github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/sys_file.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"golang.org/x/sys/unix"
    19  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    20  	"github.com/SagerNet/gvisor/pkg/context"
    21  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    22  	"github.com/SagerNet/gvisor/pkg/hostarch"
    23  	"github.com/SagerNet/gvisor/pkg/marshal/primitive"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/fs/lock"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/fs/tmpfs"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/fasync"
    31  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/limits"
    33  	"github.com/SagerNet/gvisor/pkg/syserror"
    34  )
    35  
    36  // fileOpAt performs an operation on the second last component in the path.
    37  func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, d *fs.Dirent, name string, remainingTraversals uint) error) error {
    38  	// Extract the last component.
    39  	dir, name := fs.SplitLast(path)
    40  	if dir == "/" {
    41  		// Common case: we are accessing a file in the root.
    42  		root := t.FSContext().RootDirectory()
    43  		err := fn(root, root, name, linux.MaxSymlinkTraversals)
    44  		root.DecRef(t)
    45  		return err
    46  	} else if dir == "." && dirFD == linux.AT_FDCWD {
    47  		// Common case: we are accessing a file relative to the current
    48  		// working directory; skip the look-up.
    49  		wd := t.FSContext().WorkingDirectory()
    50  		root := t.FSContext().RootDirectory()
    51  		err := fn(root, wd, name, linux.MaxSymlinkTraversals)
    52  		wd.DecRef(t)
    53  		root.DecRef(t)
    54  		return err
    55  	}
    56  
    57  	return fileOpOn(t, dirFD, dir, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error {
    58  		return fn(root, d, name, remainingTraversals)
    59  	})
    60  }
    61  
    62  // fileOpOn performs an operation on the last entry of the path.
    63  func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent, remainingTraversals uint) error) error {
    64  	var (
    65  		d   *fs.Dirent // The file.
    66  		wd  *fs.Dirent // The working directory (if required.)
    67  		rel *fs.Dirent // The relative directory for search (if required.)
    68  		f   *fs.File   // The file corresponding to dirFD (if required.)
    69  		err error
    70  	)
    71  
    72  	// Extract the working directory (maybe).
    73  	if len(path) > 0 && path[0] == '/' {
    74  		// Absolute path; rel can be nil.
    75  	} else if dirFD == linux.AT_FDCWD {
    76  		// Need to reference the working directory.
    77  		wd = t.FSContext().WorkingDirectory()
    78  		rel = wd
    79  	} else {
    80  		// Need to extract the given FD.
    81  		f = t.GetFile(dirFD)
    82  		if f == nil {
    83  			return linuxerr.EBADF
    84  		}
    85  		rel = f.Dirent
    86  		if !fs.IsDir(rel.Inode.StableAttr) {
    87  			f.DecRef(t)
    88  			return syserror.ENOTDIR
    89  		}
    90  	}
    91  
    92  	// Grab the root (always required.)
    93  	root := t.FSContext().RootDirectory()
    94  
    95  	// Lookup the node.
    96  	remainingTraversals := uint(linux.MaxSymlinkTraversals)
    97  	if resolve {
    98  		d, err = t.MountNamespace().FindInode(t, root, rel, path, &remainingTraversals)
    99  	} else {
   100  		d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals)
   101  	}
   102  	root.DecRef(t)
   103  	if wd != nil {
   104  		wd.DecRef(t)
   105  	}
   106  	if f != nil {
   107  		f.DecRef(t)
   108  	}
   109  	if err != nil {
   110  		return err
   111  	}
   112  
   113  	err = fn(root, d, remainingTraversals)
   114  	d.DecRef(t)
   115  	return err
   116  }
   117  
   118  // copyInPath copies a path in.
   119  func copyInPath(t *kernel.Task, addr hostarch.Addr, allowEmpty bool) (path string, dirPath bool, err error) {
   120  	path, err = t.CopyInString(addr, linux.PATH_MAX)
   121  	if err != nil {
   122  		return "", false, err
   123  	}
   124  	if path == "" && !allowEmpty {
   125  		return "", false, syserror.ENOENT
   126  	}
   127  
   128  	// If the path ends with a /, then checks must be enforced in various
   129  	// ways in the different callers. We pass this back to the caller.
   130  	path, dirPath = fs.TrimTrailingSlashes(path)
   131  
   132  	return path, dirPath, nil
   133  }
   134  
   135  // LINT.IfChange
   136  
   137  func openAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint) (fd uintptr, err error) {
   138  	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
   139  	if err != nil {
   140  		return 0, err
   141  	}
   142  
   143  	resolve := flags&linux.O_NOFOLLOW == 0
   144  	err = fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
   145  		// First check a few things about the filesystem before trying to get the file
   146  		// reference.
   147  		//
   148  		// It's required that Check does not try to open files not that aren't backed by
   149  		// this dirent (e.g. pipes and sockets) because this would result in opening these
   150  		// files an extra time just to check permissions.
   151  		if err := d.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
   152  			return err
   153  		}
   154  
   155  		if fs.IsSymlink(d.Inode.StableAttr) && !resolve {
   156  			return linuxerr.ELOOP
   157  		}
   158  
   159  		fileFlags := linuxToFlags(flags)
   160  		// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
   161  		fileFlags.LargeFile = true
   162  		if fs.IsDir(d.Inode.StableAttr) {
   163  			// Don't allow directories to be opened writable.
   164  			if fileFlags.Write {
   165  				return syserror.EISDIR
   166  			}
   167  		} else {
   168  			// If O_DIRECTORY is set, but the file is not a directory, then fail.
   169  			if fileFlags.Directory {
   170  				return syserror.ENOTDIR
   171  			}
   172  			// If it's a directory, then make sure.
   173  			if dirPath {
   174  				return syserror.ENOTDIR
   175  			}
   176  		}
   177  
   178  		file, err := d.Inode.GetFile(t, d, fileFlags)
   179  		if err != nil {
   180  			return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
   181  		}
   182  		defer file.DecRef(t)
   183  
   184  		// Truncate is called when O_TRUNC is specified for any kind of
   185  		// existing Dirent. Behavior is delegated to the entry's Truncate
   186  		// implementation.
   187  		if flags&linux.O_TRUNC != 0 {
   188  			if err := d.Inode.Truncate(t, d, 0); err != nil {
   189  				return err
   190  			}
   191  		}
   192  
   193  		// Success.
   194  		newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
   195  			CloseOnExec: flags&linux.O_CLOEXEC != 0,
   196  		})
   197  		if err != nil {
   198  			return err
   199  		}
   200  
   201  		// Set return result in frame.
   202  		fd = uintptr(newFD)
   203  
   204  		// Generate notification for opened file.
   205  		d.InotifyEvent(linux.IN_OPEN, 0)
   206  
   207  		return nil
   208  	})
   209  	return fd, err // Use result in frame.
   210  }
   211  
   212  func mknodAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMode) error {
   213  	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
   214  	if err != nil {
   215  		return err
   216  	}
   217  	if dirPath {
   218  		return syserror.ENOENT
   219  	}
   220  
   221  	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
   222  		if !fs.IsDir(d.Inode.StableAttr) {
   223  			return syserror.ENOTDIR
   224  		}
   225  
   226  		// Do we have the appropriate permissions on the parent?
   227  		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
   228  			return err
   229  		}
   230  
   231  		// Attempt a creation.
   232  		perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
   233  
   234  		switch mode.FileType() {
   235  		case 0:
   236  			// "Zero file type is equivalent to type S_IFREG." - mknod(2)
   237  			fallthrough
   238  		case linux.ModeRegular:
   239  			// We are not going to return the file, so the actual
   240  			// flags used don't matter, but they cannot be empty or
   241  			// Create will complain.
   242  			flags := fs.FileFlags{Read: true, Write: true}
   243  			file, err := d.Create(t, root, name, flags, perms)
   244  			if err != nil {
   245  				return err
   246  			}
   247  			file.DecRef(t)
   248  			return nil
   249  
   250  		case linux.ModeNamedPipe:
   251  			return d.CreateFifo(t, root, name, perms)
   252  
   253  		case linux.ModeSocket:
   254  			// While it is possible create a unix domain socket file on linux
   255  			// using mknod(2), in practice this is pretty useless from an
   256  			// application. Linux internally uses mknod() to create the socket
   257  			// node during bind(2), but we implement bind(2) independently. If
   258  			// an application explicitly creates a socket node using mknod(),
   259  			// you can't seem to bind() or connect() to the resulting socket.
   260  			//
   261  			// Instead of emulating this seemingly useless behaviour, we'll
   262  			// indicate that the filesystem doesn't support the creation of
   263  			// sockets.
   264  			return syserror.EOPNOTSUPP
   265  
   266  		case linux.ModeCharacterDevice:
   267  			fallthrough
   268  		case linux.ModeBlockDevice:
   269  			// TODO(b/72101894): We don't support creating block or character
   270  			// devices at the moment.
   271  			//
   272  			// When we start supporting block and character devices, we'll
   273  			// need to check for CAP_MKNOD here.
   274  			return linuxerr.EPERM
   275  
   276  		default:
   277  			// "EINVAL - mode requested creation of something other than a
   278  			// regular file, device special file, FIFO or socket." - mknod(2)
   279  			return linuxerr.EINVAL
   280  		}
   281  	})
   282  }
   283  
   284  // Mknod implements the linux syscall mknod(2).
   285  func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   286  	path := args[0].Pointer()
   287  	mode := linux.FileMode(args[1].ModeT())
   288  	// We don't need this argument until we support creation of device nodes.
   289  	_ = args[2].Uint() // dev
   290  
   291  	return 0, nil, mknodAt(t, linux.AT_FDCWD, path, mode)
   292  }
   293  
   294  // Mknodat implements the linux syscall mknodat(2).
   295  func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   296  	dirFD := args[0].Int()
   297  	path := args[1].Pointer()
   298  	mode := linux.FileMode(args[2].ModeT())
   299  	// We don't need this argument until we support creation of device nodes.
   300  	_ = args[3].Uint() // dev
   301  
   302  	return 0, nil, mknodAt(t, dirFD, path, mode)
   303  }
   304  
   305  func createAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, flags uint, mode linux.FileMode) (fd uintptr, err error) {
   306  	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
   307  	if err != nil {
   308  		return 0, err
   309  	}
   310  	if dirPath {
   311  		return 0, syserror.ENOENT
   312  	}
   313  
   314  	fileFlags := linuxToFlags(flags)
   315  	// Linux always adds the O_LARGEFILE flag when running in 64-bit mode.
   316  	fileFlags.LargeFile = true
   317  
   318  	err = fileOpAt(t, dirFD, path, func(root *fs.Dirent, parent *fs.Dirent, name string, remainingTraversals uint) error {
   319  		// Resolve the name to see if it exists, and follow any
   320  		// symlinks along the way. We must do the symlink resolution
   321  		// manually because if the symlink target does not exist, we
   322  		// must create the target (and not the symlink itself).
   323  		var (
   324  			found *fs.Dirent
   325  			err   error
   326  		)
   327  		for {
   328  			if !fs.IsDir(parent.Inode.StableAttr) {
   329  				return syserror.ENOTDIR
   330  			}
   331  
   332  			// Start by looking up the dirent at 'name'.
   333  			found, err = t.MountNamespace().FindLink(t, root, parent, name, &remainingTraversals)
   334  			if err != nil {
   335  				break
   336  			}
   337  			defer found.DecRef(t)
   338  
   339  			// We found something (possibly a symlink). If the
   340  			// O_EXCL flag was passed, then we can immediately
   341  			// return EEXIST.
   342  			if flags&linux.O_EXCL != 0 {
   343  				return syserror.EEXIST
   344  			}
   345  
   346  			// If we have a non-symlink, then we can proceed.
   347  			if !fs.IsSymlink(found.Inode.StableAttr) {
   348  				break
   349  			}
   350  
   351  			// If O_NOFOLLOW was passed, then don't try to resolve
   352  			// anything.
   353  			if flags&linux.O_NOFOLLOW != 0 {
   354  				return linuxerr.ELOOP
   355  			}
   356  
   357  			// Try to resolve the symlink directly to a Dirent.
   358  			var resolved *fs.Dirent
   359  			resolved, err = found.Inode.Getlink(t)
   360  			if err == nil {
   361  				// No more resolution necessary.
   362  				defer resolved.DecRef(t)
   363  				break
   364  			}
   365  			if err != fs.ErrResolveViaReadlink {
   366  				return err
   367  			}
   368  
   369  			// Are we able to resolve further?
   370  			if remainingTraversals == 0 {
   371  				return unix.ELOOP
   372  			}
   373  
   374  			// Resolve the symlink to a path via Readlink.
   375  			var path string
   376  			path, err = found.Inode.Readlink(t)
   377  			if err != nil {
   378  				break
   379  			}
   380  			remainingTraversals--
   381  
   382  			// Get the new parent from the target path.
   383  			var newParent *fs.Dirent
   384  			newParentPath, newName := fs.SplitLast(path)
   385  			newParent, err = t.MountNamespace().FindInode(t, root, parent, newParentPath, &remainingTraversals)
   386  			if err != nil {
   387  				break
   388  			}
   389  			defer newParent.DecRef(t)
   390  
   391  			// Repeat the process with the parent and name of the
   392  			// symlink target.
   393  			parent = newParent
   394  			name = newName
   395  		}
   396  
   397  		var newFile *fs.File
   398  		switch {
   399  		case err == nil:
   400  			// Like sys_open, check for a few things about the
   401  			// filesystem before trying to get a reference to the
   402  			// fs.File. The same constraints on Check apply.
   403  			if err := found.Inode.CheckPermission(t, flagsToPermissions(flags)); err != nil {
   404  				return err
   405  			}
   406  
   407  			// Truncate is called when O_TRUNC is specified for any kind of
   408  			// existing Dirent. Behavior is delegated to the entry's Truncate
   409  			// implementation.
   410  			if flags&linux.O_TRUNC != 0 {
   411  				if err := found.Inode.Truncate(t, found, 0); err != nil {
   412  					return err
   413  				}
   414  			}
   415  
   416  			// Create a new fs.File.
   417  			newFile, err = found.Inode.GetFile(t, found, fileFlags)
   418  			if err != nil {
   419  				return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
   420  			}
   421  			defer newFile.DecRef(t)
   422  		case linuxerr.Equals(linuxerr.ENOENT, err):
   423  			// File does not exist. Proceed with creation.
   424  
   425  			// Do we have write permissions on the parent?
   426  			if err := parent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
   427  				return err
   428  			}
   429  
   430  			// Attempt a creation.
   431  			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
   432  			newFile, err = parent.Create(t, root, name, fileFlags, perms)
   433  			if err != nil {
   434  				// No luck, bail.
   435  				return err
   436  			}
   437  			defer newFile.DecRef(t)
   438  			found = newFile.Dirent
   439  		default:
   440  			return err
   441  		}
   442  
   443  		// Success.
   444  		newFD, err := t.NewFDFrom(0, newFile, kernel.FDFlags{
   445  			CloseOnExec: flags&linux.O_CLOEXEC != 0,
   446  		})
   447  		if err != nil {
   448  			return err
   449  		}
   450  
   451  		// Set result in frame.
   452  		fd = uintptr(newFD)
   453  
   454  		// Queue the open inotify event. The creation event is
   455  		// automatically queued when the dirent is found. The open
   456  		// events are implemented at the syscall layer so we need to
   457  		// manually queue one here.
   458  		found.InotifyEvent(linux.IN_OPEN, 0)
   459  
   460  		return nil
   461  	})
   462  	return fd, err // Use result in frame.
   463  }
   464  
   465  // Open implements linux syscall open(2).
   466  func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   467  	addr := args[0].Pointer()
   468  	flags := uint(args[1].Uint())
   469  	if flags&linux.O_CREAT != 0 {
   470  		mode := linux.FileMode(args[2].ModeT())
   471  		n, err := createAt(t, linux.AT_FDCWD, addr, flags, mode)
   472  		return n, nil, err
   473  	}
   474  	n, err := openAt(t, linux.AT_FDCWD, addr, flags)
   475  	return n, nil, err
   476  }
   477  
   478  // Openat implements linux syscall openat(2).
   479  func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   480  	dirFD := args[0].Int()
   481  	addr := args[1].Pointer()
   482  	flags := uint(args[2].Uint())
   483  	if flags&linux.O_CREAT != 0 {
   484  		mode := linux.FileMode(args[3].ModeT())
   485  		n, err := createAt(t, dirFD, addr, flags, mode)
   486  		return n, nil, err
   487  	}
   488  	n, err := openAt(t, dirFD, addr, flags)
   489  	return n, nil, err
   490  }
   491  
   492  // Creat implements linux syscall creat(2).
   493  func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   494  	addr := args[0].Pointer()
   495  	mode := linux.FileMode(args[1].ModeT())
   496  	n, err := createAt(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_TRUNC, mode)
   497  	return n, nil, err
   498  }
   499  
   500  // accessContext is a context that overrides the credentials used, but
   501  // otherwise carries the same values as the embedded context.
   502  //
   503  // accessContext should only be used for access(2).
   504  type accessContext struct {
   505  	context.Context
   506  	creds *auth.Credentials
   507  }
   508  
   509  // Value implements context.Context.
   510  func (ac accessContext) Value(key interface{}) interface{} {
   511  	switch key {
   512  	case auth.CtxCredentials:
   513  		return ac.creds
   514  	default:
   515  		return ac.Context.Value(key)
   516  	}
   517  }
   518  
   519  func accessAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode uint) error {
   520  	const rOK = 4
   521  	const wOK = 2
   522  	const xOK = 1
   523  
   524  	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
   525  	if err != nil {
   526  		return err
   527  	}
   528  
   529  	// Sanity check the mode.
   530  	if mode&^(rOK|wOK|xOK) != 0 {
   531  		return linuxerr.EINVAL
   532  	}
   533  
   534  	return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
   535  		// access(2) and faccessat(2) check permissions using real
   536  		// UID/GID, not effective UID/GID.
   537  		//
   538  		// "access() needs to use the real uid/gid, not the effective
   539  		// uid/gid. We do this by temporarily clearing all FS-related
   540  		// capabilities and switching the fsuid/fsgid around to the
   541  		// real ones." -fs/open.c:faccessat
   542  		creds := t.Credentials().Fork()
   543  		creds.EffectiveKUID = creds.RealKUID
   544  		creds.EffectiveKGID = creds.RealKGID
   545  		if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
   546  			creds.EffectiveCaps = creds.PermittedCaps
   547  		} else {
   548  			creds.EffectiveCaps = 0
   549  		}
   550  
   551  		ctx := &accessContext{
   552  			Context: t,
   553  			creds:   creds,
   554  		}
   555  
   556  		return d.Inode.CheckPermission(ctx, fs.PermMask{
   557  			Read:    mode&rOK != 0,
   558  			Write:   mode&wOK != 0,
   559  			Execute: mode&xOK != 0,
   560  		})
   561  	})
   562  }
   563  
   564  // Access implements linux syscall access(2).
   565  func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   566  	addr := args[0].Pointer()
   567  	mode := args[1].ModeT()
   568  
   569  	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
   570  }
   571  
   572  // Faccessat implements linux syscall faccessat(2).
   573  //
   574  // Note that the faccessat() system call does not take a flags argument:
   575  // "The raw faccessat() system call takes only the first three arguments. The
   576  // AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
   577  // the glibc wrapper function for faccessat().  If either of these flags is
   578  // specified, then the wrapper function employs fstatat(2) to determine access
   579  // permissions." - faccessat(2)
   580  func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   581  	dirFD := args[0].Int()
   582  	addr := args[1].Pointer()
   583  	mode := args[2].ModeT()
   584  
   585  	return 0, nil, accessAt(t, dirFD, addr, mode)
   586  }
   587  
   588  // LINT.ThenChange(vfs2/filesystem.go)
   589  
   590  // LINT.IfChange
   591  
   592  // Ioctl implements linux syscall ioctl(2).
   593  func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   594  	fd := args[0].Int()
   595  	request := int(args[1].Int())
   596  
   597  	file := t.GetFile(fd)
   598  	if file == nil {
   599  		return 0, nil, linuxerr.EBADF
   600  	}
   601  	defer file.DecRef(t)
   602  
   603  	// Shared flags between file and socket.
   604  	switch request {
   605  	case linux.FIONCLEX:
   606  		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
   607  			CloseOnExec: false,
   608  		})
   609  		return 0, nil, nil
   610  	case linux.FIOCLEX:
   611  		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
   612  			CloseOnExec: true,
   613  		})
   614  		return 0, nil, nil
   615  
   616  	case linux.FIONBIO:
   617  		var set int32
   618  		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
   619  			return 0, nil, err
   620  		}
   621  		flags := file.Flags()
   622  		if set != 0 {
   623  			flags.NonBlocking = true
   624  		} else {
   625  			flags.NonBlocking = false
   626  		}
   627  		file.SetFlags(flags.Settable())
   628  		return 0, nil, nil
   629  
   630  	case linux.FIOASYNC:
   631  		var set int32
   632  		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
   633  			return 0, nil, err
   634  		}
   635  		flags := file.Flags()
   636  		if set != 0 {
   637  			flags.Async = true
   638  		} else {
   639  			flags.Async = false
   640  		}
   641  		file.SetFlags(flags.Settable())
   642  		return 0, nil, nil
   643  
   644  	case linux.FIOSETOWN, linux.SIOCSPGRP:
   645  		var set int32
   646  		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
   647  			return 0, nil, err
   648  		}
   649  		fSetOwn(t, int(fd), file, set)
   650  		return 0, nil, nil
   651  
   652  	case linux.FIOGETOWN, linux.SIOCGPGRP:
   653  		_, err := primitive.CopyInt32Out(t, args[2].Pointer(), fGetOwn(t, file))
   654  		return 0, nil, err
   655  
   656  	default:
   657  		ret, err := file.FileOperations.Ioctl(t, file, t.MemoryManager(), args)
   658  		if err != nil {
   659  			return 0, nil, err
   660  		}
   661  
   662  		return ret, nil, nil
   663  	}
   664  }
   665  
   666  // LINT.ThenChange(vfs2/ioctl.go)
   667  
   668  // LINT.IfChange
   669  
   670  // Getcwd implements the linux syscall getcwd(2).
   671  func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   672  	addr := args[0].Pointer()
   673  	size := args[1].SizeT()
   674  	cwd := t.FSContext().WorkingDirectory()
   675  	defer cwd.DecRef(t)
   676  	root := t.FSContext().RootDirectory()
   677  	defer root.DecRef(t)
   678  
   679  	// Get our fullname from the root and preprend unreachable if the root was
   680  	// unreachable from our current dirent this is the same behavior as on linux.
   681  	s, reachable := cwd.FullName(root)
   682  	if !reachable {
   683  		s = "(unreachable)" + s
   684  	}
   685  
   686  	// Note this is >= because we need a terminator.
   687  	if uint(len(s)) >= size {
   688  		return 0, nil, syserror.ERANGE
   689  	}
   690  
   691  	// Copy out the path name for the node.
   692  	bytes, err := t.CopyOutBytes(addr, []byte(s))
   693  	if err != nil {
   694  		return 0, nil, err
   695  	}
   696  
   697  	// Top it off with a terminator.
   698  	_, err = t.CopyOutBytes(addr+hostarch.Addr(bytes), []byte("\x00"))
   699  	return uintptr(bytes + 1), nil, err
   700  }
   701  
   702  // Chroot implements the linux syscall chroot(2).
   703  func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   704  	addr := args[0].Pointer()
   705  
   706  	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
   707  		return 0, nil, linuxerr.EPERM
   708  	}
   709  
   710  	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
   711  	if err != nil {
   712  		return 0, nil, err
   713  	}
   714  
   715  	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
   716  		// Is it a directory?
   717  		if !fs.IsDir(d.Inode.StableAttr) {
   718  			return syserror.ENOTDIR
   719  		}
   720  
   721  		// Does it have execute permissions?
   722  		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
   723  			return err
   724  		}
   725  
   726  		t.FSContext().SetRootDirectory(t, d)
   727  		return nil
   728  	})
   729  }
   730  
   731  // Chdir implements the linux syscall chdir(2).
   732  func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   733  	addr := args[0].Pointer()
   734  
   735  	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
   736  	if err != nil {
   737  		return 0, nil, err
   738  	}
   739  
   740  	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
   741  		// Is it a directory?
   742  		if !fs.IsDir(d.Inode.StableAttr) {
   743  			return syserror.ENOTDIR
   744  		}
   745  
   746  		// Does it have execute permissions?
   747  		if err := d.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
   748  			return err
   749  		}
   750  
   751  		t.FSContext().SetWorkingDirectory(t, d)
   752  		return nil
   753  	})
   754  }
   755  
   756  // Fchdir implements the linux syscall fchdir(2).
   757  func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   758  	fd := args[0].Int()
   759  
   760  	file := t.GetFile(fd)
   761  	if file == nil {
   762  		return 0, nil, linuxerr.EBADF
   763  	}
   764  	defer file.DecRef(t)
   765  
   766  	// Is it a directory?
   767  	if !fs.IsDir(file.Dirent.Inode.StableAttr) {
   768  		return 0, nil, syserror.ENOTDIR
   769  	}
   770  
   771  	// Does it have execute permissions?
   772  	if err := file.Dirent.Inode.CheckPermission(t, fs.PermMask{Execute: true}); err != nil {
   773  		return 0, nil, err
   774  	}
   775  
   776  	t.FSContext().SetWorkingDirectory(t, file.Dirent)
   777  	return 0, nil, nil
   778  }
   779  
   780  // LINT.ThenChange(vfs2/fscontext.go)
   781  
   782  // LINT.IfChange
   783  
   784  // Close implements linux syscall close(2).
   785  func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   786  	fd := args[0].Int()
   787  
   788  	// Note that Remove provides a reference on the file that we may use to
   789  	// flush. It is still active until we drop the final reference below
   790  	// (and other reference-holding operations complete).
   791  	file, _ := t.FDTable().Remove(t, fd)
   792  	if file == nil {
   793  		return 0, nil, linuxerr.EBADF
   794  	}
   795  	defer file.DecRef(t)
   796  
   797  	err := file.Flush(t)
   798  	return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file)
   799  }
   800  
   801  // Dup implements linux syscall dup(2).
   802  func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   803  	fd := args[0].Int()
   804  
   805  	file := t.GetFile(fd)
   806  	if file == nil {
   807  		return 0, nil, linuxerr.EBADF
   808  	}
   809  	defer file.DecRef(t)
   810  
   811  	newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
   812  	if err != nil {
   813  		return 0, nil, linuxerr.EMFILE
   814  	}
   815  	return uintptr(newFD), nil, nil
   816  }
   817  
   818  // Dup2 implements linux syscall dup2(2).
   819  func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   820  	oldfd := args[0].Int()
   821  	newfd := args[1].Int()
   822  
   823  	// If oldfd is a valid file descriptor, and newfd has the same value as oldfd,
   824  	// then dup2() does nothing, and returns newfd.
   825  	if oldfd == newfd {
   826  		oldFile := t.GetFile(oldfd)
   827  		if oldFile == nil {
   828  			return 0, nil, linuxerr.EBADF
   829  		}
   830  		defer oldFile.DecRef(t)
   831  
   832  		return uintptr(newfd), nil, nil
   833  	}
   834  
   835  	// Zero out flags arg to be used by Dup3.
   836  	args[2].Value = 0
   837  	return Dup3(t, args)
   838  }
   839  
   840  // Dup3 implements linux syscall dup3(2).
   841  func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   842  	oldfd := args[0].Int()
   843  	newfd := args[1].Int()
   844  	flags := args[2].Uint()
   845  
   846  	if oldfd == newfd {
   847  		return 0, nil, linuxerr.EINVAL
   848  	}
   849  
   850  	oldFile := t.GetFile(oldfd)
   851  	if oldFile == nil {
   852  		return 0, nil, linuxerr.EBADF
   853  	}
   854  	defer oldFile.DecRef(t)
   855  
   856  	err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0})
   857  	if err != nil {
   858  		return 0, nil, err
   859  	}
   860  
   861  	return uintptr(newfd), nil, nil
   862  }
   863  
   864  func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx {
   865  	ma := file.Async(nil)
   866  	if ma == nil {
   867  		return linux.FOwnerEx{}
   868  	}
   869  	a := ma.(*fasync.FileAsync)
   870  	ot, otg, opg := a.Owner()
   871  	switch {
   872  	case ot != nil:
   873  		return linux.FOwnerEx{
   874  			Type: linux.F_OWNER_TID,
   875  			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
   876  		}
   877  	case otg != nil:
   878  		return linux.FOwnerEx{
   879  			Type: linux.F_OWNER_PID,
   880  			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
   881  		}
   882  	case opg != nil:
   883  		return linux.FOwnerEx{
   884  			Type: linux.F_OWNER_PGRP,
   885  			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
   886  		}
   887  	default:
   888  		return linux.FOwnerEx{}
   889  	}
   890  }
   891  
   892  func fGetOwn(t *kernel.Task, file *fs.File) int32 {
   893  	owner := fGetOwnEx(t, file)
   894  	if owner.Type == linux.F_OWNER_PGRP {
   895  		return -owner.PID
   896  	}
   897  	return owner.PID
   898  }
   899  
   900  // fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
   901  //
   902  // If who is positive, it represents a PID. If negative, it represents a PGID.
   903  // If the PID or PGID is invalid, the owner is silently unset.
   904  func fSetOwn(t *kernel.Task, fd int, file *fs.File, who int32) error {
   905  	a := file.Async(fasync.New(fd)).(*fasync.FileAsync)
   906  	if who < 0 {
   907  		// Check for overflow before flipping the sign.
   908  		if who-1 > who {
   909  			return linuxerr.EINVAL
   910  		}
   911  		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(-who))
   912  		a.SetOwnerProcessGroup(t, pg)
   913  	} else {
   914  		tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(who))
   915  		a.SetOwnerThreadGroup(t, tg)
   916  	}
   917  	return nil
   918  }
   919  
   920  // Fcntl implements linux syscall fcntl(2).
   921  func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   922  	fd := args[0].Int()
   923  	cmd := args[1].Int()
   924  
   925  	file, flags := t.FDTable().Get(fd)
   926  	if file == nil {
   927  		return 0, nil, linuxerr.EBADF
   928  	}
   929  	defer file.DecRef(t)
   930  
   931  	switch cmd {
   932  	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
   933  		from := args[2].Int()
   934  		fd, err := t.NewFDFrom(from, file, kernel.FDFlags{
   935  			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
   936  		})
   937  		if err != nil {
   938  			return 0, nil, err
   939  		}
   940  		return uintptr(fd), nil, nil
   941  	case linux.F_GETFD:
   942  		return uintptr(flags.ToLinuxFDFlags()), nil, nil
   943  	case linux.F_SETFD:
   944  		flags := args[2].Uint()
   945  		err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
   946  			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
   947  		})
   948  		return 0, nil, err
   949  	case linux.F_GETFL:
   950  		return uintptr(file.Flags().ToLinux()), nil, nil
   951  	case linux.F_SETFL:
   952  		flags := uint(args[2].Uint())
   953  		file.SetFlags(linuxToFlags(flags).Settable())
   954  		return 0, nil, nil
   955  	case linux.F_SETLK, linux.F_SETLKW:
   956  		// In Linux the file system can choose to provide lock operations for an inode.
   957  		// Normally pipe and socket types lack lock operations. We diverge and use a heavy
   958  		// hammer by only allowing locks on files and directories.
   959  		if !fs.IsFile(file.Dirent.Inode.StableAttr) && !fs.IsDir(file.Dirent.Inode.StableAttr) {
   960  			return 0, nil, linuxerr.EBADF
   961  		}
   962  
   963  		// Copy in the lock request.
   964  		flockAddr := args[2].Pointer()
   965  		var flock linux.Flock
   966  		if _, err := flock.CopyIn(t, flockAddr); err != nil {
   967  			return 0, nil, err
   968  		}
   969  
   970  		// Compute the lock whence.
   971  		var sw fs.SeekWhence
   972  		switch flock.Whence {
   973  		case 0:
   974  			sw = fs.SeekSet
   975  		case 1:
   976  			sw = fs.SeekCurrent
   977  		case 2:
   978  			sw = fs.SeekEnd
   979  		default:
   980  			return 0, nil, linuxerr.EINVAL
   981  		}
   982  
   983  		// Compute the lock offset.
   984  		var off int64
   985  		switch sw {
   986  		case fs.SeekSet:
   987  			off = 0
   988  		case fs.SeekCurrent:
   989  			// Note that Linux does not hold any mutexes while retrieving the file offset,
   990  			// see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
   991  			off = file.Offset()
   992  		case fs.SeekEnd:
   993  			uattr, err := file.Dirent.Inode.UnstableAttr(t)
   994  			if err != nil {
   995  				return 0, nil, err
   996  			}
   997  			off = uattr.Size
   998  		default:
   999  			return 0, nil, linuxerr.EINVAL
  1000  		}
  1001  
  1002  		// Compute the lock range.
  1003  		rng, err := lock.ComputeRange(flock.Start, flock.Len, off)
  1004  		if err != nil {
  1005  			return 0, nil, err
  1006  		}
  1007  
  1008  		// These locks don't block; execute the non-blocking operation using the inode's lock
  1009  		// context directly.
  1010  		switch flock.Type {
  1011  		case linux.F_RDLCK:
  1012  			if !file.Flags().Read {
  1013  				return 0, nil, linuxerr.EBADF
  1014  			}
  1015  			if cmd == linux.F_SETLK {
  1016  				// Non-blocking lock, provide a nil lock.Blocker.
  1017  				if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, nil) {
  1018  					return 0, nil, linuxerr.EAGAIN
  1019  				}
  1020  			} else {
  1021  				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
  1022  				if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.ReadLock, rng, t) {
  1023  					return 0, nil, syserror.EINTR
  1024  				}
  1025  			}
  1026  			return 0, nil, nil
  1027  		case linux.F_WRLCK:
  1028  			if !file.Flags().Write {
  1029  				return 0, nil, linuxerr.EBADF
  1030  			}
  1031  			if cmd == linux.F_SETLK {
  1032  				// Non-blocking lock, provide a nil lock.Blocker.
  1033  				if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, nil) {
  1034  					return 0, nil, linuxerr.EAGAIN
  1035  				}
  1036  			} else {
  1037  				// Blocking lock, pass in the task to satisfy the lock.Blocker interface.
  1038  				if !file.Dirent.Inode.LockCtx.Posix.LockRegionVFS1(t.FDTable(), lock.WriteLock, rng, t) {
  1039  					return 0, nil, syserror.EINTR
  1040  				}
  1041  			}
  1042  			return 0, nil, nil
  1043  		case linux.F_UNLCK:
  1044  			file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng)
  1045  			return 0, nil, nil
  1046  		default:
  1047  			return 0, nil, linuxerr.EINVAL
  1048  		}
  1049  	case linux.F_GETOWN:
  1050  		return uintptr(fGetOwn(t, file)), nil, nil
  1051  	case linux.F_SETOWN:
  1052  		return 0, nil, fSetOwn(t, int(fd), file, args[2].Int())
  1053  	case linux.F_GETOWN_EX:
  1054  		addr := args[2].Pointer()
  1055  		owner := fGetOwnEx(t, file)
  1056  		_, err := owner.CopyOut(t, addr)
  1057  		return 0, nil, err
  1058  	case linux.F_SETOWN_EX:
  1059  		addr := args[2].Pointer()
  1060  		var owner linux.FOwnerEx
  1061  		_, err := owner.CopyIn(t, addr)
  1062  		if err != nil {
  1063  			return 0, nil, err
  1064  		}
  1065  		a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync)
  1066  		switch owner.Type {
  1067  		case linux.F_OWNER_TID:
  1068  			task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID))
  1069  			if task == nil {
  1070  				return 0, nil, syserror.ESRCH
  1071  			}
  1072  			a.SetOwnerTask(t, task)
  1073  			return 0, nil, nil
  1074  		case linux.F_OWNER_PID:
  1075  			tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
  1076  			if tg == nil {
  1077  				return 0, nil, syserror.ESRCH
  1078  			}
  1079  			a.SetOwnerThreadGroup(t, tg)
  1080  			return 0, nil, nil
  1081  		case linux.F_OWNER_PGRP:
  1082  			pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
  1083  			if pg == nil {
  1084  				return 0, nil, syserror.ESRCH
  1085  			}
  1086  			a.SetOwnerProcessGroup(t, pg)
  1087  			return 0, nil, nil
  1088  		default:
  1089  			return 0, nil, linuxerr.EINVAL
  1090  		}
  1091  	case linux.F_GET_SEALS:
  1092  		val, err := tmpfs.GetSeals(file.Dirent.Inode)
  1093  		return uintptr(val), nil, err
  1094  	case linux.F_ADD_SEALS:
  1095  		if !file.Flags().Write {
  1096  			return 0, nil, linuxerr.EPERM
  1097  		}
  1098  		err := tmpfs.AddSeals(file.Dirent.Inode, args[2].Uint())
  1099  		return 0, nil, err
  1100  	case linux.F_GETPIPE_SZ:
  1101  		sz, ok := file.FileOperations.(fs.FifoSizer)
  1102  		if !ok {
  1103  			return 0, nil, linuxerr.EINVAL
  1104  		}
  1105  		size, err := sz.FifoSize(t, file)
  1106  		return uintptr(size), nil, err
  1107  	case linux.F_SETPIPE_SZ:
  1108  		sz, ok := file.FileOperations.(fs.FifoSizer)
  1109  		if !ok {
  1110  			return 0, nil, linuxerr.EINVAL
  1111  		}
  1112  		n, err := sz.SetFifoSize(int64(args[2].Int()))
  1113  		return uintptr(n), nil, err
  1114  	case linux.F_GETSIG:
  1115  		a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync)
  1116  		return uintptr(a.Signal()), nil, nil
  1117  	case linux.F_SETSIG:
  1118  		a := file.Async(fasync.New(int(fd))).(*fasync.FileAsync)
  1119  		return 0, nil, a.SetSignal(linux.Signal(args[2].Int()))
  1120  	default:
  1121  		// Everything else is not yet supported.
  1122  		return 0, nil, linuxerr.EINVAL
  1123  	}
  1124  }
  1125  
  1126  // Fadvise64 implements linux syscall fadvise64(2).
  1127  // This implementation currently ignores the provided advice.
  1128  func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1129  	fd := args[0].Int()
  1130  	length := args[2].Int64()
  1131  	advice := args[3].Int()
  1132  
  1133  	// Note: offset is allowed to be negative.
  1134  	if length < 0 {
  1135  		return 0, nil, linuxerr.EINVAL
  1136  	}
  1137  
  1138  	file := t.GetFile(fd)
  1139  	if file == nil {
  1140  		return 0, nil, linuxerr.EBADF
  1141  	}
  1142  	defer file.DecRef(t)
  1143  
  1144  	// If the FD refers to a pipe or FIFO, return error.
  1145  	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
  1146  		return 0, nil, linuxerr.ESPIPE
  1147  	}
  1148  
  1149  	switch advice {
  1150  	case linux.POSIX_FADV_NORMAL:
  1151  	case linux.POSIX_FADV_RANDOM:
  1152  	case linux.POSIX_FADV_SEQUENTIAL:
  1153  	case linux.POSIX_FADV_WILLNEED:
  1154  	case linux.POSIX_FADV_DONTNEED:
  1155  	case linux.POSIX_FADV_NOREUSE:
  1156  	default:
  1157  		return 0, nil, linuxerr.EINVAL
  1158  	}
  1159  
  1160  	// Sure, whatever.
  1161  	return 0, nil, nil
  1162  }
  1163  
  1164  // LINT.ThenChange(vfs2/fd.go)
  1165  
  1166  // LINT.IfChange
  1167  
  1168  func mkdirAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, mode linux.FileMode) error {
  1169  	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
  1170  	if err != nil {
  1171  		return err
  1172  	}
  1173  
  1174  	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
  1175  		if !fs.IsDir(d.Inode.StableAttr) {
  1176  			return syserror.ENOTDIR
  1177  		}
  1178  
  1179  		// Does this directory exist already?
  1180  		remainingTraversals := uint(linux.MaxSymlinkTraversals)
  1181  		f, err := t.MountNamespace().FindInode(t, root, d, name, &remainingTraversals)
  1182  		switch {
  1183  		case err == nil:
  1184  			// The directory existed.
  1185  			defer f.DecRef(t)
  1186  			return syserror.EEXIST
  1187  		case linuxerr.Equals(linuxerr.EACCES, err):
  1188  			// Permission denied while walking to the directory.
  1189  			return err
  1190  		default:
  1191  			// Do we have write permissions on the parent?
  1192  			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
  1193  				return err
  1194  			}
  1195  
  1196  			// Create the directory.
  1197  			perms := fs.FilePermsFromMode(mode &^ linux.FileMode(t.FSContext().Umask()))
  1198  			return d.CreateDirectory(t, root, name, perms)
  1199  		}
  1200  	})
  1201  }
  1202  
  1203  // Mkdir implements linux syscall mkdir(2).
  1204  func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1205  	addr := args[0].Pointer()
  1206  	mode := linux.FileMode(args[1].ModeT())
  1207  
  1208  	return 0, nil, mkdirAt(t, linux.AT_FDCWD, addr, mode)
  1209  }
  1210  
  1211  // Mkdirat implements linux syscall mkdirat(2).
  1212  func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1213  	dirFD := args[0].Int()
  1214  	addr := args[1].Pointer()
  1215  	mode := linux.FileMode(args[2].ModeT())
  1216  
  1217  	return 0, nil, mkdirAt(t, dirFD, addr, mode)
  1218  }
  1219  
  1220  func rmdirAt(t *kernel.Task, dirFD int32, addr hostarch.Addr) error {
  1221  	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
  1222  	if err != nil {
  1223  		return err
  1224  	}
  1225  
  1226  	// Special case: removing the root always returns EBUSY.
  1227  	if path == "/" {
  1228  		return linuxerr.EBUSY
  1229  	}
  1230  
  1231  	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
  1232  		if !fs.IsDir(d.Inode.StableAttr) {
  1233  			return syserror.ENOTDIR
  1234  		}
  1235  
  1236  		// Linux returns different ernos when the path ends in single
  1237  		// dot vs. double dots.
  1238  		switch name {
  1239  		case ".":
  1240  			return linuxerr.EINVAL
  1241  		case "..":
  1242  			return linuxerr.ENOTEMPTY
  1243  		}
  1244  
  1245  		if err := d.MayDelete(t, root, name); err != nil {
  1246  			return err
  1247  		}
  1248  
  1249  		return d.RemoveDirectory(t, root, name)
  1250  	})
  1251  }
  1252  
  1253  // Rmdir implements linux syscall rmdir(2).
  1254  func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1255  	addr := args[0].Pointer()
  1256  
  1257  	return 0, nil, rmdirAt(t, linux.AT_FDCWD, addr)
  1258  }
  1259  
  1260  func symlinkAt(t *kernel.Task, dirFD int32, newAddr hostarch.Addr, oldAddr hostarch.Addr) error {
  1261  	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
  1262  	if err != nil {
  1263  		return err
  1264  	}
  1265  	if dirPath {
  1266  		return syserror.ENOENT
  1267  	}
  1268  
  1269  	// The oldPath is copied in verbatim. This is because the symlink
  1270  	// will include all details, including trailing slashes.
  1271  	oldPath, err := t.CopyInString(oldAddr, linux.PATH_MAX)
  1272  	if err != nil {
  1273  		return err
  1274  	}
  1275  	if oldPath == "" {
  1276  		return syserror.ENOENT
  1277  	}
  1278  
  1279  	return fileOpAt(t, dirFD, newPath, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
  1280  		if !fs.IsDir(d.Inode.StableAttr) {
  1281  			return syserror.ENOTDIR
  1282  		}
  1283  
  1284  		// Make sure we have write permissions on the parent directory.
  1285  		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
  1286  			return err
  1287  		}
  1288  		return d.CreateLink(t, root, oldPath, name)
  1289  	})
  1290  }
  1291  
  1292  // Symlink implements linux syscall symlink(2).
  1293  func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1294  	oldAddr := args[0].Pointer()
  1295  	newAddr := args[1].Pointer()
  1296  
  1297  	return 0, nil, symlinkAt(t, linux.AT_FDCWD, newAddr, oldAddr)
  1298  }
  1299  
  1300  // Symlinkat implements linux syscall symlinkat(2).
  1301  func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1302  	oldAddr := args[0].Pointer()
  1303  	dirFD := args[1].Int()
  1304  	newAddr := args[2].Pointer()
  1305  
  1306  	return 0, nil, symlinkAt(t, dirFD, newAddr, oldAddr)
  1307  }
  1308  
  1309  // mayLinkAt determines whether t can create a hard link to target.
  1310  //
  1311  // This corresponds to Linux's fs/namei.c:may_linkat.
  1312  func mayLinkAt(t *kernel.Task, target *fs.Inode) error {
  1313  	// Linux will impose the following restrictions on hard links only if
  1314  	// sysctl_protected_hardlinks is enabled. The kernel disables this
  1315  	// setting by default for backward compatibility (see commit
  1316  	// 561ec64ae67e), but also recommends that distributions enable it (and
  1317  	// Debian does:
  1318  	// https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=889098).
  1319  	//
  1320  	// gVisor currently behaves as though sysctl_protected_hardlinks is
  1321  	// always enabled, and thus imposes the following restrictions on hard
  1322  	// links.
  1323  
  1324  	if target.CheckOwnership(t) {
  1325  		// fs/namei.c:may_linkat: "Source inode owner (or CAP_FOWNER)
  1326  		// can hardlink all they like."
  1327  		return nil
  1328  	}
  1329  
  1330  	// If we are not the owner, then the file must be regular and have
  1331  	// Read+Write permissions.
  1332  	if !fs.IsRegular(target.StableAttr) {
  1333  		return linuxerr.EPERM
  1334  	}
  1335  	if target.CheckPermission(t, fs.PermMask{Read: true, Write: true}) != nil {
  1336  		return linuxerr.EPERM
  1337  	}
  1338  
  1339  	return nil
  1340  }
  1341  
  1342  // linkAt creates a hard link to the target specified by oldDirFD and oldAddr,
  1343  // specified by newDirFD and newAddr.  If resolve is true, then the symlinks
  1344  // will be followed when evaluating the target.
  1345  func linkAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD int32, newAddr hostarch.Addr, resolve, allowEmpty bool) error {
  1346  	oldPath, _, err := copyInPath(t, oldAddr, allowEmpty)
  1347  	if err != nil {
  1348  		return err
  1349  	}
  1350  	newPath, dirPath, err := copyInPath(t, newAddr, false /* allowEmpty */)
  1351  	if err != nil {
  1352  		return err
  1353  	}
  1354  	if dirPath {
  1355  		return syserror.ENOENT
  1356  	}
  1357  
  1358  	if allowEmpty && oldPath == "" {
  1359  		target := t.GetFile(oldDirFD)
  1360  		if target == nil {
  1361  			return linuxerr.EBADF
  1362  		}
  1363  		defer target.DecRef(t)
  1364  		if err := mayLinkAt(t, target.Dirent.Inode); err != nil {
  1365  			return err
  1366  		}
  1367  
  1368  		// Resolve the target directory.
  1369  		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
  1370  			if !fs.IsDir(newParent.Inode.StableAttr) {
  1371  				return syserror.ENOTDIR
  1372  			}
  1373  
  1374  			// Make sure we have write permissions on the parent directory.
  1375  			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
  1376  				return err
  1377  			}
  1378  			return newParent.CreateHardLink(t, root, target.Dirent, newName)
  1379  		})
  1380  	}
  1381  
  1382  	// Resolve oldDirFD and oldAddr to a dirent.  The "resolve" argument
  1383  	// only applies to this name.
  1384  	return fileOpOn(t, oldDirFD, oldPath, resolve, func(root *fs.Dirent, target *fs.Dirent, _ uint) error {
  1385  		if err := mayLinkAt(t, target.Inode); err != nil {
  1386  			return err
  1387  		}
  1388  
  1389  		// Next resolve newDirFD and newAddr to the parent dirent and name.
  1390  		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
  1391  			if !fs.IsDir(newParent.Inode.StableAttr) {
  1392  				return syserror.ENOTDIR
  1393  			}
  1394  
  1395  			// Make sure we have write permissions on the parent directory.
  1396  			if err := newParent.Inode.CheckPermission(t, fs.PermMask{Write: true, Execute: true}); err != nil {
  1397  				return err
  1398  			}
  1399  			return newParent.CreateHardLink(t, root, target, newName)
  1400  		})
  1401  	})
  1402  }
  1403  
  1404  // Link implements linux syscall link(2).
  1405  func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1406  	oldAddr := args[0].Pointer()
  1407  	newAddr := args[1].Pointer()
  1408  
  1409  	// man link(2):
  1410  	// POSIX.1-2001 says that link() should dereference oldpath if it is a
  1411  	// symbolic link. However, since kernel 2.0, Linux does not do so: if
  1412  	// oldpath is a symbolic link, then newpath is created as a (hard) link
  1413  	// to the same symbolic link file (i.e., newpath becomes a symbolic
  1414  	// link to the same file that oldpath refers to).
  1415  	resolve := false
  1416  	return 0, nil, linkAt(t, linux.AT_FDCWD, oldAddr, linux.AT_FDCWD, newAddr, resolve, false /* allowEmpty */)
  1417  }
  1418  
  1419  // Linkat implements linux syscall linkat(2).
  1420  func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1421  	oldDirFD := args[0].Int()
  1422  	oldAddr := args[1].Pointer()
  1423  	newDirFD := args[2].Int()
  1424  	newAddr := args[3].Pointer()
  1425  
  1426  	// man linkat(2):
  1427  	// By default, linkat(), does not dereference oldpath if it is a
  1428  	// symbolic link (like link(2)). Since Linux 2.6.18, the flag
  1429  	// AT_SYMLINK_FOLLOW can be specified in flags to cause oldpath to be
  1430  	// dereferenced if it is a symbolic link.
  1431  	flags := args[4].Int()
  1432  
  1433  	// Sanity check flags.
  1434  	if flags&^(linux.AT_SYMLINK_FOLLOW|linux.AT_EMPTY_PATH) != 0 {
  1435  		return 0, nil, linuxerr.EINVAL
  1436  	}
  1437  
  1438  	resolve := flags&linux.AT_SYMLINK_FOLLOW == linux.AT_SYMLINK_FOLLOW
  1439  	allowEmpty := flags&linux.AT_EMPTY_PATH == linux.AT_EMPTY_PATH
  1440  
  1441  	if allowEmpty && !t.HasCapabilityIn(linux.CAP_DAC_READ_SEARCH, t.UserNamespace().Root()) {
  1442  		return 0, nil, syserror.ENOENT
  1443  	}
  1444  
  1445  	return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
  1446  }
  1447  
  1448  // LINT.ThenChange(vfs2/filesystem.go)
  1449  
  1450  // LINT.IfChange
  1451  
  1452  func readlinkAt(t *kernel.Task, dirFD int32, addr hostarch.Addr, bufAddr hostarch.Addr, size uint) (copied uintptr, err error) {
  1453  	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
  1454  	if err != nil {
  1455  		return 0, err
  1456  	}
  1457  	if dirPath {
  1458  		return 0, syserror.ENOENT
  1459  	}
  1460  
  1461  	err = fileOpOn(t, dirFD, path, false /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
  1462  		// Check for Read permission.
  1463  		if err := d.Inode.CheckPermission(t, fs.PermMask{Read: true}); err != nil {
  1464  			return err
  1465  		}
  1466  
  1467  		s, err := d.Inode.Readlink(t)
  1468  		if linuxerr.Equals(linuxerr.ENOLINK, err) {
  1469  			return linuxerr.EINVAL
  1470  		}
  1471  		if err != nil {
  1472  			return err
  1473  		}
  1474  
  1475  		buffer := []byte(s)
  1476  		if uint(len(buffer)) > size {
  1477  			buffer = buffer[:size]
  1478  		}
  1479  
  1480  		n, err := t.CopyOutBytes(bufAddr, buffer)
  1481  
  1482  		// Update frame return value.
  1483  		copied = uintptr(n)
  1484  
  1485  		return err
  1486  	})
  1487  	return copied, err // Return frame value.
  1488  }
  1489  
  1490  // Readlink implements linux syscall readlink(2).
  1491  func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1492  	addr := args[0].Pointer()
  1493  	bufAddr := args[1].Pointer()
  1494  	size := args[2].SizeT()
  1495  
  1496  	n, err := readlinkAt(t, linux.AT_FDCWD, addr, bufAddr, size)
  1497  	return n, nil, err
  1498  }
  1499  
  1500  // Readlinkat implements linux syscall readlinkat(2).
  1501  func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1502  	dirFD := args[0].Int()
  1503  	addr := args[1].Pointer()
  1504  	bufAddr := args[2].Pointer()
  1505  	size := args[3].SizeT()
  1506  
  1507  	n, err := readlinkAt(t, dirFD, addr, bufAddr, size)
  1508  	return n, nil, err
  1509  }
  1510  
  1511  // LINT.ThenChange(vfs2/stat.go)
  1512  
  1513  // LINT.IfChange
  1514  
  1515  func unlinkAt(t *kernel.Task, dirFD int32, addr hostarch.Addr) error {
  1516  	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
  1517  	if err != nil {
  1518  		return err
  1519  	}
  1520  
  1521  	return fileOpAt(t, dirFD, path, func(root *fs.Dirent, d *fs.Dirent, name string, _ uint) error {
  1522  		if !fs.IsDir(d.Inode.StableAttr) {
  1523  			return syserror.ENOTDIR
  1524  		}
  1525  
  1526  		if err := d.MayDelete(t, root, name); err != nil {
  1527  			return err
  1528  		}
  1529  
  1530  		return d.Remove(t, root, name, dirPath)
  1531  	})
  1532  }
  1533  
  1534  // Unlink implements linux syscall unlink(2).
  1535  func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1536  	addr := args[0].Pointer()
  1537  	return 0, nil, unlinkAt(t, linux.AT_FDCWD, addr)
  1538  }
  1539  
  1540  // Unlinkat implements linux syscall unlinkat(2).
  1541  func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1542  	dirFD := args[0].Int()
  1543  	addr := args[1].Pointer()
  1544  	flags := args[2].Uint()
  1545  	if flags&linux.AT_REMOVEDIR != 0 {
  1546  		return 0, nil, rmdirAt(t, dirFD, addr)
  1547  	}
  1548  	return 0, nil, unlinkAt(t, dirFD, addr)
  1549  }
  1550  
  1551  // LINT.ThenChange(vfs2/filesystem.go)
  1552  
  1553  // LINT.IfChange
  1554  
  1555  // Truncate implements linux syscall truncate(2).
  1556  func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1557  	addr := args[0].Pointer()
  1558  	length := args[1].Int64()
  1559  
  1560  	if length < 0 {
  1561  		return 0, nil, linuxerr.EINVAL
  1562  	}
  1563  
  1564  	path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
  1565  	if err != nil {
  1566  		return 0, nil, err
  1567  	}
  1568  	if dirPath {
  1569  		return 0, nil, linuxerr.EINVAL
  1570  	}
  1571  
  1572  	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
  1573  		t.SendSignal(&linux.SignalInfo{
  1574  			Signo: int32(linux.SIGXFSZ),
  1575  			Code:  linux.SI_USER,
  1576  		})
  1577  		return 0, nil, linuxerr.EFBIG
  1578  	}
  1579  
  1580  	return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
  1581  		if fs.IsDir(d.Inode.StableAttr) {
  1582  			return syserror.EISDIR
  1583  		}
  1584  		// In contrast to open(O_TRUNC), truncate(2) is only valid for file
  1585  		// types.
  1586  		if !fs.IsFile(d.Inode.StableAttr) {
  1587  			return linuxerr.EINVAL
  1588  		}
  1589  
  1590  		// Reject truncation if the access permissions do not allow truncation.
  1591  		// This is different from the behavior of sys_ftruncate, see below.
  1592  		if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
  1593  			return err
  1594  		}
  1595  
  1596  		if err := d.Inode.Truncate(t, d, length); err != nil {
  1597  			return err
  1598  		}
  1599  
  1600  		// File length modified, generate notification.
  1601  		d.InotifyEvent(linux.IN_MODIFY, 0)
  1602  
  1603  		return nil
  1604  	})
  1605  }
  1606  
  1607  // Ftruncate implements linux syscall ftruncate(2).
  1608  func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1609  	fd := args[0].Int()
  1610  	length := args[1].Int64()
  1611  
  1612  	file := t.GetFile(fd)
  1613  	if file == nil {
  1614  		return 0, nil, linuxerr.EBADF
  1615  	}
  1616  	defer file.DecRef(t)
  1617  
  1618  	// Reject truncation if the file flags do not permit this operation.
  1619  	// This is different from truncate(2) above.
  1620  	if !file.Flags().Write {
  1621  		return 0, nil, linuxerr.EINVAL
  1622  	}
  1623  
  1624  	// In contrast to open(O_TRUNC), truncate(2) is only valid for file
  1625  	// types. Note that this is different from truncate(2) above, where a
  1626  	// directory returns EISDIR.
  1627  	if !fs.IsFile(file.Dirent.Inode.StableAttr) {
  1628  		return 0, nil, linuxerr.EINVAL
  1629  	}
  1630  
  1631  	if length < 0 {
  1632  		return 0, nil, linuxerr.EINVAL
  1633  	}
  1634  
  1635  	if uint64(length) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
  1636  		t.SendSignal(&linux.SignalInfo{
  1637  			Signo: int32(linux.SIGXFSZ),
  1638  			Code:  linux.SI_USER,
  1639  		})
  1640  		return 0, nil, linuxerr.EFBIG
  1641  	}
  1642  
  1643  	if err := file.Dirent.Inode.Truncate(t, file.Dirent, length); err != nil {
  1644  		return 0, nil, err
  1645  	}
  1646  
  1647  	// File length modified, generate notification.
  1648  	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
  1649  
  1650  	return 0, nil, nil
  1651  }
  1652  
  1653  // LINT.ThenChange(vfs2/setstat.go)
  1654  
  1655  // Umask implements linux syscall umask(2).
  1656  func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1657  	mask := args[0].ModeT()
  1658  	mask = t.FSContext().SwapUmask(mask & 0777)
  1659  	return uintptr(mask), nil, nil
  1660  }
  1661  
  1662  // LINT.IfChange
  1663  
  1664  // Change ownership of a file.
  1665  //
  1666  // uid and gid may be -1, in which case they will not be changed.
  1667  func chown(t *kernel.Task, d *fs.Dirent, uid auth.UID, gid auth.GID) error {
  1668  	owner := fs.FileOwner{
  1669  		UID: auth.NoID,
  1670  		GID: auth.NoID,
  1671  	}
  1672  
  1673  	uattr, err := d.Inode.UnstableAttr(t)
  1674  	if err != nil {
  1675  		return err
  1676  	}
  1677  
  1678  	c := t.Credentials()
  1679  	hasCap := d.Inode.CheckCapability(t, linux.CAP_CHOWN)
  1680  	isOwner := uattr.Owner.UID == c.EffectiveKUID
  1681  	var clearPrivilege bool
  1682  	if uid.Ok() {
  1683  		kuid := c.UserNamespace.MapToKUID(uid)
  1684  		// Valid UID must be supplied if UID is to be changed.
  1685  		if !kuid.Ok() {
  1686  			return linuxerr.EINVAL
  1687  		}
  1688  
  1689  		// "Only a privileged process (CAP_CHOWN) may change the owner
  1690  		// of a file." -chown(2)
  1691  		//
  1692  		// Linux also allows chown if you own the file and are
  1693  		// explicitly not changing its UID.
  1694  		isNoop := uattr.Owner.UID == kuid
  1695  		if !(hasCap || (isOwner && isNoop)) {
  1696  			return linuxerr.EPERM
  1697  		}
  1698  
  1699  		// The setuid and setgid bits are cleared during a chown.
  1700  		if uattr.Owner.UID != kuid {
  1701  			clearPrivilege = true
  1702  		}
  1703  
  1704  		owner.UID = kuid
  1705  	}
  1706  	if gid.Ok() {
  1707  		kgid := c.UserNamespace.MapToKGID(gid)
  1708  		// Valid GID must be supplied if GID is to be changed.
  1709  		if !kgid.Ok() {
  1710  			return linuxerr.EINVAL
  1711  		}
  1712  
  1713  		// "The owner of a file may change the group of the file to any
  1714  		// group of which that owner is a member. A privileged process
  1715  		// (CAP_CHOWN) may change the group arbitrarily." -chown(2)
  1716  		isNoop := uattr.Owner.GID == kgid
  1717  		isMemberGroup := c.InGroup(kgid)
  1718  		if !(hasCap || (isOwner && (isNoop || isMemberGroup))) {
  1719  			return linuxerr.EPERM
  1720  		}
  1721  
  1722  		// The setuid and setgid bits are cleared during a chown.
  1723  		if uattr.Owner.GID != kgid {
  1724  			clearPrivilege = true
  1725  		}
  1726  
  1727  		owner.GID = kgid
  1728  	}
  1729  
  1730  	// FIXME(b/62949101): This is racy; the inode's owner may have changed in
  1731  	// the meantime. (Linux holds i_mutex while calling
  1732  	// fs/attr.c:notify_change() => inode_operations::setattr =>
  1733  	// inode_change_ok().)
  1734  	if err := d.Inode.SetOwner(t, d, owner); err != nil {
  1735  		return err
  1736  	}
  1737  	// Clear privilege bits if needed and they are set.
  1738  	if clearPrivilege && uattr.Perms.HasSetUIDOrGID() && !fs.IsDir(d.Inode.StableAttr) {
  1739  		uattr.Perms.DropSetUIDAndMaybeGID()
  1740  		if !d.Inode.SetPermissions(t, d, uattr.Perms) {
  1741  			return linuxerr.EPERM
  1742  		}
  1743  	}
  1744  
  1745  	return nil
  1746  }
  1747  
  1748  func chownAt(t *kernel.Task, fd int32, addr hostarch.Addr, resolve, allowEmpty bool, uid auth.UID, gid auth.GID) error {
  1749  	path, _, err := copyInPath(t, addr, allowEmpty)
  1750  	if err != nil {
  1751  		return err
  1752  	}
  1753  
  1754  	if path == "" {
  1755  		// Annoying. What's wrong with fchown?
  1756  		file := t.GetFile(fd)
  1757  		if file == nil {
  1758  			return linuxerr.EBADF
  1759  		}
  1760  		defer file.DecRef(t)
  1761  
  1762  		return chown(t, file.Dirent, uid, gid)
  1763  	}
  1764  
  1765  	return fileOpOn(t, fd, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
  1766  		return chown(t, d, uid, gid)
  1767  	})
  1768  }
  1769  
  1770  // Chown implements linux syscall chown(2).
  1771  func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1772  	addr := args[0].Pointer()
  1773  	uid := auth.UID(args[1].Uint())
  1774  	gid := auth.GID(args[2].Uint())
  1775  
  1776  	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, true /* resolve */, false /* allowEmpty */, uid, gid)
  1777  }
  1778  
  1779  // Lchown implements linux syscall lchown(2).
  1780  func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1781  	addr := args[0].Pointer()
  1782  	uid := auth.UID(args[1].Uint())
  1783  	gid := auth.GID(args[2].Uint())
  1784  
  1785  	return 0, nil, chownAt(t, linux.AT_FDCWD, addr, false /* resolve */, false /* allowEmpty */, uid, gid)
  1786  }
  1787  
  1788  // Fchown implements linux syscall fchown(2).
  1789  func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1790  	fd := args[0].Int()
  1791  	uid := auth.UID(args[1].Uint())
  1792  	gid := auth.GID(args[2].Uint())
  1793  
  1794  	file := t.GetFile(fd)
  1795  	if file == nil {
  1796  		return 0, nil, linuxerr.EBADF
  1797  	}
  1798  	defer file.DecRef(t)
  1799  
  1800  	return 0, nil, chown(t, file.Dirent, uid, gid)
  1801  }
  1802  
  1803  // Fchownat implements Linux syscall fchownat(2).
  1804  func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1805  	dirFD := args[0].Int()
  1806  	addr := args[1].Pointer()
  1807  	uid := auth.UID(args[2].Uint())
  1808  	gid := auth.GID(args[3].Uint())
  1809  	flags := args[4].Int()
  1810  
  1811  	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
  1812  		return 0, nil, linuxerr.EINVAL
  1813  	}
  1814  
  1815  	return 0, nil, chownAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, flags&linux.AT_EMPTY_PATH != 0, uid, gid)
  1816  }
  1817  
  1818  func chmod(t *kernel.Task, d *fs.Dirent, mode linux.FileMode) error {
  1819  	// Must own file to change mode.
  1820  	if !d.Inode.CheckOwnership(t) {
  1821  		return linuxerr.EPERM
  1822  	}
  1823  
  1824  	p := fs.FilePermsFromMode(mode)
  1825  	if !d.Inode.SetPermissions(t, d, p) {
  1826  		return linuxerr.EPERM
  1827  	}
  1828  
  1829  	// File attribute changed, generate notification.
  1830  	d.InotifyEvent(linux.IN_ATTRIB, 0)
  1831  
  1832  	return nil
  1833  }
  1834  
  1835  func chmodAt(t *kernel.Task, fd int32, addr hostarch.Addr, mode linux.FileMode) error {
  1836  	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
  1837  	if err != nil {
  1838  		return err
  1839  	}
  1840  
  1841  	return fileOpOn(t, fd, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
  1842  		return chmod(t, d, mode)
  1843  	})
  1844  }
  1845  
  1846  // Chmod implements linux syscall chmod(2).
  1847  func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1848  	addr := args[0].Pointer()
  1849  	mode := linux.FileMode(args[1].ModeT())
  1850  
  1851  	return 0, nil, chmodAt(t, linux.AT_FDCWD, addr, mode)
  1852  }
  1853  
  1854  // Fchmod implements linux syscall fchmod(2).
  1855  func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1856  	fd := args[0].Int()
  1857  	mode := linux.FileMode(args[1].ModeT())
  1858  
  1859  	file := t.GetFile(fd)
  1860  	if file == nil {
  1861  		return 0, nil, linuxerr.EBADF
  1862  	}
  1863  	defer file.DecRef(t)
  1864  
  1865  	return 0, nil, chmod(t, file.Dirent, mode)
  1866  }
  1867  
  1868  // Fchmodat implements linux syscall fchmodat(2).
  1869  func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1870  	fd := args[0].Int()
  1871  	addr := args[1].Pointer()
  1872  	mode := linux.FileMode(args[2].ModeT())
  1873  
  1874  	return 0, nil, chmodAt(t, fd, addr, mode)
  1875  }
  1876  
  1877  // defaultSetToSystemTimeSpec returns a TimeSpec that will set ATime and MTime
  1878  // to the system time.
  1879  func defaultSetToSystemTimeSpec() fs.TimeSpec {
  1880  	return fs.TimeSpec{
  1881  		ATimeSetSystemTime: true,
  1882  		MTimeSetSystemTime: true,
  1883  	}
  1884  }
  1885  
  1886  func utimes(t *kernel.Task, dirFD int32, addr hostarch.Addr, ts fs.TimeSpec, resolve bool) error {
  1887  	setTimestamp := func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
  1888  		// Does the task own the file?
  1889  		if !d.Inode.CheckOwnership(t) {
  1890  			// Trying to set a specific time? Must be owner.
  1891  			if (ts.ATimeOmit || !ts.ATimeSetSystemTime) && (ts.MTimeOmit || !ts.MTimeSetSystemTime) {
  1892  				return linuxerr.EPERM
  1893  			}
  1894  
  1895  			// Trying to set to current system time? Must have write access.
  1896  			if err := d.Inode.CheckPermission(t, fs.PermMask{Write: true}); err != nil {
  1897  				return err
  1898  			}
  1899  		}
  1900  
  1901  		if err := d.Inode.SetTimestamps(t, d, ts); err != nil {
  1902  			return err
  1903  		}
  1904  
  1905  		// File attribute changed, generate notification.
  1906  		d.InotifyEvent(linux.IN_ATTRIB, 0)
  1907  		return nil
  1908  	}
  1909  
  1910  	// From utimes.c:
  1911  	// "If filename is NULL and dfd refers to an open file, then operate on
  1912  	// the file.  Otherwise look up filename, possibly using dfd as a
  1913  	// starting point."
  1914  	if addr == 0 && dirFD != linux.AT_FDCWD {
  1915  		if !resolve {
  1916  			// Linux returns EINVAL in this case. See utimes.c.
  1917  			return linuxerr.EINVAL
  1918  		}
  1919  		f := t.GetFile(dirFD)
  1920  		if f == nil {
  1921  			return linuxerr.EBADF
  1922  		}
  1923  		defer f.DecRef(t)
  1924  
  1925  		root := t.FSContext().RootDirectory()
  1926  		defer root.DecRef(t)
  1927  
  1928  		return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals)
  1929  	}
  1930  
  1931  	path, _, err := copyInPath(t, addr, false /* allowEmpty */)
  1932  	if err != nil {
  1933  		return err
  1934  	}
  1935  
  1936  	return fileOpOn(t, dirFD, path, resolve, setTimestamp)
  1937  }
  1938  
  1939  // Utime implements linux syscall utime(2).
  1940  func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1941  	filenameAddr := args[0].Pointer()
  1942  	timesAddr := args[1].Pointer()
  1943  
  1944  	// No timesAddr argument will be interpreted as current system time.
  1945  	ts := defaultSetToSystemTimeSpec()
  1946  	if timesAddr != 0 {
  1947  		var times linux.Utime
  1948  		if _, err := times.CopyIn(t, timesAddr); err != nil {
  1949  			return 0, nil, err
  1950  		}
  1951  		ts = fs.TimeSpec{
  1952  			ATime: ktime.FromSeconds(times.Actime),
  1953  			MTime: ktime.FromSeconds(times.Modtime),
  1954  		}
  1955  	}
  1956  	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
  1957  }
  1958  
  1959  // Utimes implements linux syscall utimes(2).
  1960  func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1961  	filenameAddr := args[0].Pointer()
  1962  	timesAddr := args[1].Pointer()
  1963  
  1964  	// No timesAddr argument will be interpreted as current system time.
  1965  	ts := defaultSetToSystemTimeSpec()
  1966  	if timesAddr != 0 {
  1967  		var times [2]linux.Timeval
  1968  		if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
  1969  			return 0, nil, err
  1970  		}
  1971  		ts = fs.TimeSpec{
  1972  			ATime: ktime.FromTimeval(times[0]),
  1973  			MTime: ktime.FromTimeval(times[1]),
  1974  		}
  1975  	}
  1976  	return 0, nil, utimes(t, linux.AT_FDCWD, filenameAddr, ts, true)
  1977  }
  1978  
  1979  // timespecIsValid checks that the timespec is valid for use in utimensat.
  1980  func timespecIsValid(ts linux.Timespec) bool {
  1981  	// Nsec must be UTIME_OMIT, UTIME_NOW, or less than 10^9.
  1982  	return ts.Nsec == linux.UTIME_OMIT || ts.Nsec == linux.UTIME_NOW || ts.Nsec < 1e9
  1983  }
  1984  
  1985  // Utimensat implements linux syscall utimensat(2).
  1986  func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1987  	dirFD := args[0].Int()
  1988  	pathnameAddr := args[1].Pointer()
  1989  	timesAddr := args[2].Pointer()
  1990  	flags := args[3].Int()
  1991  
  1992  	// No timesAddr argument will be interpreted as current system time.
  1993  	ts := defaultSetToSystemTimeSpec()
  1994  	if timesAddr != 0 {
  1995  		var times [2]linux.Timespec
  1996  		if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
  1997  			return 0, nil, err
  1998  		}
  1999  		if !timespecIsValid(times[0]) || !timespecIsValid(times[1]) {
  2000  			return 0, nil, linuxerr.EINVAL
  2001  		}
  2002  
  2003  		// If both are UTIME_OMIT, this is a noop.
  2004  		if times[0].Nsec == linux.UTIME_OMIT && times[1].Nsec == linux.UTIME_OMIT {
  2005  			return 0, nil, nil
  2006  		}
  2007  
  2008  		ts = fs.TimeSpec{
  2009  			ATime:              ktime.FromTimespec(times[0]),
  2010  			ATimeOmit:          times[0].Nsec == linux.UTIME_OMIT,
  2011  			ATimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
  2012  			MTime:              ktime.FromTimespec(times[1]),
  2013  			MTimeOmit:          times[1].Nsec == linux.UTIME_OMIT,
  2014  			MTimeSetSystemTime: times[0].Nsec == linux.UTIME_NOW,
  2015  		}
  2016  	}
  2017  	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, flags&linux.AT_SYMLINK_NOFOLLOW == 0)
  2018  }
  2019  
  2020  // Futimesat implements linux syscall futimesat(2).
  2021  func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  2022  	dirFD := args[0].Int()
  2023  	pathnameAddr := args[1].Pointer()
  2024  	timesAddr := args[2].Pointer()
  2025  
  2026  	// No timesAddr argument will be interpreted as current system time.
  2027  	ts := defaultSetToSystemTimeSpec()
  2028  	if timesAddr != 0 {
  2029  		var times [2]linux.Timeval
  2030  		if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
  2031  			return 0, nil, err
  2032  		}
  2033  		if times[0].Usec >= 1e6 || times[0].Usec < 0 ||
  2034  			times[1].Usec >= 1e6 || times[1].Usec < 0 {
  2035  			return 0, nil, linuxerr.EINVAL
  2036  		}
  2037  
  2038  		ts = fs.TimeSpec{
  2039  			ATime: ktime.FromTimeval(times[0]),
  2040  			MTime: ktime.FromTimeval(times[1]),
  2041  		}
  2042  	}
  2043  	return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
  2044  }
  2045  
  2046  // LINT.ThenChange(vfs2/setstat.go)
  2047  
  2048  // LINT.IfChange
  2049  
  2050  func renameAt(t *kernel.Task, oldDirFD int32, oldAddr hostarch.Addr, newDirFD int32, newAddr hostarch.Addr) error {
  2051  	newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
  2052  	if err != nil {
  2053  		return err
  2054  	}
  2055  	oldPath, _, err := copyInPath(t, oldAddr, false /* allowEmpty */)
  2056  	if err != nil {
  2057  		return err
  2058  	}
  2059  
  2060  	return fileOpAt(t, oldDirFD, oldPath, func(root *fs.Dirent, oldParent *fs.Dirent, oldName string, _ uint) error {
  2061  		if !fs.IsDir(oldParent.Inode.StableAttr) {
  2062  			return syserror.ENOTDIR
  2063  		}
  2064  
  2065  		// Rename rejects paths that end in ".", "..", or empty (i.e.
  2066  		// the root) with EBUSY.
  2067  		switch oldName {
  2068  		case "", ".", "..":
  2069  			return linuxerr.EBUSY
  2070  		}
  2071  
  2072  		return fileOpAt(t, newDirFD, newPath, func(root *fs.Dirent, newParent *fs.Dirent, newName string, _ uint) error {
  2073  			if !fs.IsDir(newParent.Inode.StableAttr) {
  2074  				return syserror.ENOTDIR
  2075  			}
  2076  
  2077  			// Rename rejects paths that end in ".", "..", or empty
  2078  			// (i.e.  the root) with EBUSY.
  2079  			switch newName {
  2080  			case "", ".", "..":
  2081  				return linuxerr.EBUSY
  2082  			}
  2083  
  2084  			return fs.Rename(t, root, oldParent, oldName, newParent, newName)
  2085  		})
  2086  	})
  2087  }
  2088  
  2089  // Rename implements linux syscall rename(2).
  2090  func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  2091  	oldPathAddr := args[0].Pointer()
  2092  	newPathAddr := args[1].Pointer()
  2093  	return 0, nil, renameAt(t, linux.AT_FDCWD, oldPathAddr, linux.AT_FDCWD, newPathAddr)
  2094  }
  2095  
  2096  // Renameat implements linux syscall renameat(2).
  2097  func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  2098  	oldDirFD := args[0].Int()
  2099  	oldPathAddr := args[1].Pointer()
  2100  	newDirFD := args[2].Int()
  2101  	newPathAddr := args[3].Pointer()
  2102  	return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
  2103  }
  2104  
  2105  // LINT.ThenChange(vfs2/filesystem.go)
  2106  
  2107  // Fallocate implements linux system call fallocate(2).
  2108  func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  2109  	fd := args[0].Int()
  2110  	mode := args[1].Int64()
  2111  	offset := args[2].Int64()
  2112  	length := args[3].Int64()
  2113  
  2114  	file := t.GetFile(fd)
  2115  	if file == nil {
  2116  		return 0, nil, linuxerr.EBADF
  2117  	}
  2118  	defer file.DecRef(t)
  2119  
  2120  	if offset < 0 || length <= 0 {
  2121  		return 0, nil, linuxerr.EINVAL
  2122  	}
  2123  	if mode != 0 {
  2124  		t.Kernel().EmitUnimplementedEvent(t)
  2125  		return 0, nil, linuxerr.ENOTSUP
  2126  	}
  2127  	if !file.Flags().Write {
  2128  		return 0, nil, linuxerr.EBADF
  2129  	}
  2130  	if fs.IsPipe(file.Dirent.Inode.StableAttr) {
  2131  		return 0, nil, linuxerr.ESPIPE
  2132  	}
  2133  	if fs.IsDir(file.Dirent.Inode.StableAttr) {
  2134  		return 0, nil, syserror.EISDIR
  2135  	}
  2136  	if !fs.IsRegular(file.Dirent.Inode.StableAttr) {
  2137  		return 0, nil, linuxerr.ENODEV
  2138  	}
  2139  	size := offset + length
  2140  	if size < 0 {
  2141  		return 0, nil, linuxerr.EFBIG
  2142  	}
  2143  	if uint64(size) >= t.ThreadGroup().Limits().Get(limits.FileSize).Cur {
  2144  		t.SendSignal(&linux.SignalInfo{
  2145  			Signo: int32(linux.SIGXFSZ),
  2146  			Code:  linux.SI_USER,
  2147  		})
  2148  		return 0, nil, linuxerr.EFBIG
  2149  	}
  2150  
  2151  	if err := file.Dirent.Inode.Allocate(t, file.Dirent, offset, length); err != nil {
  2152  		return 0, nil, err
  2153  	}
  2154  
  2155  	// File length modified, generate notification.
  2156  	file.Dirent.InotifyEvent(linux.IN_MODIFY, 0)
  2157  
  2158  	return 0, nil, nil
  2159  }
  2160  
  2161  // Flock implements linux syscall flock(2).
  2162  func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  2163  	fd := args[0].Int()
  2164  	operation := args[1].Int()
  2165  
  2166  	file := t.GetFile(fd)
  2167  	if file == nil {
  2168  		// flock(2): EBADF fd is not an open file descriptor.
  2169  		return 0, nil, linuxerr.EBADF
  2170  	}
  2171  	defer file.DecRef(t)
  2172  
  2173  	nonblocking := operation&linux.LOCK_NB != 0
  2174  	operation &^= linux.LOCK_NB
  2175  
  2176  	// A BSD style lock spans the entire file.
  2177  	rng := lock.LockRange{
  2178  		Start: 0,
  2179  		End:   lock.LockEOF,
  2180  	}
  2181  
  2182  	switch operation {
  2183  	case linux.LOCK_EX:
  2184  		if nonblocking {
  2185  			// Since we're nonblocking we pass a nil lock.Blocker implementation.
  2186  			if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, nil) {
  2187  				return 0, nil, linuxerr.EWOULDBLOCK
  2188  			}
  2189  		} else {
  2190  			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
  2191  			if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.WriteLock, rng, t) {
  2192  				return 0, nil, syserror.EINTR
  2193  			}
  2194  		}
  2195  	case linux.LOCK_SH:
  2196  		if nonblocking {
  2197  			// Since we're nonblocking we pass a nil lock.Blocker implementation.
  2198  			if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, nil) {
  2199  				return 0, nil, linuxerr.EWOULDBLOCK
  2200  			}
  2201  		} else {
  2202  			// Because we're blocking we will pass the task to satisfy the lock.Blocker interface.
  2203  			if !file.Dirent.Inode.LockCtx.BSD.LockRegionVFS1(file, lock.ReadLock, rng, t) {
  2204  				return 0, nil, syserror.EINTR
  2205  			}
  2206  		}
  2207  	case linux.LOCK_UN:
  2208  		file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng)
  2209  	default:
  2210  		// flock(2): EINVAL operation is invalid.
  2211  		return 0, nil, linuxerr.EINVAL
  2212  	}
  2213  
  2214  	return 0, nil, nil
  2215  }
  2216  
  2217  const (
  2218  	memfdPrefix     = "/memfd:"
  2219  	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
  2220  	memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + 1
  2221  )
  2222  
  2223  // MemfdCreate implements the linux syscall memfd_create(2).
  2224  func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  2225  	addr := args[0].Pointer()
  2226  	flags := args[1].Uint()
  2227  
  2228  	if flags&^memfdAllFlags != 0 {
  2229  		// Unknown bits in flags.
  2230  		return 0, nil, linuxerr.EINVAL
  2231  	}
  2232  
  2233  	allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
  2234  	cloExec := flags&linux.MFD_CLOEXEC != 0
  2235  
  2236  	name, err := t.CopyInString(addr, unix.PathMax-len(memfdPrefix))
  2237  	if err != nil {
  2238  		return 0, nil, err
  2239  	}
  2240  	if len(name) > memfdMaxNameLen {
  2241  		return 0, nil, linuxerr.EINVAL
  2242  	}
  2243  	name = memfdPrefix + name
  2244  
  2245  	inode := tmpfs.NewMemfdInode(t, allowSeals)
  2246  	dirent := fs.NewDirent(t, inode, name)
  2247  	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
  2248  	// FMODE_READ | FMODE_WRITE.
  2249  	file, err := inode.GetFile(t, dirent, fs.FileFlags{Read: true, Write: true})
  2250  	if err != nil {
  2251  		return 0, nil, err
  2252  	}
  2253  
  2254  	defer dirent.DecRef(t)
  2255  	defer file.DecRef(t)
  2256  
  2257  	newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{
  2258  		CloseOnExec: cloExec,
  2259  	})
  2260  	if err != nil {
  2261  		return 0, nil, err
  2262  	}
  2263  
  2264  	return uintptr(newFD), nil, nil
  2265  }