github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/syscalls/linux/sys_file.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"math"
    19  
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/gohacks"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/marshal/primitive"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/lock"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/tmpfs"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    31  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/fasync"
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/pipe"
    33  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/limits"
    34  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    35  )
    36  
    37  // Mknod implements Linux syscall mknod(2).
    38  func Mknod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    39  	addr := args[0].Pointer()
    40  	mode := args[1].ModeT()
    41  	dev := args[2].Uint()
    42  	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
    43  }
    44  
    45  // Mknodat implements Linux syscall mknodat(2).
    46  func Mknodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    47  	dirfd := args[0].Int()
    48  	addr := args[1].Pointer()
    49  	mode := args[2].ModeT()
    50  	dev := args[3].Uint()
    51  	return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
    52  }
    53  
    54  func mknodat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode linux.FileMode, dev uint32) error {
    55  	path, err := copyInPath(t, addr)
    56  	if err != nil {
    57  		return err
    58  	}
    59  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
    60  	if err != nil {
    61  		return err
    62  	}
    63  	defer tpop.Release(t)
    64  
    65  	// "Zero file type is equivalent to type S_IFREG." - mknod(2)
    66  	if mode.FileType() == 0 {
    67  		mode |= linux.ModeRegular
    68  	}
    69  	major, minor := linux.DecodeDeviceID(dev)
    70  	return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
    71  		Mode:     mode &^ linux.FileMode(t.FSContext().Umask()),
    72  		DevMajor: uint32(major),
    73  		DevMinor: minor,
    74  	})
    75  }
    76  
    77  // Open implements Linux syscall open(2).
    78  func Open(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    79  	addr := args[0].Pointer()
    80  	flags := args[1].Uint()
    81  	mode := args[2].ModeT()
    82  	return openat(t, linux.AT_FDCWD, addr, flags, mode)
    83  }
    84  
    85  // Openat implements Linux syscall openat(2).
    86  func Openat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    87  	dirfd := args[0].Int()
    88  	addr := args[1].Pointer()
    89  	flags := args[2].Uint()
    90  	mode := args[3].ModeT()
    91  	return openat(t, dirfd, addr, flags, mode)
    92  }
    93  
    94  // Creat implements Linux syscall creat(2).
    95  func Creat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    96  	addr := args[0].Pointer()
    97  	mode := args[1].ModeT()
    98  	return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
    99  }
   100  
   101  func openat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
   102  	path, err := copyInPath(t, pathAddr)
   103  	if err != nil {
   104  		return 0, nil, err
   105  	}
   106  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
   107  	if err != nil {
   108  		return 0, nil, err
   109  	}
   110  	defer tpop.Release(t)
   111  
   112  	file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
   113  		Flags: flags | linux.O_LARGEFILE,
   114  		Mode:  linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
   115  	})
   116  	if err != nil {
   117  		return 0, nil, err
   118  	}
   119  	defer file.DecRef(t)
   120  
   121  	fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
   122  		CloseOnExec: flags&linux.O_CLOEXEC != 0,
   123  	})
   124  	return uintptr(fd), nil, err
   125  }
   126  
   127  // Access implements Linux syscall access(2).
   128  func Access(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   129  	addr := args[0].Pointer()
   130  	mode := args[1].ModeT()
   131  
   132  	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode, 0 /* flags */)
   133  }
   134  
   135  // Faccessat implements Linux syscall faccessat(2).
   136  func Faccessat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   137  	dirfd := args[0].Int()
   138  	addr := args[1].Pointer()
   139  	mode := args[2].ModeT()
   140  
   141  	return 0, nil, accessAt(t, dirfd, addr, mode, 0 /* flags */)
   142  }
   143  
   144  // Faccessat2 implements Linux syscall faccessat2(2).
   145  func Faccessat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   146  	dirfd := args[0].Int()
   147  	addr := args[1].Pointer()
   148  	mode := args[2].ModeT()
   149  	flags := args[3].Int()
   150  
   151  	return 0, nil, accessAt(t, dirfd, addr, mode, flags)
   152  }
   153  
   154  func accessAt(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint, flags int32) error {
   155  	const rOK = 4
   156  	const wOK = 2
   157  	const xOK = 1
   158  
   159  	// Sanity check the mode.
   160  	if mode&^(rOK|wOK|xOK) != 0 {
   161  		return linuxerr.EINVAL
   162  	}
   163  
   164  	// faccessat2(2) isn't documented as supporting AT_EMPTY_PATH, but it does.
   165  	if flags&^(linux.AT_EACCESS|linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH) != 0 {
   166  		return linuxerr.EINVAL
   167  	}
   168  
   169  	path, err := copyInPath(t, pathAddr)
   170  	if err != nil {
   171  		return err
   172  	}
   173  	tpop, err := getTaskPathOperation(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0))
   174  	if err != nil {
   175  		return err
   176  	}
   177  	defer tpop.Release(t)
   178  
   179  	creds := t.Credentials()
   180  	if flags&linux.AT_EACCESS == 0 {
   181  		// access(2) and faccessat(2) check permissions using real
   182  		// UID/GID, not effective UID/GID.
   183  		//
   184  		// "access() needs to use the real uid/gid, not the effective
   185  		// uid/gid. We do this by temporarily clearing all FS-related
   186  		// capabilities and switching the fsuid/fsgid around to the
   187  		// real ones." -fs/open.c:faccessat
   188  		creds = creds.Fork()
   189  		creds.EffectiveKUID = creds.RealKUID
   190  		creds.EffectiveKGID = creds.RealKGID
   191  		if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
   192  			creds.EffectiveCaps = creds.PermittedCaps
   193  		} else {
   194  			creds.EffectiveCaps = 0
   195  		}
   196  	}
   197  
   198  	return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop)
   199  }
   200  
   201  // Ioctl implements Linux syscall ioctl(2).
   202  func Ioctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   203  	fd := args[0].Int()
   204  
   205  	file := t.GetFile(fd)
   206  	if file == nil {
   207  		return 0, nil, linuxerr.EBADF
   208  	}
   209  	defer file.DecRef(t)
   210  
   211  	if file.StatusFlags()&linux.O_PATH != 0 {
   212  		return 0, nil, linuxerr.EBADF
   213  	}
   214  
   215  	// Handle ioctls that apply to all FDs.
   216  	switch args[1].Int() {
   217  	case linux.FIONCLEX:
   218  		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
   219  			CloseOnExec: false,
   220  		})
   221  		return 0, nil, nil
   222  
   223  	case linux.FIOCLEX:
   224  		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
   225  			CloseOnExec: true,
   226  		})
   227  		return 0, nil, nil
   228  
   229  	case linux.FIONBIO:
   230  		var set int32
   231  		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
   232  			return 0, nil, err
   233  		}
   234  		flags := file.StatusFlags()
   235  		if set != 0 {
   236  			flags |= linux.O_NONBLOCK
   237  		} else {
   238  			flags &^= linux.O_NONBLOCK
   239  		}
   240  		return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags)
   241  
   242  	case linux.FIOASYNC:
   243  		var set int32
   244  		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
   245  			return 0, nil, err
   246  		}
   247  		flags := file.StatusFlags()
   248  		if set != 0 {
   249  			flags |= linux.O_ASYNC
   250  		} else {
   251  			flags &^= linux.O_ASYNC
   252  		}
   253  		file.SetStatusFlags(t, t.Credentials(), flags)
   254  		return 0, nil, nil
   255  
   256  	case linux.FIOGETOWN, linux.SIOCGPGRP:
   257  		var who int32
   258  		owner, hasOwner := getAsyncOwner(t, file)
   259  		if hasOwner {
   260  			if owner.Type == linux.F_OWNER_PGRP {
   261  				who = -owner.PID
   262  			} else {
   263  				who = owner.PID
   264  			}
   265  		}
   266  		_, err := primitive.CopyInt32Out(t, args[2].Pointer(), who)
   267  		return 0, nil, err
   268  
   269  	case linux.FIOSETOWN, linux.SIOCSPGRP:
   270  		var who int32
   271  		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil {
   272  			return 0, nil, err
   273  		}
   274  		ownerType := int32(linux.F_OWNER_PID)
   275  		if who < 0 {
   276  			// Check for overflow before flipping the sign.
   277  			if who-1 > who {
   278  				return 0, nil, linuxerr.EINVAL
   279  			}
   280  			ownerType = linux.F_OWNER_PGRP
   281  			who = -who
   282  		}
   283  		return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
   284  	}
   285  
   286  	ret, err := file.Ioctl(t, t.MemoryManager(), sysno, args)
   287  	return ret, nil, err
   288  }
   289  
   290  // Getcwd implements Linux syscall getcwd(2).
   291  func Getcwd(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   292  	addr := args[0].Pointer()
   293  	size := args[1].SizeT()
   294  
   295  	root := t.FSContext().RootDirectory()
   296  	wd := t.FSContext().WorkingDirectory()
   297  	s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
   298  	root.DecRef(t)
   299  	wd.DecRef(t)
   300  	if err != nil {
   301  		return 0, nil, err
   302  	}
   303  
   304  	// Note this is >= because we need a terminator.
   305  	if uint(len(s)) >= size {
   306  		return 0, nil, linuxerr.ERANGE
   307  	}
   308  
   309  	// Construct a byte slice containing a NUL terminator.
   310  	buf := t.CopyScratchBuffer(len(s) + 1)
   311  	copy(buf, s)
   312  	buf[len(buf)-1] = 0
   313  
   314  	// Write the pathname slice.
   315  	n, err := t.CopyOutBytes(addr, buf)
   316  	if err != nil {
   317  		return 0, nil, err
   318  	}
   319  	return uintptr(n), nil, nil
   320  }
   321  
   322  // Chdir implements Linux syscall chdir(2).
   323  func Chdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   324  	addr := args[0].Pointer()
   325  
   326  	path, err := copyInPath(t, addr)
   327  	if err != nil {
   328  		return 0, nil, err
   329  	}
   330  	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
   331  	if err != nil {
   332  		return 0, nil, err
   333  	}
   334  	defer tpop.Release(t)
   335  
   336  	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
   337  		CheckSearchable: true,
   338  	})
   339  	if err != nil {
   340  		return 0, nil, err
   341  	}
   342  	t.FSContext().SetWorkingDirectory(t, vd)
   343  	vd.DecRef(t)
   344  	return 0, nil, nil
   345  }
   346  
   347  // Fchdir implements Linux syscall fchdir(2).
   348  func Fchdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   349  	fd := args[0].Int()
   350  
   351  	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
   352  	if err != nil {
   353  		return 0, nil, err
   354  	}
   355  	defer tpop.Release(t)
   356  
   357  	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
   358  		CheckSearchable: true,
   359  	})
   360  	if err != nil {
   361  		return 0, nil, err
   362  	}
   363  	t.FSContext().SetWorkingDirectory(t, vd)
   364  	vd.DecRef(t)
   365  	return 0, nil, nil
   366  }
   367  
   368  // Chroot implements Linux syscall chroot(2).
   369  func Chroot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   370  	addr := args[0].Pointer()
   371  
   372  	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
   373  		return 0, nil, linuxerr.EPERM
   374  	}
   375  
   376  	path, err := copyInPath(t, addr)
   377  	if err != nil {
   378  		return 0, nil, err
   379  	}
   380  	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
   381  	if err != nil {
   382  		return 0, nil, err
   383  	}
   384  	defer tpop.Release(t)
   385  
   386  	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
   387  		CheckSearchable: true,
   388  	})
   389  	if err != nil {
   390  		return 0, nil, err
   391  	}
   392  	t.FSContext().SetRootDirectory(t, vd)
   393  	vd.DecRef(t)
   394  	return 0, nil, nil
   395  }
   396  
   397  // PivotRoot implements Linux syscall pivot_root(2).
   398  func PivotRoot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   399  	addr1 := args[0].Pointer()
   400  	addr2 := args[1].Pointer()
   401  
   402  	if !t.HasCapability(linux.CAP_SYS_ADMIN) {
   403  		return 0, nil, linuxerr.EPERM
   404  	}
   405  
   406  	newRootPath, err := copyInPath(t, addr1)
   407  	if err != nil {
   408  		return 0, nil, err
   409  	}
   410  	newRootTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, newRootPath, disallowEmptyPath, followFinalSymlink)
   411  	if err != nil {
   412  		return 0, nil, err
   413  	}
   414  	defer newRootTpop.Release(t)
   415  	putOldPath, err := copyInPath(t, addr2)
   416  	if err != nil {
   417  		return 0, nil, err
   418  	}
   419  	putOldTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, putOldPath, disallowEmptyPath, followFinalSymlink)
   420  	if err != nil {
   421  		return 0, nil, err
   422  	}
   423  	defer putOldTpop.Release(t)
   424  
   425  	oldRootVd := t.FSContext().RootDirectory()
   426  	defer oldRootVd.DecRef(t)
   427  	newRootVd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &newRootTpop.pop, &vfs.GetDentryOptions{
   428  		CheckSearchable: true,
   429  	})
   430  	if err != nil {
   431  		return 0, nil, err
   432  	}
   433  	defer newRootVd.DecRef(t)
   434  
   435  	if err := t.Kernel().VFS().PivotRoot(t, t.Credentials(), &newRootTpop.pop, &putOldTpop.pop); err != nil {
   436  		return 0, nil, err
   437  	}
   438  	t.Kernel().ReplaceFSContextRoots(t, oldRootVd, newRootVd)
   439  	return 0, nil, nil
   440  }
   441  
   442  // Close implements Linux syscall close(2).
   443  func Close(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   444  	fd := args[0].Int()
   445  
   446  	// Note that Remove provides a reference on the file that we may use to
   447  	// flush. It is still active until we drop the final reference below
   448  	// (and other reference-holding operations complete).
   449  	file := t.FDTable().Remove(t, fd)
   450  	if file == nil {
   451  		return 0, nil, linuxerr.EBADF
   452  	}
   453  	defer file.DecRef(t)
   454  
   455  	err := file.OnClose(t)
   456  	return 0, nil, HandleIOError(t, false /* partial */, err, linuxerr.EINTR, "close", file)
   457  }
   458  
   459  // CloseRange implements linux syscall close_range(2).
   460  func CloseRange(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   461  	first := args[0].Uint()
   462  	last := args[1].Uint()
   463  	flags := args[2].Uint()
   464  
   465  	if (first > last) || (last > math.MaxInt32) {
   466  		return 0, nil, linuxerr.EINVAL
   467  	}
   468  
   469  	if (flags & ^(linux.CLOSE_RANGE_CLOEXEC | linux.CLOSE_RANGE_UNSHARE)) != 0 {
   470  		return 0, nil, linuxerr.EINVAL
   471  	}
   472  
   473  	cloexec := flags & linux.CLOSE_RANGE_CLOEXEC
   474  	unshare := flags & linux.CLOSE_RANGE_UNSHARE
   475  
   476  	if unshare != 0 {
   477  		// If possible, we don't want to copy FDs to the new unshared table, because those FDs will
   478  		// be promptly closed and no longer used. So in the case where we know the range extends all
   479  		// the way to the end of the FdTable, we can simply copy the FdTable only up to the start of
   480  		// the range that we are closing.
   481  		if cloexec == 0 && int32(last) >= t.FDTable().GetLastFd() {
   482  			t.UnshareFdTable(int32(first))
   483  		} else {
   484  			t.UnshareFdTable(math.MaxInt32)
   485  		}
   486  	}
   487  
   488  	if cloexec != 0 {
   489  		flagToApply := kernel.FDFlags{
   490  			CloseOnExec: true,
   491  		}
   492  		t.FDTable().SetFlagsForRange(t.AsyncContext(), int32(first), int32(last), flagToApply)
   493  		return 0, nil, nil
   494  	}
   495  
   496  	fdTable := t.FDTable()
   497  	fd := int32(first)
   498  	for {
   499  		fd, file := fdTable.RemoveNextInRange(t, fd, int32(last))
   500  		if file == nil {
   501  			break
   502  		}
   503  
   504  		fd++
   505  		// Per the close_range(2) documentation, errors upon closing file descriptors are ignored.
   506  		_ = file.OnClose(t)
   507  		file.DecRef(t)
   508  	}
   509  
   510  	return 0, nil, nil
   511  }
   512  
   513  // Dup implements Linux syscall dup(2).
   514  func Dup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   515  	fd := args[0].Int()
   516  
   517  	file := t.GetFile(fd)
   518  	if file == nil {
   519  		return 0, nil, linuxerr.EBADF
   520  	}
   521  	defer file.DecRef(t)
   522  
   523  	newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
   524  	if err != nil {
   525  		return 0, nil, linuxerr.EMFILE
   526  	}
   527  	return uintptr(newFD), nil, nil
   528  }
   529  
   530  // Dup2 implements Linux syscall dup2(2).
   531  func Dup2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   532  	oldfd := args[0].Int()
   533  	newfd := args[1].Int()
   534  
   535  	if oldfd == newfd {
   536  		// As long as oldfd is valid, dup2() does nothing and returns newfd.
   537  		file := t.GetFile(oldfd)
   538  		if file == nil {
   539  			return 0, nil, linuxerr.EBADF
   540  		}
   541  		file.DecRef(t)
   542  		return uintptr(newfd), nil, nil
   543  	}
   544  
   545  	return dup3(t, oldfd, newfd, 0)
   546  }
   547  
   548  // Dup3 implements Linux syscall dup3(2).
   549  func Dup3(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   550  	oldfd := args[0].Int()
   551  	newfd := args[1].Int()
   552  	flags := args[2].Uint()
   553  
   554  	if oldfd == newfd {
   555  		return 0, nil, linuxerr.EINVAL
   556  	}
   557  
   558  	return dup3(t, oldfd, newfd, flags)
   559  }
   560  
   561  func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
   562  	if flags&^linux.O_CLOEXEC != 0 {
   563  		return 0, nil, linuxerr.EINVAL
   564  	}
   565  
   566  	file := t.GetFile(oldfd)
   567  	if file == nil {
   568  		return 0, nil, linuxerr.EBADF
   569  	}
   570  	defer file.DecRef(t)
   571  
   572  	err := t.NewFDAt(newfd, file, kernel.FDFlags{
   573  		CloseOnExec: flags&linux.O_CLOEXEC != 0,
   574  	})
   575  	if err != nil {
   576  		return 0, nil, err
   577  	}
   578  	return uintptr(newfd), nil, nil
   579  }
   580  
   581  // Fcntl implements linux syscall fcntl(2).
   582  func Fcntl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   583  	fd := args[0].Int()
   584  	cmd := args[1].Int()
   585  
   586  	file, flags := t.FDTable().Get(fd)
   587  	if file == nil {
   588  		return 0, nil, linuxerr.EBADF
   589  	}
   590  	defer file.DecRef(t)
   591  
   592  	if file.StatusFlags()&linux.O_PATH != 0 {
   593  		switch cmd {
   594  		case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL:
   595  			// allowed
   596  		default:
   597  			return 0, nil, linuxerr.EBADF
   598  		}
   599  	}
   600  
   601  	switch cmd {
   602  	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
   603  		minfd := args[2].Int()
   604  		fd, err := t.NewFDFrom(minfd, file, kernel.FDFlags{
   605  			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
   606  		})
   607  		if err != nil {
   608  			return 0, nil, err
   609  		}
   610  		return uintptr(fd), nil, nil
   611  	case linux.F_GETFD:
   612  		return uintptr(flags.ToLinuxFDFlags()), nil, nil
   613  	case linux.F_SETFD:
   614  		flags := args[2].Uint()
   615  		err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
   616  			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
   617  		})
   618  		return 0, nil, err
   619  	case linux.F_GETFL:
   620  		return uintptr(file.StatusFlags()), nil, nil
   621  	case linux.F_SETFL:
   622  		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
   623  	case linux.F_GETOWN:
   624  		owner, hasOwner := getAsyncOwner(t, file)
   625  		if !hasOwner {
   626  			return 0, nil, nil
   627  		}
   628  		if owner.Type == linux.F_OWNER_PGRP {
   629  			return uintptr(-owner.PID), nil, nil
   630  		}
   631  		return uintptr(owner.PID), nil, nil
   632  	case linux.F_SETOWN:
   633  		who := args[2].Int()
   634  		ownerType := int32(linux.F_OWNER_PID)
   635  		if who < 0 {
   636  			// Check for overflow before flipping the sign.
   637  			if who-1 > who {
   638  				return 0, nil, linuxerr.EINVAL
   639  			}
   640  			ownerType = linux.F_OWNER_PGRP
   641  			who = -who
   642  		}
   643  		return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
   644  	case linux.F_GETOWN_EX:
   645  		owner, hasOwner := getAsyncOwner(t, file)
   646  		if !hasOwner {
   647  			return 0, nil, nil
   648  		}
   649  		_, err := owner.CopyOut(t, args[2].Pointer())
   650  		return 0, nil, err
   651  	case linux.F_SETOWN_EX:
   652  		var owner linux.FOwnerEx
   653  		_, err := owner.CopyIn(t, args[2].Pointer())
   654  		if err != nil {
   655  			return 0, nil, err
   656  		}
   657  		return 0, nil, setAsyncOwner(t, int(fd), file, owner.Type, owner.PID)
   658  	case linux.F_SETPIPE_SZ:
   659  		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
   660  		if !ok {
   661  			return 0, nil, linuxerr.EBADF
   662  		}
   663  		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
   664  		if err != nil {
   665  			return 0, nil, err
   666  		}
   667  		return uintptr(n), nil, nil
   668  	case linux.F_GETPIPE_SZ:
   669  		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
   670  		if !ok {
   671  			return 0, nil, linuxerr.EBADF
   672  		}
   673  		return uintptr(pipefile.PipeSize()), nil, nil
   674  	case linux.F_GET_SEALS:
   675  		val, err := tmpfs.GetSeals(file)
   676  		return uintptr(val), nil, err
   677  	case linux.F_ADD_SEALS:
   678  		if !file.IsWritable() {
   679  			return 0, nil, linuxerr.EPERM
   680  		}
   681  		err := tmpfs.AddSeals(file, args[2].Uint())
   682  		return 0, nil, err
   683  	case linux.F_SETLK:
   684  		return 0, nil, posixLock(t, args, file, false /* ofd */, false /* block */)
   685  	case linux.F_SETLKW:
   686  		return 0, nil, posixLock(t, args, file, false /* ofd */, true /* block */)
   687  	case linux.F_GETLK:
   688  		return 0, nil, posixTestLock(t, args, file, false /* ofd */)
   689  	case linux.F_OFD_SETLK:
   690  		return 0, nil, posixLock(t, args, file, true /* ofd */, false /* block */)
   691  	case linux.F_OFD_SETLKW:
   692  		return 0, nil, posixLock(t, args, file, true /* ofd */, true /* block */)
   693  	case linux.F_OFD_GETLK:
   694  		return 0, nil, posixTestLock(t, args, file, true /* ofd */)
   695  	case linux.F_GETSIG:
   696  		a := file.AsyncHandler()
   697  		if a == nil {
   698  			// Default behavior aka SIGIO.
   699  			return 0, nil, nil
   700  		}
   701  		return uintptr(a.(*fasync.FileAsync).Signal()), nil, nil
   702  	case linux.F_SETSIG:
   703  		a, err := file.SetAsyncHandler(fasync.New(int(fd)))
   704  		if err != nil {
   705  			return 0, nil, err
   706  		}
   707  		async := a.(*fasync.FileAsync)
   708  		return 0, nil, async.SetSignal(linux.Signal(args[2].Int()))
   709  	default:
   710  		// Everything else is not yet supported.
   711  		return 0, nil, linuxerr.EINVAL
   712  	}
   713  }
   714  
   715  func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) {
   716  	a := fd.AsyncHandler()
   717  	if a == nil {
   718  		return linux.FOwnerEx{}, false
   719  	}
   720  
   721  	ot, otg, opg := a.(*fasync.FileAsync).Owner()
   722  	switch {
   723  	case ot != nil:
   724  		return linux.FOwnerEx{
   725  			Type: linux.F_OWNER_TID,
   726  			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
   727  		}, true
   728  	case otg != nil:
   729  		return linux.FOwnerEx{
   730  			Type: linux.F_OWNER_PID,
   731  			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
   732  		}, true
   733  	case opg != nil:
   734  		return linux.FOwnerEx{
   735  			Type: linux.F_OWNER_PGRP,
   736  			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
   737  		}, true
   738  	default:
   739  		return linux.FOwnerEx{}, true
   740  	}
   741  }
   742  
   743  func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, pid int32) error {
   744  	switch ownerType {
   745  	case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
   746  		// Acceptable type.
   747  	default:
   748  		return linuxerr.EINVAL
   749  	}
   750  
   751  	a, err := file.SetAsyncHandler(fasync.New(fd))
   752  	if err != nil {
   753  		return err
   754  	}
   755  	async := a.(*fasync.FileAsync)
   756  	if pid == 0 {
   757  		async.ClearOwner()
   758  		return nil
   759  	}
   760  
   761  	switch ownerType {
   762  	case linux.F_OWNER_TID:
   763  		task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
   764  		if task == nil {
   765  			return linuxerr.ESRCH
   766  		}
   767  		async.SetOwnerTask(t, task)
   768  		return nil
   769  	case linux.F_OWNER_PID:
   770  		tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
   771  		if tg == nil {
   772  			return linuxerr.ESRCH
   773  		}
   774  		async.SetOwnerThreadGroup(t, tg)
   775  		return nil
   776  	case linux.F_OWNER_PGRP:
   777  		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid))
   778  		if pg == nil {
   779  			return linuxerr.ESRCH
   780  		}
   781  		async.SetOwnerProcessGroup(t, pg)
   782  		return nil
   783  	default:
   784  		return linuxerr.EINVAL
   785  	}
   786  }
   787  
   788  func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool) error {
   789  	// Copy in the lock request.
   790  	flockAddr := args[2].Pointer()
   791  	var flock linux.Flock
   792  	if _, err := flock.CopyIn(t, flockAddr); err != nil {
   793  		return err
   794  	}
   795  	var typ lock.LockType
   796  	switch flock.Type {
   797  	case linux.F_RDLCK:
   798  		typ = lock.ReadLock
   799  	case linux.F_WRLCK:
   800  		typ = lock.WriteLock
   801  	default:
   802  		return linuxerr.EINVAL
   803  	}
   804  	r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence)
   805  	if err != nil {
   806  		return err
   807  	}
   808  	uid := lock.UniqueID(t.FDTable())
   809  	if ofd {
   810  		uid = lock.UniqueID(file)
   811  	}
   812  
   813  	newFlock, err := file.TestPOSIX(t, uid, typ, r)
   814  	if err != nil {
   815  		return err
   816  	}
   817  	if !ofd {
   818  		newFlock.PID = translatePID(t.PIDNamespace().Root(), t.PIDNamespace(), newFlock.PID)
   819  	}
   820  	if _, err = newFlock.CopyOut(t, flockAddr); err != nil {
   821  		return err
   822  	}
   823  	return nil
   824  }
   825  
   826  // translatePID translates a pid from one namespace to another. Note that this
   827  // may race with task termination/creation, in which case the original task
   828  // corresponding to pid may no longer exist. This is used to implement the
   829  // F_GETLK fcntl, which has the same potential race in Linux as well (i.e.,
   830  // there is no synchronization between retrieving the lock PID and translating
   831  // it). See fs/locks.c:posix_lock_to_flock.
   832  func translatePID(old, new *kernel.PIDNamespace, pid int32) int32 {
   833  	return int32(new.IDOfTask(old.TaskWithID(kernel.ThreadID(pid))))
   834  }
   835  
   836  func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool, block bool) error {
   837  	// Copy in the lock request.
   838  	flockAddr := args[2].Pointer()
   839  	var flock linux.Flock
   840  	if _, err := flock.CopyIn(t, flockAddr); err != nil {
   841  		return err
   842  	}
   843  	if ofd && flock.PID != 0 {
   844  		return linuxerr.EINVAL
   845  	}
   846  
   847  	uid := lock.UniqueID(t.FDTable())
   848  	pid := int32(t.TGIDInRoot())
   849  	if ofd {
   850  		uid = lock.UniqueID(file)
   851  		pid = -1
   852  	}
   853  
   854  	r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence)
   855  	if err != nil {
   856  		return err
   857  	}
   858  
   859  	switch flock.Type {
   860  	case linux.F_RDLCK:
   861  		if !file.IsReadable() {
   862  			return linuxerr.EBADF
   863  		}
   864  		return file.LockPOSIX(t, uid, pid, lock.ReadLock, r, block)
   865  
   866  	case linux.F_WRLCK:
   867  		if !file.IsWritable() {
   868  			return linuxerr.EBADF
   869  		}
   870  		return file.LockPOSIX(t, uid, pid, lock.WriteLock, r, block)
   871  
   872  	case linux.F_UNLCK:
   873  		return file.UnlockPOSIX(t, uid, r)
   874  
   875  	default:
   876  		return linuxerr.EINVAL
   877  	}
   878  }
   879  
   880  // Fadvise64 implements fadvise64(2).
   881  // This implementation currently ignores the provided advice.
   882  func Fadvise64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   883  	fd := args[0].Int()
   884  	length := args[2].Int64()
   885  	advice := args[3].Int()
   886  
   887  	// Note: offset is allowed to be negative.
   888  	if length < 0 {
   889  		return 0, nil, linuxerr.EINVAL
   890  	}
   891  
   892  	file := t.GetFile(fd)
   893  	if file == nil {
   894  		return 0, nil, linuxerr.EBADF
   895  	}
   896  	defer file.DecRef(t)
   897  
   898  	if file.StatusFlags()&linux.O_PATH != 0 {
   899  		return 0, nil, linuxerr.EBADF
   900  	}
   901  
   902  	// If the FD refers to a pipe or FIFO, return error.
   903  	if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe {
   904  		return 0, nil, linuxerr.ESPIPE
   905  	}
   906  
   907  	switch advice {
   908  	case linux.POSIX_FADV_NORMAL:
   909  	case linux.POSIX_FADV_RANDOM:
   910  	case linux.POSIX_FADV_SEQUENTIAL:
   911  	case linux.POSIX_FADV_WILLNEED:
   912  	case linux.POSIX_FADV_DONTNEED:
   913  	case linux.POSIX_FADV_NOREUSE:
   914  	default:
   915  		return 0, nil, linuxerr.EINVAL
   916  	}
   917  
   918  	// Sure, whatever.
   919  	return 0, nil, nil
   920  }
   921  
   922  // Mkdir implements Linux syscall mkdir(2).
   923  func Mkdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   924  	addr := args[0].Pointer()
   925  	mode := args[1].ModeT()
   926  	return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
   927  }
   928  
   929  // Mkdirat implements Linux syscall mkdirat(2).
   930  func Mkdirat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   931  	dirfd := args[0].Int()
   932  	addr := args[1].Pointer()
   933  	mode := args[2].ModeT()
   934  	return 0, nil, mkdirat(t, dirfd, addr, mode)
   935  }
   936  
   937  func mkdirat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode uint) error {
   938  	path, err := copyInPath(t, addr)
   939  	if err != nil {
   940  		return err
   941  	}
   942  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
   943  	if err != nil {
   944  		return err
   945  	}
   946  	defer tpop.Release(t)
   947  	return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
   948  		Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
   949  	})
   950  }
   951  
   952  // Rmdir implements Linux syscall rmdir(2).
   953  func Rmdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   954  	pathAddr := args[0].Pointer()
   955  	return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
   956  }
   957  
   958  func rmdirat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error {
   959  	path, err := copyInPath(t, pathAddr)
   960  	if err != nil {
   961  		return err
   962  	}
   963  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
   964  	if err != nil {
   965  		return err
   966  	}
   967  	defer tpop.Release(t)
   968  	return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
   969  }
   970  
   971  // Symlink implements Linux syscall symlink(2).
   972  func Symlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   973  	targetAddr := args[0].Pointer()
   974  	linkpathAddr := args[1].Pointer()
   975  	return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
   976  }
   977  
   978  // Symlinkat implements Linux syscall symlinkat(2).
   979  func Symlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   980  	targetAddr := args[0].Pointer()
   981  	newdirfd := args[1].Int()
   982  	linkpathAddr := args[2].Pointer()
   983  	return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
   984  }
   985  
   986  func symlinkat(t *kernel.Task, targetAddr hostarch.Addr, newdirfd int32, linkpathAddr hostarch.Addr) error {
   987  	target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
   988  	if err != nil {
   989  		return err
   990  	}
   991  	if len(target) == 0 {
   992  		return linuxerr.ENOENT
   993  	}
   994  	linkpath, err := copyInPath(t, linkpathAddr)
   995  	if err != nil {
   996  		return err
   997  	}
   998  	tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
   999  	if err != nil {
  1000  		return err
  1001  	}
  1002  	defer tpop.Release(t)
  1003  	return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
  1004  }
  1005  
  1006  // Link implements Linux syscall link(2).
  1007  func Link(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1008  	oldpathAddr := args[0].Pointer()
  1009  	newpathAddr := args[1].Pointer()
  1010  	return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
  1011  }
  1012  
  1013  // Linkat implements Linux syscall linkat(2).
  1014  func Linkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1015  	olddirfd := args[0].Int()
  1016  	oldpathAddr := args[1].Pointer()
  1017  	newdirfd := args[2].Int()
  1018  	newpathAddr := args[3].Pointer()
  1019  	flags := args[4].Int()
  1020  	return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
  1021  }
  1022  
  1023  func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags int32) error {
  1024  	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
  1025  		return linuxerr.EINVAL
  1026  	}
  1027  	if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
  1028  		return linuxerr.ENOENT
  1029  	}
  1030  
  1031  	oldpath, err := copyInPath(t, oldpathAddr)
  1032  	if err != nil {
  1033  		return err
  1034  	}
  1035  	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
  1036  	if err != nil {
  1037  		return err
  1038  	}
  1039  	defer oldtpop.Release(t)
  1040  
  1041  	newpath, err := copyInPath(t, newpathAddr)
  1042  	if err != nil {
  1043  		return err
  1044  	}
  1045  	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
  1046  	if err != nil {
  1047  		return err
  1048  	}
  1049  	defer newtpop.Release(t)
  1050  
  1051  	return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
  1052  }
  1053  
  1054  // Readlinkat implements Linux syscall readlinkat(2).
  1055  func Readlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1056  	dirfd := args[0].Int()
  1057  	pathAddr := args[1].Pointer()
  1058  	bufAddr := args[2].Pointer()
  1059  	size := args[3].SizeT()
  1060  	return readlinkat(t, dirfd, pathAddr, bufAddr, size)
  1061  }
  1062  
  1063  // Readlink implements Linux syscall readlink(2).
  1064  func Readlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1065  	pathAddr := args[0].Pointer()
  1066  	bufAddr := args[1].Pointer()
  1067  	size := args[2].SizeT()
  1068  	return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
  1069  }
  1070  
  1071  func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr hostarch.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
  1072  	if int(size) <= 0 {
  1073  		return 0, nil, linuxerr.EINVAL
  1074  	}
  1075  
  1076  	path, err := copyInPath(t, pathAddr)
  1077  	if err != nil {
  1078  		return 0, nil, err
  1079  	}
  1080  	// "Since Linux 2.6.39, pathname can be an empty string, in which case the
  1081  	// call operates on the symbolic link referred to by dirfd ..." -
  1082  	// readlinkat(2)
  1083  	tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
  1084  	if err != nil {
  1085  		return 0, nil, err
  1086  	}
  1087  	defer tpop.Release(t)
  1088  
  1089  	target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
  1090  	if err != nil {
  1091  		return 0, nil, err
  1092  	}
  1093  
  1094  	if len(target) > int(size) {
  1095  		target = target[:size]
  1096  	}
  1097  	n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
  1098  	if n == 0 {
  1099  		return 0, nil, err
  1100  	}
  1101  	return uintptr(n), nil, nil
  1102  }
  1103  
  1104  // Unlink implements Linux syscall unlink(2).
  1105  func Unlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1106  	pathAddr := args[0].Pointer()
  1107  	return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
  1108  }
  1109  
  1110  func unlinkat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error {
  1111  	path, err := copyInPath(t, pathAddr)
  1112  	if err != nil {
  1113  		return err
  1114  	}
  1115  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
  1116  	if err != nil {
  1117  		return err
  1118  	}
  1119  	defer tpop.Release(t)
  1120  	return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
  1121  }
  1122  
  1123  // Unlinkat implements Linux syscall unlinkat(2).
  1124  func Unlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1125  	dirfd := args[0].Int()
  1126  	pathAddr := args[1].Pointer()
  1127  	flags := args[2].Int()
  1128  
  1129  	if flags&^linux.AT_REMOVEDIR != 0 {
  1130  		return 0, nil, linuxerr.EINVAL
  1131  	}
  1132  
  1133  	if flags&linux.AT_REMOVEDIR != 0 {
  1134  		return 0, nil, rmdirat(t, dirfd, pathAddr)
  1135  	}
  1136  	return 0, nil, unlinkat(t, dirfd, pathAddr)
  1137  }
  1138  
  1139  func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
  1140  	root := t.FSContext().RootDirectory()
  1141  	defer root.DecRef(t)
  1142  	start := root
  1143  	if !path.Absolute {
  1144  		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
  1145  			return linuxerr.ENOENT
  1146  		}
  1147  		if dirfd == linux.AT_FDCWD {
  1148  			start = t.FSContext().WorkingDirectory()
  1149  			defer start.DecRef(t)
  1150  		} else {
  1151  			dirfile := t.GetFile(dirfd)
  1152  			if dirfile == nil {
  1153  				return linuxerr.EBADF
  1154  			}
  1155  			if !path.HasComponents() {
  1156  				// Use FileDescription.SetStat() instead of
  1157  				// VirtualFilesystem.SetStatAt(), since the former may be able
  1158  				// to use opened file state to expedite the SetStat.
  1159  				err := dirfile.SetStat(t, *opts)
  1160  				dirfile.DecRef(t)
  1161  				return err
  1162  			}
  1163  			start = dirfile.VirtualDentry()
  1164  			start.IncRef()
  1165  			defer start.DecRef(t)
  1166  			dirfile.DecRef(t)
  1167  		}
  1168  	}
  1169  	return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
  1170  		Root:               root,
  1171  		Start:              start,
  1172  		Path:               path,
  1173  		FollowFinalSymlink: bool(shouldFollowFinalSymlink),
  1174  	}, opts)
  1175  }
  1176  
  1177  func handleSetSizeError(t *kernel.Task, err error) error {
  1178  	if err == linuxerr.ErrExceedsFileSizeLimit {
  1179  		// Convert error to EFBIG and send a SIGXFSZ per setrlimit(2).
  1180  		t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
  1181  		return linuxerr.EFBIG
  1182  	}
  1183  	return err
  1184  }
  1185  
  1186  // Truncate implements Linux syscall truncate(2).
  1187  func Truncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1188  	addr := args[0].Pointer()
  1189  	length := args[1].Int64()
  1190  
  1191  	if length < 0 {
  1192  		return 0, nil, linuxerr.EINVAL
  1193  	}
  1194  
  1195  	path, err := copyInPath(t, addr)
  1196  	if err != nil {
  1197  		return 0, nil, err
  1198  	}
  1199  
  1200  	err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
  1201  		Stat: linux.Statx{
  1202  			Mask: linux.STATX_SIZE,
  1203  			Size: uint64(length),
  1204  		},
  1205  		NeedWritePerm: true,
  1206  	})
  1207  	return 0, nil, handleSetSizeError(t, err)
  1208  }
  1209  
  1210  // Ftruncate implements Linux syscall ftruncate(2).
  1211  func Ftruncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1212  	fd := args[0].Int()
  1213  	length := args[1].Int64()
  1214  
  1215  	if length < 0 {
  1216  		return 0, nil, linuxerr.EINVAL
  1217  	}
  1218  
  1219  	file := t.GetFile(fd)
  1220  	if file == nil {
  1221  		return 0, nil, linuxerr.EBADF
  1222  	}
  1223  	defer file.DecRef(t)
  1224  
  1225  	if !file.IsWritable() {
  1226  		return 0, nil, linuxerr.EINVAL
  1227  	}
  1228  
  1229  	err := file.SetStat(t, vfs.SetStatOptions{
  1230  		Stat: linux.Statx{
  1231  			Mask: linux.STATX_SIZE,
  1232  			Size: uint64(length),
  1233  		},
  1234  	})
  1235  	return 0, nil, handleSetSizeError(t, err)
  1236  }
  1237  
  1238  // Umask implements linux syscall umask(2).
  1239  func Umask(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1240  	mask := args[0].ModeT()
  1241  	mask = t.FSContext().SwapUmask(mask & 0777)
  1242  	return uintptr(mask), nil, nil
  1243  }
  1244  
  1245  // Chown implements Linux syscall chown(2).
  1246  func Chown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1247  	pathAddr := args[0].Pointer()
  1248  	owner := args[1].Int()
  1249  	group := args[2].Int()
  1250  	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
  1251  }
  1252  
  1253  // Lchown implements Linux syscall lchown(2).
  1254  func Lchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1255  	pathAddr := args[0].Pointer()
  1256  	owner := args[1].Int()
  1257  	group := args[2].Int()
  1258  	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
  1259  }
  1260  
  1261  // Fchownat implements Linux syscall fchownat(2).
  1262  func Fchownat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1263  	dirfd := args[0].Int()
  1264  	pathAddr := args[1].Pointer()
  1265  	owner := args[2].Int()
  1266  	group := args[3].Int()
  1267  	flags := args[4].Int()
  1268  	return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
  1269  }
  1270  
  1271  func fchownat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, owner, group, flags int32) error {
  1272  	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
  1273  		return linuxerr.EINVAL
  1274  	}
  1275  
  1276  	path, err := copyInPath(t, pathAddr)
  1277  	if err != nil {
  1278  		return err
  1279  	}
  1280  
  1281  	var opts vfs.SetStatOptions
  1282  	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
  1283  		return err
  1284  	}
  1285  
  1286  	return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
  1287  }
  1288  
  1289  func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
  1290  	userns := t.UserNamespace()
  1291  	if owner != -1 {
  1292  		kuid := userns.MapToKUID(auth.UID(owner))
  1293  		if !kuid.Ok() {
  1294  			return linuxerr.EINVAL
  1295  		}
  1296  		opts.Stat.Mask |= linux.STATX_UID
  1297  		opts.Stat.UID = uint32(kuid)
  1298  	}
  1299  	if group != -1 {
  1300  		kgid := userns.MapToKGID(auth.GID(group))
  1301  		if !kgid.Ok() {
  1302  			return linuxerr.EINVAL
  1303  		}
  1304  		opts.Stat.Mask |= linux.STATX_GID
  1305  		opts.Stat.GID = uint32(kgid)
  1306  	}
  1307  	return nil
  1308  }
  1309  
  1310  // Fchown implements Linux syscall fchown(2).
  1311  func Fchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1312  	fd := args[0].Int()
  1313  	owner := args[1].Int()
  1314  	group := args[2].Int()
  1315  
  1316  	file := t.GetFile(fd)
  1317  	if file == nil {
  1318  		return 0, nil, linuxerr.EBADF
  1319  	}
  1320  	defer file.DecRef(t)
  1321  
  1322  	var opts vfs.SetStatOptions
  1323  	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
  1324  		return 0, nil, err
  1325  	}
  1326  	return 0, nil, file.SetStat(t, opts)
  1327  }
  1328  
  1329  const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
  1330  
  1331  // Chmod implements Linux syscall chmod(2).
  1332  func Chmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1333  	pathAddr := args[0].Pointer()
  1334  	mode := args[1].ModeT()
  1335  	return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
  1336  }
  1337  
  1338  // Fchmodat implements Linux syscall fchmodat(2).
  1339  func Fchmodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1340  	dirfd := args[0].Int()
  1341  	pathAddr := args[1].Pointer()
  1342  	mode := args[2].ModeT()
  1343  	return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
  1344  }
  1345  
  1346  func fchmodat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) error {
  1347  	path, err := copyInPath(t, pathAddr)
  1348  	if err != nil {
  1349  		return err
  1350  	}
  1351  
  1352  	return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
  1353  		Stat: linux.Statx{
  1354  			Mask: linux.STATX_MODE,
  1355  			Mode: uint16(mode & chmodMask),
  1356  		},
  1357  	})
  1358  }
  1359  
  1360  // Fchmod implements Linux syscall fchmod(2).
  1361  func Fchmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1362  	fd := args[0].Int()
  1363  	mode := args[1].ModeT()
  1364  
  1365  	file := t.GetFile(fd)
  1366  	if file == nil {
  1367  		return 0, nil, linuxerr.EBADF
  1368  	}
  1369  	defer file.DecRef(t)
  1370  
  1371  	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
  1372  		Stat: linux.Statx{
  1373  			Mask: linux.STATX_MODE,
  1374  			Mode: uint16(mode & chmodMask),
  1375  		},
  1376  	})
  1377  }
  1378  
  1379  // Utime implements Linux syscall utime(2).
  1380  func Utime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1381  	pathAddr := args[0].Pointer()
  1382  	timesAddr := args[1].Pointer()
  1383  
  1384  	path, err := copyInPath(t, pathAddr)
  1385  	if err != nil {
  1386  		return 0, nil, err
  1387  	}
  1388  
  1389  	opts := vfs.SetStatOptions{
  1390  		Stat: linux.Statx{
  1391  			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
  1392  		},
  1393  	}
  1394  	if timesAddr == 0 {
  1395  		opts.Stat.Atime.Nsec = linux.UTIME_NOW
  1396  		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
  1397  	} else {
  1398  		var times linux.Utime
  1399  		if _, err := times.CopyIn(t, timesAddr); err != nil {
  1400  			return 0, nil, err
  1401  		}
  1402  		opts.Stat.Atime.Sec = times.Actime
  1403  		opts.Stat.Mtime.Sec = times.Modtime
  1404  	}
  1405  
  1406  	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
  1407  }
  1408  
  1409  // Utimes implements Linux syscall utimes(2).
  1410  func Utimes(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1411  	pathAddr := args[0].Pointer()
  1412  	timesAddr := args[1].Pointer()
  1413  
  1414  	path, err := copyInPath(t, pathAddr)
  1415  	if err != nil {
  1416  		return 0, nil, err
  1417  	}
  1418  
  1419  	var opts vfs.SetStatOptions
  1420  	if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
  1421  		return 0, nil, err
  1422  	}
  1423  
  1424  	return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
  1425  }
  1426  
  1427  // Futimesat implements Linux syscall futimesat(2).
  1428  func Futimesat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1429  	dirfd := args[0].Int()
  1430  	pathAddr := args[1].Pointer()
  1431  	timesAddr := args[2].Pointer()
  1432  
  1433  	// "If filename is NULL and dfd refers to an open file, then operate on the
  1434  	// file. Otherwise look up filename, possibly using dfd as a starting
  1435  	// point." - fs/utimes.c
  1436  	var path fspath.Path
  1437  	shouldAllowEmptyPath := allowEmptyPath
  1438  	if dirfd == linux.AT_FDCWD || pathAddr != 0 {
  1439  		var err error
  1440  		path, err = copyInPath(t, pathAddr)
  1441  		if err != nil {
  1442  			return 0, nil, err
  1443  		}
  1444  		shouldAllowEmptyPath = disallowEmptyPath
  1445  	}
  1446  
  1447  	var opts vfs.SetStatOptions
  1448  	if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
  1449  		return 0, nil, err
  1450  	}
  1451  
  1452  	return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, followFinalSymlink, &opts)
  1453  }
  1454  
  1455  func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error {
  1456  	if timesAddr == 0 {
  1457  		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
  1458  		opts.Stat.Atime.Nsec = linux.UTIME_NOW
  1459  		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
  1460  		return nil
  1461  	}
  1462  	var times [2]linux.Timeval
  1463  	if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
  1464  		return err
  1465  	}
  1466  	if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
  1467  		return linuxerr.EINVAL
  1468  	}
  1469  	opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
  1470  	opts.Stat.Atime = linux.StatxTimestamp{
  1471  		Sec:  times[0].Sec,
  1472  		Nsec: uint32(times[0].Usec * 1000),
  1473  	}
  1474  	opts.Stat.Mtime = linux.StatxTimestamp{
  1475  		Sec:  times[1].Sec,
  1476  		Nsec: uint32(times[1].Usec * 1000),
  1477  	}
  1478  	return nil
  1479  }
  1480  
  1481  // Utimensat implements Linux syscall utimensat(2).
  1482  func Utimensat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1483  	dirfd := args[0].Int()
  1484  	pathAddr := args[1].Pointer()
  1485  	timesAddr := args[2].Pointer()
  1486  	flags := args[3].Int()
  1487  
  1488  	// Linux requires that the UTIME_OMIT check occur before checking path or
  1489  	// flags.
  1490  	var opts vfs.SetStatOptions
  1491  	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
  1492  		return 0, nil, err
  1493  	}
  1494  	if opts.Stat.Mask == 0 {
  1495  		return 0, nil, nil
  1496  	}
  1497  
  1498  	if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
  1499  		return 0, nil, linuxerr.EINVAL
  1500  	}
  1501  
  1502  	// "If filename is NULL and dfd refers to an open file, then operate on the
  1503  	// file. Otherwise look up filename, possibly using dfd as a starting
  1504  	// point." - fs/utimes.c
  1505  	var path fspath.Path
  1506  	shouldAllowEmptyPath := allowEmptyPath
  1507  	if dirfd == linux.AT_FDCWD || pathAddr != 0 {
  1508  		var err error
  1509  		path, err = copyInPath(t, pathAddr)
  1510  		if err != nil {
  1511  			return 0, nil, err
  1512  		}
  1513  		shouldAllowEmptyPath = disallowEmptyPath
  1514  	}
  1515  
  1516  	return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
  1517  }
  1518  
  1519  func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error {
  1520  	if timesAddr == 0 {
  1521  		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
  1522  		opts.Stat.Atime.Nsec = linux.UTIME_NOW
  1523  		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
  1524  		return nil
  1525  	}
  1526  	var times [2]linux.Timespec
  1527  	if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
  1528  		return err
  1529  	}
  1530  	if times[0].Nsec != linux.UTIME_OMIT {
  1531  		if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) {
  1532  			return linuxerr.EINVAL
  1533  		}
  1534  		opts.Stat.Mask |= linux.STATX_ATIME
  1535  		opts.Stat.Atime = linux.StatxTimestamp{
  1536  			Sec:  times[0].Sec,
  1537  			Nsec: uint32(times[0].Nsec),
  1538  		}
  1539  	}
  1540  	if times[1].Nsec != linux.UTIME_OMIT {
  1541  		if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) {
  1542  			return linuxerr.EINVAL
  1543  		}
  1544  		opts.Stat.Mask |= linux.STATX_MTIME
  1545  		opts.Stat.Mtime = linux.StatxTimestamp{
  1546  			Sec:  times[1].Sec,
  1547  			Nsec: uint32(times[1].Nsec),
  1548  		}
  1549  	}
  1550  	return nil
  1551  }
  1552  
  1553  // Rename implements Linux syscall rename(2).
  1554  func Rename(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1555  	oldpathAddr := args[0].Pointer()
  1556  	newpathAddr := args[1].Pointer()
  1557  	return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
  1558  }
  1559  
  1560  // Renameat implements Linux syscall renameat(2).
  1561  func Renameat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1562  	olddirfd := args[0].Int()
  1563  	oldpathAddr := args[1].Pointer()
  1564  	newdirfd := args[2].Int()
  1565  	newpathAddr := args[3].Pointer()
  1566  	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
  1567  }
  1568  
  1569  // Renameat2 implements Linux syscall renameat2(2).
  1570  func Renameat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1571  	olddirfd := args[0].Int()
  1572  	oldpathAddr := args[1].Pointer()
  1573  	newdirfd := args[2].Int()
  1574  	newpathAddr := args[3].Pointer()
  1575  	flags := args[4].Uint()
  1576  	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
  1577  }
  1578  
  1579  func renameat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags uint32) error {
  1580  	oldpath, err := copyInPath(t, oldpathAddr)
  1581  	if err != nil {
  1582  		return err
  1583  	}
  1584  	// "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
  1585  	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
  1586  	if err != nil {
  1587  		return err
  1588  	}
  1589  	defer oldtpop.Release(t)
  1590  
  1591  	newpath, err := copyInPath(t, newpathAddr)
  1592  	if err != nil {
  1593  		return err
  1594  	}
  1595  	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
  1596  	if err != nil {
  1597  		return err
  1598  	}
  1599  	defer newtpop.Release(t)
  1600  
  1601  	return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
  1602  		Flags: flags,
  1603  	})
  1604  }
  1605  
  1606  // Fallocate implements linux system call fallocate(2).
  1607  func Fallocate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1608  	fd := args[0].Int()
  1609  	mode := args[1].Uint64()
  1610  	offset := args[2].Int64()
  1611  	length := args[3].Int64()
  1612  
  1613  	file := t.GetFile(fd)
  1614  	if file == nil {
  1615  		return 0, nil, linuxerr.EBADF
  1616  	}
  1617  	defer file.DecRef(t)
  1618  
  1619  	if !file.IsWritable() {
  1620  		return 0, nil, linuxerr.EBADF
  1621  	}
  1622  	if mode != 0 {
  1623  		return 0, nil, linuxerr.ENOTSUP
  1624  	}
  1625  	if offset < 0 || length <= 0 {
  1626  		return 0, nil, linuxerr.EINVAL
  1627  	}
  1628  
  1629  	size := offset + length
  1630  	if size < 0 {
  1631  		return 0, nil, linuxerr.EFBIG
  1632  	}
  1633  	limit := limits.FromContext(t).Get(limits.FileSize).Cur
  1634  	if uint64(size) >= limit {
  1635  		t.SendSignal(&linux.SignalInfo{
  1636  			Signo: int32(linux.SIGXFSZ),
  1637  			Code:  linux.SI_USER,
  1638  		})
  1639  		return 0, nil, linuxerr.EFBIG
  1640  	}
  1641  
  1642  	return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length))
  1643  }
  1644  
  1645  // Flock implements linux syscall flock(2).
  1646  func Flock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1647  	fd := args[0].Int()
  1648  	operation := args[1].Int()
  1649  
  1650  	file := t.GetFile(fd)
  1651  	if file == nil {
  1652  		// flock(2): EBADF fd is not an open file descriptor.
  1653  		return 0, nil, linuxerr.EBADF
  1654  	}
  1655  	defer file.DecRef(t)
  1656  
  1657  	nonblocking := operation&linux.LOCK_NB != 0
  1658  	operation &^= linux.LOCK_NB
  1659  
  1660  	switch operation {
  1661  	case linux.LOCK_EX:
  1662  		if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.WriteLock, !nonblocking /* block */); err != nil {
  1663  			return 0, nil, err
  1664  		}
  1665  	case linux.LOCK_SH:
  1666  		if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.ReadLock, !nonblocking /* block */); err != nil {
  1667  			return 0, nil, err
  1668  		}
  1669  	case linux.LOCK_UN:
  1670  		if err := file.UnlockBSD(t); err != nil {
  1671  			return 0, nil, err
  1672  		}
  1673  	default:
  1674  		// flock(2): EINVAL operation is invalid.
  1675  		return 0, nil, linuxerr.EINVAL
  1676  	}
  1677  
  1678  	return 0, nil, nil
  1679  }
  1680  
  1681  const (
  1682  	memfdPrefix     = "memfd:"
  1683  	memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix)
  1684  	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
  1685  )
  1686  
  1687  // MemfdCreate implements the linux syscall memfd_create(2).
  1688  func MemfdCreate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1689  	addr := args[0].Pointer()
  1690  	flags := args[1].Uint()
  1691  
  1692  	if flags&^memfdAllFlags != 0 {
  1693  		// Unknown bits in flags.
  1694  		return 0, nil, linuxerr.EINVAL
  1695  	}
  1696  
  1697  	allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
  1698  	cloExec := flags&linux.MFD_CLOEXEC != 0
  1699  
  1700  	name, err := t.CopyInString(addr, memfdMaxNameLen)
  1701  	if err != nil {
  1702  		return 0, nil, err
  1703  	}
  1704  
  1705  	shmMount := t.Kernel().ShmMount()
  1706  	file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name)
  1707  	if err != nil {
  1708  		return 0, nil, err
  1709  	}
  1710  	defer file.DecRef(t)
  1711  
  1712  	fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
  1713  		CloseOnExec: cloExec,
  1714  	})
  1715  	if err != nil {
  1716  		return 0, nil, err
  1717  	}
  1718  
  1719  	return uintptr(fd), nil, nil
  1720  }