github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/syscalls/linux/sys_file.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"math"
    19  
    20  	"github.com/metacubex/gvisor/pkg/abi/linux"
    21  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    22  	"github.com/metacubex/gvisor/pkg/fspath"
    23  	"github.com/metacubex/gvisor/pkg/gohacks"
    24  	"github.com/metacubex/gvisor/pkg/hostarch"
    25  	"github.com/metacubex/gvisor/pkg/marshal/primitive"
    26  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    27  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/lock"
    28  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/tmpfs"
    29  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    30  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    31  	"github.com/metacubex/gvisor/pkg/sentry/kernel/fasync"
    32  	"github.com/metacubex/gvisor/pkg/sentry/kernel/pipe"
    33  	"github.com/metacubex/gvisor/pkg/sentry/limits"
    34  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    35  )
    36  
    37  // Mknod implements Linux syscall mknod(2).
    38  func Mknod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    39  	addr := args[0].Pointer()
    40  	mode := args[1].ModeT()
    41  	dev := args[2].Uint()
    42  	return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
    43  }
    44  
    45  // Mknodat implements Linux syscall mknodat(2).
    46  func Mknodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    47  	dirfd := args[0].Int()
    48  	addr := args[1].Pointer()
    49  	mode := args[2].ModeT()
    50  	dev := args[3].Uint()
    51  	return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
    52  }
    53  
    54  func mknodat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode linux.FileMode, dev uint32) error {
    55  	path, err := copyInPath(t, addr)
    56  	if err != nil {
    57  		return err
    58  	}
    59  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
    60  	if err != nil {
    61  		return err
    62  	}
    63  	defer tpop.Release(t)
    64  
    65  	// "Zero file type is equivalent to type S_IFREG." - mknod(2)
    66  	if mode.FileType() == 0 {
    67  		mode |= linux.ModeRegular
    68  	}
    69  	major, minor := linux.DecodeDeviceID(dev)
    70  	return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
    71  		Mode:     mode &^ linux.FileMode(t.FSContext().Umask()),
    72  		DevMajor: uint32(major),
    73  		DevMinor: minor,
    74  	})
    75  }
    76  
    77  // Open implements Linux syscall open(2).
    78  func Open(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    79  	addr := args[0].Pointer()
    80  	flags := args[1].Uint()
    81  	mode := args[2].ModeT()
    82  	return openat(t, linux.AT_FDCWD, addr, flags, mode)
    83  }
    84  
    85  // Openat implements Linux syscall openat(2).
    86  func Openat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    87  	dirfd := args[0].Int()
    88  	addr := args[1].Pointer()
    89  	flags := args[2].Uint()
    90  	mode := args[3].ModeT()
    91  	return openat(t, dirfd, addr, flags, mode)
    92  }
    93  
    94  // Creat implements Linux syscall creat(2).
    95  func Creat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    96  	addr := args[0].Pointer()
    97  	mode := args[1].ModeT()
    98  	return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
    99  }
   100  
   101  func openat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
   102  	path, err := copyInPath(t, pathAddr)
   103  	if err != nil {
   104  		return 0, nil, err
   105  	}
   106  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
   107  	if err != nil {
   108  		return 0, nil, err
   109  	}
   110  	defer tpop.Release(t)
   111  
   112  	file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
   113  		Flags: flags | linux.O_LARGEFILE,
   114  		Mode:  linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
   115  	})
   116  	if err != nil {
   117  		return 0, nil, err
   118  	}
   119  	defer file.DecRef(t)
   120  
   121  	fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
   122  		CloseOnExec: flags&linux.O_CLOEXEC != 0,
   123  	})
   124  	return uintptr(fd), nil, err
   125  }
   126  
   127  // Access implements Linux syscall access(2).
   128  func Access(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   129  	addr := args[0].Pointer()
   130  	mode := args[1].ModeT()
   131  
   132  	return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode, 0 /* flags */)
   133  }
   134  
   135  // Faccessat implements Linux syscall faccessat(2).
   136  func Faccessat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   137  	dirfd := args[0].Int()
   138  	addr := args[1].Pointer()
   139  	mode := args[2].ModeT()
   140  
   141  	return 0, nil, accessAt(t, dirfd, addr, mode, 0 /* flags */)
   142  }
   143  
   144  // Faccessat2 implements Linux syscall faccessat2(2).
   145  func Faccessat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   146  	dirfd := args[0].Int()
   147  	addr := args[1].Pointer()
   148  	mode := args[2].ModeT()
   149  	flags := args[3].Int()
   150  
   151  	return 0, nil, accessAt(t, dirfd, addr, mode, flags)
   152  }
   153  
   154  func accessAt(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint, flags int32) error {
   155  	const rOK = 4
   156  	const wOK = 2
   157  	const xOK = 1
   158  
   159  	// Sanity check the mode.
   160  	if mode&^(rOK|wOK|xOK) != 0 {
   161  		return linuxerr.EINVAL
   162  	}
   163  
   164  	// faccessat2(2) isn't documented as supporting AT_EMPTY_PATH, but it does.
   165  	if flags&^(linux.AT_EACCESS|linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH) != 0 {
   166  		return linuxerr.EINVAL
   167  	}
   168  
   169  	path, err := copyInPath(t, pathAddr)
   170  	if err != nil {
   171  		return err
   172  	}
   173  	tpop, err := getTaskPathOperation(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0))
   174  	if err != nil {
   175  		return err
   176  	}
   177  	defer tpop.Release(t)
   178  
   179  	creds := t.Credentials()
   180  	if flags&linux.AT_EACCESS == 0 {
   181  		// access(2) and faccessat(2) check permissions using real
   182  		// UID/GID, not effective UID/GID.
   183  		//
   184  		// "access() needs to use the real uid/gid, not the effective
   185  		// uid/gid. We do this by temporarily clearing all FS-related
   186  		// capabilities and switching the fsuid/fsgid around to the
   187  		// real ones." -fs/open.c:faccessat
   188  		creds = creds.Fork()
   189  		creds.EffectiveKUID = creds.RealKUID
   190  		creds.EffectiveKGID = creds.RealKGID
   191  		if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
   192  			creds.EffectiveCaps = creds.PermittedCaps
   193  		} else {
   194  			creds.EffectiveCaps = 0
   195  		}
   196  	}
   197  
   198  	return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop)
   199  }
   200  
   201  // Ioctl implements Linux syscall ioctl(2).
   202  func Ioctl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   203  	fd := args[0].Int()
   204  
   205  	file := t.GetFile(fd)
   206  	if file == nil {
   207  		return 0, nil, linuxerr.EBADF
   208  	}
   209  	defer file.DecRef(t)
   210  
   211  	if file.StatusFlags()&linux.O_PATH != 0 {
   212  		return 0, nil, linuxerr.EBADF
   213  	}
   214  
   215  	// Handle ioctls that apply to all FDs.
   216  	switch args[1].Int() {
   217  	case linux.FIONCLEX:
   218  		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
   219  			CloseOnExec: false,
   220  		})
   221  		return 0, nil, nil
   222  
   223  	case linux.FIOCLEX:
   224  		t.FDTable().SetFlags(t, fd, kernel.FDFlags{
   225  			CloseOnExec: true,
   226  		})
   227  		return 0, nil, nil
   228  
   229  	case linux.FIONBIO:
   230  		var set int32
   231  		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
   232  			return 0, nil, err
   233  		}
   234  		flags := file.StatusFlags()
   235  		if set != 0 {
   236  			flags |= linux.O_NONBLOCK
   237  		} else {
   238  			flags &^= linux.O_NONBLOCK
   239  		}
   240  		return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags)
   241  
   242  	case linux.FIOASYNC:
   243  		var set int32
   244  		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
   245  			return 0, nil, err
   246  		}
   247  		flags := file.StatusFlags()
   248  		if set != 0 {
   249  			flags |= linux.O_ASYNC
   250  		} else {
   251  			flags &^= linux.O_ASYNC
   252  		}
   253  		file.SetStatusFlags(t, t.Credentials(), flags)
   254  		return 0, nil, nil
   255  
   256  	case linux.FIOGETOWN, linux.SIOCGPGRP:
   257  		var who int32
   258  		owner, hasOwner := getAsyncOwner(t, file)
   259  		if hasOwner {
   260  			if owner.Type == linux.F_OWNER_PGRP {
   261  				who = -owner.PID
   262  			} else {
   263  				who = owner.PID
   264  			}
   265  		}
   266  		_, err := primitive.CopyInt32Out(t, args[2].Pointer(), who)
   267  		return 0, nil, err
   268  
   269  	case linux.FIOSETOWN, linux.SIOCSPGRP:
   270  		var who int32
   271  		if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil {
   272  			return 0, nil, err
   273  		}
   274  		ownerType := int32(linux.F_OWNER_PID)
   275  		if who < 0 {
   276  			// Check for overflow before flipping the sign.
   277  			if who-1 > who {
   278  				return 0, nil, linuxerr.EINVAL
   279  			}
   280  			ownerType = linux.F_OWNER_PGRP
   281  			who = -who
   282  		}
   283  		return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
   284  	}
   285  
   286  	ret, err := file.Ioctl(t, t.MemoryManager(), sysno, args)
   287  	return ret, nil, err
   288  }
   289  
   290  // Getcwd implements Linux syscall getcwd(2).
   291  func Getcwd(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   292  	addr := args[0].Pointer()
   293  	size := args[1].SizeT()
   294  
   295  	root := t.FSContext().RootDirectory()
   296  	wd := t.FSContext().WorkingDirectory()
   297  	s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
   298  	root.DecRef(t)
   299  	wd.DecRef(t)
   300  	if err != nil {
   301  		return 0, nil, err
   302  	}
   303  
   304  	// Note this is >= because we need a terminator.
   305  	if uint(len(s)) >= size {
   306  		return 0, nil, linuxerr.ERANGE
   307  	}
   308  
   309  	// Construct a byte slice containing a NUL terminator.
   310  	buf := t.CopyScratchBuffer(len(s) + 1)
   311  	copy(buf, s)
   312  	buf[len(buf)-1] = 0
   313  
   314  	// Write the pathname slice.
   315  	n, err := t.CopyOutBytes(addr, buf)
   316  	if err != nil {
   317  		return 0, nil, err
   318  	}
   319  	return uintptr(n), nil, nil
   320  }
   321  
   322  // Chdir implements Linux syscall chdir(2).
   323  func Chdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   324  	addr := args[0].Pointer()
   325  
   326  	path, err := copyInPath(t, addr)
   327  	if err != nil {
   328  		return 0, nil, err
   329  	}
   330  	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
   331  	if err != nil {
   332  		return 0, nil, err
   333  	}
   334  	defer tpop.Release(t)
   335  
   336  	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
   337  		CheckSearchable: true,
   338  	})
   339  	if err != nil {
   340  		return 0, nil, err
   341  	}
   342  	t.FSContext().SetWorkingDirectory(t, vd)
   343  	vd.DecRef(t)
   344  	return 0, nil, nil
   345  }
   346  
   347  // Fchdir implements Linux syscall fchdir(2).
   348  func Fchdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   349  	fd := args[0].Int()
   350  
   351  	tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
   352  	if err != nil {
   353  		return 0, nil, err
   354  	}
   355  	defer tpop.Release(t)
   356  
   357  	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
   358  		CheckSearchable: true,
   359  	})
   360  	if err != nil {
   361  		return 0, nil, err
   362  	}
   363  	t.FSContext().SetWorkingDirectory(t, vd)
   364  	vd.DecRef(t)
   365  	return 0, nil, nil
   366  }
   367  
   368  // Chroot implements Linux syscall chroot(2).
   369  func Chroot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   370  	addr := args[0].Pointer()
   371  
   372  	if !t.HasCapability(linux.CAP_SYS_CHROOT) {
   373  		return 0, nil, linuxerr.EPERM
   374  	}
   375  
   376  	path, err := copyInPath(t, addr)
   377  	if err != nil {
   378  		return 0, nil, err
   379  	}
   380  	tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
   381  	if err != nil {
   382  		return 0, nil, err
   383  	}
   384  	defer tpop.Release(t)
   385  
   386  	vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
   387  		CheckSearchable: true,
   388  	})
   389  	if err != nil {
   390  		return 0, nil, err
   391  	}
   392  	t.FSContext().SetRootDirectory(t, vd)
   393  	vd.DecRef(t)
   394  	return 0, nil, nil
   395  }
   396  
   397  // PivotRoot implements Linux syscall pivot_root(2).
   398  func PivotRoot(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   399  	addr1 := args[0].Pointer()
   400  	addr2 := args[1].Pointer()
   401  
   402  	if !t.HasCapability(linux.CAP_SYS_ADMIN) {
   403  		return 0, nil, linuxerr.EPERM
   404  	}
   405  
   406  	newRootPath, err := copyInPath(t, addr1)
   407  	if err != nil {
   408  		return 0, nil, err
   409  	}
   410  	newRootTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, newRootPath, disallowEmptyPath, followFinalSymlink)
   411  	if err != nil {
   412  		return 0, nil, err
   413  	}
   414  	defer newRootTpop.Release(t)
   415  	putOldPath, err := copyInPath(t, addr2)
   416  	if err != nil {
   417  		return 0, nil, err
   418  	}
   419  	putOldTpop, err := getTaskPathOperation(t, linux.AT_FDCWD, putOldPath, disallowEmptyPath, followFinalSymlink)
   420  	if err != nil {
   421  		return 0, nil, err
   422  	}
   423  	defer putOldTpop.Release(t)
   424  
   425  	newRoot, oldRoot, err := t.Kernel().VFS().PivotRoot(t, t.Credentials(), &newRootTpop.pop, &putOldTpop.pop)
   426  	if err != nil {
   427  		return 0, nil, err
   428  	}
   429  	defer newRoot.DecRef(t)
   430  	defer oldRoot.DecRef(t)
   431  	t.Kernel().ReplaceFSContextRoots(t, oldRoot, newRoot)
   432  	return 0, nil, nil
   433  }
   434  
   435  // Close implements Linux syscall close(2).
   436  func Close(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   437  	fd := args[0].Int()
   438  
   439  	// Note that Remove provides a reference on the file that we may use to
   440  	// flush. It is still active until we drop the final reference below
   441  	// (and other reference-holding operations complete).
   442  	file := t.FDTable().Remove(t, fd)
   443  	if file == nil {
   444  		return 0, nil, linuxerr.EBADF
   445  	}
   446  	defer file.DecRef(t)
   447  
   448  	err := file.OnClose(t)
   449  	return 0, nil, HandleIOError(t, false /* partial */, err, linuxerr.EINTR, "close", file)
   450  }
   451  
   452  // CloseRange implements linux syscall close_range(2).
   453  func CloseRange(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   454  	first := args[0].Uint()
   455  	last := args[1].Uint()
   456  	flags := args[2].Uint()
   457  
   458  	if (first > last) || (last > math.MaxInt32) {
   459  		return 0, nil, linuxerr.EINVAL
   460  	}
   461  
   462  	if (flags & ^(linux.CLOSE_RANGE_CLOEXEC | linux.CLOSE_RANGE_UNSHARE)) != 0 {
   463  		return 0, nil, linuxerr.EINVAL
   464  	}
   465  
   466  	cloexec := flags & linux.CLOSE_RANGE_CLOEXEC
   467  	unshare := flags & linux.CLOSE_RANGE_UNSHARE
   468  
   469  	if unshare != 0 {
   470  		// If possible, we don't want to copy FDs to the new unshared table, because those FDs will
   471  		// be promptly closed and no longer used. So in the case where we know the range extends all
   472  		// the way to the end of the FdTable, we can simply copy the FdTable only up to the start of
   473  		// the range that we are closing.
   474  		if cloexec == 0 && int32(last) >= t.FDTable().GetLastFd() {
   475  			t.UnshareFdTable(int32(first))
   476  		} else {
   477  			t.UnshareFdTable(math.MaxInt32)
   478  		}
   479  	}
   480  
   481  	if cloexec != 0 {
   482  		flagToApply := kernel.FDFlags{
   483  			CloseOnExec: true,
   484  		}
   485  		t.FDTable().SetFlagsForRange(t.AsyncContext(), int32(first), int32(last), flagToApply)
   486  		return 0, nil, nil
   487  	}
   488  
   489  	fdTable := t.FDTable()
   490  	fd := int32(first)
   491  	for {
   492  		fd, file := fdTable.RemoveNextInRange(t, fd, int32(last))
   493  		if file == nil {
   494  			break
   495  		}
   496  
   497  		fd++
   498  		// Per the close_range(2) documentation, errors upon closing file descriptors are ignored.
   499  		_ = file.OnClose(t)
   500  		file.DecRef(t)
   501  	}
   502  
   503  	return 0, nil, nil
   504  }
   505  
   506  // Dup implements Linux syscall dup(2).
   507  func Dup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   508  	fd := args[0].Int()
   509  
   510  	file := t.GetFile(fd)
   511  	if file == nil {
   512  		return 0, nil, linuxerr.EBADF
   513  	}
   514  	defer file.DecRef(t)
   515  
   516  	newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{})
   517  	if err != nil {
   518  		return 0, nil, linuxerr.EMFILE
   519  	}
   520  	return uintptr(newFD), nil, nil
   521  }
   522  
   523  // Dup2 implements Linux syscall dup2(2).
   524  func Dup2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   525  	oldfd := args[0].Int()
   526  	newfd := args[1].Int()
   527  
   528  	if oldfd == newfd {
   529  		// As long as oldfd is valid, dup2() does nothing and returns newfd.
   530  		file := t.GetFile(oldfd)
   531  		if file == nil {
   532  			return 0, nil, linuxerr.EBADF
   533  		}
   534  		file.DecRef(t)
   535  		return uintptr(newfd), nil, nil
   536  	}
   537  
   538  	return dup3(t, oldfd, newfd, 0)
   539  }
   540  
   541  // Dup3 implements Linux syscall dup3(2).
   542  func Dup3(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   543  	oldfd := args[0].Int()
   544  	newfd := args[1].Int()
   545  	flags := args[2].Uint()
   546  
   547  	if oldfd == newfd {
   548  		return 0, nil, linuxerr.EINVAL
   549  	}
   550  
   551  	return dup3(t, oldfd, newfd, flags)
   552  }
   553  
   554  func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
   555  	if flags&^linux.O_CLOEXEC != 0 {
   556  		return 0, nil, linuxerr.EINVAL
   557  	}
   558  
   559  	file := t.GetFile(oldfd)
   560  	if file == nil {
   561  		return 0, nil, linuxerr.EBADF
   562  	}
   563  	defer file.DecRef(t)
   564  
   565  	df, err := t.NewFDAt(newfd, file, kernel.FDFlags{
   566  		CloseOnExec: flags&linux.O_CLOEXEC != 0,
   567  	})
   568  	if linuxerr.Equals(linuxerr.EMFILE, err) {
   569  		err = linuxerr.EBADF
   570  	}
   571  	if err != nil {
   572  		return 0, nil, err
   573  	}
   574  	if df != nil {
   575  		// "If the file descriptor newfd was previously open, it is closed
   576  		// before being reused; the close is performed silently (i.e., any
   577  		// errors during the close are not reported by dup2())." - dup(2)
   578  		_ = df.OnClose(t)
   579  		df.DecRef(t)
   580  	}
   581  	return uintptr(newfd), nil, nil
   582  }
   583  
   584  // Fcntl implements linux syscall fcntl(2).
   585  func Fcntl(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   586  	fd := args[0].Int()
   587  	cmd := args[1].Int()
   588  
   589  	file, flags := t.FDTable().Get(fd)
   590  	if file == nil {
   591  		return 0, nil, linuxerr.EBADF
   592  	}
   593  	defer file.DecRef(t)
   594  
   595  	if file.StatusFlags()&linux.O_PATH != 0 {
   596  		switch cmd {
   597  		case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL:
   598  			// allowed
   599  		default:
   600  			return 0, nil, linuxerr.EBADF
   601  		}
   602  	}
   603  
   604  	switch cmd {
   605  	case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
   606  		minfd := args[2].Int()
   607  		fd, err := t.NewFDFrom(minfd, file, kernel.FDFlags{
   608  			CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
   609  		})
   610  		if err != nil {
   611  			return 0, nil, err
   612  		}
   613  		return uintptr(fd), nil, nil
   614  	case linux.F_GETFD:
   615  		return uintptr(flags.ToLinuxFDFlags()), nil, nil
   616  	case linux.F_SETFD:
   617  		flags := args[2].Uint()
   618  		err := t.FDTable().SetFlags(t, fd, kernel.FDFlags{
   619  			CloseOnExec: flags&linux.FD_CLOEXEC != 0,
   620  		})
   621  		return 0, nil, err
   622  	case linux.F_GETFL:
   623  		return uintptr(file.StatusFlags()), nil, nil
   624  	case linux.F_SETFL:
   625  		return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
   626  	case linux.F_GETOWN:
   627  		owner, hasOwner := getAsyncOwner(t, file)
   628  		if !hasOwner {
   629  			return 0, nil, nil
   630  		}
   631  		if owner.Type == linux.F_OWNER_PGRP {
   632  			return uintptr(-owner.PID), nil, nil
   633  		}
   634  		return uintptr(owner.PID), nil, nil
   635  	case linux.F_SETOWN:
   636  		who := args[2].Int()
   637  		ownerType := int32(linux.F_OWNER_PID)
   638  		if who < 0 {
   639  			// Check for overflow before flipping the sign.
   640  			if who-1 > who {
   641  				return 0, nil, linuxerr.EINVAL
   642  			}
   643  			ownerType = linux.F_OWNER_PGRP
   644  			who = -who
   645  		}
   646  		return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
   647  	case linux.F_GETOWN_EX:
   648  		owner, hasOwner := getAsyncOwner(t, file)
   649  		if !hasOwner {
   650  			return 0, nil, nil
   651  		}
   652  		_, err := owner.CopyOut(t, args[2].Pointer())
   653  		return 0, nil, err
   654  	case linux.F_SETOWN_EX:
   655  		var owner linux.FOwnerEx
   656  		_, err := owner.CopyIn(t, args[2].Pointer())
   657  		if err != nil {
   658  			return 0, nil, err
   659  		}
   660  		return 0, nil, setAsyncOwner(t, int(fd), file, owner.Type, owner.PID)
   661  	case linux.F_SETPIPE_SZ:
   662  		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
   663  		if !ok {
   664  			return 0, nil, linuxerr.EBADF
   665  		}
   666  		n, err := pipefile.SetPipeSize(int64(args[2].Int()))
   667  		if err != nil {
   668  			return 0, nil, err
   669  		}
   670  		return uintptr(n), nil, nil
   671  	case linux.F_GETPIPE_SZ:
   672  		pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
   673  		if !ok {
   674  			return 0, nil, linuxerr.EBADF
   675  		}
   676  		return uintptr(pipefile.PipeSize()), nil, nil
   677  	case linux.F_GET_SEALS:
   678  		val, err := tmpfs.GetSeals(file)
   679  		return uintptr(val), nil, err
   680  	case linux.F_ADD_SEALS:
   681  		if !file.IsWritable() {
   682  			return 0, nil, linuxerr.EPERM
   683  		}
   684  		err := tmpfs.AddSeals(file, args[2].Uint())
   685  		return 0, nil, err
   686  	case linux.F_SETLK:
   687  		return 0, nil, posixLock(t, args, file, false /* ofd */, false /* block */)
   688  	case linux.F_SETLKW:
   689  		return 0, nil, posixLock(t, args, file, false /* ofd */, true /* block */)
   690  	case linux.F_GETLK:
   691  		return 0, nil, posixTestLock(t, args, file, false /* ofd */)
   692  	case linux.F_OFD_SETLK:
   693  		return 0, nil, posixLock(t, args, file, true /* ofd */, false /* block */)
   694  	case linux.F_OFD_SETLKW:
   695  		return 0, nil, posixLock(t, args, file, true /* ofd */, true /* block */)
   696  	case linux.F_OFD_GETLK:
   697  		return 0, nil, posixTestLock(t, args, file, true /* ofd */)
   698  	case linux.F_GETSIG:
   699  		a := file.AsyncHandler()
   700  		if a == nil {
   701  			// Default behavior aka SIGIO.
   702  			return 0, nil, nil
   703  		}
   704  		return uintptr(a.(*fasync.FileAsync).Signal()), nil, nil
   705  	case linux.F_SETSIG:
   706  		a, err := file.SetAsyncHandler(fasync.New(int(fd)))
   707  		if err != nil {
   708  			return 0, nil, err
   709  		}
   710  		async := a.(*fasync.FileAsync)
   711  		return 0, nil, async.SetSignal(linux.Signal(args[2].Int()))
   712  	default:
   713  		// Everything else is not yet supported.
   714  		return 0, nil, linuxerr.EINVAL
   715  	}
   716  }
   717  
   718  func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) {
   719  	a := fd.AsyncHandler()
   720  	if a == nil {
   721  		return linux.FOwnerEx{}, false
   722  	}
   723  
   724  	ot, otg, opg := a.(*fasync.FileAsync).Owner()
   725  	switch {
   726  	case ot != nil:
   727  		return linux.FOwnerEx{
   728  			Type: linux.F_OWNER_TID,
   729  			PID:  int32(t.PIDNamespace().IDOfTask(ot)),
   730  		}, true
   731  	case otg != nil:
   732  		return linux.FOwnerEx{
   733  			Type: linux.F_OWNER_PID,
   734  			PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
   735  		}, true
   736  	case opg != nil:
   737  		return linux.FOwnerEx{
   738  			Type: linux.F_OWNER_PGRP,
   739  			PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
   740  		}, true
   741  	default:
   742  		return linux.FOwnerEx{}, true
   743  	}
   744  }
   745  
   746  func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, pid int32) error {
   747  	switch ownerType {
   748  	case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
   749  		// Acceptable type.
   750  	default:
   751  		return linuxerr.EINVAL
   752  	}
   753  
   754  	a, err := file.SetAsyncHandler(fasync.New(fd))
   755  	if err != nil {
   756  		return err
   757  	}
   758  	async := a.(*fasync.FileAsync)
   759  	if pid == 0 {
   760  		async.ClearOwner()
   761  		return nil
   762  	}
   763  
   764  	switch ownerType {
   765  	case linux.F_OWNER_TID:
   766  		task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
   767  		if task == nil {
   768  			return linuxerr.ESRCH
   769  		}
   770  		async.SetOwnerTask(t, task)
   771  		return nil
   772  	case linux.F_OWNER_PID:
   773  		tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
   774  		if tg == nil {
   775  			return linuxerr.ESRCH
   776  		}
   777  		async.SetOwnerThreadGroup(t, tg)
   778  		return nil
   779  	case linux.F_OWNER_PGRP:
   780  		pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid))
   781  		if pg == nil {
   782  			return linuxerr.ESRCH
   783  		}
   784  		async.SetOwnerProcessGroup(t, pg)
   785  		return nil
   786  	default:
   787  		return linuxerr.EINVAL
   788  	}
   789  }
   790  
   791  func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool) error {
   792  	// Copy in the lock request.
   793  	flockAddr := args[2].Pointer()
   794  	var flock linux.Flock
   795  	if _, err := flock.CopyIn(t, flockAddr); err != nil {
   796  		return err
   797  	}
   798  	var typ lock.LockType
   799  	switch flock.Type {
   800  	case linux.F_RDLCK:
   801  		typ = lock.ReadLock
   802  	case linux.F_WRLCK:
   803  		typ = lock.WriteLock
   804  	default:
   805  		return linuxerr.EINVAL
   806  	}
   807  	r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence)
   808  	if err != nil {
   809  		return err
   810  	}
   811  	uid := lock.UniqueID(t.FDTable())
   812  	if ofd {
   813  		uid = lock.UniqueID(file)
   814  	}
   815  
   816  	newFlock, err := file.TestPOSIX(t, uid, typ, r)
   817  	if err != nil {
   818  		return err
   819  	}
   820  	if !ofd {
   821  		newFlock.PID = translatePID(t.PIDNamespace().Root(), t.PIDNamespace(), newFlock.PID)
   822  	}
   823  	if _, err = newFlock.CopyOut(t, flockAddr); err != nil {
   824  		return err
   825  	}
   826  	return nil
   827  }
   828  
   829  // translatePID translates a pid from one namespace to another. Note that this
   830  // may race with task termination/creation, in which case the original task
   831  // corresponding to pid may no longer exist. This is used to implement the
   832  // F_GETLK fcntl, which has the same potential race in Linux as well (i.e.,
   833  // there is no synchronization between retrieving the lock PID and translating
   834  // it). See fs/locks.c:posix_lock_to_flock.
   835  func translatePID(old, new *kernel.PIDNamespace, pid int32) int32 {
   836  	return int32(new.IDOfTask(old.TaskWithID(kernel.ThreadID(pid))))
   837  }
   838  
   839  func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, ofd bool, block bool) error {
   840  	// Copy in the lock request.
   841  	flockAddr := args[2].Pointer()
   842  	var flock linux.Flock
   843  	if _, err := flock.CopyIn(t, flockAddr); err != nil {
   844  		return err
   845  	}
   846  	if ofd && flock.PID != 0 {
   847  		return linuxerr.EINVAL
   848  	}
   849  
   850  	uid := lock.UniqueID(t.FDTable())
   851  	pid := int32(t.TGIDInRoot())
   852  	if ofd {
   853  		uid = lock.UniqueID(file)
   854  		pid = -1
   855  	}
   856  
   857  	r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence)
   858  	if err != nil {
   859  		return err
   860  	}
   861  
   862  	switch flock.Type {
   863  	case linux.F_RDLCK:
   864  		if !file.IsReadable() {
   865  			return linuxerr.EBADF
   866  		}
   867  		return file.LockPOSIX(t, uid, pid, lock.ReadLock, r, block)
   868  
   869  	case linux.F_WRLCK:
   870  		if !file.IsWritable() {
   871  			return linuxerr.EBADF
   872  		}
   873  		return file.LockPOSIX(t, uid, pid, lock.WriteLock, r, block)
   874  
   875  	case linux.F_UNLCK:
   876  		return file.UnlockPOSIX(t, uid, r)
   877  
   878  	default:
   879  		return linuxerr.EINVAL
   880  	}
   881  }
   882  
   883  // Fadvise64 implements fadvise64(2).
   884  // This implementation currently ignores the provided advice.
   885  func Fadvise64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   886  	fd := args[0].Int()
   887  	length := args[2].Int64()
   888  	advice := args[3].Int()
   889  
   890  	// Note: offset is allowed to be negative.
   891  	if length < 0 {
   892  		return 0, nil, linuxerr.EINVAL
   893  	}
   894  
   895  	file := t.GetFile(fd)
   896  	if file == nil {
   897  		return 0, nil, linuxerr.EBADF
   898  	}
   899  	defer file.DecRef(t)
   900  
   901  	if file.StatusFlags()&linux.O_PATH != 0 {
   902  		return 0, nil, linuxerr.EBADF
   903  	}
   904  
   905  	// If the FD refers to a pipe or FIFO, return error.
   906  	if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe {
   907  		return 0, nil, linuxerr.ESPIPE
   908  	}
   909  
   910  	switch advice {
   911  	case linux.POSIX_FADV_NORMAL:
   912  	case linux.POSIX_FADV_RANDOM:
   913  	case linux.POSIX_FADV_SEQUENTIAL:
   914  	case linux.POSIX_FADV_WILLNEED:
   915  	case linux.POSIX_FADV_DONTNEED:
   916  	case linux.POSIX_FADV_NOREUSE:
   917  	default:
   918  		return 0, nil, linuxerr.EINVAL
   919  	}
   920  
   921  	// Sure, whatever.
   922  	return 0, nil, nil
   923  }
   924  
   925  // Mkdir implements Linux syscall mkdir(2).
   926  func Mkdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   927  	addr := args[0].Pointer()
   928  	mode := args[1].ModeT()
   929  	return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
   930  }
   931  
   932  // Mkdirat implements Linux syscall mkdirat(2).
   933  func Mkdirat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   934  	dirfd := args[0].Int()
   935  	addr := args[1].Pointer()
   936  	mode := args[2].ModeT()
   937  	return 0, nil, mkdirat(t, dirfd, addr, mode)
   938  }
   939  
   940  func mkdirat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode uint) error {
   941  	path, err := copyInPath(t, addr)
   942  	if err != nil {
   943  		return err
   944  	}
   945  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
   946  	if err != nil {
   947  		return err
   948  	}
   949  	defer tpop.Release(t)
   950  	return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
   951  		Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
   952  	})
   953  }
   954  
   955  // Rmdir implements Linux syscall rmdir(2).
   956  func Rmdir(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   957  	pathAddr := args[0].Pointer()
   958  	return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
   959  }
   960  
   961  func rmdirat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error {
   962  	path, err := copyInPath(t, pathAddr)
   963  	if err != nil {
   964  		return err
   965  	}
   966  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
   967  	if err != nil {
   968  		return err
   969  	}
   970  	defer tpop.Release(t)
   971  	return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
   972  }
   973  
   974  // Symlink implements Linux syscall symlink(2).
   975  func Symlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   976  	targetAddr := args[0].Pointer()
   977  	linkpathAddr := args[1].Pointer()
   978  	return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
   979  }
   980  
   981  // Symlinkat implements Linux syscall symlinkat(2).
   982  func Symlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   983  	targetAddr := args[0].Pointer()
   984  	newdirfd := args[1].Int()
   985  	linkpathAddr := args[2].Pointer()
   986  	return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
   987  }
   988  
   989  func symlinkat(t *kernel.Task, targetAddr hostarch.Addr, newdirfd int32, linkpathAddr hostarch.Addr) error {
   990  	target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
   991  	if err != nil {
   992  		return err
   993  	}
   994  	if len(target) == 0 {
   995  		return linuxerr.ENOENT
   996  	}
   997  	linkpath, err := copyInPath(t, linkpathAddr)
   998  	if err != nil {
   999  		return err
  1000  	}
  1001  	tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
  1002  	if err != nil {
  1003  		return err
  1004  	}
  1005  	defer tpop.Release(t)
  1006  	return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
  1007  }
  1008  
  1009  // Link implements Linux syscall link(2).
  1010  func Link(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1011  	oldpathAddr := args[0].Pointer()
  1012  	newpathAddr := args[1].Pointer()
  1013  	return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
  1014  }
  1015  
  1016  // Linkat implements Linux syscall linkat(2).
  1017  func Linkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1018  	olddirfd := args[0].Int()
  1019  	oldpathAddr := args[1].Pointer()
  1020  	newdirfd := args[2].Int()
  1021  	newpathAddr := args[3].Pointer()
  1022  	flags := args[4].Int()
  1023  	return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
  1024  }
  1025  
  1026  func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags int32) error {
  1027  	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
  1028  		return linuxerr.EINVAL
  1029  	}
  1030  	if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
  1031  		return linuxerr.ENOENT
  1032  	}
  1033  
  1034  	oldpath, err := copyInPath(t, oldpathAddr)
  1035  	if err != nil {
  1036  		return err
  1037  	}
  1038  	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
  1039  	if err != nil {
  1040  		return err
  1041  	}
  1042  	defer oldtpop.Release(t)
  1043  
  1044  	newpath, err := copyInPath(t, newpathAddr)
  1045  	if err != nil {
  1046  		return err
  1047  	}
  1048  	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
  1049  	if err != nil {
  1050  		return err
  1051  	}
  1052  	defer newtpop.Release(t)
  1053  
  1054  	return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
  1055  }
  1056  
  1057  // Readlinkat implements Linux syscall readlinkat(2).
  1058  func Readlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1059  	dirfd := args[0].Int()
  1060  	pathAddr := args[1].Pointer()
  1061  	bufAddr := args[2].Pointer()
  1062  	size := args[3].SizeT()
  1063  	return readlinkat(t, dirfd, pathAddr, bufAddr, size)
  1064  }
  1065  
  1066  // Readlink implements Linux syscall readlink(2).
  1067  func Readlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1068  	pathAddr := args[0].Pointer()
  1069  	bufAddr := args[1].Pointer()
  1070  	size := args[2].SizeT()
  1071  	return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
  1072  }
  1073  
  1074  func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr hostarch.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
  1075  	if int(size) <= 0 {
  1076  		return 0, nil, linuxerr.EINVAL
  1077  	}
  1078  
  1079  	path, err := copyInPath(t, pathAddr)
  1080  	if err != nil {
  1081  		return 0, nil, err
  1082  	}
  1083  	// "Since Linux 2.6.39, pathname can be an empty string, in which case the
  1084  	// call operates on the symbolic link referred to by dirfd ..." -
  1085  	// readlinkat(2)
  1086  	tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
  1087  	if err != nil {
  1088  		return 0, nil, err
  1089  	}
  1090  	defer tpop.Release(t)
  1091  
  1092  	target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
  1093  	if err != nil {
  1094  		return 0, nil, err
  1095  	}
  1096  
  1097  	if len(target) > int(size) {
  1098  		target = target[:size]
  1099  	}
  1100  	n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
  1101  	if n == 0 {
  1102  		return 0, nil, err
  1103  	}
  1104  	return uintptr(n), nil, nil
  1105  }
  1106  
  1107  // Unlink implements Linux syscall unlink(2).
  1108  func Unlink(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1109  	pathAddr := args[0].Pointer()
  1110  	return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
  1111  }
  1112  
  1113  func unlinkat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error {
  1114  	path, err := copyInPath(t, pathAddr)
  1115  	if err != nil {
  1116  		return err
  1117  	}
  1118  	tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
  1119  	if err != nil {
  1120  		return err
  1121  	}
  1122  	defer tpop.Release(t)
  1123  	return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
  1124  }
  1125  
  1126  // Unlinkat implements Linux syscall unlinkat(2).
  1127  func Unlinkat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1128  	dirfd := args[0].Int()
  1129  	pathAddr := args[1].Pointer()
  1130  	flags := args[2].Int()
  1131  
  1132  	if flags&^linux.AT_REMOVEDIR != 0 {
  1133  		return 0, nil, linuxerr.EINVAL
  1134  	}
  1135  
  1136  	if flags&linux.AT_REMOVEDIR != 0 {
  1137  		return 0, nil, rmdirat(t, dirfd, pathAddr)
  1138  	}
  1139  	return 0, nil, unlinkat(t, dirfd, pathAddr)
  1140  }
  1141  
  1142  func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
  1143  	root := t.FSContext().RootDirectory()
  1144  	defer root.DecRef(t)
  1145  	start := root
  1146  	if !path.Absolute {
  1147  		if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
  1148  			return linuxerr.ENOENT
  1149  		}
  1150  		if dirfd == linux.AT_FDCWD {
  1151  			start = t.FSContext().WorkingDirectory()
  1152  			defer start.DecRef(t)
  1153  		} else {
  1154  			dirfile := t.GetFile(dirfd)
  1155  			if dirfile == nil {
  1156  				return linuxerr.EBADF
  1157  			}
  1158  			if !path.HasComponents() && dirfile.StatusFlags()&linux.O_PATH == 0 {
  1159  				// For empty path, use FileDescription.SetStat() instead of
  1160  				// VirtualFilesystem.SetStatAt(), since the former may be able to use
  1161  				// opened file state to expedite the SetStat. Skip this optimization
  1162  				// for FDs with O_PATH, since the FD impl always returns EBADF.
  1163  				err := dirfile.SetStat(t, *opts)
  1164  				dirfile.DecRef(t)
  1165  				return err
  1166  			}
  1167  			start = dirfile.VirtualDentry()
  1168  			start.IncRef()
  1169  			defer start.DecRef(t)
  1170  			dirfile.DecRef(t)
  1171  		}
  1172  	}
  1173  	return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
  1174  		Root:               root,
  1175  		Start:              start,
  1176  		Path:               path,
  1177  		FollowFinalSymlink: bool(shouldFollowFinalSymlink),
  1178  	}, opts)
  1179  }
  1180  
  1181  func handleSetSizeError(t *kernel.Task, err error) error {
  1182  	if err == linuxerr.ErrExceedsFileSizeLimit {
  1183  		// Convert error to EFBIG and send a SIGXFSZ per setrlimit(2).
  1184  		t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
  1185  		return linuxerr.EFBIG
  1186  	}
  1187  	return err
  1188  }
  1189  
  1190  // Truncate implements Linux syscall truncate(2).
  1191  func Truncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1192  	addr := args[0].Pointer()
  1193  	length := args[1].Int64()
  1194  
  1195  	if length < 0 {
  1196  		return 0, nil, linuxerr.EINVAL
  1197  	}
  1198  
  1199  	path, err := copyInPath(t, addr)
  1200  	if err != nil {
  1201  		return 0, nil, err
  1202  	}
  1203  
  1204  	err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
  1205  		Stat: linux.Statx{
  1206  			Mask: linux.STATX_SIZE,
  1207  			Size: uint64(length),
  1208  		},
  1209  		NeedWritePerm: true,
  1210  	})
  1211  	return 0, nil, handleSetSizeError(t, err)
  1212  }
  1213  
  1214  // Ftruncate implements Linux syscall ftruncate(2).
  1215  func Ftruncate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1216  	fd := args[0].Int()
  1217  	length := args[1].Int64()
  1218  
  1219  	if length < 0 {
  1220  		return 0, nil, linuxerr.EINVAL
  1221  	}
  1222  
  1223  	file := t.GetFile(fd)
  1224  	if file == nil {
  1225  		return 0, nil, linuxerr.EBADF
  1226  	}
  1227  	defer file.DecRef(t)
  1228  
  1229  	if !file.IsWritable() {
  1230  		return 0, nil, linuxerr.EINVAL
  1231  	}
  1232  
  1233  	err := file.SetStat(t, vfs.SetStatOptions{
  1234  		Stat: linux.Statx{
  1235  			Mask: linux.STATX_SIZE,
  1236  			Size: uint64(length),
  1237  		},
  1238  	})
  1239  	return 0, nil, handleSetSizeError(t, err)
  1240  }
  1241  
  1242  // Umask implements linux syscall umask(2).
  1243  func Umask(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1244  	mask := args[0].ModeT()
  1245  	mask = t.FSContext().SwapUmask(mask & 0777)
  1246  	return uintptr(mask), nil, nil
  1247  }
  1248  
  1249  // Chown implements Linux syscall chown(2).
  1250  func Chown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1251  	pathAddr := args[0].Pointer()
  1252  	owner := args[1].Int()
  1253  	group := args[2].Int()
  1254  	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
  1255  }
  1256  
  1257  // Lchown implements Linux syscall lchown(2).
  1258  func Lchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1259  	pathAddr := args[0].Pointer()
  1260  	owner := args[1].Int()
  1261  	group := args[2].Int()
  1262  	return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
  1263  }
  1264  
  1265  // Fchownat implements Linux syscall fchownat(2).
  1266  func Fchownat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1267  	dirfd := args[0].Int()
  1268  	pathAddr := args[1].Pointer()
  1269  	owner := args[2].Int()
  1270  	group := args[3].Int()
  1271  	flags := args[4].Int()
  1272  	return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
  1273  }
  1274  
  1275  func fchownat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, owner, group, flags int32) error {
  1276  	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
  1277  		return linuxerr.EINVAL
  1278  	}
  1279  
  1280  	path, err := copyInPath(t, pathAddr)
  1281  	if err != nil {
  1282  		return err
  1283  	}
  1284  
  1285  	var opts vfs.SetStatOptions
  1286  	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
  1287  		return err
  1288  	}
  1289  
  1290  	return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
  1291  }
  1292  
  1293  func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
  1294  	userns := t.UserNamespace()
  1295  	if owner != -1 {
  1296  		kuid := userns.MapToKUID(auth.UID(owner))
  1297  		if !kuid.Ok() {
  1298  			return linuxerr.EINVAL
  1299  		}
  1300  		opts.Stat.Mask |= linux.STATX_UID
  1301  		opts.Stat.UID = uint32(kuid)
  1302  	}
  1303  	if group != -1 {
  1304  		kgid := userns.MapToKGID(auth.GID(group))
  1305  		if !kgid.Ok() {
  1306  			return linuxerr.EINVAL
  1307  		}
  1308  		opts.Stat.Mask |= linux.STATX_GID
  1309  		opts.Stat.GID = uint32(kgid)
  1310  	}
  1311  	return nil
  1312  }
  1313  
  1314  // Fchown implements Linux syscall fchown(2).
  1315  func Fchown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1316  	fd := args[0].Int()
  1317  	owner := args[1].Int()
  1318  	group := args[2].Int()
  1319  
  1320  	file := t.GetFile(fd)
  1321  	if file == nil {
  1322  		return 0, nil, linuxerr.EBADF
  1323  	}
  1324  	defer file.DecRef(t)
  1325  
  1326  	var opts vfs.SetStatOptions
  1327  	if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
  1328  		return 0, nil, err
  1329  	}
  1330  	return 0, nil, file.SetStat(t, opts)
  1331  }
  1332  
  1333  const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
  1334  
  1335  // Chmod implements Linux syscall chmod(2).
  1336  func Chmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1337  	pathAddr := args[0].Pointer()
  1338  	mode := args[1].ModeT()
  1339  	return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
  1340  }
  1341  
  1342  // Fchmodat implements Linux syscall fchmodat(2).
  1343  func Fchmodat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1344  	dirfd := args[0].Int()
  1345  	pathAddr := args[1].Pointer()
  1346  	mode := args[2].ModeT()
  1347  	return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
  1348  }
  1349  
  1350  func fchmodat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) error {
  1351  	path, err := copyInPath(t, pathAddr)
  1352  	if err != nil {
  1353  		return err
  1354  	}
  1355  
  1356  	return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
  1357  		Stat: linux.Statx{
  1358  			Mask: linux.STATX_MODE,
  1359  			Mode: uint16(mode & chmodMask),
  1360  		},
  1361  	})
  1362  }
  1363  
  1364  // Fchmod implements Linux syscall fchmod(2).
  1365  func Fchmod(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1366  	fd := args[0].Int()
  1367  	mode := args[1].ModeT()
  1368  
  1369  	file := t.GetFile(fd)
  1370  	if file == nil {
  1371  		return 0, nil, linuxerr.EBADF
  1372  	}
  1373  	defer file.DecRef(t)
  1374  
  1375  	return 0, nil, file.SetStat(t, vfs.SetStatOptions{
  1376  		Stat: linux.Statx{
  1377  			Mask: linux.STATX_MODE,
  1378  			Mode: uint16(mode & chmodMask),
  1379  		},
  1380  	})
  1381  }
  1382  
  1383  // Utime implements Linux syscall utime(2).
  1384  func Utime(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1385  	pathAddr := args[0].Pointer()
  1386  	timesAddr := args[1].Pointer()
  1387  
  1388  	opts := vfs.SetStatOptions{
  1389  		Stat: linux.Statx{
  1390  			Mask: linux.STATX_ATIME | linux.STATX_MTIME,
  1391  		},
  1392  	}
  1393  	if timesAddr == 0 {
  1394  		opts.Stat.Atime.Nsec = linux.UTIME_NOW
  1395  		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
  1396  	} else {
  1397  		var times linux.Utime
  1398  		if _, err := times.CopyIn(t, timesAddr); err != nil {
  1399  			return 0, nil, err
  1400  		}
  1401  		opts.Stat.Atime.Sec = times.Actime
  1402  		opts.Stat.Mtime.Sec = times.Modtime
  1403  	}
  1404  
  1405  	return 0, nil, utimes(t, linux.AT_FDCWD, pathAddr, followFinalSymlink, &opts)
  1406  }
  1407  
  1408  // Utimes implements Linux syscall utimes(2).
  1409  func Utimes(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1410  	pathAddr := args[0].Pointer()
  1411  	timesAddr := args[1].Pointer()
  1412  
  1413  	var opts vfs.SetStatOptions
  1414  	if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
  1415  		return 0, nil, err
  1416  	}
  1417  
  1418  	return 0, nil, utimes(t, linux.AT_FDCWD, pathAddr, followFinalSymlink, &opts)
  1419  }
  1420  
  1421  // Futimesat implements Linux syscall futimesat(2).
  1422  func Futimesat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1423  	dirfd := args[0].Int()
  1424  	pathAddr := args[1].Pointer()
  1425  	timesAddr := args[2].Pointer()
  1426  
  1427  	var opts vfs.SetStatOptions
  1428  	if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
  1429  		return 0, nil, err
  1430  	}
  1431  
  1432  	return 0, nil, utimes(t, dirfd, pathAddr, followFinalSymlink, &opts)
  1433  }
  1434  
  1435  func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error {
  1436  	if timesAddr == 0 {
  1437  		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
  1438  		opts.Stat.Atime.Nsec = linux.UTIME_NOW
  1439  		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
  1440  		return nil
  1441  	}
  1442  	var times [2]linux.Timeval
  1443  	if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
  1444  		return err
  1445  	}
  1446  	if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
  1447  		return linuxerr.EINVAL
  1448  	}
  1449  	opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
  1450  	opts.Stat.Atime = linux.StatxTimestamp{
  1451  		Sec:  times[0].Sec,
  1452  		Nsec: uint32(times[0].Usec * 1000),
  1453  	}
  1454  	opts.Stat.Mtime = linux.StatxTimestamp{
  1455  		Sec:  times[1].Sec,
  1456  		Nsec: uint32(times[1].Usec * 1000),
  1457  	}
  1458  	return nil
  1459  }
  1460  
  1461  // Utimensat implements Linux syscall utimensat(2).
  1462  func Utimensat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1463  	dirfd := args[0].Int()
  1464  	pathAddr := args[1].Pointer()
  1465  	timesAddr := args[2].Pointer()
  1466  	flags := args[3].Int()
  1467  
  1468  	// Linux requires that the UTIME_OMIT check occur before flags.
  1469  	var opts vfs.SetStatOptions
  1470  	if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
  1471  		return 0, nil, err
  1472  	}
  1473  	if opts.Stat.Mask == 0 {
  1474  		return 0, nil, nil
  1475  	}
  1476  
  1477  	if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
  1478  		return 0, nil, linuxerr.EINVAL
  1479  	}
  1480  
  1481  	return 0, nil, utimes(t, dirfd, pathAddr, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
  1482  }
  1483  
  1484  func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error {
  1485  	if timesAddr == 0 {
  1486  		opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
  1487  		opts.Stat.Atime.Nsec = linux.UTIME_NOW
  1488  		opts.Stat.Mtime.Nsec = linux.UTIME_NOW
  1489  		return nil
  1490  	}
  1491  	var times [2]linux.Timespec
  1492  	if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
  1493  		return err
  1494  	}
  1495  	if times[0].Nsec != linux.UTIME_OMIT {
  1496  		if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) {
  1497  			return linuxerr.EINVAL
  1498  		}
  1499  		opts.Stat.Mask |= linux.STATX_ATIME
  1500  		opts.Stat.Atime = linux.StatxTimestamp{
  1501  			Sec:  times[0].Sec,
  1502  			Nsec: uint32(times[0].Nsec),
  1503  		}
  1504  	}
  1505  	if times[1].Nsec != linux.UTIME_OMIT {
  1506  		if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) {
  1507  			return linuxerr.EINVAL
  1508  		}
  1509  		opts.Stat.Mask |= linux.STATX_MTIME
  1510  		opts.Stat.Mtime = linux.StatxTimestamp{
  1511  			Sec:  times[1].Sec,
  1512  			Nsec: uint32(times[1].Nsec),
  1513  		}
  1514  	}
  1515  	return nil
  1516  }
  1517  
  1518  // Analogous to fs/utimes.c:do_utimes().
  1519  func utimes(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
  1520  	// "If filename is NULL and dfd refers to an open file, then operate on the
  1521  	// file. Otherwise look up filename, possibly using dfd as a starting
  1522  	// point." - fs/utimes.c:do_utimes()
  1523  	if dirfd != linux.AT_FDCWD && pathAddr == 0 {
  1524  		file := t.GetFile(dirfd)
  1525  		if file == nil {
  1526  			return linuxerr.EBADF
  1527  		}
  1528  		defer file.DecRef(t)
  1529  		return file.SetStat(t, *opts)
  1530  	}
  1531  
  1532  	path, err := copyInPath(t, pathAddr)
  1533  	if err != nil {
  1534  		return err
  1535  	}
  1536  	return setstatat(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink, opts)
  1537  }
  1538  
  1539  // Rename implements Linux syscall rename(2).
  1540  func Rename(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1541  	oldpathAddr := args[0].Pointer()
  1542  	newpathAddr := args[1].Pointer()
  1543  	return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
  1544  }
  1545  
  1546  // Renameat implements Linux syscall renameat(2).
  1547  func Renameat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1548  	olddirfd := args[0].Int()
  1549  	oldpathAddr := args[1].Pointer()
  1550  	newdirfd := args[2].Int()
  1551  	newpathAddr := args[3].Pointer()
  1552  	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
  1553  }
  1554  
  1555  // Renameat2 implements Linux syscall renameat2(2).
  1556  func Renameat2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1557  	olddirfd := args[0].Int()
  1558  	oldpathAddr := args[1].Pointer()
  1559  	newdirfd := args[2].Int()
  1560  	newpathAddr := args[3].Pointer()
  1561  	flags := args[4].Uint()
  1562  	return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
  1563  }
  1564  
  1565  func renameat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags uint32) error {
  1566  	oldpath, err := copyInPath(t, oldpathAddr)
  1567  	if err != nil {
  1568  		return err
  1569  	}
  1570  	// "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
  1571  	oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
  1572  	if err != nil {
  1573  		return err
  1574  	}
  1575  	defer oldtpop.Release(t)
  1576  
  1577  	newpath, err := copyInPath(t, newpathAddr)
  1578  	if err != nil {
  1579  		return err
  1580  	}
  1581  	newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
  1582  	if err != nil {
  1583  		return err
  1584  	}
  1585  	defer newtpop.Release(t)
  1586  
  1587  	return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
  1588  		Flags: flags,
  1589  	})
  1590  }
  1591  
  1592  // Fallocate implements linux system call fallocate(2).
  1593  func Fallocate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1594  	fd := args[0].Int()
  1595  	mode := args[1].Uint64()
  1596  	offset := args[2].Int64()
  1597  	length := args[3].Int64()
  1598  
  1599  	file := t.GetFile(fd)
  1600  	if file == nil {
  1601  		return 0, nil, linuxerr.EBADF
  1602  	}
  1603  	defer file.DecRef(t)
  1604  
  1605  	if !file.IsWritable() {
  1606  		return 0, nil, linuxerr.EBADF
  1607  	}
  1608  	if mode != 0 {
  1609  		return 0, nil, linuxerr.ENOTSUP
  1610  	}
  1611  	if offset < 0 || length <= 0 {
  1612  		return 0, nil, linuxerr.EINVAL
  1613  	}
  1614  
  1615  	size := offset + length
  1616  	if size < 0 {
  1617  		return 0, nil, linuxerr.EFBIG
  1618  	}
  1619  	limit := limits.FromContext(t).Get(limits.FileSize).Cur
  1620  	if uint64(size) >= limit {
  1621  		t.SendSignal(&linux.SignalInfo{
  1622  			Signo: int32(linux.SIGXFSZ),
  1623  			Code:  linux.SI_USER,
  1624  		})
  1625  		return 0, nil, linuxerr.EFBIG
  1626  	}
  1627  
  1628  	return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length))
  1629  }
  1630  
  1631  // Flock implements linux syscall flock(2).
  1632  func Flock(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1633  	fd := args[0].Int()
  1634  	operation := args[1].Int()
  1635  
  1636  	file := t.GetFile(fd)
  1637  	if file == nil {
  1638  		// flock(2): EBADF fd is not an open file descriptor.
  1639  		return 0, nil, linuxerr.EBADF
  1640  	}
  1641  	defer file.DecRef(t)
  1642  
  1643  	nonblocking := operation&linux.LOCK_NB != 0
  1644  	operation &^= linux.LOCK_NB
  1645  
  1646  	switch operation {
  1647  	case linux.LOCK_EX:
  1648  		if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.WriteLock, !nonblocking /* block */); err != nil {
  1649  			return 0, nil, err
  1650  		}
  1651  	case linux.LOCK_SH:
  1652  		if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.ReadLock, !nonblocking /* block */); err != nil {
  1653  			return 0, nil, err
  1654  		}
  1655  	case linux.LOCK_UN:
  1656  		if err := file.UnlockBSD(t); err != nil {
  1657  			return 0, nil, err
  1658  		}
  1659  	default:
  1660  		// flock(2): EINVAL operation is invalid.
  1661  		return 0, nil, linuxerr.EINVAL
  1662  	}
  1663  
  1664  	return 0, nil, nil
  1665  }
  1666  
  1667  const (
  1668  	memfdPrefix     = "memfd:"
  1669  	memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix)
  1670  	memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
  1671  )
  1672  
  1673  // MemfdCreate implements the linux syscall memfd_create(2).
  1674  func MemfdCreate(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1675  	addr := args[0].Pointer()
  1676  	flags := args[1].Uint()
  1677  
  1678  	if flags&^memfdAllFlags != 0 {
  1679  		// Unknown bits in flags.
  1680  		return 0, nil, linuxerr.EINVAL
  1681  	}
  1682  
  1683  	allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
  1684  	cloExec := flags&linux.MFD_CLOEXEC != 0
  1685  
  1686  	name, err := t.CopyInString(addr, memfdMaxNameLen)
  1687  	if err != nil {
  1688  		return 0, nil, err
  1689  	}
  1690  
  1691  	shmMount := t.Kernel().ShmMount()
  1692  	file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name)
  1693  	if err != nil {
  1694  		return 0, nil, err
  1695  	}
  1696  	defer file.DecRef(t)
  1697  
  1698  	fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
  1699  		CloseOnExec: cloExec,
  1700  	})
  1701  	if err != nil {
  1702  		return 0, nil, err
  1703  	}
  1704  
  1705  	return uintptr(fd), nil, nil
  1706  }