github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/sys_thread.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"path"
    19  
    20  	"golang.org/x/sys/unix"
    21  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    22  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    23  	"github.com/SagerNet/gvisor/pkg/hostarch"
    24  	"github.com/SagerNet/gvisor/pkg/marshal/primitive"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/fsbridge"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/sched"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/loader"
    31  	"github.com/SagerNet/gvisor/pkg/syserror"
    32  	"github.com/SagerNet/gvisor/pkg/usermem"
    33  )
    34  
    35  const (
    36  	// exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux.
    37  	exitSignalMask = 0xff
    38  )
    39  
    40  var (
    41  	// ExecMaxTotalSize is the maximum length of all argv and envv entries.
    42  	//
    43  	// N.B. The behavior here is different than Linux. Linux provides a limit on
    44  	// individual arguments of 32 pages, and an aggregate limit of at least 32 pages
    45  	// but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement
    46  	// any behavior based on the stack size, and instead provide a fixed hard-limit of
    47  	// 2 MB (which should work well given that 8 MB stack limits are common).
    48  	ExecMaxTotalSize = 2 * 1024 * 1024
    49  
    50  	// ExecMaxElemSize is the maximum length of a single argv or envv entry.
    51  	ExecMaxElemSize = 32 * hostarch.PageSize
    52  )
    53  
    54  // Getppid implements linux syscall getppid(2).
    55  func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    56  	parent := t.Parent()
    57  	if parent == nil {
    58  		return 0, nil, nil
    59  	}
    60  	return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil
    61  }
    62  
    63  // Getpid implements linux syscall getpid(2).
    64  func Getpid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    65  	return uintptr(t.ThreadGroup().ID()), nil, nil
    66  }
    67  
    68  // Gettid implements linux syscall gettid(2).
    69  func Gettid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    70  	return uintptr(t.ThreadID()), nil, nil
    71  }
    72  
    73  // Execve implements linux syscall execve(2).
    74  func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    75  	filenameAddr := args[0].Pointer()
    76  	argvAddr := args[1].Pointer()
    77  	envvAddr := args[2].Pointer()
    78  
    79  	return execveat(t, linux.AT_FDCWD, filenameAddr, argvAddr, envvAddr, 0)
    80  }
    81  
    82  // Execveat implements linux syscall execveat(2).
    83  func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    84  	dirFD := args[0].Int()
    85  	pathnameAddr := args[1].Pointer()
    86  	argvAddr := args[2].Pointer()
    87  	envvAddr := args[3].Pointer()
    88  	flags := args[4].Int()
    89  
    90  	return execveat(t, dirFD, pathnameAddr, argvAddr, envvAddr, flags)
    91  }
    92  
    93  func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
    94  	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
    95  	if err != nil {
    96  		return 0, nil, err
    97  	}
    98  
    99  	var argv, envv []string
   100  	if argvAddr != 0 {
   101  		var err error
   102  		argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize)
   103  		if err != nil {
   104  			return 0, nil, err
   105  		}
   106  	}
   107  	if envvAddr != 0 {
   108  		var err error
   109  		envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize)
   110  		if err != nil {
   111  			return 0, nil, err
   112  		}
   113  	}
   114  
   115  	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
   116  		return 0, nil, linuxerr.EINVAL
   117  	}
   118  	atEmptyPath := flags&linux.AT_EMPTY_PATH != 0
   119  	if !atEmptyPath && len(pathname) == 0 {
   120  		return 0, nil, syserror.ENOENT
   121  	}
   122  	resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0
   123  
   124  	root := t.FSContext().RootDirectory()
   125  	defer root.DecRef(t)
   126  
   127  	var wd *fs.Dirent
   128  	var executable fsbridge.File
   129  	var closeOnExec bool
   130  	if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
   131  		// Even if the pathname is absolute, we may still need the wd
   132  		// for interpreter scripts if the path of the interpreter is
   133  		// relative.
   134  		wd = t.FSContext().WorkingDirectory()
   135  	} else {
   136  		// Need to extract the given FD.
   137  		f, fdFlags := t.FDTable().Get(dirFD)
   138  		if f == nil {
   139  			return 0, nil, linuxerr.EBADF
   140  		}
   141  		defer f.DecRef(t)
   142  		closeOnExec = fdFlags.CloseOnExec
   143  
   144  		if atEmptyPath && len(pathname) == 0 {
   145  			// TODO(github.com/SagerNet/issue/160): Linux requires only execute permission,
   146  			// not read. However, our backing filesystems may prevent us from reading
   147  			// the file without read permission. Additionally, a task with a
   148  			// non-readable executable has additional constraints on access via
   149  			// ptrace and procfs.
   150  			if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil {
   151  				return 0, nil, err
   152  			}
   153  			executable = fsbridge.NewFSFile(f)
   154  		} else {
   155  			wd = f.Dirent
   156  			wd.IncRef()
   157  			if !fs.IsDir(wd.Inode.StableAttr) {
   158  				return 0, nil, syserror.ENOTDIR
   159  			}
   160  		}
   161  	}
   162  	if wd != nil {
   163  		defer wd.DecRef(t)
   164  	}
   165  
   166  	// Load the new TaskImage.
   167  	remainingTraversals := uint(linux.MaxSymlinkTraversals)
   168  	loadArgs := loader.LoadArgs{
   169  		Opener:              fsbridge.NewFSLookup(t.MountNamespace(), root, wd),
   170  		RemainingTraversals: &remainingTraversals,
   171  		ResolveFinal:        resolveFinal,
   172  		Filename:            pathname,
   173  		File:                executable,
   174  		CloseOnExec:         closeOnExec,
   175  		Argv:                argv,
   176  		Envv:                envv,
   177  		Features:            t.Arch().FeatureSet(),
   178  	}
   179  
   180  	image, se := t.Kernel().LoadTaskImage(t, loadArgs)
   181  	if se != nil {
   182  		return 0, nil, se.ToError()
   183  	}
   184  
   185  	ctrl, err := t.Execve(image)
   186  	return 0, ctrl, err
   187  }
   188  
   189  // Exit implements linux syscall exit(2).
   190  func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   191  	status := int(args[0].Int())
   192  	t.PrepareExit(kernel.ExitStatus{Code: status})
   193  	return 0, kernel.CtrlDoExit, nil
   194  }
   195  
   196  // ExitGroup implements linux syscall exit_group(2).
   197  func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   198  	status := int(args[0].Int())
   199  	t.PrepareGroupExit(kernel.ExitStatus{Code: status})
   200  	return 0, kernel.CtrlDoExit, nil
   201  }
   202  
   203  // clone is used by Clone, Fork, and VFork.
   204  func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) {
   205  	opts := kernel.CloneOptions{
   206  		SharingOptions: kernel.SharingOptions{
   207  			NewAddressSpace:     flags&linux.CLONE_VM == 0,
   208  			NewSignalHandlers:   flags&linux.CLONE_SIGHAND == 0,
   209  			NewThreadGroup:      flags&linux.CLONE_THREAD == 0,
   210  			TerminationSignal:   linux.Signal(flags & exitSignalMask),
   211  			NewPIDNamespace:     flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
   212  			NewUserNamespace:    flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
   213  			NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
   214  			NewFiles:            flags&linux.CLONE_FILES == 0,
   215  			NewFSContext:        flags&linux.CLONE_FS == 0,
   216  			NewUTSNamespace:     flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
   217  			NewIPCNamespace:     flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
   218  		},
   219  		Stack:         stack,
   220  		SetTLS:        flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS,
   221  		TLS:           tls,
   222  		ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID,
   223  		ChildSetTID:   flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID,
   224  		ChildTID:      childTID,
   225  		ParentSetTID:  flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID,
   226  		ParentTID:     parentTID,
   227  		Vfork:         flags&linux.CLONE_VFORK == linux.CLONE_VFORK,
   228  		Untraced:      flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED,
   229  		InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE,
   230  	}
   231  	ntid, ctrl, err := t.Clone(&opts)
   232  	return uintptr(ntid), ctrl, err
   233  }
   234  
   235  // Fork implements Linux syscall fork(2).
   236  func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   237  	// "A call to fork() is equivalent to a call to clone(2) specifying flags
   238  	// as just SIGCHLD." - fork(2)
   239  	return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0)
   240  }
   241  
   242  // Vfork implements Linux syscall vfork(2).
   243  func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   244  	// """
   245  	// A call to vfork() is equivalent to calling clone(2) with flags specified as:
   246  	//
   247  	//     CLONE_VM | CLONE_VFORK | SIGCHLD
   248  	// """ - vfork(2)
   249  	return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0)
   250  }
   251  
   252  // parseCommonWaitOptions applies the options common to wait4 and waitid to
   253  // wopts.
   254  func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
   255  	switch options & (linux.WCLONE | linux.WALL) {
   256  	case 0:
   257  		wopts.NonCloneTasks = true
   258  	case linux.WCLONE:
   259  		wopts.CloneTasks = true
   260  	case linux.WALL:
   261  		wopts.NonCloneTasks = true
   262  		wopts.CloneTasks = true
   263  	default:
   264  		return linuxerr.EINVAL
   265  	}
   266  	if options&linux.WCONTINUED != 0 {
   267  		wopts.Events |= kernel.EventGroupContinue
   268  	}
   269  	if options&linux.WNOHANG == 0 {
   270  		wopts.BlockInterruptErr = syserror.ERESTARTSYS
   271  	}
   272  	if options&linux.WNOTHREAD == 0 {
   273  		wopts.SiblingChildren = true
   274  	}
   275  	return nil
   276  }
   277  
   278  // wait4 waits for the given child process to exit.
   279  func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusageAddr hostarch.Addr) (uintptr, error) {
   280  	if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
   281  		return 0, linuxerr.EINVAL
   282  	}
   283  	wopts := kernel.WaitOptions{
   284  		Events:       kernel.EventExit | kernel.EventTraceeStop,
   285  		ConsumeEvent: true,
   286  	}
   287  	// There are four cases to consider:
   288  	//
   289  	// pid < -1    any child process whose process group ID is equal to the absolute value of pid
   290  	// pid == -1   any child process
   291  	// pid == 0    any child process whose process group ID is equal to that of the calling process
   292  	// pid > 0     the child whose process ID is equal to the value of pid
   293  	switch {
   294  	case pid < -1:
   295  		wopts.SpecificPGID = kernel.ProcessGroupID(-pid)
   296  	case pid == -1:
   297  		// Any process is the default.
   298  	case pid == 0:
   299  		wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
   300  	default:
   301  		wopts.SpecificTID = kernel.ThreadID(pid)
   302  	}
   303  
   304  	if err := parseCommonWaitOptions(&wopts, options); err != nil {
   305  		return 0, err
   306  	}
   307  	if options&linux.WUNTRACED != 0 {
   308  		wopts.Events |= kernel.EventChildGroupStop
   309  	}
   310  
   311  	wr, err := t.Wait(&wopts)
   312  	if err != nil {
   313  		if err == kernel.ErrNoWaitableEvent {
   314  			return 0, nil
   315  		}
   316  		return 0, err
   317  	}
   318  	if statusAddr != 0 {
   319  		if _, err := primitive.CopyUint32Out(t, statusAddr, wr.Status); err != nil {
   320  			return 0, err
   321  		}
   322  	}
   323  	if rusageAddr != 0 {
   324  		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
   325  		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
   326  			return 0, err
   327  		}
   328  	}
   329  	return uintptr(wr.TID), nil
   330  }
   331  
   332  // Wait4 implements linux syscall wait4(2).
   333  func Wait4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   334  	pid := int(args[0].Int())
   335  	statusAddr := args[1].Pointer()
   336  	options := int(args[2].Uint())
   337  	rusageAddr := args[3].Pointer()
   338  
   339  	n, err := wait4(t, pid, statusAddr, options, rusageAddr)
   340  	return n, nil, err
   341  }
   342  
   343  // WaitPid implements linux syscall waitpid(2).
   344  func WaitPid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   345  	pid := int(args[0].Int())
   346  	statusAddr := args[1].Pointer()
   347  	options := int(args[2].Uint())
   348  
   349  	n, err := wait4(t, pid, statusAddr, options, 0)
   350  	return n, nil, err
   351  }
   352  
   353  // Waitid implements linux syscall waitid(2).
   354  func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   355  	idtype := args[0].Int()
   356  	id := args[1].Int()
   357  	infop := args[2].Pointer()
   358  	options := int(args[3].Uint())
   359  	rusageAddr := args[4].Pointer()
   360  
   361  	if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
   362  		return 0, nil, linuxerr.EINVAL
   363  	}
   364  	if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
   365  		return 0, nil, linuxerr.EINVAL
   366  	}
   367  	wopts := kernel.WaitOptions{
   368  		Events:       kernel.EventTraceeStop,
   369  		ConsumeEvent: options&linux.WNOWAIT == 0,
   370  	}
   371  	switch idtype {
   372  	case linux.P_ALL:
   373  	case linux.P_PID:
   374  		wopts.SpecificTID = kernel.ThreadID(id)
   375  	case linux.P_PGID:
   376  		wopts.SpecificPGID = kernel.ProcessGroupID(id)
   377  	default:
   378  		return 0, nil, linuxerr.EINVAL
   379  	}
   380  
   381  	if err := parseCommonWaitOptions(&wopts, options); err != nil {
   382  		return 0, nil, err
   383  	}
   384  	if options&linux.WEXITED != 0 {
   385  		wopts.Events |= kernel.EventExit
   386  	}
   387  	if options&linux.WSTOPPED != 0 {
   388  		wopts.Events |= kernel.EventChildGroupStop
   389  	}
   390  
   391  	wr, err := t.Wait(&wopts)
   392  	if err != nil {
   393  		if err == kernel.ErrNoWaitableEvent {
   394  			err = nil
   395  			// "If WNOHANG was specified in options and there were no children
   396  			// in a waitable state, then waitid() returns 0 immediately and the
   397  			// state of the siginfo_t structure pointed to by infop is
   398  			// unspecified." - waitid(2). But Linux's waitid actually zeroes
   399  			// out the fields it would set for a successful waitid in this case
   400  			// as well.
   401  			if infop != 0 {
   402  				var si linux.SignalInfo
   403  				_, err = si.CopyOut(t, infop)
   404  			}
   405  		}
   406  		return 0, nil, err
   407  	}
   408  	if rusageAddr != 0 {
   409  		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
   410  		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
   411  			return 0, nil, err
   412  		}
   413  	}
   414  	if infop == 0 {
   415  		return 0, nil, nil
   416  	}
   417  	si := linux.SignalInfo{
   418  		Signo: int32(linux.SIGCHLD),
   419  	}
   420  	si.SetPID(int32(wr.TID))
   421  	si.SetUID(int32(wr.UID))
   422  	// TODO(b/73541790): convert kernel.ExitStatus to functions and make
   423  	// WaitResult.Status a linux.WaitStatus.
   424  	s := unix.WaitStatus(wr.Status)
   425  	switch {
   426  	case s.Exited():
   427  		si.Code = linux.CLD_EXITED
   428  		si.SetStatus(int32(s.ExitStatus()))
   429  	case s.Signaled():
   430  		si.Code = linux.CLD_KILLED
   431  		si.SetStatus(int32(s.Signal()))
   432  	case s.CoreDump():
   433  		si.Code = linux.CLD_DUMPED
   434  		si.SetStatus(int32(s.Signal()))
   435  	case s.Stopped():
   436  		if wr.Event == kernel.EventTraceeStop {
   437  			si.Code = linux.CLD_TRAPPED
   438  			si.SetStatus(int32(s.TrapCause()))
   439  		} else {
   440  			si.Code = linux.CLD_STOPPED
   441  			si.SetStatus(int32(s.StopSignal()))
   442  		}
   443  	case s.Continued():
   444  		si.Code = linux.CLD_CONTINUED
   445  		si.SetStatus(int32(linux.SIGCONT))
   446  	default:
   447  		t.Warningf("waitid got incomprehensible wait status %d", s)
   448  	}
   449  	_, err = si.CopyOut(t, infop)
   450  	return 0, nil, err
   451  }
   452  
   453  // SetTidAddress implements linux syscall set_tid_address(2).
   454  func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   455  	addr := args[0].Pointer()
   456  
   457  	// Always succeed, return caller's tid.
   458  	t.SetClearTID(addr)
   459  	return uintptr(t.ThreadID()), nil, nil
   460  }
   461  
   462  // Unshare implements linux syscall unshare(2).
   463  func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   464  	flags := args[0].Int()
   465  	opts := kernel.SharingOptions{
   466  		NewAddressSpace:     flags&linux.CLONE_VM == linux.CLONE_VM,
   467  		NewSignalHandlers:   flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND,
   468  		NewThreadGroup:      flags&linux.CLONE_THREAD == linux.CLONE_THREAD,
   469  		NewPIDNamespace:     flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
   470  		NewUserNamespace:    flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
   471  		NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
   472  		NewFiles:            flags&linux.CLONE_FILES == linux.CLONE_FILES,
   473  		NewFSContext:        flags&linux.CLONE_FS == linux.CLONE_FS,
   474  		NewUTSNamespace:     flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
   475  		NewIPCNamespace:     flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
   476  	}
   477  	// "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2)
   478  	if opts.NewPIDNamespace {
   479  		opts.NewThreadGroup = true
   480  	}
   481  	// "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since
   482  	// Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS."
   483  	if opts.NewUserNamespace {
   484  		opts.NewThreadGroup = true
   485  		opts.NewFSContext = true
   486  	}
   487  	return 0, nil, t.Unshare(&opts)
   488  }
   489  
   490  // SchedYield implements linux syscall sched_yield(2).
   491  func SchedYield(t *kernel.Task, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   492  	t.Yield()
   493  	return 0, nil, nil
   494  }
   495  
   496  // SchedSetaffinity implements linux syscall sched_setaffinity(2).
   497  func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   498  	tid := args[0].Int()
   499  	size := args[1].SizeT()
   500  	maskAddr := args[2].Pointer()
   501  
   502  	var task *kernel.Task
   503  	if tid == 0 {
   504  		task = t
   505  	} else {
   506  		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
   507  		if task == nil {
   508  			return 0, nil, syserror.ESRCH
   509  		}
   510  	}
   511  
   512  	mask := sched.NewCPUSet(t.Kernel().ApplicationCores())
   513  	if size > mask.Size() {
   514  		size = mask.Size()
   515  	}
   516  	if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil {
   517  		return 0, nil, err
   518  	}
   519  	return 0, nil, task.SetCPUMask(mask)
   520  }
   521  
   522  // SchedGetaffinity implements linux syscall sched_getaffinity(2).
   523  func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   524  	tid := args[0].Int()
   525  	size := args[1].SizeT()
   526  	maskAddr := args[2].Pointer()
   527  
   528  	// This limitation is because linux stores the cpumask
   529  	// in an array of "unsigned long" so the buffer needs to
   530  	// be a multiple of the word size.
   531  	if size&(t.Arch().Width()-1) > 0 {
   532  		return 0, nil, linuxerr.EINVAL
   533  	}
   534  
   535  	var task *kernel.Task
   536  	if tid == 0 {
   537  		task = t
   538  	} else {
   539  		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
   540  		if task == nil {
   541  			return 0, nil, syserror.ESRCH
   542  		}
   543  	}
   544  
   545  	mask := task.CPUMask()
   546  	// The buffer needs to be big enough to hold a cpumask with
   547  	// all possible cpus.
   548  	if size < mask.Size() {
   549  		return 0, nil, linuxerr.EINVAL
   550  	}
   551  	_, err := t.CopyOutBytes(maskAddr, mask)
   552  
   553  	// NOTE: The syscall interface is slightly different than the glibc
   554  	// interface. The raw sched_getaffinity syscall returns the number of
   555  	// bytes used to represent a cpu mask.
   556  	return uintptr(mask.Size()), nil, err
   557  }
   558  
   559  // Getcpu implements linux syscall getcpu(2).
   560  func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   561  	cpu := args[0].Pointer()
   562  	node := args[1].Pointer()
   563  	// third argument to this system call is nowadays unused.
   564  
   565  	if cpu != 0 {
   566  		if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil {
   567  			return 0, nil, err
   568  		}
   569  	}
   570  	// We always return node 0.
   571  	if node != 0 {
   572  		if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{
   573  			AddressSpaceActive: true,
   574  		}); err != nil {
   575  			return 0, nil, err
   576  		}
   577  	}
   578  	return 0, nil, nil
   579  }
   580  
   581  // Setpgid implements the linux syscall setpgid(2).
   582  func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   583  	// Note that throughout this function, pgid is interpreted with respect
   584  	// to t's namespace, not with respect to the selected ThreadGroup's
   585  	// namespace (which may be different).
   586  	pid := kernel.ThreadID(args[0].Int())
   587  	pgid := kernel.ProcessGroupID(args[1].Int())
   588  
   589  	// "If pid is zero, then the process ID of the calling process is used."
   590  	tg := t.ThreadGroup()
   591  	if pid != 0 {
   592  		ot := t.PIDNamespace().TaskWithID(pid)
   593  		if ot == nil {
   594  			return 0, nil, syserror.ESRCH
   595  		}
   596  		tg = ot.ThreadGroup()
   597  		if tg.Leader() != ot {
   598  			return 0, nil, linuxerr.EINVAL
   599  		}
   600  
   601  		// Setpgid only operates on child threadgroups.
   602  		if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) {
   603  			return 0, nil, syserror.ESRCH
   604  		}
   605  	}
   606  
   607  	// "If pgid is zero, then the PGID of the process specified by pid is made
   608  	// the same as its process ID."
   609  	defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg))
   610  	if pgid == 0 {
   611  		pgid = defaultPGID
   612  	} else if pgid < 0 {
   613  		return 0, nil, linuxerr.EINVAL
   614  	}
   615  
   616  	// If the pgid is the same as the group, then create a new one. Otherwise,
   617  	// we attempt to join an existing process group.
   618  	if pgid == defaultPGID {
   619  		// For convenience, errors line up with Linux syscall API.
   620  		if err := tg.CreateProcessGroup(); err != nil {
   621  			// Is the process group already as expected? If so,
   622  			// just return success. This is the same behavior as
   623  			// Linux.
   624  			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID {
   625  				return 0, nil, nil
   626  			}
   627  			return 0, nil, err
   628  		}
   629  	} else {
   630  		// Same as CreateProcessGroup, above.
   631  		if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil {
   632  			// See above.
   633  			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
   634  				return 0, nil, nil
   635  			}
   636  			return 0, nil, err
   637  		}
   638  	}
   639  
   640  	// Success.
   641  	return 0, nil, nil
   642  }
   643  
   644  // Getpgrp implements the linux syscall getpgrp(2).
   645  func Getpgrp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   646  	return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil
   647  }
   648  
   649  // Getpgid implements the linux syscall getpgid(2).
   650  func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   651  	tid := kernel.ThreadID(args[0].Int())
   652  	if tid == 0 {
   653  		return Getpgrp(t, args)
   654  	}
   655  
   656  	target := t.PIDNamespace().TaskWithID(tid)
   657  	if target == nil {
   658  		return 0, nil, syserror.ESRCH
   659  	}
   660  
   661  	return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil
   662  }
   663  
   664  // Setsid implements the linux syscall setsid(2).
   665  func Setsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   666  	return 0, nil, t.ThreadGroup().CreateSession()
   667  }
   668  
   669  // Getsid implements the linux syscall getsid(2).
   670  func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   671  	tid := kernel.ThreadID(args[0].Int())
   672  	if tid == 0 {
   673  		return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil
   674  	}
   675  
   676  	target := t.PIDNamespace().TaskWithID(tid)
   677  	if target == nil {
   678  		return 0, nil, syserror.ESRCH
   679  	}
   680  
   681  	return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil
   682  }
   683  
   684  // Getpriority pretends to implement the linux syscall getpriority(2).
   685  //
   686  // This is a stub; real priorities require a full scheduler.
   687  func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   688  	which := args[0].Int()
   689  	who := kernel.ThreadID(args[1].Int())
   690  
   691  	switch which {
   692  	case linux.PRIO_PROCESS:
   693  		// Look for who, return ESRCH if not found.
   694  		var task *kernel.Task
   695  		if who == 0 {
   696  			task = t
   697  		} else {
   698  			task = t.PIDNamespace().TaskWithID(who)
   699  		}
   700  
   701  		if task == nil {
   702  			return 0, nil, syserror.ESRCH
   703  		}
   704  
   705  		// From kernel/sys.c:getpriority:
   706  		// "To avoid negative return values, 'getpriority()'
   707  		// will not return the normal nice-value, but a negated
   708  		// value that has been offset by 20"
   709  		return uintptr(20 - task.Niceness()), nil, nil
   710  	case linux.PRIO_USER:
   711  		fallthrough
   712  	case linux.PRIO_PGRP:
   713  		// PRIO_USER and PRIO_PGRP have no further implementation yet.
   714  		return 0, nil, nil
   715  	default:
   716  		return 0, nil, linuxerr.EINVAL
   717  	}
   718  }
   719  
   720  // Setpriority pretends to implement the linux syscall setpriority(2).
   721  //
   722  // This is a stub; real priorities require a full scheduler.
   723  func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   724  	which := args[0].Int()
   725  	who := kernel.ThreadID(args[1].Int())
   726  	niceval := int(args[2].Int())
   727  
   728  	// In the kernel's implementation, values outside the range
   729  	// of [-20, 19] are truncated to these minimum and maximum
   730  	// values.
   731  	if niceval < -20 /* min niceval */ {
   732  		niceval = -20
   733  	} else if niceval > 19 /* max niceval */ {
   734  		niceval = 19
   735  	}
   736  
   737  	switch which {
   738  	case linux.PRIO_PROCESS:
   739  		// Look for who, return ESRCH if not found.
   740  		var task *kernel.Task
   741  		if who == 0 {
   742  			task = t
   743  		} else {
   744  			task = t.PIDNamespace().TaskWithID(who)
   745  		}
   746  
   747  		if task == nil {
   748  			return 0, nil, syserror.ESRCH
   749  		}
   750  
   751  		task.SetNiceness(niceval)
   752  	case linux.PRIO_USER:
   753  		fallthrough
   754  	case linux.PRIO_PGRP:
   755  		// PRIO_USER and PRIO_PGRP have no further implementation yet.
   756  		return 0, nil, nil
   757  	default:
   758  		return 0, nil, linuxerr.EINVAL
   759  	}
   760  
   761  	return 0, nil, nil
   762  }
   763  
   764  // Ptrace implements linux system call ptrace(2).
   765  func Ptrace(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   766  	req := args[0].Int64()
   767  	pid := kernel.ThreadID(args[1].Int())
   768  	addr := args[2].Pointer()
   769  	data := args[3].Pointer()
   770  
   771  	return 0, nil, t.Ptrace(req, pid, addr, data)
   772  }