github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/syscalls/linux/sys_thread.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    19  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    20  	"github.com/MerlinKodo/gvisor/pkg/fspath"
    21  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    22  	"github.com/MerlinKodo/gvisor/pkg/marshal/primitive"
    23  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    24  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel"
    25  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/sched"
    26  	"github.com/MerlinKodo/gvisor/pkg/sentry/loader"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/seccheck"
    28  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    29  	"github.com/MerlinKodo/gvisor/pkg/usermem"
    30  )
    31  
    32  var (
    33  	// ExecMaxTotalSize is the maximum length of all argv and envv entries.
    34  	//
    35  	// N.B. The behavior here is different than Linux. Linux provides a limit on
    36  	// individual arguments of 32 pages, and an aggregate limit of at least 32 pages
    37  	// but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement
    38  	// any behavior based on the stack size, and instead provide a fixed hard-limit of
    39  	// 2 MB (which should work well given that 8 MB stack limits are common).
    40  	ExecMaxTotalSize = 2 * 1024 * 1024
    41  
    42  	// ExecMaxElemSize is the maximum length of a single argv or envv entry.
    43  	ExecMaxElemSize = 32 * hostarch.PageSize
    44  )
    45  
    46  // Getppid implements linux syscall getppid(2).
    47  func Getppid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    48  	parent := t.Parent()
    49  	if parent == nil {
    50  		return 0, nil, nil
    51  	}
    52  	return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil
    53  }
    54  
    55  // Getpid implements linux syscall getpid(2).
    56  func Getpid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    57  	return uintptr(t.ThreadGroup().ID()), nil, nil
    58  }
    59  
    60  // Gettid implements linux syscall gettid(2).
    61  func Gettid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    62  	return uintptr(t.ThreadID()), nil, nil
    63  }
    64  
    65  // Execve implements linux syscall execve(2).
    66  func Execve(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    67  	pathnameAddr := args[0].Pointer()
    68  	argvAddr := args[1].Pointer()
    69  	envvAddr := args[2].Pointer()
    70  	return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
    71  }
    72  
    73  // Execveat implements linux syscall execveat(2).
    74  func Execveat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    75  	dirfd := args[0].Int()
    76  	pathnameAddr := args[1].Pointer()
    77  	argvAddr := args[2].Pointer()
    78  	envvAddr := args[3].Pointer()
    79  	flags := args[4].Int()
    80  	return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
    81  }
    82  
    83  func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
    84  	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
    85  		return 0, nil, linuxerr.EINVAL
    86  	}
    87  
    88  	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
    89  	if err != nil {
    90  		return 0, nil, err
    91  	}
    92  	var argv, envv []string
    93  	if argvAddr != 0 {
    94  		var err error
    95  		argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize)
    96  		if err != nil {
    97  			return 0, nil, err
    98  		}
    99  	}
   100  	if envvAddr != 0 {
   101  		var err error
   102  		envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize)
   103  		if err != nil {
   104  			return 0, nil, err
   105  		}
   106  	}
   107  
   108  	root := t.FSContext().RootDirectory()
   109  	defer root.DecRef(t)
   110  	var executable *vfs.FileDescription
   111  	defer func() {
   112  		if executable != nil {
   113  			executable.DecRef(t)
   114  		}
   115  	}()
   116  	closeOnExec := false
   117  	if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
   118  		// We must open the executable ourselves since dirfd is used as the
   119  		// starting point while resolving path, but the task working directory
   120  		// is used as the starting point while resolving interpreters (Linux:
   121  		// fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
   122  		// do_open_execat(fd=AT_FDCWD)), and the loader package is currently
   123  		// incapable of handling this correctly.
   124  		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
   125  			return 0, nil, linuxerr.ENOENT
   126  		}
   127  		dirfile, dirfileFlags := t.FDTable().Get(dirfd)
   128  		if dirfile == nil {
   129  			return 0, nil, linuxerr.EBADF
   130  		}
   131  		start := dirfile.VirtualDentry()
   132  		start.IncRef()
   133  		dirfile.DecRef(t)
   134  		closeOnExec = dirfileFlags.CloseOnExec
   135  		file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
   136  			Root:               root,
   137  			Start:              start,
   138  			Path:               path,
   139  			FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
   140  		}, &vfs.OpenOptions{
   141  			Flags:    linux.O_RDONLY,
   142  			FileExec: true,
   143  		})
   144  		start.DecRef(t)
   145  		if err != nil {
   146  			return 0, nil, err
   147  		}
   148  		executable = file
   149  		pathname = executable.MappedName(t)
   150  	}
   151  
   152  	// Load the new TaskImage.
   153  	wd := t.FSContext().WorkingDirectory()
   154  	defer wd.DecRef(t)
   155  	remainingTraversals := uint(linux.MaxSymlinkTraversals)
   156  	loadArgs := loader.LoadArgs{
   157  		Root:                root,
   158  		WorkingDir:          wd,
   159  		RemainingTraversals: &remainingTraversals,
   160  		ResolveFinal:        flags&linux.AT_SYMLINK_NOFOLLOW == 0,
   161  		Filename:            pathname,
   162  		File:                executable,
   163  		CloseOnExec:         closeOnExec,
   164  		Argv:                argv,
   165  		Envv:                envv,
   166  		Features:            t.Kernel().FeatureSet(),
   167  	}
   168  	if seccheck.Global.Enabled(seccheck.PointExecve) {
   169  		// Retain the first executable file that is opened (which may open
   170  		// multiple executable files while resolving interpreter scripts).
   171  		if executable == nil {
   172  			loadArgs.AfterOpen = func(f *vfs.FileDescription) {
   173  				if executable == nil {
   174  					f.IncRef()
   175  					executable = f
   176  					pathname = executable.MappedName(t)
   177  				}
   178  			}
   179  		}
   180  	}
   181  
   182  	image, se := t.Kernel().LoadTaskImage(t, loadArgs)
   183  	if se != nil {
   184  		return 0, nil, se.ToError()
   185  	}
   186  
   187  	ctrl, err := t.Execve(image, argv, envv, executable, pathname)
   188  	return 0, ctrl, err
   189  }
   190  
   191  // Exit implements linux syscall exit(2).
   192  func Exit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   193  	status := args[0].Int()
   194  	t.PrepareExit(linux.WaitStatusExit(status & 0xff))
   195  	return 0, kernel.CtrlDoExit, nil
   196  }
   197  
   198  // ExitGroup implements linux syscall exit_group(2).
   199  func ExitGroup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   200  	status := args[0].Int()
   201  	t.PrepareGroupExit(linux.WaitStatusExit(status & 0xff))
   202  	return 0, kernel.CtrlDoExit, nil
   203  }
   204  
   205  // clone is used by Clone, Fork, and VFork.
   206  func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) {
   207  	args := linux.CloneArgs{
   208  		Flags:      uint64(uint32(flags) &^ linux.CSIGNAL),
   209  		ChildTID:   uint64(childTID),
   210  		ParentTID:  uint64(parentTID),
   211  		ExitSignal: uint64(flags & linux.CSIGNAL),
   212  		Stack:      uint64(stack),
   213  		TLS:        uint64(tls),
   214  	}
   215  	ntid, ctrl, err := t.Clone(&args)
   216  	return uintptr(ntid), ctrl, err
   217  }
   218  
   219  // Fork implements Linux syscall fork(2).
   220  func Fork(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   221  	// "A call to fork() is equivalent to a call to clone(2) specifying flags
   222  	// as just SIGCHLD." - fork(2)
   223  	return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0)
   224  }
   225  
   226  // Vfork implements Linux syscall vfork(2).
   227  func Vfork(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   228  	// """
   229  	// A call to vfork() is equivalent to calling clone(2) with flags specified as:
   230  	//
   231  	//     CLONE_VM | CLONE_VFORK | SIGCHLD
   232  	// """ - vfork(2)
   233  	return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0)
   234  }
   235  
   236  // Clone3 implements linux syscall clone3(2).
   237  func Clone3(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   238  	cloneArgsPointer := args[0].Pointer()
   239  	size := args[1].SizeT()
   240  
   241  	if int(size) < linux.CLONE_ARGS_SIZE_VER0 || int(size) > linux.CLONE_ARGS_SIZE_VER2 {
   242  		return 0, nil, linuxerr.EINVAL
   243  	}
   244  
   245  	var cloneArgs linux.CloneArgs
   246  	if cloneArgsPointer != 0 {
   247  		if _, err := cloneArgs.CopyInN(t, cloneArgsPointer, int(size)); err != nil {
   248  			return 0, nil, err
   249  		}
   250  	}
   251  
   252  	ntid, ctrl, err := t.Clone(&cloneArgs)
   253  	if err != nil {
   254  		return 0, nil, err
   255  	}
   256  	return uintptr(ntid), ctrl, err
   257  }
   258  
   259  // parseCommonWaitOptions applies the options common to wait4 and waitid to
   260  // wopts.
   261  func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
   262  	switch options & (linux.WCLONE | linux.WALL) {
   263  	case 0:
   264  		wopts.NonCloneTasks = true
   265  	case linux.WCLONE:
   266  		wopts.CloneTasks = true
   267  	case linux.WALL:
   268  		wopts.NonCloneTasks = true
   269  		wopts.CloneTasks = true
   270  	default:
   271  		return linuxerr.EINVAL
   272  	}
   273  	if options&linux.WCONTINUED != 0 {
   274  		wopts.Events |= kernel.EventGroupContinue
   275  	}
   276  	if options&linux.WNOHANG == 0 {
   277  		wopts.BlockInterruptErr = linuxerr.ERESTARTSYS
   278  	}
   279  	if options&linux.WNOTHREAD == 0 {
   280  		wopts.SiblingChildren = true
   281  	}
   282  	return nil
   283  }
   284  
   285  // wait4 waits for the given child process to exit.
   286  func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusageAddr hostarch.Addr) (uintptr, error) {
   287  	if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
   288  		return 0, linuxerr.EINVAL
   289  	}
   290  	wopts := kernel.WaitOptions{
   291  		Events:       kernel.EventExit | kernel.EventTraceeStop,
   292  		ConsumeEvent: true,
   293  	}
   294  	// There are four cases to consider:
   295  	//
   296  	// pid < -1    any child process whose process group ID is equal to the absolute value of pid
   297  	// pid == -1   any child process
   298  	// pid == 0    any child process whose process group ID is equal to that of the calling process
   299  	// pid > 0     the child whose process ID is equal to the value of pid
   300  	switch {
   301  	case pid < -1:
   302  		wopts.SpecificPGID = kernel.ProcessGroupID(-pid)
   303  	case pid == -1:
   304  		// Any process is the default.
   305  	case pid == 0:
   306  		wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
   307  	default:
   308  		wopts.SpecificTID = kernel.ThreadID(pid)
   309  	}
   310  
   311  	if err := parseCommonWaitOptions(&wopts, options); err != nil {
   312  		return 0, err
   313  	}
   314  	if options&linux.WUNTRACED != 0 {
   315  		wopts.Events |= kernel.EventChildGroupStop
   316  	}
   317  
   318  	wr, err := t.Wait(&wopts)
   319  	if err != nil {
   320  		if err == kernel.ErrNoWaitableEvent {
   321  			return 0, nil
   322  		}
   323  		return 0, err
   324  	}
   325  	if statusAddr != 0 {
   326  		if _, err := primitive.CopyUint32Out(t, statusAddr, uint32(wr.Status)); err != nil {
   327  			return 0, err
   328  		}
   329  	}
   330  	if rusageAddr != 0 {
   331  		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
   332  		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
   333  			return 0, err
   334  		}
   335  	}
   336  	return uintptr(wr.TID), nil
   337  }
   338  
   339  // Wait4 implements linux syscall wait4(2).
   340  func Wait4(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   341  	pid := int(args[0].Int())
   342  	statusAddr := args[1].Pointer()
   343  	options := int(args[2].Uint())
   344  	rusageAddr := args[3].Pointer()
   345  
   346  	n, err := wait4(t, pid, statusAddr, options, rusageAddr)
   347  	return n, nil, err
   348  }
   349  
   350  // WaitPid implements linux syscall waitpid(2).
   351  func WaitPid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   352  	pid := int(args[0].Int())
   353  	statusAddr := args[1].Pointer()
   354  	options := int(args[2].Uint())
   355  
   356  	n, err := wait4(t, pid, statusAddr, options, 0)
   357  	return n, nil, err
   358  }
   359  
   360  // Waitid implements linux syscall waitid(2).
   361  func Waitid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   362  	idtype := args[0].Int()
   363  	id := args[1].Int()
   364  	infop := args[2].Pointer()
   365  	options := int(args[3].Uint())
   366  	rusageAddr := args[4].Pointer()
   367  
   368  	if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
   369  		return 0, nil, linuxerr.EINVAL
   370  	}
   371  	if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
   372  		return 0, nil, linuxerr.EINVAL
   373  	}
   374  	wopts := kernel.WaitOptions{
   375  		Events:       kernel.EventTraceeStop,
   376  		ConsumeEvent: options&linux.WNOWAIT == 0,
   377  	}
   378  	switch idtype {
   379  	case linux.P_ALL:
   380  	case linux.P_PID:
   381  		wopts.SpecificTID = kernel.ThreadID(id)
   382  	case linux.P_PGID:
   383  		wopts.SpecificPGID = kernel.ProcessGroupID(id)
   384  	default:
   385  		return 0, nil, linuxerr.EINVAL
   386  	}
   387  
   388  	if err := parseCommonWaitOptions(&wopts, options); err != nil {
   389  		return 0, nil, err
   390  	}
   391  	if options&linux.WEXITED != 0 {
   392  		wopts.Events |= kernel.EventExit
   393  	}
   394  	if options&linux.WSTOPPED != 0 {
   395  		wopts.Events |= kernel.EventChildGroupStop
   396  	}
   397  
   398  	wr, err := t.Wait(&wopts)
   399  	if err != nil {
   400  		if err == kernel.ErrNoWaitableEvent {
   401  			err = nil
   402  			// "If WNOHANG was specified in options and there were no children
   403  			// in a waitable state, then waitid() returns 0 immediately and the
   404  			// state of the siginfo_t structure pointed to by infop is
   405  			// unspecified." - waitid(2). But Linux's waitid actually zeroes
   406  			// out the fields it would set for a successful waitid in this case
   407  			// as well.
   408  			if infop != 0 {
   409  				var si linux.SignalInfo
   410  				_, err = si.CopyOut(t, infop)
   411  			}
   412  		}
   413  		return 0, nil, err
   414  	}
   415  	if rusageAddr != 0 {
   416  		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
   417  		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
   418  			return 0, nil, err
   419  		}
   420  	}
   421  	if infop == 0 {
   422  		return 0, nil, nil
   423  	}
   424  	si := linux.SignalInfo{
   425  		Signo: int32(linux.SIGCHLD),
   426  	}
   427  	si.SetPID(int32(wr.TID))
   428  	si.SetUID(int32(wr.UID))
   429  	s := wr.Status
   430  	switch {
   431  	case s.Exited():
   432  		si.Code = linux.CLD_EXITED
   433  		si.SetStatus(int32(s.ExitStatus()))
   434  	case s.Signaled():
   435  		if s.CoreDumped() {
   436  			si.Code = linux.CLD_DUMPED
   437  		} else {
   438  			si.Code = linux.CLD_KILLED
   439  		}
   440  		si.SetStatus(int32(s.TerminationSignal()))
   441  	case s.Stopped():
   442  		if wr.Event == kernel.EventTraceeStop {
   443  			si.Code = linux.CLD_TRAPPED
   444  			si.SetStatus(int32(s.PtraceEvent()))
   445  		} else {
   446  			si.Code = linux.CLD_STOPPED
   447  			si.SetStatus(int32(s.StopSignal()))
   448  		}
   449  	case s.Continued():
   450  		si.Code = linux.CLD_CONTINUED
   451  		si.SetStatus(int32(linux.SIGCONT))
   452  	default:
   453  		t.Warningf("waitid got incomprehensible wait status %d", s)
   454  	}
   455  	_, err = si.CopyOut(t, infop)
   456  	return 0, nil, err
   457  }
   458  
   459  // SetTidAddress implements linux syscall set_tid_address(2).
   460  func SetTidAddress(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   461  	addr := args[0].Pointer()
   462  
   463  	// Always succeed, return caller's tid.
   464  	t.SetClearTID(addr)
   465  	return uintptr(t.ThreadID()), nil, nil
   466  }
   467  
   468  // Setns implements linux syscall setns(2).
   469  func Setns(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   470  	fd := args[0].Int()
   471  
   472  	file := t.GetFile(fd)
   473  	if file == nil {
   474  		return 0, nil, linuxerr.EBADF
   475  	}
   476  	defer file.DecRef(t)
   477  
   478  	flags := args[1].Int()
   479  	return 0, nil, t.Setns(file, flags)
   480  }
   481  
   482  // Unshare implements linux syscall unshare(2).
   483  func Unshare(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   484  	flags := args[0].Int()
   485  	// "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2)
   486  	if flags&linux.CLONE_NEWPID != 0 {
   487  		flags |= linux.CLONE_THREAD
   488  	}
   489  	// "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since
   490  	// Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS."
   491  	if flags&linux.CLONE_NEWUSER != 0 {
   492  		flags |= linux.CLONE_THREAD | linux.CLONE_FS
   493  	}
   494  	return 0, nil, t.Unshare(flags)
   495  }
   496  
   497  // SchedYield implements linux syscall sched_yield(2).
   498  func SchedYield(t *kernel.Task, sysno uintptr, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   499  	t.Yield()
   500  	return 0, nil, nil
   501  }
   502  
   503  // SchedSetaffinity implements linux syscall sched_setaffinity(2).
   504  func SchedSetaffinity(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   505  	tid := args[0].Int()
   506  	size := args[1].SizeT()
   507  	maskAddr := args[2].Pointer()
   508  
   509  	var task *kernel.Task
   510  	if tid == 0 {
   511  		task = t
   512  	} else {
   513  		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
   514  		if task == nil {
   515  			return 0, nil, linuxerr.ESRCH
   516  		}
   517  	}
   518  
   519  	mask := sched.NewCPUSet(t.Kernel().ApplicationCores())
   520  	if size > mask.Size() {
   521  		size = mask.Size()
   522  	}
   523  	if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil {
   524  		return 0, nil, err
   525  	}
   526  	return 0, nil, task.SetCPUMask(mask)
   527  }
   528  
   529  // SchedGetaffinity implements linux syscall sched_getaffinity(2).
   530  func SchedGetaffinity(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   531  	tid := args[0].Int()
   532  	size := args[1].SizeT()
   533  	maskAddr := args[2].Pointer()
   534  
   535  	// This limitation is because linux stores the cpumask
   536  	// in an array of "unsigned long" so the buffer needs to
   537  	// be a multiple of the word size.
   538  	if size&(t.Arch().Width()-1) > 0 {
   539  		return 0, nil, linuxerr.EINVAL
   540  	}
   541  
   542  	var task *kernel.Task
   543  	if tid == 0 {
   544  		task = t
   545  	} else {
   546  		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
   547  		if task == nil {
   548  			return 0, nil, linuxerr.ESRCH
   549  		}
   550  	}
   551  
   552  	mask := task.CPUMask()
   553  	// The buffer needs to be big enough to hold a cpumask with
   554  	// all possible cpus.
   555  	if size < mask.Size() {
   556  		return 0, nil, linuxerr.EINVAL
   557  	}
   558  	_, err := t.CopyOutBytes(maskAddr, mask)
   559  
   560  	// NOTE: The syscall interface is slightly different than the glibc
   561  	// interface. The raw sched_getaffinity syscall returns the number of
   562  	// bytes used to represent a cpu mask.
   563  	return uintptr(mask.Size()), nil, err
   564  }
   565  
   566  // Getcpu implements linux syscall getcpu(2).
   567  func Getcpu(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   568  	cpu := args[0].Pointer()
   569  	node := args[1].Pointer()
   570  	// third argument to this system call is nowadays unused.
   571  
   572  	if cpu != 0 {
   573  		if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil {
   574  			return 0, nil, err
   575  		}
   576  	}
   577  	// We always return node 0.
   578  	if node != 0 {
   579  		if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{
   580  			AddressSpaceActive: true,
   581  		}); err != nil {
   582  			return 0, nil, err
   583  		}
   584  	}
   585  	return 0, nil, nil
   586  }
   587  
   588  // Setpgid implements the linux syscall setpgid(2).
   589  func Setpgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   590  	// Note that throughout this function, pgid is interpreted with respect
   591  	// to t's namespace, not with respect to the selected ThreadGroup's
   592  	// namespace (which may be different).
   593  	pid := kernel.ThreadID(args[0].Int())
   594  	pgid := kernel.ProcessGroupID(args[1].Int())
   595  
   596  	// "If pid is zero, then the process ID of the calling process is used."
   597  	tg := t.ThreadGroup()
   598  	if pid != 0 {
   599  		ot := t.PIDNamespace().TaskWithID(pid)
   600  		if ot == nil {
   601  			return 0, nil, linuxerr.ESRCH
   602  		}
   603  		tg = ot.ThreadGroup()
   604  		if tg.Leader() != ot {
   605  			return 0, nil, linuxerr.EINVAL
   606  		}
   607  
   608  		// Setpgid only operates on child threadgroups.
   609  		if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) {
   610  			return 0, nil, linuxerr.ESRCH
   611  		}
   612  	}
   613  
   614  	// "If pgid is zero, then the PGID of the process specified by pid is made
   615  	// the same as its process ID."
   616  	defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg))
   617  	if pgid == 0 {
   618  		pgid = defaultPGID
   619  	} else if pgid < 0 {
   620  		return 0, nil, linuxerr.EINVAL
   621  	}
   622  
   623  	// If the pgid is the same as the group, then create a new one. Otherwise,
   624  	// we attempt to join an existing process group.
   625  	if pgid == defaultPGID {
   626  		// For convenience, errors line up with Linux syscall API.
   627  		if err := tg.CreateProcessGroup(); err != nil {
   628  			// Is the process group already as expected? If so,
   629  			// just return success. This is the same behavior as
   630  			// Linux.
   631  			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID {
   632  				return 0, nil, nil
   633  			}
   634  			return 0, nil, err
   635  		}
   636  	} else {
   637  		// Same as CreateProcessGroup, above.
   638  		if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil {
   639  			// See above.
   640  			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
   641  				return 0, nil, nil
   642  			}
   643  			return 0, nil, err
   644  		}
   645  	}
   646  
   647  	// Success.
   648  	return 0, nil, nil
   649  }
   650  
   651  // Getpgrp implements the linux syscall getpgrp(2).
   652  func Getpgrp(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   653  	return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil
   654  }
   655  
   656  // Getpgid implements the linux syscall getpgid(2).
   657  func Getpgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   658  	tid := kernel.ThreadID(args[0].Int())
   659  	if tid == 0 {
   660  		return Getpgrp(t, sysno, args)
   661  	}
   662  
   663  	target := t.PIDNamespace().TaskWithID(tid)
   664  	if target == nil {
   665  		return 0, nil, linuxerr.ESRCH
   666  	}
   667  
   668  	return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil
   669  }
   670  
   671  // Setsid implements the linux syscall setsid(2).
   672  func Setsid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   673  	return 0, nil, t.ThreadGroup().CreateSession()
   674  }
   675  
   676  // Getsid implements the linux syscall getsid(2).
   677  func Getsid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   678  	tid := kernel.ThreadID(args[0].Int())
   679  	if tid == 0 {
   680  		return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil
   681  	}
   682  
   683  	target := t.PIDNamespace().TaskWithID(tid)
   684  	if target == nil {
   685  		return 0, nil, linuxerr.ESRCH
   686  	}
   687  
   688  	return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil
   689  }
   690  
   691  // Getpriority pretends to implement the linux syscall getpriority(2).
   692  //
   693  // This is a stub; real priorities require a full scheduler.
   694  func Getpriority(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   695  	which := args[0].Int()
   696  	who := kernel.ThreadID(args[1].Int())
   697  
   698  	switch which {
   699  	case linux.PRIO_PROCESS:
   700  		// Look for who, return ESRCH if not found.
   701  		var task *kernel.Task
   702  		if who == 0 {
   703  			task = t
   704  		} else {
   705  			task = t.PIDNamespace().TaskWithID(who)
   706  		}
   707  
   708  		if task == nil {
   709  			return 0, nil, linuxerr.ESRCH
   710  		}
   711  
   712  		// From kernel/sys.c:getpriority:
   713  		// "To avoid negative return values, 'getpriority()'
   714  		// will not return the normal nice-value, but a negated
   715  		// value that has been offset by 20"
   716  		return uintptr(20 - task.Niceness()), nil, nil
   717  	case linux.PRIO_USER:
   718  		fallthrough
   719  	case linux.PRIO_PGRP:
   720  		// PRIO_USER and PRIO_PGRP have no further implementation yet.
   721  		return 0, nil, nil
   722  	default:
   723  		return 0, nil, linuxerr.EINVAL
   724  	}
   725  }
   726  
   727  // Setpriority pretends to implement the linux syscall setpriority(2).
   728  //
   729  // This is a stub; real priorities require a full scheduler.
   730  func Setpriority(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   731  	which := args[0].Int()
   732  	who := kernel.ThreadID(args[1].Int())
   733  	niceval := int(args[2].Int())
   734  
   735  	// In the kernel's implementation, values outside the range
   736  	// of [-20, 19] are truncated to these minimum and maximum
   737  	// values.
   738  	if niceval < -20 /* min niceval */ {
   739  		niceval = -20
   740  	} else if niceval > 19 /* max niceval */ {
   741  		niceval = 19
   742  	}
   743  
   744  	switch which {
   745  	case linux.PRIO_PROCESS:
   746  		// Look for who, return ESRCH if not found.
   747  		var task *kernel.Task
   748  		if who == 0 {
   749  			task = t
   750  		} else {
   751  			task = t.PIDNamespace().TaskWithID(who)
   752  		}
   753  
   754  		if task == nil {
   755  			return 0, nil, linuxerr.ESRCH
   756  		}
   757  
   758  		task.SetNiceness(niceval)
   759  	case linux.PRIO_USER:
   760  		fallthrough
   761  	case linux.PRIO_PGRP:
   762  		// PRIO_USER and PRIO_PGRP have no further implementation yet.
   763  		return 0, nil, nil
   764  	default:
   765  		return 0, nil, linuxerr.EINVAL
   766  	}
   767  
   768  	return 0, nil, nil
   769  }
   770  
   771  // Ptrace implements linux system call ptrace(2).
   772  func Ptrace(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   773  	req := args[0].Int64()
   774  	pid := kernel.ThreadID(args[1].Int())
   775  	addr := args[2].Pointer()
   776  	data := args[3].Pointer()
   777  
   778  	return 0, nil, t.Ptrace(req, pid, addr, data)
   779  }