github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/syscalls/linux/sys_thread.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package linux
    16  
    17  import (
    18  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    19  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/marshal/primitive"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/sched"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/loader"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    30  )
    31  
    32  var (
    33  	// ExecMaxTotalSize is the maximum length of all argv and envv entries.
    34  	//
    35  	// N.B. The behavior here is different than Linux. Linux provides a limit on
    36  	// individual arguments of 32 pages, and an aggregate limit of at least 32 pages
    37  	// but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement
    38  	// any behavior based on the stack size, and instead provide a fixed hard-limit of
    39  	// 2 MB (which should work well given that 8 MB stack limits are common).
    40  	ExecMaxTotalSize = 2 * 1024 * 1024
    41  
    42  	// ExecMaxElemSize is the maximum length of a single argv or envv entry.
    43  	ExecMaxElemSize = 32 * hostarch.PageSize
    44  )
    45  
    46  // Getppid implements linux syscall getppid(2).
    47  func Getppid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    48  	parent := t.Parent()
    49  	if parent == nil {
    50  		return 0, nil, nil
    51  	}
    52  	return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil
    53  }
    54  
    55  // Getpid implements linux syscall getpid(2).
    56  func Getpid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    57  	return uintptr(t.ThreadGroup().ID()), nil, nil
    58  }
    59  
    60  // Gettid implements linux syscall gettid(2).
    61  func Gettid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    62  	return uintptr(t.ThreadID()), nil, nil
    63  }
    64  
    65  // Execve implements linux syscall execve(2).
    66  func Execve(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    67  	pathnameAddr := args[0].Pointer()
    68  	argvAddr := args[1].Pointer()
    69  	envvAddr := args[2].Pointer()
    70  	return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
    71  }
    72  
    73  // Execveat implements linux syscall execveat(2).
    74  func Execveat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
    75  	dirfd := args[0].Int()
    76  	pathnameAddr := args[1].Pointer()
    77  	argvAddr := args[2].Pointer()
    78  	envvAddr := args[3].Pointer()
    79  	flags := args[4].Int()
    80  	return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
    81  }
    82  
    83  func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
    84  	if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
    85  		return 0, nil, linuxerr.EINVAL
    86  	}
    87  
    88  	pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
    89  	if err != nil {
    90  		return 0, nil, err
    91  	}
    92  	var argv, envv []string
    93  	if argvAddr != 0 {
    94  		var err error
    95  		argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize)
    96  		if err != nil {
    97  			return 0, nil, err
    98  		}
    99  	}
   100  	if envvAddr != 0 {
   101  		var err error
   102  		envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize)
   103  		if err != nil {
   104  			return 0, nil, err
   105  		}
   106  	}
   107  
   108  	root := t.FSContext().RootDirectory()
   109  	defer root.DecRef(t)
   110  	var executable *vfs.FileDescription
   111  	defer func() {
   112  		if executable != nil {
   113  			executable.DecRef(t)
   114  		}
   115  	}()
   116  	closeOnExec := false
   117  	if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
   118  		// We must open the executable ourselves since dirfd is used as the
   119  		// starting point while resolving path, but the task working directory
   120  		// is used as the starting point while resolving interpreters (Linux:
   121  		// fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
   122  		// do_open_execat(fd=AT_FDCWD)), and the loader package is currently
   123  		// incapable of handling this correctly.
   124  		if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
   125  			return 0, nil, linuxerr.ENOENT
   126  		}
   127  		dirfile, dirfileFlags := t.FDTable().Get(dirfd)
   128  		if dirfile == nil {
   129  			return 0, nil, linuxerr.EBADF
   130  		}
   131  		start := dirfile.VirtualDentry()
   132  		start.IncRef()
   133  		dirfile.DecRef(t)
   134  		closeOnExec = dirfileFlags.CloseOnExec
   135  		file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
   136  			Root:               root,
   137  			Start:              start,
   138  			Path:               path,
   139  			FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
   140  		}, &vfs.OpenOptions{
   141  			Flags:    linux.O_RDONLY,
   142  			FileExec: true,
   143  		})
   144  		start.DecRef(t)
   145  		if err != nil {
   146  			return 0, nil, err
   147  		}
   148  		executable = file
   149  		pathname = executable.MappedName(t)
   150  	}
   151  
   152  	// Load the new TaskImage.
   153  	wd := t.FSContext().WorkingDirectory()
   154  	defer wd.DecRef(t)
   155  	remainingTraversals := uint(linux.MaxSymlinkTraversals)
   156  	loadArgs := loader.LoadArgs{
   157  		Root:                root,
   158  		WorkingDir:          wd,
   159  		RemainingTraversals: &remainingTraversals,
   160  		ResolveFinal:        flags&linux.AT_SYMLINK_NOFOLLOW == 0,
   161  		Filename:            pathname,
   162  		File:                executable,
   163  		CloseOnExec:         closeOnExec,
   164  		Argv:                argv,
   165  		Envv:                envv,
   166  		Features:            t.Kernel().FeatureSet(),
   167  	}
   168  	if seccheck.Global.Enabled(seccheck.PointExecve) {
   169  		// Retain the first executable file that is opened (which may open
   170  		// multiple executable files while resolving interpreter scripts).
   171  		if executable == nil {
   172  			loadArgs.AfterOpen = func(f *vfs.FileDescription) {
   173  				if executable == nil {
   174  					f.IncRef()
   175  					executable = f
   176  					pathname = executable.MappedName(t)
   177  				}
   178  			}
   179  		}
   180  	}
   181  
   182  	image, se := t.Kernel().LoadTaskImage(t, loadArgs)
   183  	if se != nil {
   184  		return 0, nil, se.ToError()
   185  	}
   186  
   187  	ctrl, err := t.Execve(image, argv, envv, executable, pathname)
   188  	return 0, ctrl, err
   189  }
   190  
   191  // Exit implements linux syscall exit(2).
   192  func Exit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   193  	status := args[0].Int()
   194  	t.PrepareExit(linux.WaitStatusExit(status & 0xff))
   195  	return 0, kernel.CtrlDoExit, nil
   196  }
   197  
   198  // ExitGroup implements linux syscall exit_group(2).
   199  func ExitGroup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   200  	status := args[0].Int()
   201  	t.PrepareGroupExit(linux.WaitStatusExit(status & 0xff))
   202  	return 0, kernel.CtrlDoExit, nil
   203  }
   204  
   205  // clone is used by Clone, Fork, and VFork.
   206  func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) {
   207  	args := linux.CloneArgs{
   208  		Flags:      uint64(uint32(flags) &^ linux.CSIGNAL),
   209  		Pidfd:      uint64(parentTID),
   210  		ChildTID:   uint64(childTID),
   211  		ParentTID:  uint64(parentTID),
   212  		ExitSignal: uint64(flags & linux.CSIGNAL),
   213  		Stack:      uint64(stack),
   214  		TLS:        uint64(tls),
   215  	}
   216  	ntid, ctrl, err := t.Clone(&args)
   217  	return uintptr(ntid), ctrl, err
   218  }
   219  
   220  // Fork implements Linux syscall fork(2).
   221  func Fork(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   222  	// "A call to fork() is equivalent to a call to clone(2) specifying flags
   223  	// as just SIGCHLD." - fork(2)
   224  	return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0)
   225  }
   226  
   227  // Vfork implements Linux syscall vfork(2).
   228  func Vfork(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   229  	// """
   230  	// A call to vfork() is equivalent to calling clone(2) with flags specified as:
   231  	//
   232  	//     CLONE_VM | CLONE_VFORK | SIGCHLD
   233  	// """ - vfork(2)
   234  	return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0)
   235  }
   236  
   237  // parseCommonWaitOptions applies the options common to wait4 and waitid to
   238  // wopts.
   239  func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
   240  	switch options & (linux.WCLONE | linux.WALL) {
   241  	case 0:
   242  		wopts.NonCloneTasks = true
   243  	case linux.WCLONE:
   244  		wopts.CloneTasks = true
   245  	case linux.WALL:
   246  		wopts.NonCloneTasks = true
   247  		wopts.CloneTasks = true
   248  	default:
   249  		return linuxerr.EINVAL
   250  	}
   251  	if options&linux.WCONTINUED != 0 {
   252  		wopts.Events |= kernel.EventGroupContinue
   253  	}
   254  	if options&linux.WNOHANG == 0 {
   255  		wopts.BlockInterruptErr = linuxerr.ERESTARTSYS
   256  	}
   257  	if options&linux.WNOTHREAD == 0 {
   258  		wopts.SiblingChildren = true
   259  	}
   260  	return nil
   261  }
   262  
   263  // wait4 waits for the given child process to exit.
   264  func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusageAddr hostarch.Addr) (uintptr, error) {
   265  	if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
   266  		return 0, linuxerr.EINVAL
   267  	}
   268  	wopts := kernel.WaitOptions{
   269  		Events:       kernel.EventExit | kernel.EventTraceeStop,
   270  		ConsumeEvent: true,
   271  	}
   272  	// There are four cases to consider:
   273  	//
   274  	// pid < -1    any child process whose process group ID is equal to the absolute value of pid
   275  	// pid == -1   any child process
   276  	// pid == 0    any child process whose process group ID is equal to that of the calling process
   277  	// pid > 0     the child whose process ID is equal to the value of pid
   278  	switch {
   279  	case pid < -1:
   280  		wopts.SpecificPGID = kernel.ProcessGroupID(-pid)
   281  	case pid == -1:
   282  		// Any process is the default.
   283  	case pid == 0:
   284  		wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
   285  	default:
   286  		wopts.SpecificTID = kernel.ThreadID(pid)
   287  	}
   288  
   289  	if err := parseCommonWaitOptions(&wopts, options); err != nil {
   290  		return 0, err
   291  	}
   292  	if options&linux.WUNTRACED != 0 {
   293  		wopts.Events |= kernel.EventChildGroupStop
   294  	}
   295  
   296  	wr, err := t.Wait(&wopts)
   297  	if err != nil {
   298  		if err == kernel.ErrNoWaitableEvent {
   299  			return 0, nil
   300  		}
   301  		return 0, err
   302  	}
   303  	if statusAddr != 0 {
   304  		if _, err := primitive.CopyUint32Out(t, statusAddr, uint32(wr.Status)); err != nil {
   305  			return 0, err
   306  		}
   307  	}
   308  	if rusageAddr != 0 {
   309  		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
   310  		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
   311  			return 0, err
   312  		}
   313  	}
   314  	return uintptr(wr.TID), nil
   315  }
   316  
   317  // Wait4 implements linux syscall wait4(2).
   318  func Wait4(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   319  	pid := int(args[0].Int())
   320  	statusAddr := args[1].Pointer()
   321  	options := int(args[2].Uint())
   322  	rusageAddr := args[3].Pointer()
   323  
   324  	n, err := wait4(t, pid, statusAddr, options, rusageAddr)
   325  	return n, nil, err
   326  }
   327  
   328  // WaitPid implements linux syscall waitpid(2).
   329  func WaitPid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   330  	pid := int(args[0].Int())
   331  	statusAddr := args[1].Pointer()
   332  	options := int(args[2].Uint())
   333  
   334  	n, err := wait4(t, pid, statusAddr, options, 0)
   335  	return n, nil, err
   336  }
   337  
   338  // Waitid implements linux syscall waitid(2).
   339  func Waitid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   340  	idtype := args[0].Int()
   341  	id := args[1].Int()
   342  	infop := args[2].Pointer()
   343  	options := int(args[3].Uint())
   344  	rusageAddr := args[4].Pointer()
   345  
   346  	if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
   347  		return 0, nil, linuxerr.EINVAL
   348  	}
   349  	if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
   350  		return 0, nil, linuxerr.EINVAL
   351  	}
   352  	wopts := kernel.WaitOptions{
   353  		Events:       kernel.EventTraceeStop,
   354  		ConsumeEvent: options&linux.WNOWAIT == 0,
   355  	}
   356  	switch idtype {
   357  	case linux.P_ALL:
   358  	case linux.P_PID:
   359  		wopts.SpecificTID = kernel.ThreadID(id)
   360  	case linux.P_PGID:
   361  		wopts.SpecificPGID = kernel.ProcessGroupID(id)
   362  	default:
   363  		return 0, nil, linuxerr.EINVAL
   364  	}
   365  
   366  	if err := parseCommonWaitOptions(&wopts, options); err != nil {
   367  		return 0, nil, err
   368  	}
   369  	if options&linux.WEXITED != 0 {
   370  		wopts.Events |= kernel.EventExit
   371  	}
   372  	if options&linux.WSTOPPED != 0 {
   373  		wopts.Events |= kernel.EventChildGroupStop
   374  	}
   375  
   376  	wr, err := t.Wait(&wopts)
   377  	if err != nil {
   378  		if err == kernel.ErrNoWaitableEvent {
   379  			err = nil
   380  			// "If WNOHANG was specified in options and there were no children
   381  			// in a waitable state, then waitid() returns 0 immediately and the
   382  			// state of the siginfo_t structure pointed to by infop is
   383  			// unspecified." - waitid(2). But Linux's waitid actually zeroes
   384  			// out the fields it would set for a successful waitid in this case
   385  			// as well.
   386  			if infop != 0 {
   387  				var si linux.SignalInfo
   388  				_, err = si.CopyOut(t, infop)
   389  			}
   390  		}
   391  		return 0, nil, err
   392  	}
   393  	if rusageAddr != 0 {
   394  		ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
   395  		if _, err := ru.CopyOut(t, rusageAddr); err != nil {
   396  			return 0, nil, err
   397  		}
   398  	}
   399  	if infop == 0 {
   400  		return 0, nil, nil
   401  	}
   402  	si := linux.SignalInfo{
   403  		Signo: int32(linux.SIGCHLD),
   404  	}
   405  	si.SetPID(int32(wr.TID))
   406  	si.SetUID(int32(wr.UID))
   407  	s := wr.Status
   408  	switch {
   409  	case s.Exited():
   410  		si.Code = linux.CLD_EXITED
   411  		si.SetStatus(int32(s.ExitStatus()))
   412  	case s.Signaled():
   413  		if s.CoreDumped() {
   414  			si.Code = linux.CLD_DUMPED
   415  		} else {
   416  			si.Code = linux.CLD_KILLED
   417  		}
   418  		si.SetStatus(int32(s.TerminationSignal()))
   419  	case s.Stopped():
   420  		if wr.Event == kernel.EventTraceeStop {
   421  			si.Code = linux.CLD_TRAPPED
   422  			si.SetStatus(int32(s.PtraceEvent()))
   423  		} else {
   424  			si.Code = linux.CLD_STOPPED
   425  			si.SetStatus(int32(s.StopSignal()))
   426  		}
   427  	case s.Continued():
   428  		si.Code = linux.CLD_CONTINUED
   429  		si.SetStatus(int32(linux.SIGCONT))
   430  	default:
   431  		t.Warningf("waitid got incomprehensible wait status %d", s)
   432  	}
   433  	_, err = si.CopyOut(t, infop)
   434  	return 0, nil, err
   435  }
   436  
   437  // SetTidAddress implements linux syscall set_tid_address(2).
   438  func SetTidAddress(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   439  	addr := args[0].Pointer()
   440  
   441  	// Always succeed, return caller's tid.
   442  	t.SetClearTID(addr)
   443  	return uintptr(t.ThreadID()), nil, nil
   444  }
   445  
   446  // Setns implements linux syscall setns(2).
   447  func Setns(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   448  	fd := args[0].Int()
   449  
   450  	file := t.GetFile(fd)
   451  	if file == nil {
   452  		return 0, nil, linuxerr.EBADF
   453  	}
   454  	defer file.DecRef(t)
   455  
   456  	flags := args[1].Int()
   457  	return 0, nil, t.Setns(file, flags)
   458  }
   459  
   460  // Unshare implements linux syscall unshare(2).
   461  func Unshare(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   462  	flags := args[0].Int()
   463  	// "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2)
   464  	if flags&linux.CLONE_NEWPID != 0 {
   465  		flags |= linux.CLONE_THREAD
   466  	}
   467  	// "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since
   468  	// Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS."
   469  	if flags&linux.CLONE_NEWUSER != 0 {
   470  		flags |= linux.CLONE_THREAD | linux.CLONE_FS
   471  	}
   472  	return 0, nil, t.Unshare(flags)
   473  }
   474  
   475  // SchedYield implements linux syscall sched_yield(2).
   476  func SchedYield(t *kernel.Task, sysno uintptr, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   477  	t.Yield()
   478  	return 0, nil, nil
   479  }
   480  
   481  // SchedSetaffinity implements linux syscall sched_setaffinity(2).
   482  func SchedSetaffinity(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   483  	tid := args[0].Int()
   484  	size := args[1].SizeT()
   485  	maskAddr := args[2].Pointer()
   486  
   487  	var task *kernel.Task
   488  	if tid == 0 {
   489  		task = t
   490  	} else {
   491  		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
   492  		if task == nil {
   493  			return 0, nil, linuxerr.ESRCH
   494  		}
   495  	}
   496  
   497  	mask := sched.NewCPUSet(t.Kernel().ApplicationCores())
   498  	if size > mask.Size() {
   499  		size = mask.Size()
   500  	}
   501  	if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil {
   502  		return 0, nil, err
   503  	}
   504  	return 0, nil, task.SetCPUMask(mask)
   505  }
   506  
   507  // SchedGetaffinity implements linux syscall sched_getaffinity(2).
   508  func SchedGetaffinity(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   509  	tid := args[0].Int()
   510  	size := args[1].SizeT()
   511  	maskAddr := args[2].Pointer()
   512  
   513  	// This limitation is because linux stores the cpumask
   514  	// in an array of "unsigned long" so the buffer needs to
   515  	// be a multiple of the word size.
   516  	if size&(t.Arch().Width()-1) > 0 {
   517  		return 0, nil, linuxerr.EINVAL
   518  	}
   519  
   520  	var task *kernel.Task
   521  	if tid == 0 {
   522  		task = t
   523  	} else {
   524  		task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
   525  		if task == nil {
   526  			return 0, nil, linuxerr.ESRCH
   527  		}
   528  	}
   529  
   530  	mask := task.CPUMask()
   531  	// The buffer needs to be big enough to hold a cpumask with
   532  	// all possible cpus.
   533  	if size < mask.Size() {
   534  		return 0, nil, linuxerr.EINVAL
   535  	}
   536  	_, err := t.CopyOutBytes(maskAddr, mask)
   537  
   538  	// NOTE: The syscall interface is slightly different than the glibc
   539  	// interface. The raw sched_getaffinity syscall returns the number of
   540  	// bytes used to represent a cpu mask.
   541  	return uintptr(mask.Size()), nil, err
   542  }
   543  
   544  // Getcpu implements linux syscall getcpu(2).
   545  func Getcpu(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   546  	cpu := args[0].Pointer()
   547  	node := args[1].Pointer()
   548  	// third argument to this system call is nowadays unused.
   549  
   550  	if cpu != 0 {
   551  		if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil {
   552  			return 0, nil, err
   553  		}
   554  	}
   555  	// We always return node 0.
   556  	if node != 0 {
   557  		if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{
   558  			AddressSpaceActive: true,
   559  		}); err != nil {
   560  			return 0, nil, err
   561  		}
   562  	}
   563  	return 0, nil, nil
   564  }
   565  
   566  // Setpgid implements the linux syscall setpgid(2).
   567  func Setpgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   568  	// Note that throughout this function, pgid is interpreted with respect
   569  	// to t's namespace, not with respect to the selected ThreadGroup's
   570  	// namespace (which may be different).
   571  	pid := kernel.ThreadID(args[0].Int())
   572  	pgid := kernel.ProcessGroupID(args[1].Int())
   573  
   574  	// "If pid is zero, then the process ID of the calling process is used."
   575  	tg := t.ThreadGroup()
   576  	if pid != 0 {
   577  		ot := t.PIDNamespace().TaskWithID(pid)
   578  		if ot == nil {
   579  			return 0, nil, linuxerr.ESRCH
   580  		}
   581  		tg = ot.ThreadGroup()
   582  		if tg.Leader() != ot {
   583  			return 0, nil, linuxerr.EINVAL
   584  		}
   585  
   586  		// Setpgid only operates on child threadgroups.
   587  		if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) {
   588  			return 0, nil, linuxerr.ESRCH
   589  		}
   590  	}
   591  
   592  	// "If pgid is zero, then the PGID of the process specified by pid is made
   593  	// the same as its process ID."
   594  	defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg))
   595  	if pgid == 0 {
   596  		pgid = defaultPGID
   597  	} else if pgid < 0 {
   598  		return 0, nil, linuxerr.EINVAL
   599  	}
   600  
   601  	// If the pgid is the same as the group, then create a new one. Otherwise,
   602  	// we attempt to join an existing process group.
   603  	if pgid == defaultPGID {
   604  		// For convenience, errors line up with Linux syscall API.
   605  		if err := tg.CreateProcessGroup(); err != nil {
   606  			// Is the process group already as expected? If so,
   607  			// just return success. This is the same behavior as
   608  			// Linux.
   609  			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID {
   610  				return 0, nil, nil
   611  			}
   612  			return 0, nil, err
   613  		}
   614  	} else {
   615  		// Same as CreateProcessGroup, above.
   616  		if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil {
   617  			// See above.
   618  			if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
   619  				return 0, nil, nil
   620  			}
   621  			return 0, nil, err
   622  		}
   623  	}
   624  
   625  	// Success.
   626  	return 0, nil, nil
   627  }
   628  
   629  // Getpgrp implements the linux syscall getpgrp(2).
   630  func Getpgrp(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   631  	return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil
   632  }
   633  
   634  // Getpgid implements the linux syscall getpgid(2).
   635  func Getpgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   636  	tid := kernel.ThreadID(args[0].Int())
   637  	if tid == 0 {
   638  		return Getpgrp(t, sysno, args)
   639  	}
   640  
   641  	target := t.PIDNamespace().TaskWithID(tid)
   642  	if target == nil {
   643  		return 0, nil, linuxerr.ESRCH
   644  	}
   645  
   646  	return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil
   647  }
   648  
   649  // Setsid implements the linux syscall setsid(2).
   650  func Setsid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   651  	return 0, nil, t.ThreadGroup().CreateSession()
   652  }
   653  
   654  // Getsid implements the linux syscall getsid(2).
   655  func Getsid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   656  	tid := kernel.ThreadID(args[0].Int())
   657  	if tid == 0 {
   658  		return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil
   659  	}
   660  
   661  	target := t.PIDNamespace().TaskWithID(tid)
   662  	if target == nil {
   663  		return 0, nil, linuxerr.ESRCH
   664  	}
   665  
   666  	return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil
   667  }
   668  
   669  // Getpriority pretends to implement the linux syscall getpriority(2).
   670  //
   671  // This is a stub; real priorities require a full scheduler.
   672  func Getpriority(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   673  	which := args[0].Int()
   674  	who := kernel.ThreadID(args[1].Int())
   675  
   676  	switch which {
   677  	case linux.PRIO_PROCESS:
   678  		// Look for who, return ESRCH if not found.
   679  		var task *kernel.Task
   680  		if who == 0 {
   681  			task = t
   682  		} else {
   683  			task = t.PIDNamespace().TaskWithID(who)
   684  		}
   685  
   686  		if task == nil {
   687  			return 0, nil, linuxerr.ESRCH
   688  		}
   689  
   690  		// From kernel/sys.c:getpriority:
   691  		// "To avoid negative return values, 'getpriority()'
   692  		// will not return the normal nice-value, but a negated
   693  		// value that has been offset by 20"
   694  		return uintptr(20 - task.Niceness()), nil, nil
   695  	case linux.PRIO_USER:
   696  		fallthrough
   697  	case linux.PRIO_PGRP:
   698  		// PRIO_USER and PRIO_PGRP have no further implementation yet.
   699  		return 0, nil, nil
   700  	default:
   701  		return 0, nil, linuxerr.EINVAL
   702  	}
   703  }
   704  
   705  // Setpriority pretends to implement the linux syscall setpriority(2).
   706  //
   707  // This is a stub; real priorities require a full scheduler.
   708  func Setpriority(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   709  	which := args[0].Int()
   710  	who := kernel.ThreadID(args[1].Int())
   711  	niceval := int(args[2].Int())
   712  
   713  	// In the kernel's implementation, values outside the range
   714  	// of [-20, 19] are truncated to these minimum and maximum
   715  	// values.
   716  	if niceval < -20 /* min niceval */ {
   717  		niceval = -20
   718  	} else if niceval > 19 /* max niceval */ {
   719  		niceval = 19
   720  	}
   721  
   722  	switch which {
   723  	case linux.PRIO_PROCESS:
   724  		// Look for who, return ESRCH if not found.
   725  		var task *kernel.Task
   726  		if who == 0 {
   727  			task = t
   728  		} else {
   729  			task = t.PIDNamespace().TaskWithID(who)
   730  		}
   731  
   732  		if task == nil {
   733  			return 0, nil, linuxerr.ESRCH
   734  		}
   735  
   736  		task.SetNiceness(niceval)
   737  	case linux.PRIO_USER:
   738  		fallthrough
   739  	case linux.PRIO_PGRP:
   740  		// PRIO_USER and PRIO_PGRP have no further implementation yet.
   741  		return 0, nil, nil
   742  	default:
   743  		return 0, nil, linuxerr.EINVAL
   744  	}
   745  
   746  	return 0, nil, nil
   747  }
   748  
   749  // Ptrace implements linux system call ptrace(2).
   750  func Ptrace(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   751  	req := args[0].Int64()
   752  	pid := kernel.ThreadID(args[1].Int())
   753  	addr := args[2].Pointer()
   754  	data := args[3].Pointer()
   755  
   756  	return 0, nil, t.Ptrace(req, pid, addr, data)
   757  }