github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/sys_thread.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "path" 19 20 "golang.org/x/sys/unix" 21 "github.com/SagerNet/gvisor/pkg/abi/linux" 22 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 23 "github.com/SagerNet/gvisor/pkg/hostarch" 24 "github.com/SagerNet/gvisor/pkg/marshal/primitive" 25 "github.com/SagerNet/gvisor/pkg/sentry/arch" 26 "github.com/SagerNet/gvisor/pkg/sentry/fs" 27 "github.com/SagerNet/gvisor/pkg/sentry/fsbridge" 28 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 29 "github.com/SagerNet/gvisor/pkg/sentry/kernel/sched" 30 "github.com/SagerNet/gvisor/pkg/sentry/loader" 31 "github.com/SagerNet/gvisor/pkg/syserror" 32 "github.com/SagerNet/gvisor/pkg/usermem" 33 ) 34 35 const ( 36 // exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux. 37 exitSignalMask = 0xff 38 ) 39 40 var ( 41 // ExecMaxTotalSize is the maximum length of all argv and envv entries. 42 // 43 // N.B. The behavior here is different than Linux. Linux provides a limit on 44 // individual arguments of 32 pages, and an aggregate limit of at least 32 pages 45 // but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement 46 // any behavior based on the stack size, and instead provide a fixed hard-limit of 47 // 2 MB (which should work well given that 8 MB stack limits are common). 48 ExecMaxTotalSize = 2 * 1024 * 1024 49 50 // ExecMaxElemSize is the maximum length of a single argv or envv entry. 51 ExecMaxElemSize = 32 * hostarch.PageSize 52 ) 53 54 // Getppid implements linux syscall getppid(2). 55 func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 56 parent := t.Parent() 57 if parent == nil { 58 return 0, nil, nil 59 } 60 return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil 61 } 62 63 // Getpid implements linux syscall getpid(2). 64 func Getpid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 65 return uintptr(t.ThreadGroup().ID()), nil, nil 66 } 67 68 // Gettid implements linux syscall gettid(2). 69 func Gettid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 70 return uintptr(t.ThreadID()), nil, nil 71 } 72 73 // Execve implements linux syscall execve(2). 74 func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 75 filenameAddr := args[0].Pointer() 76 argvAddr := args[1].Pointer() 77 envvAddr := args[2].Pointer() 78 79 return execveat(t, linux.AT_FDCWD, filenameAddr, argvAddr, envvAddr, 0) 80 } 81 82 // Execveat implements linux syscall execveat(2). 83 func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 84 dirFD := args[0].Int() 85 pathnameAddr := args[1].Pointer() 86 argvAddr := args[2].Pointer() 87 envvAddr := args[3].Pointer() 88 flags := args[4].Int() 89 90 return execveat(t, dirFD, pathnameAddr, argvAddr, envvAddr, flags) 91 } 92 93 func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { 94 pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) 95 if err != nil { 96 return 0, nil, err 97 } 98 99 var argv, envv []string 100 if argvAddr != 0 { 101 var err error 102 argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize) 103 if err != nil { 104 return 0, nil, err 105 } 106 } 107 if envvAddr != 0 { 108 var err error 109 envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize) 110 if err != nil { 111 return 0, nil, err 112 } 113 } 114 115 if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { 116 return 0, nil, linuxerr.EINVAL 117 } 118 atEmptyPath := flags&linux.AT_EMPTY_PATH != 0 119 if !atEmptyPath && len(pathname) == 0 { 120 return 0, nil, syserror.ENOENT 121 } 122 resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0 123 124 root := t.FSContext().RootDirectory() 125 defer root.DecRef(t) 126 127 var wd *fs.Dirent 128 var executable fsbridge.File 129 var closeOnExec bool 130 if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) { 131 // Even if the pathname is absolute, we may still need the wd 132 // for interpreter scripts if the path of the interpreter is 133 // relative. 134 wd = t.FSContext().WorkingDirectory() 135 } else { 136 // Need to extract the given FD. 137 f, fdFlags := t.FDTable().Get(dirFD) 138 if f == nil { 139 return 0, nil, linuxerr.EBADF 140 } 141 defer f.DecRef(t) 142 closeOnExec = fdFlags.CloseOnExec 143 144 if atEmptyPath && len(pathname) == 0 { 145 // TODO(github.com/SagerNet/issue/160): Linux requires only execute permission, 146 // not read. However, our backing filesystems may prevent us from reading 147 // the file without read permission. Additionally, a task with a 148 // non-readable executable has additional constraints on access via 149 // ptrace and procfs. 150 if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil { 151 return 0, nil, err 152 } 153 executable = fsbridge.NewFSFile(f) 154 } else { 155 wd = f.Dirent 156 wd.IncRef() 157 if !fs.IsDir(wd.Inode.StableAttr) { 158 return 0, nil, syserror.ENOTDIR 159 } 160 } 161 } 162 if wd != nil { 163 defer wd.DecRef(t) 164 } 165 166 // Load the new TaskImage. 167 remainingTraversals := uint(linux.MaxSymlinkTraversals) 168 loadArgs := loader.LoadArgs{ 169 Opener: fsbridge.NewFSLookup(t.MountNamespace(), root, wd), 170 RemainingTraversals: &remainingTraversals, 171 ResolveFinal: resolveFinal, 172 Filename: pathname, 173 File: executable, 174 CloseOnExec: closeOnExec, 175 Argv: argv, 176 Envv: envv, 177 Features: t.Arch().FeatureSet(), 178 } 179 180 image, se := t.Kernel().LoadTaskImage(t, loadArgs) 181 if se != nil { 182 return 0, nil, se.ToError() 183 } 184 185 ctrl, err := t.Execve(image) 186 return 0, ctrl, err 187 } 188 189 // Exit implements linux syscall exit(2). 190 func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 191 status := int(args[0].Int()) 192 t.PrepareExit(kernel.ExitStatus{Code: status}) 193 return 0, kernel.CtrlDoExit, nil 194 } 195 196 // ExitGroup implements linux syscall exit_group(2). 197 func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 198 status := int(args[0].Int()) 199 t.PrepareGroupExit(kernel.ExitStatus{Code: status}) 200 return 0, kernel.CtrlDoExit, nil 201 } 202 203 // clone is used by Clone, Fork, and VFork. 204 func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) { 205 opts := kernel.CloneOptions{ 206 SharingOptions: kernel.SharingOptions{ 207 NewAddressSpace: flags&linux.CLONE_VM == 0, 208 NewSignalHandlers: flags&linux.CLONE_SIGHAND == 0, 209 NewThreadGroup: flags&linux.CLONE_THREAD == 0, 210 TerminationSignal: linux.Signal(flags & exitSignalMask), 211 NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, 212 NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, 213 NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, 214 NewFiles: flags&linux.CLONE_FILES == 0, 215 NewFSContext: flags&linux.CLONE_FS == 0, 216 NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, 217 NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, 218 }, 219 Stack: stack, 220 SetTLS: flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS, 221 TLS: tls, 222 ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID, 223 ChildSetTID: flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID, 224 ChildTID: childTID, 225 ParentSetTID: flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID, 226 ParentTID: parentTID, 227 Vfork: flags&linux.CLONE_VFORK == linux.CLONE_VFORK, 228 Untraced: flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED, 229 InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE, 230 } 231 ntid, ctrl, err := t.Clone(&opts) 232 return uintptr(ntid), ctrl, err 233 } 234 235 // Fork implements Linux syscall fork(2). 236 func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 237 // "A call to fork() is equivalent to a call to clone(2) specifying flags 238 // as just SIGCHLD." - fork(2) 239 return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0) 240 } 241 242 // Vfork implements Linux syscall vfork(2). 243 func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 244 // """ 245 // A call to vfork() is equivalent to calling clone(2) with flags specified as: 246 // 247 // CLONE_VM | CLONE_VFORK | SIGCHLD 248 // """ - vfork(2) 249 return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0) 250 } 251 252 // parseCommonWaitOptions applies the options common to wait4 and waitid to 253 // wopts. 254 func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { 255 switch options & (linux.WCLONE | linux.WALL) { 256 case 0: 257 wopts.NonCloneTasks = true 258 case linux.WCLONE: 259 wopts.CloneTasks = true 260 case linux.WALL: 261 wopts.NonCloneTasks = true 262 wopts.CloneTasks = true 263 default: 264 return linuxerr.EINVAL 265 } 266 if options&linux.WCONTINUED != 0 { 267 wopts.Events |= kernel.EventGroupContinue 268 } 269 if options&linux.WNOHANG == 0 { 270 wopts.BlockInterruptErr = syserror.ERESTARTSYS 271 } 272 if options&linux.WNOTHREAD == 0 { 273 wopts.SiblingChildren = true 274 } 275 return nil 276 } 277 278 // wait4 waits for the given child process to exit. 279 func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusageAddr hostarch.Addr) (uintptr, error) { 280 if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { 281 return 0, linuxerr.EINVAL 282 } 283 wopts := kernel.WaitOptions{ 284 Events: kernel.EventExit | kernel.EventTraceeStop, 285 ConsumeEvent: true, 286 } 287 // There are four cases to consider: 288 // 289 // pid < -1 any child process whose process group ID is equal to the absolute value of pid 290 // pid == -1 any child process 291 // pid == 0 any child process whose process group ID is equal to that of the calling process 292 // pid > 0 the child whose process ID is equal to the value of pid 293 switch { 294 case pid < -1: 295 wopts.SpecificPGID = kernel.ProcessGroupID(-pid) 296 case pid == -1: 297 // Any process is the default. 298 case pid == 0: 299 wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup()) 300 default: 301 wopts.SpecificTID = kernel.ThreadID(pid) 302 } 303 304 if err := parseCommonWaitOptions(&wopts, options); err != nil { 305 return 0, err 306 } 307 if options&linux.WUNTRACED != 0 { 308 wopts.Events |= kernel.EventChildGroupStop 309 } 310 311 wr, err := t.Wait(&wopts) 312 if err != nil { 313 if err == kernel.ErrNoWaitableEvent { 314 return 0, nil 315 } 316 return 0, err 317 } 318 if statusAddr != 0 { 319 if _, err := primitive.CopyUint32Out(t, statusAddr, wr.Status); err != nil { 320 return 0, err 321 } 322 } 323 if rusageAddr != 0 { 324 ru := getrusage(wr.Task, linux.RUSAGE_BOTH) 325 if _, err := ru.CopyOut(t, rusageAddr); err != nil { 326 return 0, err 327 } 328 } 329 return uintptr(wr.TID), nil 330 } 331 332 // Wait4 implements linux syscall wait4(2). 333 func Wait4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 334 pid := int(args[0].Int()) 335 statusAddr := args[1].Pointer() 336 options := int(args[2].Uint()) 337 rusageAddr := args[3].Pointer() 338 339 n, err := wait4(t, pid, statusAddr, options, rusageAddr) 340 return n, nil, err 341 } 342 343 // WaitPid implements linux syscall waitpid(2). 344 func WaitPid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 345 pid := int(args[0].Int()) 346 statusAddr := args[1].Pointer() 347 options := int(args[2].Uint()) 348 349 n, err := wait4(t, pid, statusAddr, options, 0) 350 return n, nil, err 351 } 352 353 // Waitid implements linux syscall waitid(2). 354 func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 355 idtype := args[0].Int() 356 id := args[1].Int() 357 infop := args[2].Pointer() 358 options := int(args[3].Uint()) 359 rusageAddr := args[4].Pointer() 360 361 if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { 362 return 0, nil, linuxerr.EINVAL 363 } 364 if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { 365 return 0, nil, linuxerr.EINVAL 366 } 367 wopts := kernel.WaitOptions{ 368 Events: kernel.EventTraceeStop, 369 ConsumeEvent: options&linux.WNOWAIT == 0, 370 } 371 switch idtype { 372 case linux.P_ALL: 373 case linux.P_PID: 374 wopts.SpecificTID = kernel.ThreadID(id) 375 case linux.P_PGID: 376 wopts.SpecificPGID = kernel.ProcessGroupID(id) 377 default: 378 return 0, nil, linuxerr.EINVAL 379 } 380 381 if err := parseCommonWaitOptions(&wopts, options); err != nil { 382 return 0, nil, err 383 } 384 if options&linux.WEXITED != 0 { 385 wopts.Events |= kernel.EventExit 386 } 387 if options&linux.WSTOPPED != 0 { 388 wopts.Events |= kernel.EventChildGroupStop 389 } 390 391 wr, err := t.Wait(&wopts) 392 if err != nil { 393 if err == kernel.ErrNoWaitableEvent { 394 err = nil 395 // "If WNOHANG was specified in options and there were no children 396 // in a waitable state, then waitid() returns 0 immediately and the 397 // state of the siginfo_t structure pointed to by infop is 398 // unspecified." - waitid(2). But Linux's waitid actually zeroes 399 // out the fields it would set for a successful waitid in this case 400 // as well. 401 if infop != 0 { 402 var si linux.SignalInfo 403 _, err = si.CopyOut(t, infop) 404 } 405 } 406 return 0, nil, err 407 } 408 if rusageAddr != 0 { 409 ru := getrusage(wr.Task, linux.RUSAGE_BOTH) 410 if _, err := ru.CopyOut(t, rusageAddr); err != nil { 411 return 0, nil, err 412 } 413 } 414 if infop == 0 { 415 return 0, nil, nil 416 } 417 si := linux.SignalInfo{ 418 Signo: int32(linux.SIGCHLD), 419 } 420 si.SetPID(int32(wr.TID)) 421 si.SetUID(int32(wr.UID)) 422 // TODO(b/73541790): convert kernel.ExitStatus to functions and make 423 // WaitResult.Status a linux.WaitStatus. 424 s := unix.WaitStatus(wr.Status) 425 switch { 426 case s.Exited(): 427 si.Code = linux.CLD_EXITED 428 si.SetStatus(int32(s.ExitStatus())) 429 case s.Signaled(): 430 si.Code = linux.CLD_KILLED 431 si.SetStatus(int32(s.Signal())) 432 case s.CoreDump(): 433 si.Code = linux.CLD_DUMPED 434 si.SetStatus(int32(s.Signal())) 435 case s.Stopped(): 436 if wr.Event == kernel.EventTraceeStop { 437 si.Code = linux.CLD_TRAPPED 438 si.SetStatus(int32(s.TrapCause())) 439 } else { 440 si.Code = linux.CLD_STOPPED 441 si.SetStatus(int32(s.StopSignal())) 442 } 443 case s.Continued(): 444 si.Code = linux.CLD_CONTINUED 445 si.SetStatus(int32(linux.SIGCONT)) 446 default: 447 t.Warningf("waitid got incomprehensible wait status %d", s) 448 } 449 _, err = si.CopyOut(t, infop) 450 return 0, nil, err 451 } 452 453 // SetTidAddress implements linux syscall set_tid_address(2). 454 func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 455 addr := args[0].Pointer() 456 457 // Always succeed, return caller's tid. 458 t.SetClearTID(addr) 459 return uintptr(t.ThreadID()), nil, nil 460 } 461 462 // Unshare implements linux syscall unshare(2). 463 func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 464 flags := args[0].Int() 465 opts := kernel.SharingOptions{ 466 NewAddressSpace: flags&linux.CLONE_VM == linux.CLONE_VM, 467 NewSignalHandlers: flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND, 468 NewThreadGroup: flags&linux.CLONE_THREAD == linux.CLONE_THREAD, 469 NewPIDNamespace: flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID, 470 NewUserNamespace: flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER, 471 NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET, 472 NewFiles: flags&linux.CLONE_FILES == linux.CLONE_FILES, 473 NewFSContext: flags&linux.CLONE_FS == linux.CLONE_FS, 474 NewUTSNamespace: flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS, 475 NewIPCNamespace: flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC, 476 } 477 // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) 478 if opts.NewPIDNamespace { 479 opts.NewThreadGroup = true 480 } 481 // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since 482 // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS." 483 if opts.NewUserNamespace { 484 opts.NewThreadGroup = true 485 opts.NewFSContext = true 486 } 487 return 0, nil, t.Unshare(&opts) 488 } 489 490 // SchedYield implements linux syscall sched_yield(2). 491 func SchedYield(t *kernel.Task, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 492 t.Yield() 493 return 0, nil, nil 494 } 495 496 // SchedSetaffinity implements linux syscall sched_setaffinity(2). 497 func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 498 tid := args[0].Int() 499 size := args[1].SizeT() 500 maskAddr := args[2].Pointer() 501 502 var task *kernel.Task 503 if tid == 0 { 504 task = t 505 } else { 506 task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) 507 if task == nil { 508 return 0, nil, syserror.ESRCH 509 } 510 } 511 512 mask := sched.NewCPUSet(t.Kernel().ApplicationCores()) 513 if size > mask.Size() { 514 size = mask.Size() 515 } 516 if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil { 517 return 0, nil, err 518 } 519 return 0, nil, task.SetCPUMask(mask) 520 } 521 522 // SchedGetaffinity implements linux syscall sched_getaffinity(2). 523 func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 524 tid := args[0].Int() 525 size := args[1].SizeT() 526 maskAddr := args[2].Pointer() 527 528 // This limitation is because linux stores the cpumask 529 // in an array of "unsigned long" so the buffer needs to 530 // be a multiple of the word size. 531 if size&(t.Arch().Width()-1) > 0 { 532 return 0, nil, linuxerr.EINVAL 533 } 534 535 var task *kernel.Task 536 if tid == 0 { 537 task = t 538 } else { 539 task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) 540 if task == nil { 541 return 0, nil, syserror.ESRCH 542 } 543 } 544 545 mask := task.CPUMask() 546 // The buffer needs to be big enough to hold a cpumask with 547 // all possible cpus. 548 if size < mask.Size() { 549 return 0, nil, linuxerr.EINVAL 550 } 551 _, err := t.CopyOutBytes(maskAddr, mask) 552 553 // NOTE: The syscall interface is slightly different than the glibc 554 // interface. The raw sched_getaffinity syscall returns the number of 555 // bytes used to represent a cpu mask. 556 return uintptr(mask.Size()), nil, err 557 } 558 559 // Getcpu implements linux syscall getcpu(2). 560 func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 561 cpu := args[0].Pointer() 562 node := args[1].Pointer() 563 // third argument to this system call is nowadays unused. 564 565 if cpu != 0 { 566 if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil { 567 return 0, nil, err 568 } 569 } 570 // We always return node 0. 571 if node != 0 { 572 if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{ 573 AddressSpaceActive: true, 574 }); err != nil { 575 return 0, nil, err 576 } 577 } 578 return 0, nil, nil 579 } 580 581 // Setpgid implements the linux syscall setpgid(2). 582 func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 583 // Note that throughout this function, pgid is interpreted with respect 584 // to t's namespace, not with respect to the selected ThreadGroup's 585 // namespace (which may be different). 586 pid := kernel.ThreadID(args[0].Int()) 587 pgid := kernel.ProcessGroupID(args[1].Int()) 588 589 // "If pid is zero, then the process ID of the calling process is used." 590 tg := t.ThreadGroup() 591 if pid != 0 { 592 ot := t.PIDNamespace().TaskWithID(pid) 593 if ot == nil { 594 return 0, nil, syserror.ESRCH 595 } 596 tg = ot.ThreadGroup() 597 if tg.Leader() != ot { 598 return 0, nil, linuxerr.EINVAL 599 } 600 601 // Setpgid only operates on child threadgroups. 602 if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) { 603 return 0, nil, syserror.ESRCH 604 } 605 } 606 607 // "If pgid is zero, then the PGID of the process specified by pid is made 608 // the same as its process ID." 609 defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg)) 610 if pgid == 0 { 611 pgid = defaultPGID 612 } else if pgid < 0 { 613 return 0, nil, linuxerr.EINVAL 614 } 615 616 // If the pgid is the same as the group, then create a new one. Otherwise, 617 // we attempt to join an existing process group. 618 if pgid == defaultPGID { 619 // For convenience, errors line up with Linux syscall API. 620 if err := tg.CreateProcessGroup(); err != nil { 621 // Is the process group already as expected? If so, 622 // just return success. This is the same behavior as 623 // Linux. 624 if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID { 625 return 0, nil, nil 626 } 627 return 0, nil, err 628 } 629 } else { 630 // Same as CreateProcessGroup, above. 631 if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil { 632 // See above. 633 if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { 634 return 0, nil, nil 635 } 636 return 0, nil, err 637 } 638 } 639 640 // Success. 641 return 0, nil, nil 642 } 643 644 // Getpgrp implements the linux syscall getpgrp(2). 645 func Getpgrp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 646 return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil 647 } 648 649 // Getpgid implements the linux syscall getpgid(2). 650 func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 651 tid := kernel.ThreadID(args[0].Int()) 652 if tid == 0 { 653 return Getpgrp(t, args) 654 } 655 656 target := t.PIDNamespace().TaskWithID(tid) 657 if target == nil { 658 return 0, nil, syserror.ESRCH 659 } 660 661 return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil 662 } 663 664 // Setsid implements the linux syscall setsid(2). 665 func Setsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 666 return 0, nil, t.ThreadGroup().CreateSession() 667 } 668 669 // Getsid implements the linux syscall getsid(2). 670 func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 671 tid := kernel.ThreadID(args[0].Int()) 672 if tid == 0 { 673 return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil 674 } 675 676 target := t.PIDNamespace().TaskWithID(tid) 677 if target == nil { 678 return 0, nil, syserror.ESRCH 679 } 680 681 return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil 682 } 683 684 // Getpriority pretends to implement the linux syscall getpriority(2). 685 // 686 // This is a stub; real priorities require a full scheduler. 687 func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 688 which := args[0].Int() 689 who := kernel.ThreadID(args[1].Int()) 690 691 switch which { 692 case linux.PRIO_PROCESS: 693 // Look for who, return ESRCH if not found. 694 var task *kernel.Task 695 if who == 0 { 696 task = t 697 } else { 698 task = t.PIDNamespace().TaskWithID(who) 699 } 700 701 if task == nil { 702 return 0, nil, syserror.ESRCH 703 } 704 705 // From kernel/sys.c:getpriority: 706 // "To avoid negative return values, 'getpriority()' 707 // will not return the normal nice-value, but a negated 708 // value that has been offset by 20" 709 return uintptr(20 - task.Niceness()), nil, nil 710 case linux.PRIO_USER: 711 fallthrough 712 case linux.PRIO_PGRP: 713 // PRIO_USER and PRIO_PGRP have no further implementation yet. 714 return 0, nil, nil 715 default: 716 return 0, nil, linuxerr.EINVAL 717 } 718 } 719 720 // Setpriority pretends to implement the linux syscall setpriority(2). 721 // 722 // This is a stub; real priorities require a full scheduler. 723 func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 724 which := args[0].Int() 725 who := kernel.ThreadID(args[1].Int()) 726 niceval := int(args[2].Int()) 727 728 // In the kernel's implementation, values outside the range 729 // of [-20, 19] are truncated to these minimum and maximum 730 // values. 731 if niceval < -20 /* min niceval */ { 732 niceval = -20 733 } else if niceval > 19 /* max niceval */ { 734 niceval = 19 735 } 736 737 switch which { 738 case linux.PRIO_PROCESS: 739 // Look for who, return ESRCH if not found. 740 var task *kernel.Task 741 if who == 0 { 742 task = t 743 } else { 744 task = t.PIDNamespace().TaskWithID(who) 745 } 746 747 if task == nil { 748 return 0, nil, syserror.ESRCH 749 } 750 751 task.SetNiceness(niceval) 752 case linux.PRIO_USER: 753 fallthrough 754 case linux.PRIO_PGRP: 755 // PRIO_USER and PRIO_PGRP have no further implementation yet. 756 return 0, nil, nil 757 default: 758 return 0, nil, linuxerr.EINVAL 759 } 760 761 return 0, nil, nil 762 } 763 764 // Ptrace implements linux system call ptrace(2). 765 func Ptrace(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 766 req := args[0].Int64() 767 pid := kernel.ThreadID(args[1].Int()) 768 addr := args[2].Pointer() 769 data := args[3].Pointer() 770 771 return 0, nil, t.Ptrace(req, pid, addr, data) 772 }