github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/syscalls/linux/sys_thread.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "github.com/metacubex/gvisor/pkg/abi/linux" 19 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 20 "github.com/metacubex/gvisor/pkg/fspath" 21 "github.com/metacubex/gvisor/pkg/hostarch" 22 "github.com/metacubex/gvisor/pkg/marshal/primitive" 23 "github.com/metacubex/gvisor/pkg/sentry/arch" 24 "github.com/metacubex/gvisor/pkg/sentry/kernel" 25 "github.com/metacubex/gvisor/pkg/sentry/kernel/sched" 26 "github.com/metacubex/gvisor/pkg/sentry/loader" 27 "github.com/metacubex/gvisor/pkg/sentry/seccheck" 28 "github.com/metacubex/gvisor/pkg/sentry/vfs" 29 "github.com/metacubex/gvisor/pkg/usermem" 30 ) 31 32 var ( 33 // ExecMaxTotalSize is the maximum length of all argv and envv entries. 34 // 35 // N.B. The behavior here is different than Linux. Linux provides a limit on 36 // individual arguments of 32 pages, and an aggregate limit of at least 32 pages 37 // but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement 38 // any behavior based on the stack size, and instead provide a fixed hard-limit of 39 // 2 MB (which should work well given that 8 MB stack limits are common). 40 ExecMaxTotalSize = 2 * 1024 * 1024 41 42 // ExecMaxElemSize is the maximum length of a single argv or envv entry. 43 ExecMaxElemSize = 32 * hostarch.PageSize 44 ) 45 46 // Getppid implements linux syscall getppid(2). 47 func Getppid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 48 parent := t.Parent() 49 if parent == nil { 50 return 0, nil, nil 51 } 52 return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil 53 } 54 55 // Getpid implements linux syscall getpid(2). 56 func Getpid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 57 return uintptr(t.ThreadGroup().ID()), nil, nil 58 } 59 60 // Gettid implements linux syscall gettid(2). 61 func Gettid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 62 return uintptr(t.ThreadID()), nil, nil 63 } 64 65 // Execve implements linux syscall execve(2). 66 func Execve(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 67 pathnameAddr := args[0].Pointer() 68 argvAddr := args[1].Pointer() 69 envvAddr := args[2].Pointer() 70 return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */) 71 } 72 73 // Execveat implements linux syscall execveat(2). 74 func Execveat(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 75 dirfd := args[0].Int() 76 pathnameAddr := args[1].Pointer() 77 argvAddr := args[2].Pointer() 78 envvAddr := args[3].Pointer() 79 flags := args[4].Int() 80 return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags) 81 } 82 83 func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { 84 if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { 85 return 0, nil, linuxerr.EINVAL 86 } 87 88 pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) 89 if err != nil { 90 return 0, nil, err 91 } 92 var argv, envv []string 93 if argvAddr != 0 { 94 var err error 95 argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize) 96 if err != nil { 97 return 0, nil, err 98 } 99 } 100 if envvAddr != 0 { 101 var err error 102 envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize) 103 if err != nil { 104 return 0, nil, err 105 } 106 } 107 108 root := t.FSContext().RootDirectory() 109 defer root.DecRef(t) 110 var executable *vfs.FileDescription 111 defer func() { 112 if executable != nil { 113 executable.DecRef(t) 114 } 115 }() 116 closeOnExec := false 117 if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { 118 // We must open the executable ourselves since dirfd is used as the 119 // starting point while resolving path, but the task working directory 120 // is used as the starting point while resolving interpreters (Linux: 121 // fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() => 122 // do_open_execat(fd=AT_FDCWD)), and the loader package is currently 123 // incapable of handling this correctly. 124 if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { 125 return 0, nil, linuxerr.ENOENT 126 } 127 dirfile, dirfileFlags := t.FDTable().Get(dirfd) 128 if dirfile == nil { 129 return 0, nil, linuxerr.EBADF 130 } 131 start := dirfile.VirtualDentry() 132 start.IncRef() 133 dirfile.DecRef(t) 134 closeOnExec = dirfileFlags.CloseOnExec 135 file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ 136 Root: root, 137 Start: start, 138 Path: path, 139 FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, 140 }, &vfs.OpenOptions{ 141 Flags: linux.O_RDONLY, 142 FileExec: true, 143 }) 144 start.DecRef(t) 145 if err != nil { 146 return 0, nil, err 147 } 148 executable = file 149 pathname = executable.MappedName(t) 150 } 151 152 // Load the new TaskImage. 153 wd := t.FSContext().WorkingDirectory() 154 defer wd.DecRef(t) 155 remainingTraversals := uint(linux.MaxSymlinkTraversals) 156 loadArgs := loader.LoadArgs{ 157 Root: root, 158 WorkingDir: wd, 159 RemainingTraversals: &remainingTraversals, 160 ResolveFinal: flags&linux.AT_SYMLINK_NOFOLLOW == 0, 161 Filename: pathname, 162 File: executable, 163 CloseOnExec: closeOnExec, 164 Argv: argv, 165 Envv: envv, 166 Features: t.Kernel().FeatureSet(), 167 } 168 if seccheck.Global.Enabled(seccheck.PointExecve) { 169 // Retain the first executable file that is opened (which may open 170 // multiple executable files while resolving interpreter scripts). 171 if executable == nil { 172 loadArgs.AfterOpen = func(f *vfs.FileDescription) { 173 if executable == nil { 174 f.IncRef() 175 executable = f 176 pathname = executable.MappedName(t) 177 } 178 } 179 } 180 } 181 182 image, se := t.Kernel().LoadTaskImage(t, loadArgs) 183 if se != nil { 184 return 0, nil, se.ToError() 185 } 186 187 ctrl, err := t.Execve(image, argv, envv, executable, pathname) 188 return 0, ctrl, err 189 } 190 191 // Exit implements linux syscall exit(2). 192 func Exit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 193 status := args[0].Int() 194 t.PrepareExit(linux.WaitStatusExit(status & 0xff)) 195 return 0, kernel.CtrlDoExit, nil 196 } 197 198 // ExitGroup implements linux syscall exit_group(2). 199 func ExitGroup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 200 status := args[0].Int() 201 t.PrepareGroupExit(linux.WaitStatusExit(status & 0xff)) 202 return 0, kernel.CtrlDoExit, nil 203 } 204 205 // clone is used by Clone, Fork, and VFork. 206 func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) { 207 args := linux.CloneArgs{ 208 Flags: uint64(uint32(flags) &^ linux.CSIGNAL), 209 ChildTID: uint64(childTID), 210 ParentTID: uint64(parentTID), 211 ExitSignal: uint64(flags & linux.CSIGNAL), 212 Stack: uint64(stack), 213 TLS: uint64(tls), 214 } 215 ntid, ctrl, err := t.Clone(&args) 216 return uintptr(ntid), ctrl, err 217 } 218 219 // Fork implements Linux syscall fork(2). 220 func Fork(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 221 // "A call to fork() is equivalent to a call to clone(2) specifying flags 222 // as just SIGCHLD." - fork(2) 223 return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0) 224 } 225 226 // Vfork implements Linux syscall vfork(2). 227 func Vfork(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 228 // """ 229 // A call to vfork() is equivalent to calling clone(2) with flags specified as: 230 // 231 // CLONE_VM | CLONE_VFORK | SIGCHLD 232 // """ - vfork(2) 233 return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0) 234 } 235 236 // Clone3 implements linux syscall clone3(2). 237 func Clone3(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 238 cloneArgsPointer := args[0].Pointer() 239 size := args[1].SizeT() 240 241 if int(size) < linux.CLONE_ARGS_SIZE_VER0 || int(size) > linux.CLONE_ARGS_SIZE_VER2 { 242 return 0, nil, linuxerr.EINVAL 243 } 244 245 var cloneArgs linux.CloneArgs 246 if cloneArgsPointer != 0 { 247 if _, err := cloneArgs.CopyInN(t, cloneArgsPointer, int(size)); err != nil { 248 return 0, nil, err 249 } 250 } 251 252 ntid, ctrl, err := t.Clone(&cloneArgs) 253 if err != nil { 254 return 0, nil, err 255 } 256 return uintptr(ntid), ctrl, err 257 } 258 259 // parseCommonWaitOptions applies the options common to wait4 and waitid to 260 // wopts. 261 func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error { 262 switch options & (linux.WCLONE | linux.WALL) { 263 case 0: 264 wopts.NonCloneTasks = true 265 case linux.WCLONE: 266 wopts.CloneTasks = true 267 case linux.WALL: 268 wopts.NonCloneTasks = true 269 wopts.CloneTasks = true 270 default: 271 return linuxerr.EINVAL 272 } 273 if options&linux.WCONTINUED != 0 { 274 wopts.Events |= kernel.EventGroupContinue 275 } 276 if options&linux.WNOHANG == 0 { 277 wopts.BlockInterruptErr = linuxerr.ERESTARTSYS 278 } 279 if options&linux.WNOTHREAD == 0 { 280 wopts.SiblingChildren = true 281 } 282 return nil 283 } 284 285 // wait4 waits for the given child process to exit. 286 func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusageAddr hostarch.Addr) (uintptr, error) { 287 if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { 288 return 0, linuxerr.EINVAL 289 } 290 wopts := kernel.WaitOptions{ 291 Events: kernel.EventExit | kernel.EventTraceeStop, 292 ConsumeEvent: true, 293 } 294 // There are four cases to consider: 295 // 296 // pid < -1 any child process whose process group ID is equal to the absolute value of pid 297 // pid == -1 any child process 298 // pid == 0 any child process whose process group ID is equal to that of the calling process 299 // pid > 0 the child whose process ID is equal to the value of pid 300 switch { 301 case pid < -1: 302 wopts.SpecificPGID = kernel.ProcessGroupID(-pid) 303 case pid == -1: 304 // Any process is the default. 305 case pid == 0: 306 wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup()) 307 default: 308 wopts.SpecificTID = kernel.ThreadID(pid) 309 } 310 311 if err := parseCommonWaitOptions(&wopts, options); err != nil { 312 return 0, err 313 } 314 if options&linux.WUNTRACED != 0 { 315 wopts.Events |= kernel.EventChildGroupStop 316 } 317 318 wr, err := t.Wait(&wopts) 319 if err != nil { 320 if err == kernel.ErrNoWaitableEvent { 321 return 0, nil 322 } 323 return 0, err 324 } 325 if statusAddr != 0 { 326 if _, err := primitive.CopyUint32Out(t, statusAddr, uint32(wr.Status)); err != nil { 327 return 0, err 328 } 329 } 330 if rusageAddr != 0 { 331 ru := getrusage(wr.Task, linux.RUSAGE_BOTH) 332 if _, err := ru.CopyOut(t, rusageAddr); err != nil { 333 return 0, err 334 } 335 } 336 return uintptr(wr.TID), nil 337 } 338 339 // Wait4 implements linux syscall wait4(2). 340 func Wait4(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 341 pid := int(args[0].Int()) 342 statusAddr := args[1].Pointer() 343 options := int(args[2].Uint()) 344 rusageAddr := args[3].Pointer() 345 346 n, err := wait4(t, pid, statusAddr, options, rusageAddr) 347 return n, nil, err 348 } 349 350 // WaitPid implements linux syscall waitpid(2). 351 func WaitPid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 352 pid := int(args[0].Int()) 353 statusAddr := args[1].Pointer() 354 options := int(args[2].Uint()) 355 356 n, err := wait4(t, pid, statusAddr, options, 0) 357 return n, nil, err 358 } 359 360 // Waitid implements linux syscall waitid(2). 361 func Waitid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 362 idtype := args[0].Int() 363 id := args[1].Int() 364 infop := args[2].Pointer() 365 options := int(args[3].Uint()) 366 rusageAddr := args[4].Pointer() 367 368 if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 { 369 return 0, nil, linuxerr.EINVAL 370 } 371 if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 { 372 return 0, nil, linuxerr.EINVAL 373 } 374 wopts := kernel.WaitOptions{ 375 Events: kernel.EventTraceeStop, 376 ConsumeEvent: options&linux.WNOWAIT == 0, 377 } 378 switch idtype { 379 case linux.P_ALL: 380 case linux.P_PID: 381 wopts.SpecificTID = kernel.ThreadID(id) 382 case linux.P_PGID: 383 wopts.SpecificPGID = kernel.ProcessGroupID(id) 384 default: 385 return 0, nil, linuxerr.EINVAL 386 } 387 388 if err := parseCommonWaitOptions(&wopts, options); err != nil { 389 return 0, nil, err 390 } 391 if options&linux.WEXITED != 0 { 392 wopts.Events |= kernel.EventExit 393 } 394 if options&linux.WSTOPPED != 0 { 395 wopts.Events |= kernel.EventChildGroupStop 396 } 397 398 wr, err := t.Wait(&wopts) 399 if err != nil { 400 if err == kernel.ErrNoWaitableEvent { 401 err = nil 402 // "If WNOHANG was specified in options and there were no children 403 // in a waitable state, then waitid() returns 0 immediately and the 404 // state of the siginfo_t structure pointed to by infop is 405 // unspecified." - waitid(2). But Linux's waitid actually zeroes 406 // out the fields it would set for a successful waitid in this case 407 // as well. 408 if infop != 0 { 409 var si linux.SignalInfo 410 _, err = si.CopyOut(t, infop) 411 } 412 } 413 return 0, nil, err 414 } 415 if rusageAddr != 0 { 416 ru := getrusage(wr.Task, linux.RUSAGE_BOTH) 417 if _, err := ru.CopyOut(t, rusageAddr); err != nil { 418 return 0, nil, err 419 } 420 } 421 if infop == 0 { 422 return 0, nil, nil 423 } 424 si := linux.SignalInfo{ 425 Signo: int32(linux.SIGCHLD), 426 } 427 si.SetPID(int32(wr.TID)) 428 si.SetUID(int32(wr.UID)) 429 s := wr.Status 430 switch { 431 case s.Exited(): 432 si.Code = linux.CLD_EXITED 433 si.SetStatus(int32(s.ExitStatus())) 434 case s.Signaled(): 435 if s.CoreDumped() { 436 si.Code = linux.CLD_DUMPED 437 } else { 438 si.Code = linux.CLD_KILLED 439 } 440 si.SetStatus(int32(s.TerminationSignal())) 441 case s.Stopped(): 442 if wr.Event == kernel.EventTraceeStop { 443 si.Code = linux.CLD_TRAPPED 444 si.SetStatus(int32(s.PtraceEvent())) 445 } else { 446 si.Code = linux.CLD_STOPPED 447 si.SetStatus(int32(s.StopSignal())) 448 } 449 case s.Continued(): 450 si.Code = linux.CLD_CONTINUED 451 si.SetStatus(int32(linux.SIGCONT)) 452 default: 453 t.Warningf("waitid got incomprehensible wait status %d", s) 454 } 455 _, err = si.CopyOut(t, infop) 456 return 0, nil, err 457 } 458 459 // SetTidAddress implements linux syscall set_tid_address(2). 460 func SetTidAddress(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 461 addr := args[0].Pointer() 462 463 // Always succeed, return caller's tid. 464 t.SetClearTID(addr) 465 return uintptr(t.ThreadID()), nil, nil 466 } 467 468 // Setns implements linux syscall setns(2). 469 func Setns(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 470 fd := args[0].Int() 471 472 file := t.GetFile(fd) 473 if file == nil { 474 return 0, nil, linuxerr.EBADF 475 } 476 defer file.DecRef(t) 477 478 flags := args[1].Int() 479 return 0, nil, t.Setns(file, flags) 480 } 481 482 // Unshare implements linux syscall unshare(2). 483 func Unshare(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 484 flags := args[0].Int() 485 // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2) 486 if flags&linux.CLONE_NEWPID != 0 { 487 flags |= linux.CLONE_THREAD 488 } 489 // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since 490 // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS." 491 if flags&linux.CLONE_NEWUSER != 0 { 492 flags |= linux.CLONE_THREAD | linux.CLONE_FS 493 } 494 return 0, nil, t.Unshare(flags) 495 } 496 497 // SchedYield implements linux syscall sched_yield(2). 498 func SchedYield(t *kernel.Task, sysno uintptr, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 499 t.Yield() 500 return 0, nil, nil 501 } 502 503 // SchedSetaffinity implements linux syscall sched_setaffinity(2). 504 func SchedSetaffinity(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 505 tid := args[0].Int() 506 size := args[1].SizeT() 507 maskAddr := args[2].Pointer() 508 509 var task *kernel.Task 510 if tid == 0 { 511 task = t 512 } else { 513 task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) 514 if task == nil { 515 return 0, nil, linuxerr.ESRCH 516 } 517 } 518 519 mask := sched.NewCPUSet(t.Kernel().ApplicationCores()) 520 if size > mask.Size() { 521 size = mask.Size() 522 } 523 if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil { 524 return 0, nil, err 525 } 526 return 0, nil, task.SetCPUMask(mask) 527 } 528 529 // SchedGetaffinity implements linux syscall sched_getaffinity(2). 530 func SchedGetaffinity(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 531 tid := args[0].Int() 532 size := args[1].SizeT() 533 maskAddr := args[2].Pointer() 534 535 // This limitation is because linux stores the cpumask 536 // in an array of "unsigned long" so the buffer needs to 537 // be a multiple of the word size. 538 if size&(t.Arch().Width()-1) > 0 { 539 return 0, nil, linuxerr.EINVAL 540 } 541 542 var task *kernel.Task 543 if tid == 0 { 544 task = t 545 } else { 546 task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)) 547 if task == nil { 548 return 0, nil, linuxerr.ESRCH 549 } 550 } 551 552 mask := task.CPUMask() 553 // The buffer needs to be big enough to hold a cpumask with 554 // all possible cpus. 555 if size < mask.Size() { 556 return 0, nil, linuxerr.EINVAL 557 } 558 _, err := t.CopyOutBytes(maskAddr, mask) 559 560 // NOTE: The syscall interface is slightly different than the glibc 561 // interface. The raw sched_getaffinity syscall returns the number of 562 // bytes used to represent a cpu mask. 563 return uintptr(mask.Size()), nil, err 564 } 565 566 // Getcpu implements linux syscall getcpu(2). 567 func Getcpu(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 568 cpu := args[0].Pointer() 569 node := args[1].Pointer() 570 // third argument to this system call is nowadays unused. 571 572 if cpu != 0 { 573 if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil { 574 return 0, nil, err 575 } 576 } 577 // We always return node 0. 578 if node != 0 { 579 if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{ 580 AddressSpaceActive: true, 581 }); err != nil { 582 return 0, nil, err 583 } 584 } 585 return 0, nil, nil 586 } 587 588 // Setpgid implements the linux syscall setpgid(2). 589 func Setpgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 590 // Note that throughout this function, pgid is interpreted with respect 591 // to t's namespace, not with respect to the selected ThreadGroup's 592 // namespace (which may be different). 593 pid := kernel.ThreadID(args[0].Int()) 594 pgid := kernel.ProcessGroupID(args[1].Int()) 595 596 // "If pid is zero, then the process ID of the calling process is used." 597 tg := t.ThreadGroup() 598 if pid != 0 { 599 ot := t.PIDNamespace().TaskWithID(pid) 600 if ot == nil { 601 return 0, nil, linuxerr.ESRCH 602 } 603 tg = ot.ThreadGroup() 604 if tg.Leader() != ot { 605 return 0, nil, linuxerr.EINVAL 606 } 607 608 // Setpgid only operates on child threadgroups. 609 if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) { 610 return 0, nil, linuxerr.ESRCH 611 } 612 } 613 614 // "If pgid is zero, then the PGID of the process specified by pid is made 615 // the same as its process ID." 616 defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg)) 617 if pgid == 0 { 618 pgid = defaultPGID 619 } else if pgid < 0 { 620 return 0, nil, linuxerr.EINVAL 621 } 622 623 // If the pgid is the same as the group, then create a new one. Otherwise, 624 // we attempt to join an existing process group. 625 if pgid == defaultPGID { 626 // For convenience, errors line up with Linux syscall API. 627 if err := tg.CreateProcessGroup(); err != nil { 628 // Is the process group already as expected? If so, 629 // just return success. This is the same behavior as 630 // Linux. 631 if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID { 632 return 0, nil, nil 633 } 634 return 0, nil, err 635 } 636 } else { 637 // Same as CreateProcessGroup, above. 638 if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil { 639 // See above. 640 if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid { 641 return 0, nil, nil 642 } 643 return 0, nil, err 644 } 645 } 646 647 // Success. 648 return 0, nil, nil 649 } 650 651 // Getpgrp implements the linux syscall getpgrp(2). 652 func Getpgrp(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 653 return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil 654 } 655 656 // Getpgid implements the linux syscall getpgid(2). 657 func Getpgid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 658 tid := kernel.ThreadID(args[0].Int()) 659 if tid == 0 { 660 return Getpgrp(t, sysno, args) 661 } 662 663 target := t.PIDNamespace().TaskWithID(tid) 664 if target == nil { 665 return 0, nil, linuxerr.ESRCH 666 } 667 668 return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil 669 } 670 671 // Setsid implements the linux syscall setsid(2). 672 func Setsid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 673 sid, err := t.ThreadGroup().CreateSession() 674 if err != nil { 675 return 0, nil, err 676 } 677 return uintptr(sid), nil, nil 678 } 679 680 // Getsid implements the linux syscall getsid(2). 681 func Getsid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 682 tid := kernel.ThreadID(args[0].Int()) 683 if tid == 0 { 684 return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil 685 } 686 687 target := t.PIDNamespace().TaskWithID(tid) 688 if target == nil { 689 return 0, nil, linuxerr.ESRCH 690 } 691 692 return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil 693 } 694 695 // Getpriority pretends to implement the linux syscall getpriority(2). 696 // 697 // This is a stub; real priorities require a full scheduler. 698 func Getpriority(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 699 which := args[0].Int() 700 who := kernel.ThreadID(args[1].Int()) 701 702 switch which { 703 case linux.PRIO_PROCESS: 704 // Look for who, return ESRCH if not found. 705 var task *kernel.Task 706 if who == 0 { 707 task = t 708 } else { 709 task = t.PIDNamespace().TaskWithID(who) 710 } 711 712 if task == nil { 713 return 0, nil, linuxerr.ESRCH 714 } 715 716 // From kernel/sys.c:getpriority: 717 // "To avoid negative return values, 'getpriority()' 718 // will not return the normal nice-value, but a negated 719 // value that has been offset by 20" 720 return uintptr(20 - task.Niceness()), nil, nil 721 case linux.PRIO_USER: 722 fallthrough 723 case linux.PRIO_PGRP: 724 // PRIO_USER and PRIO_PGRP have no further implementation yet. 725 return 0, nil, nil 726 default: 727 return 0, nil, linuxerr.EINVAL 728 } 729 } 730 731 // Setpriority pretends to implement the linux syscall setpriority(2). 732 // 733 // This is a stub; real priorities require a full scheduler. 734 func Setpriority(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 735 which := args[0].Int() 736 who := kernel.ThreadID(args[1].Int()) 737 niceval := int(args[2].Int()) 738 739 // In the kernel's implementation, values outside the range 740 // of [-20, 19] are truncated to these minimum and maximum 741 // values. 742 if niceval < -20 /* min niceval */ { 743 niceval = -20 744 } else if niceval > 19 /* max niceval */ { 745 niceval = 19 746 } 747 748 switch which { 749 case linux.PRIO_PROCESS: 750 // Look for who, return ESRCH if not found. 751 var task *kernel.Task 752 if who == 0 { 753 task = t 754 } else { 755 task = t.PIDNamespace().TaskWithID(who) 756 } 757 758 if task == nil { 759 return 0, nil, linuxerr.ESRCH 760 } 761 762 task.SetNiceness(niceval) 763 case linux.PRIO_USER: 764 fallthrough 765 case linux.PRIO_PGRP: 766 // PRIO_USER and PRIO_PGRP have no further implementation yet. 767 return 0, nil, nil 768 default: 769 return 0, nil, linuxerr.EINVAL 770 } 771 772 return 0, nil, nil 773 } 774 775 // Ptrace implements linux system call ptrace(2). 776 func Ptrace(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 777 req := args[0].Int64() 778 pid := kernel.ThreadID(args[1].Int()) 779 addr := args[2].Pointer() 780 data := args[3].Pointer() 781 782 return 0, nil, t.Ptrace(req, pid, addr, data) 783 }