github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_exit.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 // This file implements the task exit cycle: 18 // 19 // - Tasks are asynchronously requested to exit with Task.Kill. 20 // 21 // - When able, the task goroutine enters the exit path starting from state 22 // runExit. 23 // 24 // - Other tasks observe completed exits with Task.Wait (which implements the 25 // wait*() family of syscalls). 26 27 import ( 28 "errors" 29 "fmt" 30 "strconv" 31 "strings" 32 33 "github.com/SagerNet/gvisor/pkg/abi/linux" 34 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 35 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 36 "github.com/SagerNet/gvisor/pkg/syserror" 37 "github.com/SagerNet/gvisor/pkg/waiter" 38 ) 39 40 // An ExitStatus is a value communicated from an exiting task or thread group 41 // to the party that reaps it. 42 // 43 // +stateify savable 44 type ExitStatus struct { 45 // Code is the numeric value passed to the call to exit or exit_group that 46 // caused the exit. If the exit was not caused by such a call, Code is 0. 47 Code int 48 49 // Signo is the signal that caused the exit. If the exit was not caused by 50 // a signal, Signo is 0. 51 Signo int 52 } 53 54 func (es ExitStatus) String() string { 55 var b strings.Builder 56 if code := es.Code; code != 0 { 57 if b.Len() != 0 { 58 b.WriteByte(' ') 59 } 60 _, _ = fmt.Fprintf(&b, "Code=%d", code) 61 } 62 if signal := es.Signo; signal != 0 { 63 if b.Len() != 0 { 64 b.WriteByte(' ') 65 } 66 _, _ = fmt.Fprintf(&b, "Signal=%d", signal) 67 } 68 return b.String() 69 } 70 71 // Signaled returns true if the ExitStatus indicates that the exiting task or 72 // thread group was killed by a signal. 73 func (es ExitStatus) Signaled() bool { 74 return es.Signo != 0 75 } 76 77 // Status returns the numeric representation of the ExitStatus returned by e.g. 78 // the wait4() system call. 79 func (es ExitStatus) Status() uint32 { 80 return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff) 81 } 82 83 // ShellExitCode returns the numeric exit code that Bash would return for an 84 // exit status of es. 85 func (es ExitStatus) ShellExitCode() int { 86 if es.Signaled() { 87 return 128 + es.Signo 88 } 89 return es.Code 90 } 91 92 // TaskExitState represents a step in the task exit path. 93 // 94 // "Exiting" and "exited" are often ambiguous; prefer to name specific states. 95 type TaskExitState int 96 97 const ( 98 // TaskExitNone indicates that the task has not begun exiting. 99 TaskExitNone TaskExitState = iota 100 101 // TaskExitInitiated indicates that the task goroutine has entered the exit 102 // path, and the task is no longer eligible to participate in group stops 103 // or group signal handling. TaskExitInitiated is analogous to Linux's 104 // PF_EXITING. 105 TaskExitInitiated 106 107 // TaskExitZombie indicates that the task has released its resources, and 108 // the task no longer prevents a sibling thread from completing execve. 109 TaskExitZombie 110 111 // TaskExitDead indicates that the task's thread IDs have been released, 112 // and the task no longer prevents its thread group leader from being 113 // reaped. ("Reaping" refers to the transitioning of a task from 114 // TaskExitZombie to TaskExitDead.) 115 TaskExitDead 116 ) 117 118 // String implements fmt.Stringer. 119 func (t TaskExitState) String() string { 120 switch t { 121 case TaskExitNone: 122 return "TaskExitNone" 123 case TaskExitInitiated: 124 return "TaskExitInitiated" 125 case TaskExitZombie: 126 return "TaskExitZombie" 127 case TaskExitDead: 128 return "TaskExitDead" 129 default: 130 return strconv.Itoa(int(t)) 131 } 132 } 133 134 // killLocked marks t as killed by enqueueing a SIGKILL, without causing the 135 // thread-group-affecting side effects SIGKILL usually has. 136 // 137 // Preconditions: The signal mutex must be locked. 138 func (t *Task) killLocked() { 139 // Clear killable stops. 140 if t.stop != nil && t.stop.Killable() { 141 t.endInternalStopLocked() 142 } 143 t.pendingSignals.enqueue(&linux.SignalInfo{ 144 Signo: int32(linux.SIGKILL), 145 // Linux just sets SIGKILL in the pending signal bitmask without 146 // enqueueing an actual siginfo, such that 147 // kernel/signal.c:collect_signal() initializes si_code to SI_USER. 148 Code: linux.SI_USER, 149 }, nil) 150 t.interrupt() 151 } 152 153 // killed returns true if t has a SIGKILL pending. killed is analogous to 154 // Linux's fatal_signal_pending(). 155 // 156 // Preconditions: The caller must be running on the task goroutine. 157 func (t *Task) killed() bool { 158 t.tg.signalHandlers.mu.Lock() 159 defer t.tg.signalHandlers.mu.Unlock() 160 return t.killedLocked() 161 } 162 163 func (t *Task) killedLocked() bool { 164 return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0 165 } 166 167 // PrepareExit indicates an exit with status es. 168 // 169 // Preconditions: The caller must be running on the task goroutine. 170 func (t *Task) PrepareExit(es ExitStatus) { 171 t.tg.signalHandlers.mu.Lock() 172 defer t.tg.signalHandlers.mu.Unlock() 173 t.exitStatus = es 174 } 175 176 // PrepareGroupExit indicates a group exit with status es to t's thread group. 177 // 178 // PrepareGroupExit is analogous to Linux's do_group_exit(), except that it 179 // does not tail-call do_exit(), except that it *does* set Task.exitStatus. 180 // (Linux does not do so until within do_exit(), since it reuses exit_code for 181 // ptrace.) 182 // 183 // Preconditions: The caller must be running on the task goroutine. 184 func (t *Task) PrepareGroupExit(es ExitStatus) { 185 t.tg.signalHandlers.mu.Lock() 186 defer t.tg.signalHandlers.mu.Unlock() 187 if t.tg.exiting || t.tg.execing != nil { 188 // Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e. 189 // this "group exit" is being executed by the killed sibling of an 190 // execing task, then Task.Execve never set t.tg.exitStatus, so it's 191 // still the zero value. This is consistent with Linux, both in intent 192 // ("all other threads ... report death as if they exited via _exit(2) 193 // with exit code 0" - ptrace(2), "execve under ptrace") and in 194 // implementation (compare fs/exec.c:de_thread() => 195 // kernel/signal.c:zap_other_threads() and 196 // kernel/exit.c:do_group_exit() => 197 // include/linux/sched.h:signal_group_exit()). 198 t.exitStatus = t.tg.exitStatus 199 return 200 } 201 t.tg.exiting = true 202 t.tg.exitStatus = es 203 t.exitStatus = es 204 for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { 205 if sibling != t { 206 sibling.killLocked() 207 } 208 } 209 } 210 211 // Kill requests that all tasks in ts exit as if group exiting with status es. 212 // Kill does not wait for tasks to exit. 213 // 214 // Kill has no analogue in Linux; it's provided for save/restore only. 215 func (ts *TaskSet) Kill(es ExitStatus) { 216 ts.mu.Lock() 217 defer ts.mu.Unlock() 218 ts.Root.exiting = true 219 for t := range ts.Root.tids { 220 t.tg.signalHandlers.mu.Lock() 221 if !t.tg.exiting { 222 t.tg.exiting = true 223 t.tg.exitStatus = es 224 } 225 t.killLocked() 226 t.tg.signalHandlers.mu.Unlock() 227 } 228 } 229 230 // advanceExitStateLocked checks that t's current exit state is oldExit, then 231 // sets it to newExit. If t's current exit state is not oldExit, 232 // advanceExitStateLocked panics. 233 // 234 // Preconditions: The TaskSet mutex must be locked. 235 func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) { 236 if t.exitState != oldExit { 237 panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState)) 238 } 239 t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit) 240 t.exitState = newExit 241 } 242 243 // runExit is the entry point into the task exit path. 244 // 245 // +stateify savable 246 type runExit struct{} 247 248 func (*runExit) execute(t *Task) taskRunState { 249 t.ptraceExit() 250 return (*runExitMain)(nil) 251 } 252 253 // +stateify savable 254 type runExitMain struct{} 255 256 func (*runExitMain) execute(t *Task) taskRunState { 257 t.traceExitEvent() 258 lastExiter := t.exitThreadGroup() 259 260 t.ResetKcov() 261 262 // If the task has a cleartid, and the thread group wasn't killed by a 263 // signal, handle that before releasing the MM. 264 if t.cleartid != 0 { 265 t.tg.signalHandlers.mu.Lock() 266 signaled := t.tg.exiting && t.tg.exitStatus.Signaled() 267 t.tg.signalHandlers.mu.Unlock() 268 if !signaled { 269 zero := ThreadID(0) 270 if _, err := zero.CopyOut(t, t.cleartid); err == nil { 271 t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1) 272 } 273 // If the CopyOut fails, there's nothing we can do. 274 } 275 } 276 277 // Handle the robust futex list. 278 t.exitRobustList() 279 280 // Deactivate the address space and update max RSS before releasing the 281 // task's MM. 282 t.Deactivate() 283 t.tg.pidns.owner.mu.Lock() 284 t.updateRSSLocked() 285 t.tg.pidns.owner.mu.Unlock() 286 t.mu.Lock() 287 t.image.release() 288 t.mu.Unlock() 289 290 // Releasing the MM unblocks a blocked CLONE_VFORK parent. 291 t.unstopVforkParent() 292 293 t.fsContext.DecRef(t) 294 t.fdTable.DecRef(t) 295 296 // Detach task from all cgroups. This must happen before potentially the 297 // last ref to the cgroupfs mount is dropped below. 298 t.LeaveCgroups() 299 300 t.mu.Lock() 301 if t.mountNamespaceVFS2 != nil { 302 t.mountNamespaceVFS2.DecRef(t) 303 t.mountNamespaceVFS2 = nil 304 } 305 t.ipcns.DecRef(t) 306 t.mu.Unlock() 307 308 // If this is the last task to exit from the thread group, release the 309 // thread group's resources. 310 if lastExiter { 311 t.tg.Release(t) 312 } 313 314 // Detach tracees. 315 t.exitPtrace() 316 317 // Reparent the task's children. 318 t.exitChildren() 319 320 // Don't tail-call runExitNotify, as exitChildren may have initiated a stop 321 // to wait for a PID namespace to die. 322 return (*runExitNotify)(nil) 323 } 324 325 // exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread 326 // group that it is no longer eligible to participate in group activities. It 327 // returns true if t is the last task in its thread group to call 328 // exitThreadGroup. 329 func (t *Task) exitThreadGroup() bool { 330 t.tg.pidns.owner.mu.Lock() 331 defer t.tg.pidns.owner.mu.Unlock() 332 t.tg.signalHandlers.mu.Lock() 333 // Can't defer unlock: see below. 334 335 t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated) 336 t.tg.activeTasks-- 337 last := t.tg.activeTasks == 0 338 339 // Ensure that someone will handle the signals we can't. 340 t.setSignalMaskLocked(^linux.SignalSet(0)) 341 342 // Check if this task's exit interacts with an initiated group stop. 343 if !t.groupStopPending { 344 t.tg.signalHandlers.mu.Unlock() 345 return last 346 } 347 t.groupStopPending = false 348 sig := t.tg.groupStopSignal 349 notifyParent := t.participateGroupStopLocked() 350 // signalStop must be called with t's signal mutex unlocked. 351 t.tg.signalHandlers.mu.Unlock() 352 if notifyParent && t.tg.leader.parent != nil { 353 t.tg.leader.parent.signalStop(t, linux.CLD_STOPPED, int32(sig)) 354 t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop) 355 } 356 return last 357 } 358 359 func (t *Task) exitChildren() { 360 t.tg.pidns.owner.mu.Lock() 361 defer t.tg.pidns.owner.mu.Unlock() 362 newParent := t.findReparentTargetLocked() 363 if newParent == nil { 364 // "If the init process of a PID namespace terminates, the kernel 365 // terminates all of the processes in the namespace via a SIGKILL 366 // signal." - pid_namespaces(7) 367 t.Debugf("Init process terminating, killing namespace") 368 t.tg.pidns.exiting = true 369 for other := range t.tg.pidns.tgids { 370 if other == t.tg { 371 continue 372 } 373 other.signalHandlers.mu.Lock() 374 other.leader.sendSignalLocked(&linux.SignalInfo{ 375 Signo: int32(linux.SIGKILL), 376 }, true /* group */) 377 other.signalHandlers.mu.Unlock() 378 } 379 // TODO(b/37722272): The init process waits for all processes in the 380 // namespace to exit before completing its own exit 381 // (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all 382 // other tasks in the namespace are dead, except possibly for this 383 // thread group's leader (which can't be reaped until this task exits). 384 } 385 // This is correct even if newParent is nil (it ensures that children don't 386 // wait for a parent to reap them.) 387 for c := range t.children { 388 if sig := c.ParentDeathSignal(); sig != 0 { 389 siginfo := &linux.SignalInfo{ 390 Signo: int32(sig), 391 Code: linux.SI_USER, 392 } 393 siginfo.SetPID(int32(c.tg.pidns.tids[t])) 394 siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow())) 395 c.tg.signalHandlers.mu.Lock() 396 c.sendSignalLocked(siginfo, true /* group */) 397 c.tg.signalHandlers.mu.Unlock() 398 } 399 c.reparentLocked(newParent) 400 if newParent != nil { 401 newParent.children[c] = struct{}{} 402 } 403 } 404 } 405 406 // findReparentTargetLocked returns the task to which t's children should be 407 // reparented. If no such task exists, findNewParentLocked returns nil. 408 // 409 // Preconditions: The TaskSet mutex must be locked. 410 func (t *Task) findReparentTargetLocked() *Task { 411 // Reparent to any sibling in the same thread group that hasn't begun 412 // exiting. 413 if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil { 414 return t2 415 } 416 // "A child process that is orphaned within the namespace will be 417 // reparented to [the init process for the namespace] ..." - 418 // pid_namespaces(7) 419 if init := t.tg.pidns.tasks[InitTID]; init != nil { 420 return init.tg.anyNonExitingTaskLocked() 421 } 422 return nil 423 } 424 425 func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task { 426 for t := tg.tasks.Front(); t != nil; t = t.Next() { 427 if t.exitState == TaskExitNone { 428 return t 429 } 430 } 431 return nil 432 } 433 434 // reparentLocked changes t's parent. The new parent may be nil. 435 // 436 // Preconditions: The TaskSet mutex must be locked for writing. 437 func (t *Task) reparentLocked(parent *Task) { 438 oldParent := t.parent 439 t.parent = parent 440 if oldParent != nil { 441 delete(oldParent.children, t) 442 } 443 if parent != nil { 444 parent.children[t] = struct{}{} 445 } 446 // If a thread group leader's parent changes, reset the thread group's 447 // termination signal to SIGCHLD and re-check exit notification. (Compare 448 // kernel/exit.c:reparent_leader().) 449 if t != t.tg.leader { 450 return 451 } 452 if oldParent == nil && parent == nil { 453 return 454 } 455 if oldParent != nil && parent != nil && oldParent.tg == parent.tg { 456 return 457 } 458 t.tg.terminationSignal = linux.SIGCHLD 459 if t.exitParentNotified && !t.exitParentAcked { 460 t.exitParentNotified = false 461 t.exitNotifyLocked(false) 462 } 463 } 464 465 // When a task exits, other tasks in the system, notably the task's parent and 466 // ptracer, may want to be notified. The exit notification system ensures that 467 // interested tasks receive signals and/or are woken from blocking calls to 468 // wait*() syscalls; these notifications must be resolved before exiting tasks 469 // can be reaped and disappear from the system. 470 // 471 // Each task may have a parent task and/or a tracer task. If both a parent and 472 // a tracer exist, they may be the same task, different tasks in the same 473 // thread group, or tasks in different thread groups. (In the last case, Linux 474 // refers to the task as being ptrace-reparented due to an implementation 475 // detail; we avoid this terminology to avoid confusion.) 476 // 477 // A thread group is *empty* if all non-leader tasks in the thread group are 478 // dead, and the leader is either a zombie or dead. The exit of a thread group 479 // leader is never waitable - by either the parent or tracer - until the thread 480 // group is empty. 481 // 482 // There are a few ways for an exit notification to be resolved: 483 // 484 // - The exit notification may be acknowledged by a call to Task.Wait with 485 // WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall). 486 // 487 // - If the notified party is the parent, and the parent thread group is not 488 // also the tracer thread group, and the notification signal is SIGCHLD, the 489 // parent may explicitly ignore the notification (see quote in exitNotify). 490 // Note that it's possible for the notified party to ignore the signal in other 491 // cases, but the notification is only resolved under the above conditions. 492 // (Actually, there is one exception; see the last paragraph of the "leader, 493 // has tracer, tracer thread group is parent thread group" case below.) 494 // 495 // - If the notified party is the parent, and the parent does not exist, the 496 // notification is resolved as if ignored. (This is only possible in the 497 // sentry. In Linux, the only task / thread group without a parent is global 498 // init, and killing global init causes a kernel panic.) 499 // 500 // - If the notified party is a tracer, the tracer may detach the traced task. 501 // (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.) 502 // 503 // In addition, if the notified party is the parent, the parent may exit and 504 // cause the notifying task to be reparented to another thread group. This does 505 // not resolve the notification; instead, the notification must be resent to 506 // the new parent. 507 // 508 // The series of notifications generated for a given task's exit depend on 509 // whether it is a thread group leader; whether the task is ptraced; and, if 510 // so, whether the tracer thread group is the same as the parent thread group. 511 // 512 // - Non-leader, no tracer: No notification is generated; the task is reaped 513 // immediately. 514 // 515 // - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer 516 // notification is resolved (by waiting or detaching), the task is reaped. (For 517 // non-leaders, whether the tracer and parent thread groups are the same is 518 // irrelevant.) 519 // 520 // - Leader, no tracer: The task remains a zombie, with no notification sent, 521 // until all other tasks in the thread group are dead. (In Linux terms, this 522 // condition is indicated by include/linux/sched.h:thread_group_empty(); tasks 523 // are removed from their thread_group list in kernel/exit.c:release_task() => 524 // __exit_signal() => __unhash_process().) Then the thread group's termination 525 // signal is sent to the parent. When the parent notification is resolved (by 526 // waiting or ignoring), the task is reaped. 527 // 528 // - Leader, has tracer, tracer thread group is not parent thread group: 529 // SIGCHLD is sent to the tracer. When the tracer notification is resolved (by 530 // waiting or detaching), and all other tasks in the thread group are dead, the 531 // thread group's termination signal is sent to the parent. (Note that the 532 // tracer cannot resolve the exit notification by waiting until the thread 533 // group is empty.) When the parent notification is resolved, the task is 534 // reaped. 535 // 536 // - Leader, has tracer, tracer thread group is parent thread group: 537 // 538 // If all other tasks in the thread group are dead, the thread group's 539 // termination signal is sent to the parent. At this point, the notification 540 // can only be resolved by waiting. If the parent detaches from the task as a 541 // tracer, the notification is not resolved, but the notification can now be 542 // resolved by waiting or ignoring. When the parent notification is resolved, 543 // the task is reaped. 544 // 545 // If at least one task in the thread group is not dead, SIGCHLD is sent to the 546 // parent. At this point, the notification cannot be resolved at all; once the 547 // thread group becomes empty, it can be resolved only by waiting. If the 548 // parent detaches from the task as a tracer before all remaining tasks die, 549 // then exit notification proceeds as in the case where the leader never had a 550 // tracer. If the parent detaches from the task as a tracer after all remaining 551 // tasks die, the notification is not resolved, but the notification can now be 552 // resolved by waiting or ignoring. When the parent notification is resolved, 553 // the task is reaped. 554 // 555 // In both of the above cases, when the parent detaches from the task as a 556 // tracer while the thread group is empty, whether or not the parent resolves 557 // the notification by ignoring it is based on the parent's SIGCHLD signal 558 // action, whether or not the thread group's termination signal is SIGCHLD 559 // (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()). 560 // 561 // There is one final wrinkle: A leader can become a non-leader due to a 562 // sibling execve. In this case, the execing thread detaches the leader's 563 // tracer (if one exists) and reaps the leader immediately. In Linux, this is 564 // in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked(). 565 566 // +stateify savable 567 type runExitNotify struct{} 568 569 func (*runExitNotify) execute(t *Task) taskRunState { 570 t.tg.pidns.owner.mu.Lock() 571 defer t.tg.pidns.owner.mu.Unlock() 572 t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie) 573 t.tg.liveTasks-- 574 // Check if this completes a sibling's execve. 575 if t.tg.execing != nil && t.tg.liveTasks == 1 { 576 // execing blocks the addition of new tasks to the thread group, so 577 // the sole living task must be the execing one. 578 e := t.tg.execing 579 e.tg.signalHandlers.mu.Lock() 580 if _, ok := e.stop.(*execStop); ok { 581 e.endInternalStopLocked() 582 } 583 e.tg.signalHandlers.mu.Unlock() 584 } 585 t.exitNotifyLocked(false) 586 // The task goroutine will now exit. 587 return nil 588 } 589 590 // exitNotifyLocked is called after changes to t's state that affect exit 591 // notification. 592 // 593 // If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace; 594 // thanks to Linux's haphazard implementation of this functionality, such cases 595 // determine whether parent notifications are ignored based on the parent's 596 // handling of SIGCHLD, regardless of what the exited task's thread group's 597 // termination signal is. 598 // 599 // Preconditions: The TaskSet mutex must be locked for writing. 600 func (t *Task) exitNotifyLocked(fromPtraceDetach bool) { 601 if t.exitState != TaskExitZombie { 602 return 603 } 604 if !t.exitTracerNotified { 605 t.exitTracerNotified = true 606 tracer := t.Tracer() 607 if tracer == nil { 608 t.exitTracerAcked = true 609 } else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg { 610 // Don't set exitParentNotified if t is non-leader, even if the 611 // tracer is in the parent thread group, so that if the parent 612 // detaches the following call to exitNotifyLocked passes through 613 // the !exitParentNotified case below and causes t to be reaped 614 // immediately. 615 // 616 // Tracer notification doesn't care about about 617 // SIG_IGN/SA_NOCLDWAIT. 618 tracer.tg.signalHandlers.mu.Lock() 619 tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */) 620 tracer.tg.signalHandlers.mu.Unlock() 621 // Wake EventTraceeStop waiters as well since this task will never 622 // ptrace-stop again. 623 tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop) 624 } else { 625 // t is a leader and the tracer is in the parent thread group. 626 t.exitParentNotified = true 627 sig := linux.SIGCHLD 628 if t.tg.tasksCount == 1 { 629 sig = t.tg.terminationSignal 630 } 631 // This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either 632 // (in Linux, the check in do_notify_parent() is gated by 633 // !tsk->ptrace.) 634 t.parent.tg.signalHandlers.mu.Lock() 635 t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */) 636 t.parent.tg.signalHandlers.mu.Unlock() 637 // See below for rationale for this event mask. 638 t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) 639 } 640 } 641 if t.exitTracerAcked && !t.exitParentNotified { 642 if t != t.tg.leader { 643 t.exitParentNotified = true 644 t.exitParentAcked = true 645 } else if t.tg.tasksCount == 1 { 646 t.exitParentNotified = true 647 if t.parent == nil { 648 t.exitParentAcked = true 649 } else { 650 // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is 651 // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see 652 // sigaction(2)), then children that terminate do not become 653 // zombies and a call to wait() or waitpid() will block until all 654 // children have terminated, and then fail with errno set to 655 // ECHILD. (The original POSIX standard left the behavior of 656 // setting SIGCHLD to SIG_IGN unspecified. Note that even though 657 // the default disposition of SIGCHLD is "ignore", explicitly 658 // setting the disposition to SIG_IGN results in different 659 // treatment of zombie process children.) Linux 2.6 conforms to 660 // this specification." - wait(2) 661 // 662 // Some undocumented Linux-specific details: 663 // 664 // - All of the above is ignored if the termination signal isn't 665 // SIGCHLD. 666 // 667 // - SA_NOCLDWAIT causes the leader to be immediately reaped, but 668 // does not suppress the SIGCHLD. 669 signalParent := t.tg.terminationSignal.IsValid() 670 t.parent.tg.signalHandlers.mu.Lock() 671 if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach { 672 if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok { 673 if act.Handler == linux.SIG_IGN { 674 t.exitParentAcked = true 675 signalParent = false 676 } else if act.Flags&linux.SA_NOCLDWAIT != 0 { 677 t.exitParentAcked = true 678 } 679 } 680 } 681 if signalParent { 682 t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */) 683 } 684 t.parent.tg.signalHandlers.mu.Unlock() 685 // If a task in the parent was waiting for a child group stop 686 // or continue, it needs to be notified of the exit, because 687 // there may be no remaining eligible tasks (so that wait 688 // should return ECHILD). 689 t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue) 690 } 691 } 692 } 693 if t.exitTracerAcked && t.exitParentAcked { 694 t.advanceExitStateLocked(TaskExitZombie, TaskExitDead) 695 for ns := t.tg.pidns; ns != nil; ns = ns.parent { 696 tid := ns.tids[t] 697 delete(ns.tasks, tid) 698 delete(ns.tids, t) 699 if t == t.tg.leader { 700 delete(ns.tgids, t.tg) 701 } 702 } 703 t.tg.exitedCPUStats.Accumulate(t.CPUStats()) 704 t.tg.ioUsage.Accumulate(t.ioUsage) 705 t.tg.signalHandlers.mu.Lock() 706 t.tg.tasks.Remove(t) 707 t.tg.tasksCount-- 708 tc := t.tg.tasksCount 709 t.tg.signalHandlers.mu.Unlock() 710 if tc == 1 && t != t.tg.leader { 711 // Our fromPtraceDetach doesn't matter here (in Linux terms, this 712 // is via a call to release_task()). 713 t.tg.leader.exitNotifyLocked(false) 714 } else if tc == 0 { 715 t.tg.processGroup.decRefWithParent(t.tg.parentPG()) 716 } 717 if t.parent != nil { 718 delete(t.parent.children, t) 719 // Do not clear t.parent. It may be still be needed after the task has exited 720 // (for example, to perform ptrace access checks on /proc/[pid] files). 721 } 722 } 723 } 724 725 // Preconditions: The TaskSet mutex must be locked. 726 func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.SignalInfo { 727 info := &linux.SignalInfo{ 728 Signo: int32(sig), 729 } 730 info.SetPID(int32(receiver.tg.pidns.tids[t])) 731 info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow())) 732 if t.exitStatus.Signaled() { 733 info.Code = linux.CLD_KILLED 734 info.SetStatus(int32(t.exitStatus.Signo)) 735 } else { 736 info.Code = linux.CLD_EXITED 737 info.SetStatus(int32(t.exitStatus.Code)) 738 } 739 // TODO(b/72102453): Set utime, stime. 740 return info 741 } 742 743 // ExitStatus returns t's exit status, which is only guaranteed to be 744 // meaningful if t.ExitState() != TaskExitNone. 745 func (t *Task) ExitStatus() ExitStatus { 746 t.tg.pidns.owner.mu.RLock() 747 defer t.tg.pidns.owner.mu.RUnlock() 748 t.tg.signalHandlers.mu.Lock() 749 defer t.tg.signalHandlers.mu.Unlock() 750 return t.exitStatus 751 } 752 753 // ExitStatus returns the exit status that would be returned by a consuming 754 // wait*() on tg. 755 func (tg *ThreadGroup) ExitStatus() ExitStatus { 756 tg.pidns.owner.mu.RLock() 757 defer tg.pidns.owner.mu.RUnlock() 758 tg.signalHandlers.mu.Lock() 759 defer tg.signalHandlers.mu.Unlock() 760 if tg.exiting { 761 return tg.exitStatus 762 } 763 return tg.leader.exitStatus 764 } 765 766 // TerminationSignal returns the thread group's termination signal. 767 func (tg *ThreadGroup) TerminationSignal() linux.Signal { 768 tg.pidns.owner.mu.RLock() 769 defer tg.pidns.owner.mu.RUnlock() 770 return tg.terminationSignal 771 } 772 773 // Task events that can be waited for. 774 const ( 775 // EventExit represents an exit notification generated for a child thread 776 // group leader or a tracee under the conditions specified in the comment 777 // above runExitNotify. 778 EventExit waiter.EventMask = 1 << iota 779 780 // EventChildGroupStop occurs when a child thread group completes a group 781 // stop (i.e. all tasks in the child thread group have entered a stopped 782 // state as a result of a group stop). 783 EventChildGroupStop 784 785 // EventTraceeStop occurs when a task that is ptraced by a task in the 786 // notified thread group enters a ptrace stop (see ptrace(2)). 787 EventTraceeStop 788 789 // EventGroupContinue occurs when a child thread group, or a thread group 790 // whose leader is ptraced by a task in the notified thread group, that had 791 // initiated or completed a group stop leaves the group stop, due to the 792 // child thread group or any task in the child thread group being sent 793 // SIGCONT. 794 EventGroupContinue 795 ) 796 797 // WaitOptions controls the behavior of Task.Wait. 798 type WaitOptions struct { 799 // If SpecificTID is non-zero, only events from the task with thread ID 800 // SpecificTID are eligible to be waited for. SpecificTID is resolved in 801 // the PID namespace of the waiter (the method receiver of Task.Wait). If 802 // no such task exists, or that task would not otherwise be eligible to be 803 // waited for by the waiting task, then there are no waitable tasks and 804 // Wait will return ECHILD. 805 SpecificTID ThreadID 806 807 // If SpecificPGID is non-zero, only events from ThreadGroups with a 808 // matching ProcessGroupID are eligible to be waited for. (Same 809 // constraints as SpecificTID apply.) 810 SpecificPGID ProcessGroupID 811 812 // Terminology note: Per waitpid(2), "a clone child is one which delivers 813 // no signal, or a signal other than SIGCHLD to its parent upon 814 // termination." In Linux, termination signal is technically a per-task 815 // property rather than a per-thread-group property. However, clone() 816 // forces no termination signal for tasks created with CLONE_THREAD, and 817 // execve() resets the termination signal to SIGCHLD, so all 818 // non-group-leader threads have no termination signal and are therefore 819 // "clone tasks". 820 821 // If NonCloneTasks is true, events from non-clone tasks are eligible to be 822 // waited for. 823 NonCloneTasks bool 824 825 // If CloneTasks is true, events from clone tasks are eligible to be waited 826 // for. 827 CloneTasks bool 828 829 // If SiblingChildren is true, events from children tasks of any task 830 // in the thread group of the waiter are eligible to be waited for. 831 SiblingChildren bool 832 833 // Events is a bitwise combination of the events defined above that specify 834 // what events are of interest to the call to Wait. 835 Events waiter.EventMask 836 837 // If ConsumeEvent is true, the Wait should consume the event such that it 838 // cannot be returned by a future Wait. Note that if a task exit is 839 // consumed in this way, in most cases the task will be reaped. 840 ConsumeEvent bool 841 842 // If BlockInterruptErr is not nil, Wait will block until either an event 843 // is available or there are no tasks that could produce a waitable event; 844 // if that blocking is interrupted, Wait returns BlockInterruptErr. If 845 // BlockInterruptErr is nil, Wait will not block. 846 BlockInterruptErr error 847 } 848 849 // Preconditions: The TaskSet mutex must be locked (for reading or writing). 850 func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool { 851 if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] { 852 return false 853 } 854 if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] { 855 return false 856 } 857 // Tracees are always eligible. 858 if tracee { 859 return true 860 } 861 if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD { 862 return o.NonCloneTasks 863 } 864 return o.CloneTasks 865 } 866 867 // ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g. 868 // waitpid(WNOHANG)) that find no waitable events, but determine that waitable 869 // events may exist in the future. (In contrast, if a non-blocking or blocking 870 // Wait determines that there are no tasks that can produce a waitable event, 871 // Task.Wait returns ECHILD.) 872 var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events") 873 874 // WaitResult contains information about a waited-for event. 875 type WaitResult struct { 876 // Task is the task that reported the event. 877 Task *Task 878 879 // TID is the thread ID of Task in the PID namespace of the task that 880 // called Wait (that is, the method receiver of the call to Task.Wait). TID 881 // is provided because consuming exit waits cause the thread ID to be 882 // deallocated. 883 TID ThreadID 884 885 // UID is the real UID of Task in the user namespace of the task that 886 // called Wait. 887 UID auth.UID 888 889 // Event is exactly one of the events defined above. 890 Event waiter.EventMask 891 892 // Status is the numeric status associated with the event. 893 Status uint32 894 } 895 896 // Wait waits for an event from a thread group that is a child of t's thread 897 // group, or a task in such a thread group, or a task that is ptraced by t, 898 // subject to the options specified in opts. 899 func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) { 900 if opts.BlockInterruptErr == nil { 901 return t.waitOnce(opts) 902 } 903 w, ch := waiter.NewChannelEntry(nil) 904 t.tg.eventQueue.EventRegister(&w, opts.Events) 905 defer t.tg.eventQueue.EventUnregister(&w) 906 for { 907 wr, err := t.waitOnce(opts) 908 if err != ErrNoWaitableEvent { 909 // This includes err == nil. 910 return wr, err 911 } 912 if err := t.Block(ch); err != nil { 913 return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr) 914 } 915 } 916 } 917 918 func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) { 919 anyWaitableTasks := false 920 921 t.tg.pidns.owner.mu.Lock() 922 defer t.tg.pidns.owner.mu.Unlock() 923 924 if opts.SiblingChildren { 925 // We can wait on the children and tracees of any task in the 926 // same thread group. 927 for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() { 928 wr, any := t.waitParentLocked(opts, parent) 929 if wr != nil { 930 return wr, nil 931 } 932 anyWaitableTasks = anyWaitableTasks || any 933 } 934 } else { 935 // We can only wait on this task. 936 var wr *WaitResult 937 wr, anyWaitableTasks = t.waitParentLocked(opts, t) 938 if wr != nil { 939 return wr, nil 940 } 941 } 942 943 if anyWaitableTasks { 944 return nil, ErrNoWaitableEvent 945 } 946 return nil, linuxerr.ECHILD 947 } 948 949 // Preconditions: The TaskSet mutex must be locked for writing. 950 func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) { 951 anyWaitableTasks := false 952 953 for child := range parent.children { 954 if !opts.matchesTask(child, parent.tg.pidns, false) { 955 continue 956 } 957 // Non-leaders don't notify parents on exit and aren't eligible to 958 // be waited on. 959 if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked { 960 anyWaitableTasks = true 961 if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil { 962 return wr, anyWaitableTasks 963 } 964 } 965 // Check for group stops and continues. Tasks that have passed 966 // TaskExitInitiated can no longer participate in group stops. 967 if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 { 968 continue 969 } 970 if child.exitState >= TaskExitInitiated { 971 continue 972 } 973 // If the waiter is in the same thread group as the task's 974 // tracer, do not report its group stops; they will be reported 975 // as ptrace stops instead. This also skips checking for group 976 // continues, but they'll be checked for when scanning tracees 977 // below. (Per kernel/exit.c:wait_consider_task(): "If a 978 // ptracer wants to distinguish the two events for its own 979 // children, it should create a separate process which takes 980 // the role of real parent.") 981 if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg { 982 continue 983 } 984 anyWaitableTasks = true 985 if opts.Events&EventChildGroupStop != 0 { 986 if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil { 987 return wr, anyWaitableTasks 988 } 989 } 990 if opts.Events&EventGroupContinue != 0 { 991 if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil { 992 return wr, anyWaitableTasks 993 } 994 } 995 } 996 for tracee := range parent.ptraceTracees { 997 if !opts.matchesTask(tracee, parent.tg.pidns, true) { 998 continue 999 } 1000 // Non-leaders do notify tracers on exit. 1001 if opts.Events&EventExit != 0 && !tracee.exitTracerAcked { 1002 anyWaitableTasks = true 1003 if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil { 1004 return wr, anyWaitableTasks 1005 } 1006 } 1007 if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 { 1008 continue 1009 } 1010 if tracee.exitState >= TaskExitInitiated { 1011 continue 1012 } 1013 anyWaitableTasks = true 1014 if opts.Events&EventTraceeStop != 0 { 1015 if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil { 1016 return wr, anyWaitableTasks 1017 } 1018 } 1019 if opts.Events&EventGroupContinue != 0 { 1020 if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil { 1021 return wr, anyWaitableTasks 1022 } 1023 } 1024 } 1025 1026 return nil, anyWaitableTasks 1027 } 1028 1029 // Preconditions: The TaskSet mutex must be locked for writing. 1030 func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult { 1031 if asPtracer && !target.exitTracerNotified { 1032 return nil 1033 } 1034 if !asPtracer && !target.exitParentNotified { 1035 return nil 1036 } 1037 // Zombied thread group leaders are never waitable until their thread group 1038 // is otherwise empty. Usually this is caught by the 1039 // target.exitParentNotified check above, but if t is both (in the thread 1040 // group of) target's tracer and parent, asPtracer may be true. 1041 if target == target.tg.leader && target.tg.tasksCount != 1 { 1042 return nil 1043 } 1044 pid := t.tg.pidns.tids[target] 1045 uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() 1046 status := target.exitStatus.Status() 1047 if !opts.ConsumeEvent { 1048 return &WaitResult{ 1049 Task: target, 1050 TID: pid, 1051 UID: uid, 1052 Event: EventExit, 1053 Status: status, 1054 } 1055 } 1056 // Surprisingly, the exit status reported by a non-consuming wait can 1057 // differ from that reported by a consuming wait; the latter will return 1058 // the group exit code if one is available. 1059 if target.tg.exiting { 1060 status = target.tg.exitStatus.Status() 1061 } 1062 // t may be (in the thread group of) target's parent, tracer, or both. We 1063 // don't need to check for !exitTracerAcked because tracees are detached 1064 // here, and we don't need to check for !exitParentAcked because zombies 1065 // will be reaped here. 1066 if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified { 1067 target.exitTracerAcked = true 1068 target.ptraceTracer.Store((*Task)(nil)) 1069 delete(t.ptraceTracees, target) 1070 } 1071 if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified { 1072 target.exitParentAcked = true 1073 if target == target.tg.leader { 1074 // target.tg.exitedCPUStats doesn't include target.CPUStats() yet, 1075 // and won't until after target.exitNotifyLocked() (maybe). Include 1076 // target.CPUStats() explicitly. This is consistent with Linux, 1077 // which accounts an exited task's cputime to its thread group in 1078 // kernel/exit.c:release_task() => __exit_signal(), and uses 1079 // thread_group_cputime_adjusted() in wait_task_zombie(). 1080 t.tg.childCPUStats.Accumulate(target.CPUStats()) 1081 t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats) 1082 t.tg.childCPUStats.Accumulate(target.tg.childCPUStats) 1083 // Update t's child max resident set size. The size will be the maximum 1084 // of this thread's size and all its childrens' sizes. 1085 if t.tg.childMaxRSS < target.tg.maxRSS { 1086 t.tg.childMaxRSS = target.tg.maxRSS 1087 } 1088 if t.tg.childMaxRSS < target.tg.childMaxRSS { 1089 t.tg.childMaxRSS = target.tg.childMaxRSS 1090 } 1091 } 1092 } 1093 target.exitNotifyLocked(false) 1094 return &WaitResult{ 1095 Task: target, 1096 TID: pid, 1097 UID: uid, 1098 Event: EventExit, 1099 Status: status, 1100 } 1101 } 1102 1103 // updateRSSLocked updates t.tg.maxRSS. 1104 // 1105 // Preconditions: The TaskSet mutex must be locked for writing. 1106 func (t *Task) updateRSSLocked() { 1107 if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS { 1108 t.tg.maxRSS = mmMaxRSS 1109 } 1110 } 1111 1112 // Preconditions: The TaskSet mutex must be locked for writing. 1113 func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult { 1114 target.tg.signalHandlers.mu.Lock() 1115 defer target.tg.signalHandlers.mu.Unlock() 1116 if !target.tg.groupStopWaitable { 1117 return nil 1118 } 1119 pid := t.tg.pidns.tids[target] 1120 uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() 1121 sig := target.tg.groupStopSignal 1122 if opts.ConsumeEvent { 1123 target.tg.groupStopWaitable = false 1124 } 1125 return &WaitResult{ 1126 Task: target, 1127 TID: pid, 1128 UID: uid, 1129 Event: EventChildGroupStop, 1130 // There is no name for these status constants. 1131 Status: (uint32(sig)&0xff)<<8 | 0x7f, 1132 } 1133 } 1134 1135 // Preconditions: The TaskSet mutex must be locked for writing. 1136 func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult { 1137 target.tg.signalHandlers.mu.Lock() 1138 defer target.tg.signalHandlers.mu.Unlock() 1139 if !target.tg.groupContWaitable { 1140 return nil 1141 } 1142 pid := t.tg.pidns.tids[target] 1143 uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() 1144 if opts.ConsumeEvent { 1145 target.tg.groupContWaitable = false 1146 } 1147 return &WaitResult{ 1148 Task: target, 1149 TID: pid, 1150 UID: uid, 1151 Event: EventGroupContinue, 1152 Status: 0xffff, 1153 } 1154 } 1155 1156 // Preconditions: The TaskSet mutex must be locked for writing. 1157 func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult { 1158 target.tg.signalHandlers.mu.Lock() 1159 defer target.tg.signalHandlers.mu.Unlock() 1160 if target.stop == nil { 1161 return nil 1162 } 1163 if _, ok := target.stop.(*ptraceStop); !ok { 1164 return nil 1165 } 1166 if target.ptraceCode == 0 { 1167 return nil 1168 } 1169 pid := t.tg.pidns.tids[target] 1170 uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow() 1171 code := target.ptraceCode 1172 if opts.ConsumeEvent { 1173 target.ptraceCode = 0 1174 } 1175 return &WaitResult{ 1176 Task: target, 1177 TID: pid, 1178 UID: uid, 1179 Event: EventTraceeStop, 1180 Status: uint32(code)<<8 | 0x7f, 1181 } 1182 } 1183 1184 // ExitState returns t's current progress through the exit path. 1185 func (t *Task) ExitState() TaskExitState { 1186 t.tg.pidns.owner.mu.RLock() 1187 defer t.tg.pidns.owner.mu.RUnlock() 1188 return t.exitState 1189 } 1190 1191 // ParentDeathSignal returns t's parent death signal. 1192 func (t *Task) ParentDeathSignal() linux.Signal { 1193 t.mu.Lock() 1194 defer t.mu.Unlock() 1195 return t.parentDeathSignal 1196 } 1197 1198 // SetParentDeathSignal sets t's parent death signal. 1199 func (t *Task) SetParentDeathSignal(sig linux.Signal) { 1200 t.mu.Lock() 1201 defer t.mu.Unlock() 1202 t.parentDeathSignal = sig 1203 }