github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/task_exit.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/task_exit.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  // This file implements the task exit cycle:
    18  //
    19  //	- Tasks are asynchronously requested to exit with Task.Kill.
    20  //
    21  //	- When able, the task goroutine enters the exit path starting from state
    22  //		runExit.
    23  //
    24  //	- Other tasks observe completed exits with Task.Wait (which implements the
    25  //		wait*() family of syscalls).
    26  
    27  import (
    28  	"errors"
    29  	"fmt"
    30  	"strconv"
    31  
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    33  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    34  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    35  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    36  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck"
    37  	pb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck/points/points_go_proto"
    38  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    39  )
    40  
    41  // TaskExitState represents a step in the task exit path.
    42  //
    43  // "Exiting" and "exited" are often ambiguous; prefer to name specific states.
    44  type TaskExitState int
    45  
    46  const (
    47  	// TaskExitNone indicates that the task has not begun exiting.
    48  	TaskExitNone TaskExitState = iota
    49  
    50  	// TaskExitInitiated indicates that the task goroutine has entered the exit
    51  	// path, and the task is no longer eligible to participate in group stops
    52  	// or group signal handling. TaskExitInitiated is analogous to Linux's
    53  	// PF_EXITING.
    54  	TaskExitInitiated
    55  
    56  	// TaskExitZombie indicates that the task has released its resources, and
    57  	// the task no longer prevents a sibling thread from completing execve.
    58  	TaskExitZombie
    59  
    60  	// TaskExitDead indicates that the task's thread IDs have been released,
    61  	// and the task no longer prevents its thread group leader from being
    62  	// reaped. ("Reaping" refers to the transitioning of a task from
    63  	// TaskExitZombie to TaskExitDead.)
    64  	TaskExitDead
    65  )
    66  
    67  // String implements fmt.Stringer.
    68  func (t TaskExitState) String() string {
    69  	switch t {
    70  	case TaskExitNone:
    71  		return "TaskExitNone"
    72  	case TaskExitInitiated:
    73  		return "TaskExitInitiated"
    74  	case TaskExitZombie:
    75  		return "TaskExitZombie"
    76  	case TaskExitDead:
    77  		return "TaskExitDead"
    78  	default:
    79  		return strconv.Itoa(int(t))
    80  	}
    81  }
    82  
    83  // killLocked marks t as killed by enqueueing a SIGKILL, without causing the
    84  // thread-group-affecting side effects SIGKILL usually has.
    85  //
    86  // Preconditions: The signal mutex must be locked.
    87  func (t *Task) killLocked() {
    88  	// Clear killable stops.
    89  	if t.stop != nil && t.stop.Killable() {
    90  		t.endInternalStopLocked()
    91  	}
    92  	t.pendingSignals.enqueue(&linux.SignalInfo{
    93  		Signo: int32(linux.SIGKILL),
    94  		// Linux just sets SIGKILL in the pending signal bitmask without
    95  		// enqueueing an actual siginfo, such that
    96  		// kernel/signal.c:collect_signal() initializes si_code to SI_USER.
    97  		Code: linux.SI_USER,
    98  	}, nil)
    99  	t.interrupt()
   100  }
   101  
   102  // killed returns true if t has a SIGKILL pending. killed is analogous to
   103  // Linux's fatal_signal_pending().
   104  //
   105  // Preconditions: The caller must be running on the task goroutine.
   106  func (t *Task) killed() bool {
   107  	t.tg.signalHandlers.mu.Lock()
   108  	defer t.tg.signalHandlers.mu.Unlock()
   109  	return t.killedLocked()
   110  }
   111  
   112  func (t *Task) killedLocked() bool {
   113  	return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
   114  }
   115  
   116  // PrepareExit indicates an exit with the given status.
   117  //
   118  // Preconditions: The caller must be running on the task goroutine.
   119  func (t *Task) PrepareExit(ws linux.WaitStatus) {
   120  	t.tg.pidns.owner.mu.RLock()
   121  	defer t.tg.pidns.owner.mu.RUnlock()
   122  	t.tg.signalHandlers.mu.Lock()
   123  	defer t.tg.signalHandlers.mu.Unlock()
   124  
   125  	last := t.tg.activeTasks == 1
   126  	if last {
   127  		t.prepareGroupExitLocked(ws)
   128  		return
   129  	}
   130  
   131  	t.exitStatus = ws
   132  }
   133  
   134  // PrepareGroupExit indicates a group exit with status es to t's thread group.
   135  //
   136  // PrepareGroupExit is analogous to Linux's do_group_exit(), except that it
   137  // does not tail-call do_exit(), except that it *does* set Task.exitStatus.
   138  // (Linux does not do so until within do_exit(), since it reuses exit_code for
   139  // ptrace.)
   140  //
   141  // Preconditions: The caller must be running on the task goroutine.
   142  func (t *Task) PrepareGroupExit(ws linux.WaitStatus) {
   143  	t.tg.signalHandlers.mu.Lock()
   144  	defer t.tg.signalHandlers.mu.Unlock()
   145  	t.prepareGroupExitLocked(ws)
   146  }
   147  
   148  // Preconditions:
   149  //   - The caller must be running on the task goroutine.
   150  //   - The signal mutex must be locked.
   151  func (t *Task) prepareGroupExitLocked(ws linux.WaitStatus) {
   152  	if t.tg.exiting || t.tg.execing != nil {
   153  		// Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e.
   154  		// this "group exit" is being executed by the killed sibling of an
   155  		// execing task, then Task.Execve never set t.tg.exitStatus, so it's
   156  		// still the zero value. This is consistent with Linux, both in intent
   157  		// ("all other threads ... report death as if they exited via _exit(2)
   158  		// with exit code 0" - ptrace(2), "execve under ptrace") and in
   159  		// implementation (compare fs/exec.c:de_thread() =>
   160  		// kernel/signal.c:zap_other_threads() and
   161  		// kernel/exit.c:do_group_exit() =>
   162  		// include/linux/sched.h:signal_group_exit()).
   163  		t.exitStatus = t.tg.exitStatus
   164  		return
   165  	}
   166  	t.tg.exiting = true
   167  	t.tg.exitStatus = ws
   168  	t.exitStatus = ws
   169  	for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
   170  		if sibling != t {
   171  			sibling.killLocked()
   172  		}
   173  	}
   174  }
   175  
   176  // Kill requests that all tasks in ts exit as if group exiting with status ws.
   177  // Kill does not wait for tasks to exit.
   178  //
   179  // Kill has no analogue in Linux; it's provided for save/restore only.
   180  func (ts *TaskSet) Kill(ws linux.WaitStatus) {
   181  	ts.mu.Lock()
   182  	defer ts.mu.Unlock()
   183  	ts.Root.exiting = true
   184  	for t := range ts.Root.tids {
   185  		t.tg.signalHandlers.mu.Lock()
   186  		if !t.tg.exiting {
   187  			t.tg.exiting = true
   188  			t.tg.exitStatus = ws
   189  		}
   190  		t.killLocked()
   191  		t.tg.signalHandlers.mu.Unlock()
   192  	}
   193  }
   194  
   195  // advanceExitStateLocked checks that t's current exit state is oldExit, then
   196  // sets it to newExit. If t's current exit state is not oldExit,
   197  // advanceExitStateLocked panics.
   198  //
   199  // Preconditions: The TaskSet mutex must be locked.
   200  func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
   201  	if t.exitState != oldExit {
   202  		panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState))
   203  	}
   204  	t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit)
   205  	t.exitState = newExit
   206  }
   207  
   208  // runExit is the entry point into the task exit path.
   209  //
   210  // +stateify savable
   211  type runExit struct{}
   212  
   213  func (*runExit) execute(t *Task) taskRunState {
   214  	t.ptraceExit()
   215  	return (*runExitMain)(nil)
   216  }
   217  
   218  // +stateify savable
   219  type runExitMain struct{}
   220  
   221  func (*runExitMain) execute(t *Task) taskRunState {
   222  	t.traceExitEvent()
   223  
   224  	if seccheck.Global.Enabled(seccheck.PointTaskExit) {
   225  		info := &pb.TaskExit{
   226  			ExitStatus: int32(t.tg.exitStatus),
   227  		}
   228  		fields := seccheck.Global.GetFieldSet(seccheck.PointTaskExit)
   229  		if !fields.Context.Empty() {
   230  			info.ContextData = &pb.ContextData{}
   231  			LoadSeccheckData(t, fields.Context, info.ContextData)
   232  		}
   233  		seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   234  			return c.TaskExit(t, fields, info)
   235  		})
   236  	}
   237  
   238  	lastExiter := t.exitThreadGroup()
   239  
   240  	t.ResetKcov()
   241  
   242  	// If the task has a cleartid, and the thread group wasn't killed by a
   243  	// signal, handle that before releasing the MM.
   244  	if t.cleartid != 0 {
   245  		t.tg.signalHandlers.mu.Lock()
   246  		signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
   247  		t.tg.signalHandlers.mu.Unlock()
   248  		if !signaled {
   249  			zero := ThreadID(0)
   250  			if _, err := zero.CopyOut(t, t.cleartid); err == nil {
   251  				t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
   252  			}
   253  			// If the CopyOut fails, there's nothing we can do.
   254  		}
   255  	}
   256  
   257  	// Handle the robust futex list.
   258  	t.exitRobustList()
   259  
   260  	// Deactivate the address space and update max RSS before releasing the
   261  	// task's MM.
   262  	t.Deactivate()
   263  	t.tg.pidns.owner.mu.Lock()
   264  	t.updateRSSLocked()
   265  	t.tg.pidns.owner.mu.Unlock()
   266  
   267  	// Release the task image resources. Accessing these fields must be
   268  	// done with t.mu held, but the mm.DecUsers() call must be done outside
   269  	// of that lock.
   270  	t.mu.Lock()
   271  	mm := t.image.MemoryManager
   272  	t.image.MemoryManager = nil
   273  	t.image.fu = nil
   274  	t.mu.Unlock()
   275  	mm.DecUsers(t)
   276  
   277  	// Releasing the MM unblocks a blocked CLONE_VFORK parent.
   278  	t.unstopVforkParent()
   279  
   280  	t.fsContext.DecRef(t)
   281  	t.fdTable.DecRef(t)
   282  
   283  	// Detach task from all cgroups. This must happen before potentially the
   284  	// last ref to the cgroupfs mount is dropped below.
   285  	t.LeaveCgroups()
   286  
   287  	t.mu.Lock()
   288  	mntns := t.mountNamespace
   289  	t.mountNamespace = nil
   290  	ipcns := t.ipcns
   291  	netns := t.netns.Swap(nil)
   292  	t.mu.Unlock()
   293  	if mntns != nil {
   294  		mntns.DecRef(t)
   295  	}
   296  	ipcns.DecRef(t)
   297  	netns.DecRef(t)
   298  
   299  	// If this is the last task to exit from the thread group, release the
   300  	// thread group's resources.
   301  	if lastExiter {
   302  		t.tg.Release(t)
   303  	}
   304  
   305  	// Detach tracees.
   306  	t.exitPtrace()
   307  
   308  	// Reparent the task's children.
   309  	t.exitChildren()
   310  
   311  	// Don't tail-call runExitNotify, as exitChildren may have initiated a stop
   312  	// to wait for a PID namespace to die.
   313  	return (*runExitNotify)(nil)
   314  }
   315  
   316  // exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread
   317  // group that it is no longer eligible to participate in group activities. It
   318  // returns true if t is the last task in its thread group to call
   319  // exitThreadGroup.
   320  func (t *Task) exitThreadGroup() bool {
   321  	t.tg.pidns.owner.mu.Lock()
   322  	defer t.tg.pidns.owner.mu.Unlock()
   323  	t.tg.signalHandlers.mu.Lock()
   324  	// Can't defer unlock: see below.
   325  
   326  	t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated)
   327  	t.tg.activeTasks--
   328  	last := t.tg.activeTasks == 0
   329  
   330  	// Ensure that someone will handle the signals we can't.
   331  	t.setSignalMaskLocked(^linux.SignalSet(0))
   332  
   333  	// Check if this task's exit interacts with an initiated group stop.
   334  	if !t.groupStopPending {
   335  		t.tg.signalHandlers.mu.Unlock()
   336  		return last
   337  	}
   338  	t.groupStopPending = false
   339  	sig := t.tg.groupStopSignal
   340  	notifyParent := t.participateGroupStopLocked()
   341  	// signalStop must be called with t's signal mutex unlocked.
   342  	t.tg.signalHandlers.mu.Unlock()
   343  	if notifyParent && t.tg.leader.parent != nil {
   344  		t.tg.leader.parent.signalStop(t, linux.CLD_STOPPED, int32(sig))
   345  		t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
   346  	}
   347  	return last
   348  }
   349  
   350  func (t *Task) exitChildren() {
   351  	t.tg.pidns.owner.mu.Lock()
   352  	defer t.tg.pidns.owner.mu.Unlock()
   353  	newParent := t.findReparentTargetLocked()
   354  	if newParent == nil {
   355  		// "If the init process of a PID namespace terminates, the kernel
   356  		// terminates all of the processes in the namespace via a SIGKILL
   357  		// signal." - pid_namespaces(7)
   358  		t.Debugf("Init process terminating, killing namespace")
   359  		t.tg.pidns.exiting = true
   360  		for other := range t.tg.pidns.tgids {
   361  			if other == t.tg {
   362  				continue
   363  			}
   364  			other.signalHandlers.mu.Lock()
   365  			other.leader.sendSignalLocked(&linux.SignalInfo{
   366  				Signo: int32(linux.SIGKILL),
   367  			}, true /* group */)
   368  			other.signalHandlers.mu.Unlock()
   369  		}
   370  		// TODO(b/37722272): The init process waits for all processes in the
   371  		// namespace to exit before completing its own exit
   372  		// (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
   373  		// other tasks in the namespace are dead, except possibly for this
   374  		// thread group's leader (which can't be reaped until this task exits).
   375  	}
   376  	// This is correct even if newParent is nil (it ensures that children don't
   377  	// wait for a parent to reap them.)
   378  	for c := range t.children {
   379  		if sig := c.ParentDeathSignal(); sig != 0 {
   380  			siginfo := &linux.SignalInfo{
   381  				Signo: int32(sig),
   382  				Code:  linux.SI_USER,
   383  			}
   384  			siginfo.SetPID(int32(c.tg.pidns.tids[t]))
   385  			siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
   386  			c.tg.signalHandlers.mu.Lock()
   387  			c.sendSignalLocked(siginfo, true /* group */)
   388  			c.tg.signalHandlers.mu.Unlock()
   389  		}
   390  		c.reparentLocked(newParent)
   391  		if newParent != nil {
   392  			newParent.children[c] = struct{}{}
   393  		}
   394  	}
   395  }
   396  
   397  // findReparentTargetLocked returns the task to which t's children should be
   398  // reparented. If no such task exists, findNewParentLocked returns nil.
   399  //
   400  // This corresponds to Linux's find_new_reaper().
   401  //
   402  // Preconditions: The TaskSet mutex must be locked.
   403  func (t *Task) findReparentTargetLocked() *Task {
   404  	// Reparent to any sibling in the same thread group that hasn't begun
   405  	// exiting.
   406  	if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil {
   407  		return t2
   408  	}
   409  
   410  	if !t.tg.hasChildSubreaper {
   411  		// No child subreaper exists. We can immediately return the
   412  		// init process in this PID namespace if it exists.
   413  		if init := t.tg.pidns.tasks[initTID]; init != nil {
   414  			return init.tg.anyNonExitingTaskLocked()
   415  		}
   416  		return nil
   417  	}
   418  
   419  	// Walk up the process tree until we either find a subreaper, or we hit
   420  	// the init process in the PID namespace.
   421  	for parent := t.parent; parent != nil; parent = parent.parent {
   422  		if parent.tg.isInitInLocked(parent.PIDNamespace()) {
   423  			// We found the init process for this pid namespace,
   424  			// return a task from it. If the init process is
   425  			// exiting, this might return nil.
   426  			return parent.tg.anyNonExitingTaskLocked()
   427  		}
   428  		if parent.tg.isChildSubreaper {
   429  			// We found a subreaper process. Return a non-exiting
   430  			// task if there is one, otherwise keep walking up the
   431  			// process tree.
   432  			if target := parent.tg.anyNonExitingTaskLocked(); target != nil {
   433  				return target
   434  			}
   435  		}
   436  	}
   437  
   438  	return nil
   439  }
   440  
   441  func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task {
   442  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   443  		if t.exitState == TaskExitNone {
   444  			return t
   445  		}
   446  	}
   447  	return nil
   448  }
   449  
   450  // reparentLocked changes t's parent. The new parent may be nil.
   451  //
   452  // Preconditions: The TaskSet mutex must be locked for writing.
   453  func (t *Task) reparentLocked(parent *Task) {
   454  	oldParent := t.parent
   455  	t.parent = parent
   456  	if oldParent != nil {
   457  		delete(oldParent.children, t)
   458  	}
   459  	if parent != nil {
   460  		parent.children[t] = struct{}{}
   461  	}
   462  	// If a thread group leader's parent changes, reset the thread group's
   463  	// termination signal to SIGCHLD and re-check exit notification. (Compare
   464  	// kernel/exit.c:reparent_leader().)
   465  	if t != t.tg.leader {
   466  		return
   467  	}
   468  	if oldParent == nil && parent == nil {
   469  		return
   470  	}
   471  	if oldParent != nil && parent != nil && oldParent.tg == parent.tg {
   472  		return
   473  	}
   474  	t.tg.terminationSignal = linux.SIGCHLD
   475  	if t.exitParentNotified && !t.exitParentAcked {
   476  		t.exitParentNotified = false
   477  		t.exitNotifyLocked(false)
   478  	}
   479  }
   480  
   481  // When a task exits, other tasks in the system, notably the task's parent and
   482  // ptracer, may want to be notified. The exit notification system ensures that
   483  // interested tasks receive signals and/or are woken from blocking calls to
   484  // wait*() syscalls; these notifications must be resolved before exiting tasks
   485  // can be reaped and disappear from the system.
   486  //
   487  // Each task may have a parent task and/or a tracer task. If both a parent and
   488  // a tracer exist, they may be the same task, different tasks in the same
   489  // thread group, or tasks in different thread groups. (In the last case, Linux
   490  // refers to the task as being ptrace-reparented due to an implementation
   491  // detail; we avoid this terminology to avoid confusion.)
   492  //
   493  // A thread group is *empty* if all non-leader tasks in the thread group are
   494  // dead, and the leader is either a zombie or dead. The exit of a thread group
   495  // leader is never waitable - by either the parent or tracer - until the thread
   496  // group is empty.
   497  //
   498  // There are a few ways for an exit notification to be resolved:
   499  //
   500  //	- The exit notification may be acknowledged by a call to Task.Wait with
   501  //   WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall).
   502  //
   503  //	- If the notified party is the parent, and the parent thread group is not
   504  //		also the tracer thread group, and the notification signal is SIGCHLD, the
   505  //		parent may explicitly ignore the notification (see quote in exitNotify).
   506  //		Note that it's possible for the notified party to ignore the signal in other
   507  //		cases, but the notification is only resolved under the above conditions.
   508  //		(Actually, there is one exception; see the last paragraph of the "leader,
   509  //		has tracer, tracer thread group is parent thread group" case below.)
   510  //
   511  //	- If the notified party is the parent, and the parent does not exist, the
   512  //		notification is resolved as if ignored. (This is only possible in the
   513  //		sentry. In Linux, the only task / thread group without a parent is global
   514  //		init, and killing global init causes a kernel panic.)
   515  //
   516  //	- If the notified party is a tracer, the tracer may detach the traced task.
   517  //		(Zombie tasks cannot be ptrace-attached, so the reverse is not possible.)
   518  //
   519  // In addition, if the notified party is the parent, the parent may exit and
   520  // cause the notifying task to be reparented to another thread group. This does
   521  // not resolve the notification; instead, the notification must be resent to
   522  // the new parent.
   523  //
   524  // The series of notifications generated for a given task's exit depend on
   525  // whether it is a thread group leader; whether the task is ptraced; and, if
   526  // so, whether the tracer thread group is the same as the parent thread group.
   527  //
   528  //	- Non-leader, no tracer: No notification is generated; the task is reaped
   529  //		immediately.
   530  //
   531  //	- Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer
   532  //		notification is resolved (by waiting or detaching), the task is reaped. (For
   533  //		non-leaders, whether the tracer and parent thread groups are the same is
   534  //		irrelevant.)
   535  //
   536  //	- Leader, no tracer: The task remains a zombie, with no notification sent,
   537  //		until all other tasks in the thread group are dead. (In Linux terms, this
   538  //		condition is indicated by include/linux/sched.h:thread_group_empty(); tasks
   539  //		are removed from their thread_group list in kernel/exit.c:release_task() =>
   540  // 		__exit_signal() => __unhash_process().) Then the thread group's termination
   541  //		signal is sent to the parent. When the parent notification is resolved (by
   542  //		waiting or ignoring), the task is reaped.
   543  //
   544  //	- Leader, has tracer, tracer thread group is not parent thread group:
   545  // SIGCHLD is sent to the tracer. When the tracer notification is resolved (by
   546  // waiting or detaching), and all other tasks in the thread group are dead, the
   547  // thread group's termination signal is sent to the parent. (Note that the
   548  // tracer cannot resolve the exit notification by waiting until the thread
   549  // group is empty.) When the parent notification is resolved, the task is
   550  // reaped.
   551  //
   552  //	- Leader, has tracer, tracer thread group is parent thread group:
   553  //
   554  // If all other tasks in the thread group are dead, the thread group's
   555  // termination signal is sent to the parent. At this point, the notification
   556  // can only be resolved by waiting. If the parent detaches from the task as a
   557  // tracer, the notification is not resolved, but the notification can now be
   558  // resolved by waiting or ignoring. When the parent notification is resolved,
   559  // the task is reaped.
   560  //
   561  // If at least one task in the thread group is not dead, SIGCHLD is sent to the
   562  // parent. At this point, the notification cannot be resolved at all; once the
   563  // thread group becomes empty, it can be resolved only by waiting. If the
   564  // parent detaches from the task as a tracer before all remaining tasks die,
   565  // then exit notification proceeds as in the case where the leader never had a
   566  // tracer. If the parent detaches from the task as a tracer after all remaining
   567  // tasks die, the notification is not resolved, but the notification can now be
   568  // resolved by waiting or ignoring. When the parent notification is resolved,
   569  // the task is reaped.
   570  //
   571  // In both of the above cases, when the parent detaches from the task as a
   572  // tracer while the thread group is empty, whether or not the parent resolves
   573  // the notification by ignoring it is based on the parent's SIGCHLD signal
   574  // action, whether or not the thread group's termination signal is SIGCHLD
   575  // (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()).
   576  //
   577  // There is one final wrinkle: A leader can become a non-leader due to a
   578  // sibling execve. In this case, the execing thread detaches the leader's
   579  // tracer (if one exists) and reaps the leader immediately. In Linux, this is
   580  // in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
   581  
   582  // +stateify savable
   583  type runExitNotify struct{}
   584  
   585  func (*runExitNotify) execute(t *Task) taskRunState {
   586  	t.tg.pidns.owner.mu.Lock()
   587  	defer t.tg.pidns.owner.mu.Unlock()
   588  	t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
   589  	t.tg.liveTasks--
   590  	// Check if this completes a sibling's execve.
   591  	if t.tg.execing != nil && t.tg.liveTasks == 1 {
   592  		// execing blocks the addition of new tasks to the thread group, so
   593  		// the sole living task must be the execing one.
   594  		e := t.tg.execing
   595  		e.tg.signalHandlers.mu.Lock()
   596  		if _, ok := e.stop.(*execStop); ok {
   597  			e.endInternalStopLocked()
   598  		}
   599  		e.tg.signalHandlers.mu.Unlock()
   600  	}
   601  	t.exitNotifyLocked(false)
   602  	// The task goroutine will now exit.
   603  	return nil
   604  }
   605  
   606  // exitNotifyLocked is called after changes to t's state that affect exit
   607  // notification.
   608  //
   609  // If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace;
   610  // thanks to Linux's haphazard implementation of this functionality, such cases
   611  // determine whether parent notifications are ignored based on the parent's
   612  // handling of SIGCHLD, regardless of what the exited task's thread group's
   613  // termination signal is.
   614  //
   615  // Preconditions: The TaskSet mutex must be locked for writing.
   616  func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
   617  	if t.exitState != TaskExitZombie {
   618  		return
   619  	}
   620  	if !t.exitTracerNotified {
   621  		t.exitTracerNotified = true
   622  		tracer := t.Tracer()
   623  		if tracer == nil {
   624  			t.exitTracerAcked = true
   625  		} else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg {
   626  			// Don't set exitParentNotified if t is non-leader, even if the
   627  			// tracer is in the parent thread group, so that if the parent
   628  			// detaches the following call to exitNotifyLocked passes through
   629  			// the !exitParentNotified case below and causes t to be reaped
   630  			// immediately.
   631  			//
   632  			// Tracer notification doesn't care about about
   633  			// SIG_IGN/SA_NOCLDWAIT.
   634  			tracer.tg.signalHandlers.mu.Lock()
   635  			tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */)
   636  			tracer.tg.signalHandlers.mu.Unlock()
   637  			// Wake EventTraceeStop waiters as well since this task will never
   638  			// ptrace-stop again.
   639  			tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop)
   640  		} else {
   641  			// t is a leader and the tracer is in the parent thread group.
   642  			t.exitParentNotified = true
   643  			sig := linux.SIGCHLD
   644  			if t.tg.tasksCount == 1 {
   645  				sig = t.tg.terminationSignal
   646  			}
   647  			// This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either
   648  			// (in Linux, the check in do_notify_parent() is gated by
   649  			// !tsk->ptrace.)
   650  			t.parent.tg.signalHandlers.mu.Lock()
   651  			t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */)
   652  			t.parent.tg.signalHandlers.mu.Unlock()
   653  			// See below for rationale for this event mask.
   654  			t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
   655  		}
   656  	}
   657  	if t.exitTracerAcked && !t.exitParentNotified {
   658  		if t != t.tg.leader {
   659  			t.exitParentNotified = true
   660  			t.exitParentAcked = true
   661  		} else if t.tg.tasksCount == 1 {
   662  			t.exitParentNotified = true
   663  			if t.parent == nil {
   664  				t.exitParentAcked = true
   665  			} else {
   666  				// "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
   667  				// set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
   668  				// sigaction(2)), then children that terminate do not become
   669  				// zombies and a call to wait() or waitpid() will block until all
   670  				// children have terminated, and then fail with errno set to
   671  				// ECHILD. (The original POSIX standard left the behavior of
   672  				// setting SIGCHLD to SIG_IGN unspecified. Note that even though
   673  				// the default disposition of SIGCHLD is "ignore", explicitly
   674  				// setting the disposition to SIG_IGN results in different
   675  				// treatment of zombie process children.) Linux 2.6 conforms to
   676  				// this specification." - wait(2)
   677  				//
   678  				// Some undocumented Linux-specific details:
   679  				//
   680  				//	- All of the above is ignored if the termination signal isn't
   681  				//		SIGCHLD.
   682  				//
   683  				//	- SA_NOCLDWAIT causes the leader to be immediately reaped, but
   684  				//		does not suppress the SIGCHLD.
   685  				signalParent := t.tg.terminationSignal.IsValid()
   686  				t.parent.tg.signalHandlers.mu.Lock()
   687  				if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach {
   688  					if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok {
   689  						if act.Handler == linux.SIG_IGN {
   690  							t.exitParentAcked = true
   691  							signalParent = false
   692  						} else if act.Flags&linux.SA_NOCLDWAIT != 0 {
   693  							t.exitParentAcked = true
   694  						}
   695  					}
   696  				}
   697  				if signalParent {
   698  					t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */)
   699  				}
   700  				t.parent.tg.signalHandlers.mu.Unlock()
   701  				// If a task in the parent was waiting for a child group stop
   702  				// or continue, it needs to be notified of the exit, because
   703  				// there may be no remaining eligible tasks (so that wait
   704  				// should return ECHILD).
   705  				t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
   706  			}
   707  
   708  			// We don't send exit events for the root process because we don't send
   709  			// Clone or Exec events for the initial process.
   710  			if t.tg != t.k.globalInit && seccheck.Global.Enabled(seccheck.PointExitNotifyParent) {
   711  				mask, info := getExitNotifyParentSeccheckInfo(t)
   712  				if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   713  					return c.ExitNotifyParent(t, mask, info)
   714  				}); err != nil {
   715  					log.Infof("Ignoring error from ExitNotifyParent point: %v", err)
   716  				}
   717  			}
   718  		}
   719  	}
   720  	if t.exitTracerAcked && t.exitParentAcked {
   721  		t.advanceExitStateLocked(TaskExitZombie, TaskExitDead)
   722  		for ns := t.tg.pidns; ns != nil; ns = ns.parent {
   723  			ns.deleteTask(t)
   724  		}
   725  		t.userCounters.decRLimitNProc()
   726  		t.tg.exitedCPUStats.Accumulate(t.CPUStats())
   727  		t.tg.ioUsage.Accumulate(t.ioUsage)
   728  		t.tg.signalHandlers.mu.Lock()
   729  		t.tg.tasks.Remove(t)
   730  		t.tg.tasksCount--
   731  		tc := t.tg.tasksCount
   732  		t.tg.signalHandlers.mu.Unlock()
   733  		if tc == 1 && t != t.tg.leader {
   734  			// Our fromPtraceDetach doesn't matter here (in Linux terms, this
   735  			// is via a call to release_task()).
   736  			t.tg.leader.exitNotifyLocked(false)
   737  		} else if tc == 0 {
   738  			t.tg.pidWithinNS.Store(0)
   739  			t.tg.processGroup.decRefWithParent(t.tg.parentPG())
   740  		}
   741  		if t.parent != nil {
   742  			delete(t.parent.children, t)
   743  			// Do not clear t.parent. It may be still be needed after the task has exited
   744  			// (for example, to perform ptrace access checks on /proc/[pid] files).
   745  		}
   746  	}
   747  }
   748  
   749  // Preconditions: The TaskSet mutex must be locked.
   750  func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.SignalInfo {
   751  	info := &linux.SignalInfo{
   752  		Signo: int32(sig),
   753  	}
   754  	info.SetPID(int32(receiver.tg.pidns.tids[t]))
   755  	info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
   756  	if t.exitStatus.Signaled() {
   757  		info.Code = linux.CLD_KILLED
   758  		info.SetStatus(int32(t.exitStatus.TerminationSignal()))
   759  	} else {
   760  		info.Code = linux.CLD_EXITED
   761  		info.SetStatus(int32(t.exitStatus.ExitStatus()))
   762  	}
   763  	// TODO(b/72102453): Set utime, stime.
   764  	return info
   765  }
   766  
   767  // Preconditions: The TaskSet mutex must be locked.
   768  func getExitNotifyParentSeccheckInfo(t *Task) (seccheck.FieldSet, *pb.ExitNotifyParentInfo) {
   769  	fields := seccheck.Global.GetFieldSet(seccheck.PointExitNotifyParent)
   770  
   771  	info := &pb.ExitNotifyParentInfo{
   772  		ExitStatus: int32(t.tg.exitStatus),
   773  	}
   774  	if !fields.Context.Empty() {
   775  		info.ContextData = &pb.ContextData{}
   776  		// cwd isn't used for notifyExit seccheck so it's ok to pass an empty
   777  		// string.
   778  		LoadSeccheckDataLocked(t, fields.Context, info.ContextData, "")
   779  	}
   780  
   781  	return fields, info
   782  }
   783  
   784  // ExitStatus returns t's exit status, which is only guaranteed to be
   785  // meaningful if t.ExitState() != TaskExitNone.
   786  func (t *Task) ExitStatus() linux.WaitStatus {
   787  	t.tg.pidns.owner.mu.RLock()
   788  	defer t.tg.pidns.owner.mu.RUnlock()
   789  	t.tg.signalHandlers.mu.Lock()
   790  	defer t.tg.signalHandlers.mu.Unlock()
   791  	return t.exitStatus
   792  }
   793  
   794  // ExitStatus returns the exit status that would be returned by a consuming
   795  // wait*() on tg.
   796  func (tg *ThreadGroup) ExitStatus() linux.WaitStatus {
   797  	tg.pidns.owner.mu.RLock()
   798  	defer tg.pidns.owner.mu.RUnlock()
   799  	tg.signalHandlers.mu.Lock()
   800  	defer tg.signalHandlers.mu.Unlock()
   801  	if tg.exiting {
   802  		return tg.exitStatus
   803  	}
   804  	return tg.leader.exitStatus
   805  }
   806  
   807  // TerminationSignal returns the thread group's termination signal, which is
   808  // the signal that will be sent to its leader's parent when all threads have
   809  // exited.
   810  func (tg *ThreadGroup) TerminationSignal() linux.Signal {
   811  	tg.pidns.owner.mu.RLock()
   812  	defer tg.pidns.owner.mu.RUnlock()
   813  	return tg.terminationSignal
   814  }
   815  
   816  // Task events that can be waited for.
   817  const (
   818  	// EventExit represents an exit notification generated for a child thread
   819  	// group leader or a tracee under the conditions specified in the comment
   820  	// above runExitNotify.
   821  	EventExit waiter.EventMask = 1 << iota
   822  
   823  	// EventChildGroupStop occurs when a child thread group completes a group
   824  	// stop (i.e. all tasks in the child thread group have entered a stopped
   825  	// state as a result of a group stop).
   826  	EventChildGroupStop
   827  
   828  	// EventTraceeStop occurs when a task that is ptraced by a task in the
   829  	// notified thread group enters a ptrace stop (see ptrace(2)).
   830  	EventTraceeStop
   831  
   832  	// EventGroupContinue occurs when a child thread group, or a thread group
   833  	// whose leader is ptraced by a task in the notified thread group, that had
   834  	// initiated or completed a group stop leaves the group stop, due to the
   835  	// child thread group or any task in the child thread group being sent
   836  	// SIGCONT.
   837  	EventGroupContinue
   838  )
   839  
   840  // WaitOptions controls the behavior of Task.Wait.
   841  type WaitOptions struct {
   842  	// If SpecificTID is non-zero, only events from the task with thread ID
   843  	// SpecificTID are eligible to be waited for. SpecificTID is resolved in
   844  	// the PID namespace of the waiter (the method receiver of Task.Wait). If
   845  	// no such task exists, or that task would not otherwise be eligible to be
   846  	// waited for by the waiting task, then there are no waitable tasks and
   847  	// Wait will return ECHILD.
   848  	SpecificTID ThreadID
   849  
   850  	// If SpecificPGID is non-zero, only events from ThreadGroups with a
   851  	// matching ProcessGroupID are eligible to be waited for. (Same
   852  	// constraints as SpecificTID apply.)
   853  	SpecificPGID ProcessGroupID
   854  
   855  	// Terminology note: Per waitpid(2), "a clone child is one which delivers
   856  	// no signal, or a signal other than SIGCHLD to its parent upon
   857  	// termination." In Linux, termination signal is technically a per-task
   858  	// property rather than a per-thread-group property. However, clone()
   859  	// forces no termination signal for tasks created with CLONE_THREAD, and
   860  	// execve() resets the termination signal to SIGCHLD, so all
   861  	// non-group-leader threads have no termination signal and are therefore
   862  	// "clone tasks".
   863  
   864  	// If NonCloneTasks is true, events from non-clone tasks are eligible to be
   865  	// waited for.
   866  	NonCloneTasks bool
   867  
   868  	// If CloneTasks is true, events from clone tasks are eligible to be waited
   869  	// for.
   870  	CloneTasks bool
   871  
   872  	// If SiblingChildren is true, events from children tasks of any task
   873  	// in the thread group of the waiter are eligible to be waited for.
   874  	SiblingChildren bool
   875  
   876  	// Events is a bitwise combination of the events defined above that specify
   877  	// what events are of interest to the call to Wait.
   878  	Events waiter.EventMask
   879  
   880  	// If ConsumeEvent is true, the Wait should consume the event such that it
   881  	// cannot be returned by a future Wait. Note that if a task exit is
   882  	// consumed in this way, in most cases the task will be reaped.
   883  	ConsumeEvent bool
   884  
   885  	// If BlockInterruptErr is not nil, Wait will block until either an event
   886  	// is available or there are no tasks that could produce a waitable event;
   887  	// if that blocking is interrupted, Wait returns BlockInterruptErr. If
   888  	// BlockInterruptErr is nil, Wait will not block.
   889  	BlockInterruptErr error
   890  }
   891  
   892  // Preconditions: The TaskSet mutex must be locked (for reading or writing).
   893  func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool {
   894  	if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
   895  		return false
   896  	}
   897  	if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
   898  		return false
   899  	}
   900  	// Tracees are always eligible.
   901  	if tracee {
   902  		return true
   903  	}
   904  	if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
   905  		return o.NonCloneTasks
   906  	}
   907  	return o.CloneTasks
   908  }
   909  
   910  // ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g.
   911  // waitpid(WNOHANG)) that find no waitable events, but determine that waitable
   912  // events may exist in the future. (In contrast, if a non-blocking or blocking
   913  // Wait determines that there are no tasks that can produce a waitable event,
   914  // Task.Wait returns ECHILD.)
   915  var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")
   916  
   917  // WaitResult contains information about a waited-for event.
   918  type WaitResult struct {
   919  	// Task is the task that reported the event.
   920  	Task *Task
   921  
   922  	// TID is the thread ID of Task in the PID namespace of the task that
   923  	// called Wait (that is, the method receiver of the call to Task.Wait). TID
   924  	// is provided because consuming exit waits cause the thread ID to be
   925  	// deallocated.
   926  	TID ThreadID
   927  
   928  	// UID is the real UID of Task in the user namespace of the task that
   929  	// called Wait.
   930  	UID auth.UID
   931  
   932  	// Event is exactly one of the events defined above.
   933  	Event waiter.EventMask
   934  
   935  	// Status is the wait status associated with the event.
   936  	Status linux.WaitStatus
   937  }
   938  
   939  // Wait waits for an event from a thread group that is a child of t's thread
   940  // group, or a task in such a thread group, or a task that is ptraced by t,
   941  // subject to the options specified in opts.
   942  func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) {
   943  	if opts.BlockInterruptErr == nil {
   944  		return t.waitOnce(opts)
   945  	}
   946  	w, ch := waiter.NewChannelEntry(opts.Events)
   947  	t.tg.eventQueue.EventRegister(&w)
   948  	defer t.tg.eventQueue.EventUnregister(&w)
   949  	for {
   950  		wr, err := t.waitOnce(opts)
   951  		if err != ErrNoWaitableEvent {
   952  			// This includes err == nil.
   953  			return wr, err
   954  		}
   955  		if err := t.Block(ch); err != nil {
   956  			return wr, linuxerr.ConvertIntr(err, opts.BlockInterruptErr)
   957  		}
   958  	}
   959  }
   960  
   961  func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
   962  	anyWaitableTasks := false
   963  
   964  	t.tg.pidns.owner.mu.Lock()
   965  	defer t.tg.pidns.owner.mu.Unlock()
   966  
   967  	if opts.SiblingChildren {
   968  		// We can wait on the children and tracees of any task in the
   969  		// same thread group.
   970  		for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
   971  			wr, any := t.waitParentLocked(opts, parent)
   972  			if wr != nil {
   973  				return wr, nil
   974  			}
   975  			anyWaitableTasks = anyWaitableTasks || any
   976  		}
   977  	} else {
   978  		// We can only wait on this task.
   979  		var wr *WaitResult
   980  		wr, anyWaitableTasks = t.waitParentLocked(opts, t)
   981  		if wr != nil {
   982  			return wr, nil
   983  		}
   984  	}
   985  
   986  	if anyWaitableTasks {
   987  		return nil, ErrNoWaitableEvent
   988  	}
   989  	return nil, linuxerr.ECHILD
   990  }
   991  
   992  // Preconditions: The TaskSet mutex must be locked for writing.
   993  func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) {
   994  	anyWaitableTasks := false
   995  
   996  	for child := range parent.children {
   997  		if !opts.matchesTask(child, parent.tg.pidns, false) {
   998  			continue
   999  		}
  1000  		// Non-leaders don't notify parents on exit and aren't eligible to
  1001  		// be waited on.
  1002  		if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
  1003  			anyWaitableTasks = true
  1004  			if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
  1005  				return wr, anyWaitableTasks
  1006  			}
  1007  		}
  1008  		// Check for group stops and continues. Tasks that have passed
  1009  		// TaskExitInitiated can no longer participate in group stops.
  1010  		if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
  1011  			continue
  1012  		}
  1013  		if child.exitState >= TaskExitInitiated {
  1014  			continue
  1015  		}
  1016  		// If the waiter is in the same thread group as the task's
  1017  		// tracer, do not report its group stops; they will be reported
  1018  		// as ptrace stops instead. This also skips checking for group
  1019  		// continues, but they'll be checked for when scanning tracees
  1020  		// below. (Per kernel/exit.c:wait_consider_task(): "If a
  1021  		// ptracer wants to distinguish the two events for its own
  1022  		// children, it should create a separate process which takes
  1023  		// the role of real parent.")
  1024  		if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
  1025  			continue
  1026  		}
  1027  		anyWaitableTasks = true
  1028  		if opts.Events&EventChildGroupStop != 0 {
  1029  			if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
  1030  				return wr, anyWaitableTasks
  1031  			}
  1032  		}
  1033  		if opts.Events&EventGroupContinue != 0 {
  1034  			if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
  1035  				return wr, anyWaitableTasks
  1036  			}
  1037  		}
  1038  	}
  1039  	for tracee := range parent.ptraceTracees {
  1040  		if !opts.matchesTask(tracee, parent.tg.pidns, true) {
  1041  			continue
  1042  		}
  1043  		// Non-leaders do notify tracers on exit.
  1044  		if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
  1045  			anyWaitableTasks = true
  1046  			if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
  1047  				return wr, anyWaitableTasks
  1048  			}
  1049  		}
  1050  		if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
  1051  			continue
  1052  		}
  1053  		if tracee.exitState >= TaskExitInitiated {
  1054  			continue
  1055  		}
  1056  		anyWaitableTasks = true
  1057  		if opts.Events&EventTraceeStop != 0 {
  1058  			if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
  1059  				return wr, anyWaitableTasks
  1060  			}
  1061  		}
  1062  		if opts.Events&EventGroupContinue != 0 {
  1063  			if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
  1064  				return wr, anyWaitableTasks
  1065  			}
  1066  		}
  1067  	}
  1068  
  1069  	return nil, anyWaitableTasks
  1070  }
  1071  
  1072  // Preconditions: The TaskSet mutex must be locked for writing.
  1073  func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult {
  1074  	if asPtracer && !target.exitTracerNotified {
  1075  		return nil
  1076  	}
  1077  	if !asPtracer && !target.exitParentNotified {
  1078  		return nil
  1079  	}
  1080  	// Zombied thread group leaders are never waitable until their thread group
  1081  	// is otherwise empty. Usually this is caught by the
  1082  	// target.exitParentNotified check above, but if t is both (in the thread
  1083  	// group of) target's tracer and parent, asPtracer may be true.
  1084  	if target == target.tg.leader && target.tg.tasksCount != 1 {
  1085  		return nil
  1086  	}
  1087  	pid := t.tg.pidns.tids[target]
  1088  	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
  1089  	status := target.exitStatus
  1090  	if !opts.ConsumeEvent {
  1091  		return &WaitResult{
  1092  			Task:   target,
  1093  			TID:    pid,
  1094  			UID:    uid,
  1095  			Event:  EventExit,
  1096  			Status: status,
  1097  		}
  1098  	}
  1099  	// Surprisingly, the exit status reported by a non-consuming wait can
  1100  	// differ from that reported by a consuming wait; the latter will return
  1101  	// the group exit code if one is available.
  1102  	if target.tg.exiting {
  1103  		status = target.tg.exitStatus
  1104  	}
  1105  	// t may be (in the thread group of) target's parent, tracer, or both. We
  1106  	// don't need to check for !exitTracerAcked because tracees are detached
  1107  	// here, and we don't need to check for !exitParentAcked because zombies
  1108  	// will be reaped here.
  1109  	if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified {
  1110  		target.exitTracerAcked = true
  1111  		target.ptraceTracer.Store((*Task)(nil))
  1112  		delete(t.ptraceTracees, target)
  1113  	}
  1114  	if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified {
  1115  		target.exitParentAcked = true
  1116  		if target == target.tg.leader {
  1117  			// target.tg.exitedCPUStats doesn't include target.CPUStats() yet,
  1118  			// and won't until after target.exitNotifyLocked() (maybe). Include
  1119  			// target.CPUStats() explicitly. This is consistent with Linux,
  1120  			// which accounts an exited task's cputime to its thread group in
  1121  			// kernel/exit.c:release_task() => __exit_signal(), and uses
  1122  			// thread_group_cputime_adjusted() in wait_task_zombie().
  1123  			t.tg.childCPUStats.Accumulate(target.CPUStats())
  1124  			t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats)
  1125  			t.tg.childCPUStats.Accumulate(target.tg.childCPUStats)
  1126  			// Update t's child max resident set size. The size will be the maximum
  1127  			// of this thread's size and all its childrens' sizes.
  1128  			if t.tg.childMaxRSS < target.tg.maxRSS {
  1129  				t.tg.childMaxRSS = target.tg.maxRSS
  1130  			}
  1131  			if t.tg.childMaxRSS < target.tg.childMaxRSS {
  1132  				t.tg.childMaxRSS = target.tg.childMaxRSS
  1133  			}
  1134  		}
  1135  	}
  1136  	target.exitNotifyLocked(false)
  1137  	return &WaitResult{
  1138  		Task:   target,
  1139  		TID:    pid,
  1140  		UID:    uid,
  1141  		Event:  EventExit,
  1142  		Status: status,
  1143  	}
  1144  }
  1145  
  1146  // updateRSSLocked updates t.tg.maxRSS.
  1147  //
  1148  // Preconditions: The TaskSet mutex must be locked for writing.
  1149  func (t *Task) updateRSSLocked() {
  1150  	if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS {
  1151  		t.tg.maxRSS = mmMaxRSS
  1152  	}
  1153  }
  1154  
  1155  // Preconditions: The TaskSet mutex must be locked for writing.
  1156  func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult {
  1157  	target.tg.signalHandlers.mu.Lock()
  1158  	defer target.tg.signalHandlers.mu.Unlock()
  1159  	if !target.tg.groupStopWaitable {
  1160  		return nil
  1161  	}
  1162  	pid := t.tg.pidns.tids[target]
  1163  	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
  1164  	sig := target.tg.groupStopSignal
  1165  	if opts.ConsumeEvent {
  1166  		target.tg.groupStopWaitable = false
  1167  	}
  1168  	return &WaitResult{
  1169  		Task:   target,
  1170  		TID:    pid,
  1171  		UID:    uid,
  1172  		Event:  EventChildGroupStop,
  1173  		Status: linux.WaitStatusStopped(uint32(sig)),
  1174  	}
  1175  }
  1176  
  1177  // Preconditions: The TaskSet mutex must be locked for writing.
  1178  func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult {
  1179  	target.tg.signalHandlers.mu.Lock()
  1180  	defer target.tg.signalHandlers.mu.Unlock()
  1181  	if !target.tg.groupContWaitable {
  1182  		return nil
  1183  	}
  1184  	pid := t.tg.pidns.tids[target]
  1185  	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
  1186  	if opts.ConsumeEvent {
  1187  		target.tg.groupContWaitable = false
  1188  	}
  1189  	return &WaitResult{
  1190  		Task:   target,
  1191  		TID:    pid,
  1192  		UID:    uid,
  1193  		Event:  EventGroupContinue,
  1194  		Status: linux.WaitStatusContinued(),
  1195  	}
  1196  }
  1197  
  1198  // Preconditions: The TaskSet mutex must be locked for writing.
  1199  func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult {
  1200  	target.tg.signalHandlers.mu.Lock()
  1201  	defer target.tg.signalHandlers.mu.Unlock()
  1202  	if target.stop == nil {
  1203  		return nil
  1204  	}
  1205  	if _, ok := target.stop.(*ptraceStop); !ok {
  1206  		return nil
  1207  	}
  1208  	if target.ptraceCode == 0 {
  1209  		return nil
  1210  	}
  1211  	pid := t.tg.pidns.tids[target]
  1212  	uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
  1213  	code := target.ptraceCode
  1214  	if opts.ConsumeEvent {
  1215  		target.ptraceCode = 0
  1216  	}
  1217  	return &WaitResult{
  1218  		Task:   target,
  1219  		TID:    pid,
  1220  		UID:    uid,
  1221  		Event:  EventTraceeStop,
  1222  		Status: linux.WaitStatusStopped(uint32(code)),
  1223  	}
  1224  }
  1225  
  1226  // ExitState returns t's current progress through the exit path.
  1227  func (t *Task) ExitState() TaskExitState {
  1228  	t.tg.pidns.owner.mu.RLock()
  1229  	defer t.tg.pidns.owner.mu.RUnlock()
  1230  	return t.exitState
  1231  }
  1232  
  1233  // ParentDeathSignal returns t's parent death signal.
  1234  func (t *Task) ParentDeathSignal() linux.Signal {
  1235  	t.mu.Lock()
  1236  	defer t.mu.Unlock()
  1237  	return t.parentDeathSignal
  1238  }
  1239  
  1240  // SetParentDeathSignal sets t's parent death signal.
  1241  func (t *Task) SetParentDeathSignal(sig linux.Signal) {
  1242  	t.mu.Lock()
  1243  	defer t.mu.Unlock()
  1244  	t.parentDeathSignal = sig
  1245  }