github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/thread_group.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"sync/atomic"
    19  
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    25  	ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/limits"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    29  )
    30  
    31  // A ThreadGroup is a logical grouping of tasks that has widespread
    32  // significance to other kernel features (e.g. signal handling). ("Thread
    33  // groups" are usually called "processes" in userspace documentation.)
    34  //
    35  // ThreadGroup is a superset of Linux's struct signal_struct.
    36  //
    37  // +stateify savable
    38  type ThreadGroup struct {
    39  	threadGroupNode
    40  
    41  	// signalHandlers is the set of signal handlers used by every task in this
    42  	// thread group. (signalHandlers may also be shared with other thread
    43  	// groups.)
    44  	//
    45  	// signalHandlers.mu (hereafter "the signal mutex") protects state related
    46  	// to signal handling, as well as state that usually needs to be atomic
    47  	// with signal handling, for all ThreadGroups and Tasks using
    48  	// signalHandlers. (This is analogous to Linux's use of struct
    49  	// sighand_struct::siglock.)
    50  	//
    51  	// The signalHandlers pointer can only be mutated during an execve
    52  	// (Task.finishExec). Consequently, when it's possible for a task in the
    53  	// thread group to be completing an execve, signalHandlers is protected by
    54  	// the owning TaskSet.mu. Otherwise, it is possible to read the
    55  	// signalHandlers pointer without synchronization. In particular,
    56  	// completing an execve requires that all other tasks in the thread group
    57  	// have exited, so task goroutines do not need the owning TaskSet.mu to
    58  	// read the signalHandlers pointer of their thread groups.
    59  	signalHandlers *SignalHandlers
    60  
    61  	// pendingSignals is the set of pending signals that may be handled by any
    62  	// task in this thread group.
    63  	//
    64  	// pendingSignals is protected by the signal mutex.
    65  	pendingSignals pendingSignals
    66  
    67  	// If groupStopDequeued is true, a task in the thread group has dequeued a
    68  	// stop signal, but has not yet initiated the group stop.
    69  	//
    70  	// groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED.
    71  	//
    72  	// groupStopDequeued is protected by the signal mutex.
    73  	groupStopDequeued bool
    74  
    75  	// groupStopSignal is the signal that caused a group stop to be initiated.
    76  	//
    77  	// groupStopSignal is protected by the signal mutex.
    78  	groupStopSignal linux.Signal
    79  
    80  	// groupStopPendingCount is the number of active tasks in the thread group
    81  	// for which Task.groupStopPending is set.
    82  	//
    83  	// groupStopPendingCount is analogous to Linux's
    84  	// signal_struct::group_stop_count.
    85  	//
    86  	// groupStopPendingCount is protected by the signal mutex.
    87  	groupStopPendingCount int
    88  
    89  	// If groupStopComplete is true, groupStopPendingCount transitioned from
    90  	// non-zero to zero without an intervening SIGCONT.
    91  	//
    92  	// groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED.
    93  	//
    94  	// groupStopComplete is protected by the signal mutex.
    95  	groupStopComplete bool
    96  
    97  	// If groupStopWaitable is true, the thread group is indicating a waitable
    98  	// group stop event (as defined by EventChildGroupStop).
    99  	//
   100  	// Linux represents the analogous state as SIGNAL_STOP_STOPPED being set
   101  	// and group_exit_code being non-zero.
   102  	//
   103  	// groupStopWaitable is protected by the signal mutex.
   104  	groupStopWaitable bool
   105  
   106  	// If groupContNotify is true, then a SIGCONT has recently ended a group
   107  	// stop on this thread group, and the first task to observe it should
   108  	// notify its parent. groupContInterrupted is true iff SIGCONT ended an
   109  	// incomplete group stop. If groupContNotify is false, groupContInterrupted is
   110  	// meaningless.
   111  	//
   112  	// Analogues in Linux:
   113  	//
   114  	//	- groupContNotify && groupContInterrupted is represented by
   115  	//		SIGNAL_CLD_STOPPED.
   116  	//
   117  	//	- groupContNotify && !groupContInterrupted is represented by
   118  	//		SIGNAL_CLD_CONTINUED.
   119  	//
   120  	//	- !groupContNotify is represented by neither flag being set.
   121  	//
   122  	// groupContNotify and groupContInterrupted are protected by the signal
   123  	// mutex.
   124  	groupContNotify      bool
   125  	groupContInterrupted bool
   126  
   127  	// If groupContWaitable is true, the thread group is indicating a waitable
   128  	// continue event (as defined by EventGroupContinue).
   129  	//
   130  	// groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED.
   131  	//
   132  	// groupContWaitable is protected by the signal mutex.
   133  	groupContWaitable bool
   134  
   135  	// exiting is true if all tasks in the ThreadGroup should exit. exiting is
   136  	// analogous to Linux's SIGNAL_GROUP_EXIT.
   137  	//
   138  	// exiting is protected by the signal mutex. exiting can only transition
   139  	// from false to true.
   140  	exiting bool
   141  
   142  	// exitStatus is the thread group's exit status.
   143  	//
   144  	// While exiting is false, exitStatus is protected by the signal mutex.
   145  	// When exiting becomes true, exitStatus becomes immutable.
   146  	exitStatus linux.WaitStatus
   147  
   148  	// terminationSignal is the signal that this thread group's leader will
   149  	// send to its parent when it exits.
   150  	//
   151  	// terminationSignal is protected by the TaskSet mutex.
   152  	terminationSignal linux.Signal
   153  
   154  	// liveGoroutines is the number of non-exited task goroutines in the thread
   155  	// group.
   156  	//
   157  	// liveGoroutines is not saved; it is reset as task goroutines are
   158  	// restarted by Task.Start.
   159  	liveGoroutines sync.WaitGroup `state:"nosave"`
   160  
   161  	timerMu threadGroupTimerMutex `state:"nosave"`
   162  
   163  	// itimerRealTimer implements ITIMER_REAL for the thread group.
   164  	itimerRealTimer *ktime.Timer
   165  
   166  	// itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group.
   167  	//
   168  	// itimerVirtSetting is protected by the signal mutex.
   169  	itimerVirtSetting ktime.Setting
   170  
   171  	// itimerProfSetting is the ITIMER_PROF setting for the thread group.
   172  	//
   173  	// itimerProfSetting is protected by the signal mutex.
   174  	itimerProfSetting ktime.Setting
   175  
   176  	// rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit
   177  	// notifications for the thread group.
   178  	//
   179  	// rlimitCPUSoftSetting is protected by the signal mutex.
   180  	rlimitCPUSoftSetting ktime.Setting
   181  
   182  	// cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true,
   183  	// itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true,
   184  	// or limits.Get(CPU) is finite.
   185  	//
   186  	// cpuTimersEnabled is protected by the signal mutex.
   187  	cpuTimersEnabled atomicbitops.Uint32
   188  
   189  	// timers is the thread group's POSIX interval timers. nextTimerID is the
   190  	// TimerID at which allocation should begin searching for an unused ID.
   191  	//
   192  	// timers and nextTimerID are protected by timerMu.
   193  	timers      map[linux.TimerID]*IntervalTimer
   194  	nextTimerID linux.TimerID
   195  
   196  	// exitedCPUStats is the CPU usage for all exited tasks in the thread
   197  	// group. exitedCPUStats is protected by the TaskSet mutex.
   198  	exitedCPUStats usage.CPUStats
   199  
   200  	// childCPUStats is the CPU usage of all joined descendants of this thread
   201  	// group. childCPUStats is protected by the TaskSet mutex.
   202  	childCPUStats usage.CPUStats
   203  
   204  	// ioUsage is the I/O usage for all exited tasks in the thread group.
   205  	// The ioUsage pointer is immutable.
   206  	ioUsage *usage.IO
   207  
   208  	// maxRSS is the historical maximum resident set size of the thread group, updated when:
   209  	//
   210  	//	- A task in the thread group exits, since after all tasks have
   211  	//		exited the MemoryManager is no longer reachable.
   212  	//
   213  	//	- The thread group completes an execve, since this changes
   214  	//		MemoryManagers.
   215  	//
   216  	// maxRSS is protected by the TaskSet mutex.
   217  	maxRSS uint64
   218  
   219  	// childMaxRSS is the maximum resident set size in bytes of all joined
   220  	// descendants of this thread group.
   221  	//
   222  	// childMaxRSS is protected by the TaskSet mutex.
   223  	childMaxRSS uint64
   224  
   225  	// Resource limits for this ThreadGroup. The limits pointer is immutable.
   226  	limits *limits.LimitSet
   227  
   228  	// processGroup is the processGroup for this thread group.
   229  	//
   230  	// processGroup is protected by the TaskSet mutex.
   231  	processGroup *ProcessGroup
   232  
   233  	// execed indicates an exec has occurred since creation. This will be
   234  	// set by finishExec, and new TheadGroups will have this field cleared.
   235  	// When execed is set, the processGroup may no longer be changed.
   236  	//
   237  	// execed is protected by the TaskSet mutex.
   238  	execed bool
   239  
   240  	// oldRSeqCritical is the thread group's old rseq critical region.
   241  	oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"`
   242  
   243  	// tty is the thread group's controlling terminal. If nil, there is no
   244  	// controlling terminal.
   245  	//
   246  	// tty is protected by the signal mutex.
   247  	tty *TTY
   248  
   249  	// oomScoreAdj is the thread group's OOM score adjustment. This is
   250  	// currently not used but is maintained for consistency.
   251  	// TODO(gvisor.dev/issue/1967)
   252  	oomScoreAdj atomicbitops.Int32
   253  
   254  	// isChildSubreaper and hasChildSubreaper correspond to Linux's
   255  	// signal_struct::is_child_subreaper and has_child_subreaper.
   256  	//
   257  	// Both fields are protected by the TaskSet mutex.
   258  	//
   259  	// Quoting from signal.h:
   260  	// "PR_SET_CHILD_SUBREAPER marks a process, like a service manager, to
   261  	// re-parent orphan (double-forking) child processes to this process
   262  	// instead of 'init'. The service manager is able to receive SIGCHLD
   263  	// signals and is able to investigate the process until it calls
   264  	// wait(). All children of this process will inherit a flag if they
   265  	// should look for a child_subreaper process at exit"
   266  	isChildSubreaper  bool
   267  	hasChildSubreaper bool
   268  }
   269  
   270  // NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
   271  // thread group leader will send its parent terminationSignal when it exits.
   272  // The new thread group isn't visible to the system until a task has been
   273  // created inside of it by a successful call to TaskSet.NewTask.
   274  func (k *Kernel) NewThreadGroup(pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup {
   275  	tg := &ThreadGroup{
   276  		threadGroupNode: threadGroupNode{
   277  			pidns: pidns,
   278  		},
   279  		signalHandlers:    sh,
   280  		terminationSignal: terminationSignal,
   281  		ioUsage:           &usage.IO{},
   282  		limits:            limits,
   283  	}
   284  	tg.itimerRealTimer = ktime.NewTimer(k.timekeeper.monotonicClock, &itimerRealListener{tg: tg})
   285  	tg.timers = make(map[linux.TimerID]*IntervalTimer)
   286  	tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
   287  	return tg
   288  }
   289  
   290  // saveOldRSeqCritical is invoked by stateify.
   291  func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion {
   292  	return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
   293  }
   294  
   295  // loadOldRSeqCritical is invoked by stateify.
   296  func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) {
   297  	tg.oldRSeqCritical.Store(r)
   298  }
   299  
   300  // SignalHandlers returns the signal handlers used by tg.
   301  //
   302  // Preconditions: The caller must provide the synchronization required to read
   303  // tg.signalHandlers, as described in the field's comment.
   304  func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
   305  	return tg.signalHandlers
   306  }
   307  
   308  // Limits returns tg's limits.
   309  func (tg *ThreadGroup) Limits() *limits.LimitSet {
   310  	return tg.limits
   311  }
   312  
   313  // Release releases the thread group's resources.
   314  func (tg *ThreadGroup) Release(ctx context.Context) {
   315  	// Timers must be destroyed without holding the TaskSet or signal mutexes
   316  	// since timers send signals with Timer.mu locked.
   317  	tg.itimerRealTimer.Destroy()
   318  	var its []*IntervalTimer
   319  	tg.pidns.owner.mu.Lock()
   320  	tg.signalHandlers.mu.Lock()
   321  	for _, it := range tg.timers {
   322  		its = append(its, it)
   323  	}
   324  	tg.timers = make(map[linux.TimerID]*IntervalTimer) // nil maps can't be saved
   325  	tg.signalHandlers.mu.Unlock()
   326  	tg.pidns.owner.mu.Unlock()
   327  	for _, it := range its {
   328  		it.DestroyTimer()
   329  	}
   330  }
   331  
   332  // forEachChildThreadGroupLocked indicates over all child ThreadGroups.
   333  //
   334  // Precondition: TaskSet.mu must be held.
   335  func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
   336  	tg.walkDescendantThreadGroupsLocked(func(child *ThreadGroup) bool {
   337  		fn(child)
   338  		// Don't recurse below the immediate children.
   339  		return false
   340  	})
   341  }
   342  
   343  // walkDescendantThreadGroupsLocked recursively walks all descendent
   344  // ThreadGroups and executes the visitor function. If visitor returns false for
   345  // a given ThreadGroup, then that ThreadGroups descendants are excluded from
   346  // further iteration.
   347  //
   348  // This corresponds to Linux's walk_process_tree.
   349  //
   350  // Precondition: TaskSet.mu must be held.
   351  func (tg *ThreadGroup) walkDescendantThreadGroupsLocked(visitor func(*ThreadGroup) bool) {
   352  	for t := tg.tasks.Front(); t != nil; t = t.Next() {
   353  		for child := range t.children {
   354  			if child == child.tg.leader {
   355  				if !visitor(child.tg) {
   356  					// Don't recurse below child.
   357  					continue
   358  				}
   359  				child.tg.walkDescendantThreadGroupsLocked(visitor)
   360  			}
   361  		}
   362  	}
   363  }
   364  
   365  // SetControllingTTY sets tty as the controlling terminal of tg.
   366  func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool) error {
   367  	tty.mu.Lock()
   368  	defer tty.mu.Unlock()
   369  
   370  	// We might be asked to set the controlling terminal of multiple
   371  	// processes, so we lock both the TaskSet and SignalHandlers.
   372  	tg.pidns.owner.mu.Lock()
   373  	defer tg.pidns.owner.mu.Unlock()
   374  	tg.signalHandlers.mu.Lock()
   375  	defer tg.signalHandlers.mu.Unlock()
   376  
   377  	// "The calling process must be a session leader and not have a
   378  	// controlling terminal already." - tty_ioctl(4)
   379  	if tg.processGroup.session.leader != tg || tg.tty != nil {
   380  		return linuxerr.EINVAL
   381  	}
   382  
   383  	creds := auth.CredentialsFromContext(tg.leader)
   384  	hasAdmin := creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root())
   385  
   386  	// "If this terminal is already the controlling terminal of a different
   387  	// session group, then the ioctl fails with EPERM, unless the caller
   388  	// has the CAP_SYS_ADMIN capability and arg equals 1, in which case the
   389  	// terminal is stolen, and all processes that had it as controlling
   390  	// terminal lose it." - tty_ioctl(4)
   391  	if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
   392  		// Stealing requires CAP_SYS_ADMIN in the root user namespace.
   393  		if !hasAdmin || !steal {
   394  			return linuxerr.EPERM
   395  		}
   396  		// Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
   397  		for othertg := range tg.pidns.owner.Root.tgids {
   398  			// This won't deadlock by locking tg.signalHandlers
   399  			// because at this point:
   400  			//	- We only lock signalHandlers if it's in the same
   401  			//		session as the tty's controlling thread group.
   402  			//	- We know that the calling thread group is not in
   403  			//		the same session as the tty's controlling thread
   404  			//		group.
   405  			if othertg.processGroup.session == tty.tg.processGroup.session {
   406  				othertg.signalHandlers.mu.NestedLock(signalHandlersLockTg)
   407  				othertg.tty = nil
   408  				othertg.signalHandlers.mu.NestedUnlock(signalHandlersLockTg)
   409  			}
   410  		}
   411  	}
   412  
   413  	if !isReadable && !hasAdmin {
   414  		return linuxerr.EPERM
   415  	}
   416  
   417  	// Set the controlling terminal and foreground process group.
   418  	tg.tty = tty
   419  	tg.processGroup.session.foreground = tg.processGroup
   420  	// Set this as the controlling process of the terminal.
   421  	tty.tg = tg
   422  
   423  	return nil
   424  }
   425  
   426  // ReleaseControllingTTY gives up tty as the controlling tty of tg.
   427  func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error {
   428  	tty.mu.Lock()
   429  	defer tty.mu.Unlock()
   430  
   431  	// We might be asked to set the controlling terminal of multiple
   432  	// processes, so we lock both the TaskSet and SignalHandlers.
   433  	tg.pidns.owner.mu.RLock()
   434  	defer tg.pidns.owner.mu.RUnlock()
   435  
   436  	// Just below, we may re-lock signalHandlers in order to send signals.
   437  	// Thus we can't defer Unlock here.
   438  	tg.signalHandlers.mu.Lock()
   439  
   440  	if tg.tty == nil || tg.tty != tty {
   441  		tg.signalHandlers.mu.Unlock()
   442  		return linuxerr.ENOTTY
   443  	}
   444  
   445  	// "If the process was session leader, then send SIGHUP and SIGCONT to
   446  	// the foreground process group and all processes in the current
   447  	// session lose their controlling terminal." - tty_ioctl(4)
   448  	// Remove tty as the controlling tty for each process in the session,
   449  	// then send them SIGHUP and SIGCONT.
   450  
   451  	// If we're not the session leader, we don't have to do much.
   452  	if tty.tg != tg {
   453  		tg.tty = nil
   454  		tg.signalHandlers.mu.Unlock()
   455  		return nil
   456  	}
   457  
   458  	tg.signalHandlers.mu.Unlock()
   459  
   460  	// We're the session leader. SIGHUP and SIGCONT the foreground process
   461  	// group and remove all controlling terminals in the session.
   462  	var lastErr error
   463  	for othertg := range tg.pidns.owner.Root.tgids {
   464  		if othertg.processGroup.session == tg.processGroup.session {
   465  			othertg.signalHandlers.mu.Lock()
   466  			othertg.tty = nil
   467  			if othertg.processGroup == tg.processGroup.session.foreground {
   468  				if err := othertg.leader.sendSignalLocked(&linux.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil {
   469  					lastErr = err
   470  				}
   471  				if err := othertg.leader.sendSignalLocked(&linux.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil {
   472  					lastErr = err
   473  				}
   474  			}
   475  			othertg.signalHandlers.mu.Unlock()
   476  		}
   477  	}
   478  
   479  	return lastErr
   480  }
   481  
   482  // ForegroundProcessGroupID returns the foreground process group ID of the
   483  // thread group.
   484  func (tg *ThreadGroup) ForegroundProcessGroupID(tty *TTY) (ProcessGroupID, error) {
   485  	tty.mu.Lock()
   486  	defer tty.mu.Unlock()
   487  
   488  	tg.pidns.owner.mu.Lock()
   489  	defer tg.pidns.owner.mu.Unlock()
   490  	tg.signalHandlers.mu.Lock()
   491  	defer tg.signalHandlers.mu.Unlock()
   492  
   493  	// fd must refer to the controlling terminal of the calling process.
   494  	// See tcgetpgrp(3)
   495  	if tg.tty != tty {
   496  		return 0, linuxerr.ENOTTY
   497  	}
   498  
   499  	return tg.processGroup.session.foreground.id, nil
   500  }
   501  
   502  // SetForegroundProcessGroupID sets the foreground process group of tty to
   503  // pgid.
   504  func (tg *ThreadGroup) SetForegroundProcessGroupID(tty *TTY, pgid ProcessGroupID) error {
   505  	tty.mu.Lock()
   506  	defer tty.mu.Unlock()
   507  
   508  	tg.pidns.owner.mu.Lock()
   509  	defer tg.pidns.owner.mu.Unlock()
   510  	tg.signalHandlers.mu.Lock()
   511  	defer tg.signalHandlers.mu.Unlock()
   512  
   513  	// tty must be the controlling terminal.
   514  	if tg.tty != tty {
   515  		return linuxerr.ENOTTY
   516  	}
   517  
   518  	// pgid must be positive.
   519  	if pgid < 0 {
   520  		return linuxerr.EINVAL
   521  	}
   522  
   523  	// pg must not be empty. Empty process groups are removed from their
   524  	// pid namespaces.
   525  	pg, ok := tg.pidns.processGroups[pgid]
   526  	if !ok {
   527  		return linuxerr.ESRCH
   528  	}
   529  
   530  	// pg must be part of this process's session.
   531  	if tg.processGroup.session != pg.session {
   532  		return linuxerr.EPERM
   533  	}
   534  
   535  	signalAction := tg.signalHandlers.actions[linux.SIGTTOU]
   536  	// If the calling process is a member of a background group, a SIGTTOU
   537  	// signal is sent to all members of this background process group.
   538  	// We need also need to check whether it is ignoring or blocking SIGTTOU.
   539  	ignored := signalAction.Handler == linux.SIG_IGN
   540  	blocked := (linux.SignalSet(tg.leader.signalMask.RacyLoad()) & linux.SignalSetOf(linux.SIGTTOU)) != 0
   541  	if tg.processGroup.id != tg.processGroup.session.foreground.id && !ignored && !blocked {
   542  		tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGTTOU), true)
   543  		return linuxerr.ERESTARTSYS
   544  	}
   545  
   546  	tg.processGroup.session.foreground = pg
   547  	return nil
   548  }
   549  
   550  // SetChildSubreaper marks this ThreadGroup sets the isChildSubreaper field on
   551  // this ThreadGroup, and marks all child ThreadGroups as having a subreaper.
   552  // Recursion stops if we find another subreaper process, which is either a
   553  // ThreadGroup with isChildSubreaper bit set, or a ThreadGroup with PID=1
   554  // inside a PID namespace.
   555  func (tg *ThreadGroup) SetChildSubreaper(isSubreaper bool) {
   556  	ts := tg.TaskSet()
   557  	ts.mu.Lock()
   558  	defer ts.mu.Unlock()
   559  	tg.isChildSubreaper = isSubreaper
   560  	tg.walkDescendantThreadGroupsLocked(func(child *ThreadGroup) bool {
   561  		// Is this child PID 1 in its PID namespace, or already a
   562  		// subreaper?
   563  		if child.isInitInLocked(child.PIDNamespace()) || child.isChildSubreaper {
   564  			// Don't set hasChildSubreaper, and don't recurse.
   565  			return false
   566  		}
   567  		child.hasChildSubreaper = isSubreaper
   568  		return true // Recurse.
   569  	})
   570  }
   571  
   572  // IsChildSubreaper returns whether this ThreadGroup is a child subreaper.
   573  func (tg *ThreadGroup) IsChildSubreaper() bool {
   574  	ts := tg.TaskSet()
   575  	ts.mu.RLock()
   576  	defer ts.mu.RUnlock()
   577  	return tg.isChildSubreaper
   578  }
   579  
   580  // IsInitIn returns whether this ThreadGroup has TID 1 int the given
   581  // PIDNamespace.
   582  func (tg *ThreadGroup) IsInitIn(pidns *PIDNamespace) bool {
   583  	ts := tg.TaskSet()
   584  	ts.mu.RLock()
   585  	defer ts.mu.RUnlock()
   586  	return tg.isInitInLocked(pidns)
   587  }
   588  
   589  // isInitInLocked returns whether this ThreadGroup has TID 1 in the given
   590  // PIDNamespace.
   591  //
   592  // Preconditions: TaskSet.mu must be locked.
   593  func (tg *ThreadGroup) isInitInLocked(pidns *PIDNamespace) bool {
   594  	return pidns.tgids[tg] == initTID
   595  }
   596  
   597  // itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
   598  //
   599  // +stateify savable
   600  type itimerRealListener struct {
   601  	tg *ThreadGroup
   602  }
   603  
   604  // NotifyTimer implements ktime.TimerListener.NotifyTimer.
   605  func (l *itimerRealListener) NotifyTimer(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
   606  	l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM))
   607  	return ktime.Setting{}, false
   608  }