github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	gocontext "context"
    19  	"runtime/trace"
    20  	"sync/atomic"
    21  
    22  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    23  	"github.com/SagerNet/gvisor/pkg/bpf"
    24  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    25  	"github.com/SagerNet/gvisor/pkg/hostarch"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/inet"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/futex"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/sched"
    31  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    34  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    35  	"github.com/SagerNet/gvisor/pkg/sync"
    36  	"github.com/SagerNet/gvisor/pkg/waiter"
    37  )
    38  
    39  // Task represents a thread of execution in the untrusted app.  It
    40  // includes registers and any thread-specific state that you would
    41  // normally expect.
    42  //
    43  // Each task is associated with a goroutine, called the task goroutine, that
    44  // executes code (application code, system calls, etc.) on behalf of that task.
    45  // See Task.run (task_run.go).
    46  //
    47  // All fields that are "owned by the task goroutine" can only be mutated by the
    48  // task goroutine while it is running. The task goroutine does not require
    49  // synchronization to read these fields, although it still requires
    50  // synchronization as described for those fields to mutate them.
    51  //
    52  // All fields that are "exclusive to the task goroutine" can only be accessed
    53  // by the task goroutine while it is running. The task goroutine does not
    54  // require synchronization to read or write these fields.
    55  //
    56  // +stateify savable
    57  type Task struct {
    58  	taskNode
    59  
    60  	// goid is the task goroutine's ID. goid is owned by the task goroutine,
    61  	// but since it's used to detect cases where non-task goroutines
    62  	// incorrectly access state owned by, or exclusive to, the task goroutine,
    63  	// goid is always accessed using atomic memory operations.
    64  	goid int64 `state:"nosave"`
    65  
    66  	// runState is what the task goroutine is executing if it is not stopped.
    67  	// If runState is nil, the task goroutine should exit or has exited.
    68  	// runState is exclusive to the task goroutine.
    69  	runState taskRunState
    70  
    71  	// taskWorkCount represents the current size of the task work queue. It is
    72  	// used to avoid acquiring taskWorkMu when the queue is empty.
    73  	//
    74  	// Must accessed with atomic memory operations.
    75  	taskWorkCount int32
    76  
    77  	// taskWorkMu protects taskWork.
    78  	taskWorkMu sync.Mutex `state:"nosave"`
    79  
    80  	// taskWork is a queue of work to be executed before resuming user execution.
    81  	// It is similar to the task_work mechanism in Linux.
    82  	//
    83  	// taskWork is exclusive to the task goroutine.
    84  	taskWork []TaskWorker
    85  
    86  	// haveSyscallReturn is true if image.Arch().Return() represents a value
    87  	// returned by a syscall (or set by ptrace after a syscall).
    88  	//
    89  	// haveSyscallReturn is exclusive to the task goroutine.
    90  	haveSyscallReturn bool
    91  
    92  	// interruptChan is notified whenever the task goroutine is interrupted
    93  	// (usually by a pending signal). interruptChan is effectively a condition
    94  	// variable that can be used in select statements.
    95  	//
    96  	// interruptChan is not saved; because saving interrupts all tasks,
    97  	// interruptChan is always notified after restore (see Task.run).
    98  	interruptChan chan struct{} `state:"nosave"`
    99  
   100  	// gosched contains the current scheduling state of the task goroutine.
   101  	//
   102  	// gosched is protected by goschedSeq. gosched is owned by the task
   103  	// goroutine.
   104  	goschedSeq sync.SeqCount `state:"nosave"`
   105  	gosched    TaskGoroutineSchedInfo
   106  
   107  	// yieldCount is the number of times the task goroutine has called
   108  	// Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or
   109  	// Task.Yield(), voluntarily ceasing execution.
   110  	//
   111  	// yieldCount is accessed using atomic memory operations. yieldCount is
   112  	// owned by the task goroutine.
   113  	yieldCount uint64
   114  
   115  	// pendingSignals is the set of pending signals that may be handled only by
   116  	// this task.
   117  	//
   118  	// pendingSignals is protected by (taskNode.)tg.signalHandlers.mu
   119  	// (hereafter "the signal mutex"); see comment on
   120  	// ThreadGroup.signalHandlers.
   121  	pendingSignals pendingSignals
   122  
   123  	// signalMask is the set of signals whose delivery is currently blocked.
   124  	//
   125  	// signalMask is accessed using atomic memory operations, and is protected
   126  	// by the signal mutex (such that reading signalMask is safe if either the
   127  	// signal mutex is locked or if atomic memory operations are used, while
   128  	// writing signalMask requires both). signalMask is owned by the task
   129  	// goroutine.
   130  	signalMask linux.SignalSet
   131  
   132  	// If the task goroutine is currently executing Task.sigtimedwait,
   133  	// realSignalMask is the previous value of signalMask, which has temporarily
   134  	// been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0.
   135  	//
   136  	// realSignalMask is exclusive to the task goroutine.
   137  	realSignalMask linux.SignalSet
   138  
   139  	// If haveSavedSignalMask is true, savedSignalMask is the signal mask that
   140  	// should be applied after the task has either delivered one signal to a
   141  	// user handler or is about to resume execution in the untrusted
   142  	// application.
   143  	//
   144  	// Both haveSavedSignalMask and savedSignalMask are exclusive to the task
   145  	// goroutine.
   146  	haveSavedSignalMask bool
   147  	savedSignalMask     linux.SignalSet
   148  
   149  	// signalStack is the alternate signal stack used by signal handlers for
   150  	// which the SA_ONSTACK flag is set.
   151  	//
   152  	// signalStack is exclusive to the task goroutine.
   153  	signalStack linux.SignalStack
   154  
   155  	// signalQueue is a set of registered waiters for signal-related events.
   156  	//
   157  	// signalQueue is protected by the signalMutex. Note that the task does
   158  	// not implement all queue methods, specifically the readiness checks.
   159  	// The task only broadcast a notification on signal delivery.
   160  	signalQueue waiter.Queue `state:"zerovalue"`
   161  
   162  	// If groupStopPending is true, the task should participate in a group
   163  	// stop in the interrupt path.
   164  	//
   165  	// groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux.
   166  	//
   167  	// groupStopPending is protected by the signal mutex.
   168  	groupStopPending bool
   169  
   170  	// If groupStopAcknowledged is true, the task has already acknowledged that
   171  	// it is entering the most recent group stop that has been initiated on its
   172  	// thread group.
   173  	//
   174  	// groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
   175  	//
   176  	// groupStopAcknowledged is protected by the signal mutex.
   177  	groupStopAcknowledged bool
   178  
   179  	// If trapStopPending is true, the task goroutine should enter a
   180  	// PTRACE_INTERRUPT-induced stop from the interrupt path.
   181  	//
   182  	// trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that
   183  	// Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects
   184  	// JOBCTL_STOP_PENDING.
   185  	//
   186  	// trapStopPending is protected by the signal mutex.
   187  	trapStopPending bool
   188  
   189  	// If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group
   190  	// stop has begun or ended since the last time the task entered a
   191  	// ptrace-stop from the group-stop path.
   192  	//
   193  	// trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux.
   194  	//
   195  	// trapNotifyPending is protected by the signal mutex.
   196  	trapNotifyPending bool
   197  
   198  	// If stop is not nil, it is the internally-initiated condition that
   199  	// currently prevents the task goroutine from running.
   200  	//
   201  	// stop is protected by the signal mutex.
   202  	stop TaskStop
   203  
   204  	// stopCount is the number of active external stops (calls to
   205  	// Task.BeginExternalStop that have not been paired with a call to
   206  	// Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is
   207  	// non-zero if the task goroutine should stop.
   208  	//
   209  	// Mutating stopCount requires both locking the signal mutex and using
   210  	// atomic memory operations. Reading stopCount requires either locking the
   211  	// signal mutex or using atomic memory operations. This allows Task.doStop
   212  	// to require only a single atomic read in the common case where stopCount
   213  	// is 0.
   214  	//
   215  	// stopCount is not saved, because external stops cannot be retained across
   216  	// a save/restore cycle. (Suppose a sentryctl command issues an external
   217  	// stop; after a save/restore cycle, the restored sentry has no knowledge
   218  	// of the pre-save sentryctl command, and the stopped task would remain
   219  	// stopped forever.)
   220  	stopCount int32 `state:"nosave"`
   221  
   222  	// endStopCond is signaled when stopCount transitions to 0. The combination
   223  	// of stopCount and endStopCond effectively form a sync.WaitGroup, but
   224  	// WaitGroup provides no way to read its counter value.
   225  	//
   226  	// Invariant: endStopCond.L is the signal mutex. (This is not racy because
   227  	// sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine
   228  	// calls sync.Cond.Wait; and only the task goroutine can change the
   229  	// identity of the signal mutex, in Task.finishExec.)
   230  	endStopCond sync.Cond `state:"nosave"`
   231  
   232  	// exitStatus is the task's exit status.
   233  	//
   234  	// exitStatus is protected by the signal mutex.
   235  	exitStatus ExitStatus
   236  
   237  	// syscallRestartBlock represents a custom restart function to run in
   238  	// restart_syscall(2) to resume an interrupted syscall.
   239  	//
   240  	// syscallRestartBlock is exclusive to the task goroutine.
   241  	syscallRestartBlock SyscallRestartBlock
   242  
   243  	// p provides the mechanism by which the task runs code in userspace. The p
   244  	// interface object is immutable.
   245  	p platform.Context `state:"nosave"`
   246  
   247  	// k is the Kernel that this task belongs to. The k pointer is immutable.
   248  	k *Kernel
   249  
   250  	// containerID has no equivalent in Linux; it's used by runsc to track all
   251  	// tasks that belong to a given containers since cgroups aren't implemented.
   252  	// It's inherited by the children, is immutable, and may be empty.
   253  	//
   254  	// NOTE: cgroups can be used to track this when implemented.
   255  	containerID string
   256  
   257  	// mu protects some of the following fields.
   258  	mu sync.Mutex `state:"nosave"`
   259  
   260  	// image holds task data provided by the ELF loader.
   261  	//
   262  	// image is protected by mu, and is owned by the task goroutine.
   263  	image TaskImage
   264  
   265  	// fsContext is the task's filesystem context.
   266  	//
   267  	// fsContext is protected by mu, and is owned by the task goroutine.
   268  	fsContext *FSContext
   269  
   270  	// fdTable is the task's file descriptor table.
   271  	//
   272  	// fdTable is protected by mu, and is owned by the task goroutine.
   273  	fdTable *FDTable
   274  
   275  	// If vforkParent is not nil, it is the task that created this task with
   276  	// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
   277  	// this TaskImage is released.
   278  	//
   279  	// vforkParent is protected by the TaskSet mutex.
   280  	vforkParent *Task
   281  
   282  	// exitState is the task's progress through the exit path.
   283  	//
   284  	// exitState is protected by the TaskSet mutex. exitState is owned by the
   285  	// task goroutine.
   286  	exitState TaskExitState
   287  
   288  	// exitTracerNotified is true if the exit path has either signaled the
   289  	// task's tracer to indicate the exit, or determined that no such signal is
   290  	// needed. exitTracerNotified can only be true if exitState is
   291  	// TaskExitZombie or TaskExitDead.
   292  	//
   293  	// exitTracerNotified is protected by the TaskSet mutex.
   294  	exitTracerNotified bool
   295  
   296  	// exitTracerAcked is true if exitTracerNotified is true and either the
   297  	// task's tracer has acknowledged the exit notification, or the exit path
   298  	// has determined that no such notification is needed.
   299  	//
   300  	// exitTracerAcked is protected by the TaskSet mutex.
   301  	exitTracerAcked bool
   302  
   303  	// exitParentNotified is true if the exit path has either signaled the
   304  	// task's parent to indicate the exit, or determined that no such signal is
   305  	// needed. exitParentNotified can only be true if exitState is
   306  	// TaskExitZombie or TaskExitDead.
   307  	//
   308  	// exitParentNotified is protected by the TaskSet mutex.
   309  	exitParentNotified bool
   310  
   311  	// exitParentAcked is true if exitParentNotified is true and either the
   312  	// task's parent has acknowledged the exit notification, or the exit path
   313  	// has determined that no such acknowledgment is needed.
   314  	//
   315  	// exitParentAcked is protected by the TaskSet mutex.
   316  	exitParentAcked bool
   317  
   318  	// goroutineStopped is a WaitGroup whose counter value is 1 when the task
   319  	// goroutine is running and 0 when the task goroutine is stopped or has
   320  	// exited.
   321  	goroutineStopped sync.WaitGroup `state:"nosave"`
   322  
   323  	// ptraceTracer is the task that is ptrace-attached to this one. If
   324  	// ptraceTracer is nil, this task is not being traced. Note that due to
   325  	// atomic.Value limitations (atomic.Value.Store(nil) panics), a nil
   326  	// ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)).
   327  	//
   328  	// ptraceTracer is protected by the TaskSet mutex, and accessed with atomic
   329  	// operations. This allows paths that wouldn't otherwise lock the TaskSet
   330  	// mutex, notably the syscall path, to check if ptraceTracer is nil without
   331  	// additional synchronization.
   332  	ptraceTracer atomic.Value `state:".(*Task)"`
   333  
   334  	// ptraceTracees is the set of tasks that this task is ptrace-attached to.
   335  	//
   336  	// ptraceTracees is protected by the TaskSet mutex.
   337  	ptraceTracees map[*Task]struct{}
   338  
   339  	// ptraceSeized is true if ptraceTracer attached to this task with
   340  	// PTRACE_SEIZE.
   341  	//
   342  	// ptraceSeized is protected by the TaskSet mutex.
   343  	ptraceSeized bool
   344  
   345  	// ptraceOpts contains ptrace options explicitly set by the tracer. If
   346  	// ptraceTracer is nil, ptraceOpts is expected to be the zero value.
   347  	//
   348  	// ptraceOpts is protected by the TaskSet mutex.
   349  	ptraceOpts ptraceOptions
   350  
   351  	// ptraceSyscallMode controls ptrace behavior around syscall entry and
   352  	// exit.
   353  	//
   354  	// ptraceSyscallMode is protected by the TaskSet mutex.
   355  	ptraceSyscallMode ptraceSyscallMode
   356  
   357  	// If ptraceSinglestep is true, the next time the task executes application
   358  	// code, single-stepping should be enabled. ptraceSinglestep is stored
   359  	// independently of the architecture-specific trap flag because tracer
   360  	// detaching (which can happen concurrently with the tracee's execution if
   361  	// the tracer exits) must disable single-stepping, and the task's
   362  	// architectural state is implicitly exclusive to the task goroutine (no
   363  	// synchronization occurs before passing registers to SwitchToApp).
   364  	//
   365  	// ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP.
   366  	//
   367  	// ptraceSinglestep is protected by the TaskSet mutex.
   368  	ptraceSinglestep bool
   369  
   370  	// If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the
   371  	// time that t entered the ptrace stop, reset to 0 when the tracer
   372  	// acknowledges the stop with a wait*() syscall. Otherwise, it is the
   373  	// signal number passed to the ptrace operation that ended the last ptrace
   374  	// stop on this task. In the latter case, the effect of ptraceCode depends
   375  	// on the nature of the ptrace stop; signal-delivery-stop uses it to
   376  	// conditionally override ptraceSiginfo, syscall-entry/exit-stops send the
   377  	// signal to the task after leaving the stop, and PTRACE_EVENT stops and
   378  	// traced group stops ignore it entirely.
   379  	//
   380  	// Linux contextually stores the equivalent of ptraceCode in
   381  	// task_struct::exit_code.
   382  	//
   383  	// ptraceCode is protected by the TaskSet mutex.
   384  	ptraceCode int32
   385  
   386  	// ptraceSiginfo is the value returned to the tracer by
   387  	// ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO).
   388  	// (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.)
   389  	// ptraceSiginfo is nil if the task is in a ptraced group-stop (this is
   390  	// required for PTRACE_GETSIGINFO to return EINVAL during such stops, which
   391  	// is in turn required to distinguish group stops from other ptrace stops,
   392  	// per subsection "Group-stop" in ptrace(2)).
   393  	//
   394  	// ptraceSiginfo is analogous to Linux's task_struct::last_siginfo.
   395  	//
   396  	// ptraceSiginfo is protected by the TaskSet mutex.
   397  	ptraceSiginfo *linux.SignalInfo
   398  
   399  	// ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to
   400  	// the tracer by ptrace(PTRACE_GETEVENTMSG).
   401  	//
   402  	// ptraceEventMsg is protected by the TaskSet mutex.
   403  	ptraceEventMsg uint64
   404  
   405  	// ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has
   406  	// been added before. This is used during task exit to decide whether we need
   407  	// to clean up YAMA exceptions.
   408  	//
   409  	// ptraceYAMAExceptionAdded is protected by the TaskSet mutex.
   410  	ptraceYAMAExceptionAdded bool
   411  
   412  	// The struct that holds the IO-related usage. The ioUsage pointer is
   413  	// immutable.
   414  	ioUsage *usage.IO
   415  
   416  	// logPrefix is a string containing the task's thread ID in the root PID
   417  	// namespace, and is prepended to log messages emitted by Task.Infof etc.
   418  	logPrefix atomic.Value `state:"nosave"`
   419  
   420  	// traceContext and traceTask are both used for tracing, and are
   421  	// updated along with the logPrefix in updateInfoLocked.
   422  	//
   423  	// These are exclusive to the task goroutine.
   424  	traceContext gocontext.Context `state:"nosave"`
   425  	traceTask    *trace.Task       `state:"nosave"`
   426  
   427  	// creds is the task's credentials.
   428  	//
   429  	// creds.Load() may be called without synchronization. creds.Store() is
   430  	// serialized by mu. creds is owned by the task goroutine. All
   431  	// auth.Credentials objects that creds may point to, or have pointed to
   432  	// in the past, must be treated as immutable.
   433  	creds auth.AtomicPtrCredentials
   434  
   435  	// utsns is the task's UTS namespace.
   436  	//
   437  	// utsns is protected by mu. utsns is owned by the task goroutine.
   438  	utsns *UTSNamespace
   439  
   440  	// ipcns is the task's IPC namespace.
   441  	//
   442  	// ipcns is protected by mu. ipcns is owned by the task goroutine.
   443  	ipcns *IPCNamespace
   444  
   445  	// abstractSockets tracks abstract sockets that are in use.
   446  	//
   447  	// abstractSockets is protected by mu.
   448  	abstractSockets *AbstractSocketNamespace
   449  
   450  	// mountNamespaceVFS2 is the task's mount namespace.
   451  	//
   452  	// It is protected by mu. It is owned by the task goroutine.
   453  	mountNamespaceVFS2 *vfs.MountNamespace
   454  
   455  	// parentDeathSignal is sent to this task's thread group when its parent exits.
   456  	//
   457  	// parentDeathSignal is protected by mu.
   458  	parentDeathSignal linux.Signal
   459  
   460  	// syscallFilters is all seccomp-bpf syscall filters applicable to the
   461  	// task, in the order in which they were installed. The type of the atomic
   462  	// is []bpf.Program. Writing needs to be protected by the signal mutex.
   463  	//
   464  	// syscallFilters is owned by the task goroutine.
   465  	syscallFilters atomic.Value `state:".([]bpf.Program)"`
   466  
   467  	// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
   468  	// task's virtual address space; when the task exits, set the pointed-to
   469  	// ThreadID to 0, and wake any futex waiters.
   470  	//
   471  	// cleartid is exclusive to the task goroutine.
   472  	cleartid hostarch.Addr
   473  
   474  	// This is mostly a fake cpumask just for sched_set/getaffinity as we
   475  	// don't really control the affinity.
   476  	//
   477  	// Invariant: allowedCPUMask.Size() ==
   478  	// sched.CPUMaskSize(Kernel.applicationCores).
   479  	//
   480  	// allowedCPUMask is protected by mu.
   481  	allowedCPUMask sched.CPUSet
   482  
   483  	// cpu is the fake cpu number returned by getcpu(2). cpu is ignored
   484  	// entirely if Kernel.useHostCores is true.
   485  	//
   486  	// cpu is accessed using atomic memory operations.
   487  	cpu int32
   488  
   489  	// This is used to keep track of changes made to a process' priority/niceness.
   490  	// It is mostly used to provide some reasonable return value from
   491  	// getpriority(2) after a call to setpriority(2) has been made.
   492  	// We currently do not actually modify a process' scheduling priority.
   493  	// NOTE: This represents the userspace view of priority (nice).
   494  	// This means that the value should be in the range [-20, 19].
   495  	//
   496  	// niceness is protected by mu.
   497  	niceness int
   498  
   499  	// This is used to track the numa policy for the current thread. This can be
   500  	// modified through a set_mempolicy(2) syscall. Since we always report a
   501  	// single numa node, all policies are no-ops. We only track this information
   502  	// so that we can return reasonable values if the application calls
   503  	// get_mempolicy(2) after setting a non-default policy. Note that in the
   504  	// real syscall, nodemask can be longer than a single unsigned long, but we
   505  	// always report a single node so never need to save more than a single
   506  	// bit.
   507  	//
   508  	// numaPolicy and numaNodeMask are protected by mu.
   509  	numaPolicy   linux.NumaPolicy
   510  	numaNodeMask uint64
   511  
   512  	// netns is the task's network namespace. netns is never nil.
   513  	//
   514  	// netns is protected by mu.
   515  	netns *inet.Namespace
   516  
   517  	// If rseqPreempted is true, before the next call to p.Switch(),
   518  	// interrupt rseq critical regions as defined by rseqAddr and
   519  	// tg.oldRSeqCritical and write the task goroutine's CPU number to
   520  	// rseqAddr/oldRSeqCPUAddr.
   521  	//
   522  	// We support two ABIs for restartable sequences:
   523  	//
   524  	//  1. The upstream interface added in v4.18,
   525  	//  2. An "old" interface never merged upstream. In the implementation,
   526  	//     this is referred to as "old rseq".
   527  	//
   528  	// rseqPreempted is exclusive to the task goroutine.
   529  	rseqPreempted bool `state:"nosave"`
   530  
   531  	// rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr.
   532  	//
   533  	// If rseq is unused, rseqCPU is -1 for convenient use in
   534  	// platform.Context.Switch.
   535  	//
   536  	// rseqCPU is exclusive to the task goroutine.
   537  	rseqCPU int32
   538  
   539  	// oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
   540  	//
   541  	// oldRSeqCPUAddr is exclusive to the task goroutine.
   542  	oldRSeqCPUAddr hostarch.Addr
   543  
   544  	// rseqAddr is a pointer to the userspace linux.RSeq structure.
   545  	//
   546  	// rseqAddr is exclusive to the task goroutine.
   547  	rseqAddr hostarch.Addr
   548  
   549  	// rseqSignature is the signature that the rseq abort IP must be signed
   550  	// with.
   551  	//
   552  	// rseqSignature is exclusive to the task goroutine.
   553  	rseqSignature uint32
   554  
   555  	// copyScratchBuffer is a buffer available to CopyIn/CopyOut
   556  	// implementations that require an intermediate buffer to copy data
   557  	// into/out of. It prevents these buffers from being allocated/zeroed in
   558  	// each syscall and eventually garbage collected.
   559  	//
   560  	// copyScratchBuffer is exclusive to the task goroutine.
   561  	copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"`
   562  
   563  	// blockingTimer is used for blocking timeouts. blockingTimerChan is the
   564  	// channel that is sent to when blockingTimer fires.
   565  	//
   566  	// blockingTimer is exclusive to the task goroutine.
   567  	blockingTimer     *ktime.Timer    `state:"nosave"`
   568  	blockingTimerChan <-chan struct{} `state:"nosave"`
   569  
   570  	// futexWaiter is used for futex(FUTEX_WAIT) syscalls.
   571  	//
   572  	// futexWaiter is exclusive to the task goroutine.
   573  	futexWaiter *futex.Waiter `state:"nosave"`
   574  
   575  	// robustList is a pointer to the head of the tasks's robust futex
   576  	// list.
   577  	robustList hostarch.Addr
   578  
   579  	// startTime is the real time at which the task started. It is set when
   580  	// a Task is created or invokes execve(2).
   581  	//
   582  	// startTime is protected by mu.
   583  	startTime ktime.Time
   584  
   585  	// kcov is the kcov instance providing code coverage owned by this task.
   586  	//
   587  	// kcov is exclusive to the task goroutine.
   588  	kcov *Kcov
   589  
   590  	// cgroups is the set of cgroups this task belongs to. This may be empty if
   591  	// no cgroup controllers are enabled. Protected by mu.
   592  	//
   593  	// +checklocks:mu
   594  	cgroups map[Cgroup]struct{}
   595  }
   596  
   597  func (t *Task) savePtraceTracer() *Task {
   598  	return t.ptraceTracer.Load().(*Task)
   599  }
   600  
   601  func (t *Task) loadPtraceTracer(tracer *Task) {
   602  	t.ptraceTracer.Store(tracer)
   603  }
   604  
   605  func (t *Task) saveSyscallFilters() []bpf.Program {
   606  	if f := t.syscallFilters.Load(); f != nil {
   607  		return f.([]bpf.Program)
   608  	}
   609  	return nil
   610  }
   611  
   612  func (t *Task) loadSyscallFilters(filters []bpf.Program) {
   613  	t.syscallFilters.Store(filters)
   614  }
   615  
   616  // afterLoad is invoked by stateify.
   617  func (t *Task) afterLoad() {
   618  	t.updateInfoLocked()
   619  	t.interruptChan = make(chan struct{}, 1)
   620  	t.gosched.State = TaskGoroutineNonexistent
   621  	if t.stop != nil {
   622  		t.stopCount = 1
   623  	}
   624  	t.endStopCond.L = &t.tg.signalHandlers.mu
   625  	t.p = t.k.Platform.NewContext()
   626  	t.rseqPreempted = true
   627  	t.futexWaiter = futex.NewWaiter()
   628  }
   629  
   630  // copyScratchBufferLen is the length of Task.copyScratchBuffer.
   631  const copyScratchBufferLen = 144 // sizeof(struct stat)
   632  
   633  // CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
   634  // functions. It must only be used within those functions and can only be used
   635  // by the task goroutine; it exists to improve performance and thus
   636  // intentionally lacks any synchronization.
   637  //
   638  // Callers should pass a constant value as an argument if possible, which will
   639  // allow the compiler to inline and optimize out the if statement below.
   640  func (t *Task) CopyScratchBuffer(size int) []byte {
   641  	if size > copyScratchBufferLen {
   642  		return make([]byte, size)
   643  	}
   644  	return t.copyScratchBuffer[:size]
   645  }
   646  
   647  // FutexWaiter returns the Task's futex.Waiter.
   648  func (t *Task) FutexWaiter() *futex.Waiter {
   649  	return t.futexWaiter
   650  }
   651  
   652  // Kernel returns the Kernel containing t.
   653  func (t *Task) Kernel() *Kernel {
   654  	return t.k
   655  }
   656  
   657  // SetClearTID sets t's cleartid.
   658  //
   659  // Preconditions: The caller must be running on the task goroutine.
   660  func (t *Task) SetClearTID(addr hostarch.Addr) {
   661  	t.cleartid = addr
   662  }
   663  
   664  // SetSyscallRestartBlock sets the restart block for use in
   665  // restart_syscall(2). After registering a restart block, a syscall should
   666  // return ERESTART_RESTARTBLOCK to request a restart using the block.
   667  //
   668  // Precondition: The caller must be running on the task goroutine.
   669  func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) {
   670  	t.syscallRestartBlock = r
   671  }
   672  
   673  // SyscallRestartBlock returns the currently registered restart block for use in
   674  // restart_syscall(2). This function is *not* idempotent and may be called once
   675  // per syscall. This function must not be called if a restart block has not been
   676  // registered for the current syscall.
   677  //
   678  // Precondition: The caller must be running on the task goroutine.
   679  func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
   680  	r := t.syscallRestartBlock
   681  	// Explicitly set the restart block to nil so that a future syscall can't
   682  	// accidentally reuse it.
   683  	t.syscallRestartBlock = nil
   684  	return r
   685  }
   686  
   687  // IsChrooted returns true if the root directory of t's FSContext is not the
   688  // root directory of t's MountNamespace.
   689  //
   690  // Preconditions: The caller must be running on the task goroutine, or t.mu
   691  // must be locked.
   692  func (t *Task) IsChrooted() bool {
   693  	if VFS2Enabled {
   694  		realRoot := t.mountNamespaceVFS2.Root()
   695  		root := t.fsContext.RootDirectoryVFS2()
   696  		defer root.DecRef(t)
   697  		return root != realRoot
   698  	}
   699  
   700  	realRoot := t.tg.mounts.Root()
   701  	defer realRoot.DecRef(t)
   702  	root := t.fsContext.RootDirectory()
   703  	if root != nil {
   704  		defer root.DecRef(t)
   705  	}
   706  	return root != realRoot
   707  }
   708  
   709  // TaskImage returns t's TaskImage.
   710  //
   711  // Precondition: The caller must be running on the task goroutine, or t.mu must
   712  // be locked.
   713  func (t *Task) TaskImage() *TaskImage {
   714  	return &t.image
   715  }
   716  
   717  // FSContext returns t's FSContext. FSContext does not take an additional
   718  // reference on the returned FSContext.
   719  //
   720  // Precondition: The caller must be running on the task goroutine, or t.mu must
   721  // be locked.
   722  func (t *Task) FSContext() *FSContext {
   723  	return t.fsContext
   724  }
   725  
   726  // FDTable returns t's FDTable. FDMTable does not take an additional reference
   727  // on the returned FDMap.
   728  //
   729  // Precondition: The caller must be running on the task goroutine, or t.mu must
   730  // be locked.
   731  func (t *Task) FDTable() *FDTable {
   732  	return t.fdTable
   733  }
   734  
   735  // GetFile is a convenience wrapper for t.FDTable().Get.
   736  //
   737  // Precondition: same as FDTable.Get.
   738  func (t *Task) GetFile(fd int32) *fs.File {
   739  	f, _ := t.fdTable.Get(fd)
   740  	return f
   741  }
   742  
   743  // GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2.
   744  //
   745  // Precondition: same as FDTable.Get.
   746  func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription {
   747  	f, _ := t.fdTable.GetVFS2(fd)
   748  	return f
   749  }
   750  
   751  // NewFDs is a convenience wrapper for t.FDTable().NewFDs.
   752  //
   753  // This automatically passes the task as the context.
   754  //
   755  // Precondition: same as FDTable.
   756  func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) {
   757  	return t.fdTable.NewFDs(t, fd, files, flags)
   758  }
   759  
   760  // NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2.
   761  //
   762  // This automatically passes the task as the context.
   763  //
   764  // Precondition: same as FDTable.
   765  func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
   766  	return t.fdTable.NewFDsVFS2(t, fd, files, flags)
   767  }
   768  
   769  // NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
   770  //
   771  // This automatically passes the task as the context.
   772  //
   773  // Precondition: same as FDTable.
   774  func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) {
   775  	fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags)
   776  	if err != nil {
   777  		return 0, err
   778  	}
   779  	return fds[0], nil
   780  }
   781  
   782  // NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
   783  //
   784  // This automatically passes the task as the context.
   785  //
   786  // Precondition: same as FDTable.Get.
   787  func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
   788  	return t.fdTable.NewFDVFS2(t, fd, file, flags)
   789  }
   790  
   791  // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
   792  //
   793  // This automatically passes the task as the context.
   794  //
   795  // Precondition: same as FDTable.
   796  func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
   797  	return t.fdTable.NewFDAt(t, fd, file, flags)
   798  }
   799  
   800  // NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
   801  //
   802  // This automatically passes the task as the context.
   803  //
   804  // Precondition: same as FDTable.
   805  func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
   806  	return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
   807  }
   808  
   809  // WithMuLocked executes f with t.mu locked.
   810  func (t *Task) WithMuLocked(f func(*Task)) {
   811  	t.mu.Lock()
   812  	f(t)
   813  	t.mu.Unlock()
   814  }
   815  
   816  // MountNamespace returns t's MountNamespace. MountNamespace does not take an
   817  // additional reference on the returned MountNamespace.
   818  func (t *Task) MountNamespace() *fs.MountNamespace {
   819  	return t.tg.mounts
   820  }
   821  
   822  // MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the
   823  // returned mount namespace.
   824  func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
   825  	t.mu.Lock()
   826  	defer t.mu.Unlock()
   827  	return t.mountNamespaceVFS2
   828  }
   829  
   830  // AbstractSockets returns t's AbstractSocketNamespace.
   831  func (t *Task) AbstractSockets() *AbstractSocketNamespace {
   832  	return t.abstractSockets
   833  }
   834  
   835  // ContainerID returns t's container ID.
   836  func (t *Task) ContainerID() string {
   837  	return t.containerID
   838  }
   839  
   840  // OOMScoreAdj gets the task's thread group's OOM score adjustment.
   841  func (t *Task) OOMScoreAdj() int32 {
   842  	return atomic.LoadInt32(&t.tg.oomScoreAdj)
   843  }
   844  
   845  // SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The
   846  // value should be between -1000 and 1000 inclusive.
   847  func (t *Task) SetOOMScoreAdj(adj int32) error {
   848  	if adj > 1000 || adj < -1000 {
   849  		return linuxerr.EINVAL
   850  	}
   851  	atomic.StoreInt32(&t.tg.oomScoreAdj, adj)
   852  	return nil
   853  }
   854  
   855  // KUID returns t's kuid.
   856  func (t *Task) KUID() uint32 {
   857  	return uint32(t.Credentials().EffectiveKUID)
   858  }
   859  
   860  // KGID returns t's kgid.
   861  func (t *Task) KGID() uint32 {
   862  	return uint32(t.Credentials().EffectiveKGID)
   863  }
   864  
   865  // SetKcov sets the kcov instance associated with t.
   866  func (t *Task) SetKcov(k *Kcov) {
   867  	t.kcov = k
   868  }
   869  
   870  // ResetKcov clears the kcov instance associated with t.
   871  func (t *Task) ResetKcov() {
   872  	if t.kcov != nil {
   873  		t.kcov.OnTaskExit()
   874  		t.kcov = nil
   875  	}
   876  }