gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/task.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 gocontext "context" 19 "runtime/trace" 20 "sync/atomic" 21 22 "gvisor.dev/gvisor/pkg/abi/linux" 23 "gvisor.dev/gvisor/pkg/atomicbitops" 24 "gvisor.dev/gvisor/pkg/errors/linuxerr" 25 "gvisor.dev/gvisor/pkg/hostarch" 26 "gvisor.dev/gvisor/pkg/metric" 27 "gvisor.dev/gvisor/pkg/sentry/inet" 28 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 29 "gvisor.dev/gvisor/pkg/sentry/kernel/futex" 30 "gvisor.dev/gvisor/pkg/sentry/kernel/sched" 31 ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" 32 "gvisor.dev/gvisor/pkg/sentry/platform" 33 "gvisor.dev/gvisor/pkg/sentry/usage" 34 "gvisor.dev/gvisor/pkg/sentry/vfs" 35 "gvisor.dev/gvisor/pkg/sync" 36 "gvisor.dev/gvisor/pkg/waiter" 37 ) 38 39 // TaskOrigin indicates how the task was initially created. 40 type TaskOrigin int 41 42 const ( 43 // OriginUnknown indicates that task creation source is not known (or not important). 44 OriginUnknown TaskOrigin = iota 45 46 // OriginExec indicates that task was created due to an exec request inside a container. 47 OriginExec 48 ) 49 50 // Task represents a thread of execution in the untrusted app. It 51 // includes registers and any thread-specific state that you would 52 // normally expect. 53 // 54 // Each task is associated with a goroutine, called the task goroutine, that 55 // executes code (application code, system calls, etc.) on behalf of that task. 56 // See Task.run (task_run.go). 57 // 58 // All fields that are "owned by the task goroutine" can only be mutated by the 59 // task goroutine while it is running. The task goroutine does not require 60 // synchronization to read these fields, although it still requires 61 // synchronization as described for those fields to mutate them. 62 // 63 // All fields that are "exclusive to the task goroutine" can only be accessed 64 // by the task goroutine while it is running. The task goroutine does not 65 // require synchronization to read or write these fields. 66 // 67 // +stateify savable 68 type Task struct { 69 taskNode 70 71 // goid is the task goroutine's ID. goid is owned by the task goroutine, 72 // but since it's used to detect cases where non-task goroutines 73 // incorrectly access state owned by, or exclusive to, the task goroutine, 74 // goid is always accessed using atomic memory operations. 75 goid atomicbitops.Int64 `state:"nosave"` 76 77 // runState is what the task goroutine is executing if it is not stopped. 78 // If runState is nil, the task goroutine should exit or has exited. 79 // runState is exclusive to the task goroutine. 80 runState taskRunState 81 82 // taskWorkCount represents the current size of the task work queue. It is 83 // used to avoid acquiring taskWorkMu when the queue is empty. 84 taskWorkCount atomicbitops.Int32 85 86 // taskWorkMu protects taskWork. 87 taskWorkMu taskWorkMutex `state:"nosave"` 88 89 // taskWork is a queue of work to be executed before resuming user execution. 90 // It is similar to the task_work mechanism in Linux. 91 // 92 // taskWork is exclusive to the task goroutine. 93 taskWork []TaskWorker 94 95 // haveSyscallReturn is true if image.Arch().Return() represents a value 96 // returned by a syscall (or set by ptrace after a syscall). 97 // 98 // haveSyscallReturn is exclusive to the task goroutine. 99 haveSyscallReturn bool 100 101 // interruptChan is notified whenever the task goroutine is interrupted 102 // (usually by a pending signal). interruptChan is effectively a condition 103 // variable that can be used in select statements. 104 // 105 // interruptChan is not saved; because saving interrupts all tasks, 106 // interruptChan is always notified after restore (see Task.run). 107 interruptChan chan struct{} `state:"nosave"` 108 109 // gosched contains the current scheduling state of the task goroutine. 110 // 111 // gosched is protected by goschedSeq. gosched is owned by the task 112 // goroutine. 113 goschedSeq sync.SeqCount `state:"nosave"` 114 gosched TaskGoroutineSchedInfo 115 116 // yieldCount is the number of times the task goroutine has called 117 // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or 118 // Task.Yield(), voluntarily ceasing execution. 119 // 120 // yieldCount is accessed using atomic memory operations. yieldCount is 121 // owned by the task goroutine. 122 yieldCount atomicbitops.Uint64 123 124 // pendingSignals is the set of pending signals that may be handled only by 125 // this task. 126 // 127 // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu 128 // (hereafter "the signal mutex"); see comment on 129 // ThreadGroup.signalHandlers. 130 pendingSignals pendingSignals 131 132 // signalMask is the set of signals whose delivery is currently blocked. 133 // 134 // signalMask is accessed using atomic memory operations, and is protected 135 // by the signal mutex (such that reading signalMask is safe if either the 136 // signal mutex is locked or if atomic memory operations are used, while 137 // writing signalMask requires both). signalMask is owned by the task 138 // goroutine. 139 signalMask atomicbitops.Uint64 140 141 // If the task goroutine is currently executing Task.sigtimedwait, 142 // realSignalMask is the previous value of signalMask, which has temporarily 143 // been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0. 144 // 145 // realSignalMask is exclusive to the task goroutine. 146 realSignalMask linux.SignalSet 147 148 // If haveSavedSignalMask is true, savedSignalMask is the signal mask that 149 // should be applied after the task has either delivered one signal to a 150 // user handler or is about to resume execution in the untrusted 151 // application. 152 // 153 // Both haveSavedSignalMask and savedSignalMask are exclusive to the task 154 // goroutine. 155 haveSavedSignalMask bool 156 savedSignalMask linux.SignalSet 157 158 // signalStack is the alternate signal stack used by signal handlers for 159 // which the SA_ONSTACK flag is set. 160 // 161 // signalStack is exclusive to the task goroutine. 162 signalStack linux.SignalStack 163 164 // signalQueue is a set of registered waiters for signal-related events. 165 // 166 // signalQueue is protected by the signalMutex. Note that the task does 167 // not implement all queue methods, specifically the readiness checks. 168 // The task only broadcast a notification on signal delivery. 169 signalQueue waiter.Queue 170 171 // If groupStopPending is true, the task should participate in a group 172 // stop in the interrupt path. 173 // 174 // groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux. 175 // 176 // groupStopPending is protected by the signal mutex. 177 groupStopPending bool 178 179 // If groupStopAcknowledged is true, the task has already acknowledged that 180 // it is entering the most recent group stop that has been initiated on its 181 // thread group. 182 // 183 // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux. 184 // 185 // groupStopAcknowledged is protected by the signal mutex. 186 groupStopAcknowledged bool 187 188 // If trapStopPending is true, the task goroutine should enter a 189 // PTRACE_INTERRUPT-induced stop from the interrupt path. 190 // 191 // trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that 192 // Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects 193 // JOBCTL_STOP_PENDING. 194 // 195 // trapStopPending is protected by the signal mutex. 196 trapStopPending bool 197 198 // If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group 199 // stop has begun or ended since the last time the task entered a 200 // ptrace-stop from the group-stop path. 201 // 202 // trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux. 203 // 204 // trapNotifyPending is protected by the signal mutex. 205 trapNotifyPending bool 206 207 // If stop is not nil, it is the internally-initiated condition that 208 // currently prevents the task goroutine from running. 209 // 210 // stop is protected by the signal mutex. 211 stop TaskStop 212 213 // stopCount is the number of active external stops (calls to 214 // Task.BeginExternalStop that have not been paired with a call to 215 // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is 216 // non-zero if the task goroutine should stop. 217 // 218 // Mutating stopCount requires both locking the signal mutex and using 219 // atomic memory operations. Reading stopCount requires either locking the 220 // signal mutex or using atomic memory operations. This allows Task.doStop 221 // to require only a single atomic read in the common case where stopCount 222 // is 0. 223 // 224 // stopCount is not saved, because external stops cannot be retained across 225 // a save/restore cycle. (Suppose a sentryctl command issues an external 226 // stop; after a save/restore cycle, the restored sentry has no knowledge 227 // of the pre-save sentryctl command, and the stopped task would remain 228 // stopped forever.) 229 stopCount atomicbitops.Int32 `state:"nosave"` 230 231 // endStopCond is signaled when stopCount transitions to 0. The combination 232 // of stopCount and endStopCond effectively form a sync.WaitGroup, but 233 // WaitGroup provides no way to read its counter value. 234 // 235 // Invariant: endStopCond.L is the signal mutex. (This is not racy because 236 // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine 237 // calls sync.Cond.Wait; and only the task goroutine can change the 238 // identity of the signal mutex, in Task.finishExec.) 239 endStopCond sync.Cond `state:"nosave"` 240 241 // exitStatus is the task's exit status. 242 // 243 // exitStatus is protected by the signal mutex. 244 exitStatus linux.WaitStatus 245 246 // syscallRestartBlock represents a custom restart function to run in 247 // restart_syscall(2) to resume an interrupted syscall. 248 // 249 // syscallRestartBlock is exclusive to the task goroutine. 250 syscallRestartBlock SyscallRestartBlock 251 252 // p provides the mechanism by which the task runs code in userspace. The p 253 // interface object is immutable. 254 p platform.Context `state:"nosave"` 255 256 // k is the Kernel that this task belongs to. The k pointer is immutable. 257 k *Kernel 258 259 // containerID has no equivalent in Linux; it's used by runsc to track all 260 // tasks that belong to a given containers since cgroups aren't implemented. 261 // It's inherited by the children, is immutable, and may be empty. 262 // 263 // NOTE: cgroups can be used to track this when implemented. 264 containerID string 265 266 // mu protects some of the following fields. 267 mu taskMutex `state:"nosave"` 268 269 // image holds task data provided by the ELF loader. 270 // 271 // image is protected by mu, and is owned by the task goroutine. 272 image TaskImage 273 274 // fsContext is the task's filesystem context. 275 // 276 // fsContext is protected by mu, and is owned by the task goroutine. 277 fsContext *FSContext 278 279 // fdTable is the task's file descriptor table. 280 // 281 // fdTable is protected by mu, and is owned by the task goroutine. 282 fdTable *FDTable 283 284 // If vforkParent is not nil, it is the task that created this task with 285 // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when 286 // this TaskImage is released. 287 // 288 // vforkParent is protected by the TaskSet mutex. 289 vforkParent *Task 290 291 // exitState is the task's progress through the exit path. 292 // 293 // exitState is protected by the TaskSet mutex. exitState is owned by the 294 // task goroutine. 295 exitState TaskExitState 296 297 // exitTracerNotified is true if the exit path has either signaled the 298 // task's tracer to indicate the exit, or determined that no such signal is 299 // needed. exitTracerNotified can only be true if exitState is 300 // TaskExitZombie or TaskExitDead. 301 // 302 // exitTracerNotified is protected by the TaskSet mutex. 303 exitTracerNotified bool 304 305 // exitTracerAcked is true if exitTracerNotified is true and either the 306 // task's tracer has acknowledged the exit notification, or the exit path 307 // has determined that no such notification is needed. 308 // 309 // exitTracerAcked is protected by the TaskSet mutex. 310 exitTracerAcked bool 311 312 // exitParentNotified is true if the exit path has either signaled the 313 // task's parent to indicate the exit, or determined that no such signal is 314 // needed. exitParentNotified can only be true if exitState is 315 // TaskExitZombie or TaskExitDead. 316 // 317 // exitParentNotified is protected by the TaskSet mutex. 318 exitParentNotified bool 319 320 // exitParentAcked is true if exitParentNotified is true and either the 321 // task's parent has acknowledged the exit notification, or the exit path 322 // has determined that no such acknowledgment is needed. 323 // 324 // exitParentAcked is protected by the TaskSet mutex. 325 exitParentAcked bool 326 327 // goroutineStopped is a WaitGroup whose counter value is 1 when the task 328 // goroutine is running and 0 when the task goroutine is stopped or has 329 // exited. 330 goroutineStopped sync.WaitGroup `state:"nosave"` 331 332 // ptraceTracer is the task that is ptrace-attached to this one. If 333 // ptraceTracer is nil, this task is not being traced. 334 // 335 // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic 336 // operations. This allows paths that wouldn't otherwise lock the TaskSet 337 // mutex, notably the syscall path, to check if ptraceTracer is nil without 338 // additional synchronization. 339 ptraceTracer atomic.Pointer[Task] `state:".(*Task)"` 340 341 // ptraceTracees is the set of tasks that this task is ptrace-attached to. 342 // 343 // ptraceTracees is protected by the TaskSet mutex. 344 ptraceTracees map[*Task]struct{} 345 346 // ptraceSeized is true if ptraceTracer attached to this task with 347 // PTRACE_SEIZE. 348 // 349 // ptraceSeized is protected by the TaskSet mutex. 350 ptraceSeized bool 351 352 // ptraceOpts contains ptrace options explicitly set by the tracer. If 353 // ptraceTracer is nil, ptraceOpts is expected to be the zero value. 354 // 355 // ptraceOpts is protected by the TaskSet mutex. 356 ptraceOpts ptraceOptions 357 358 // ptraceSyscallMode controls ptrace behavior around syscall entry and 359 // exit. 360 // 361 // ptraceSyscallMode is protected by the TaskSet mutex. 362 ptraceSyscallMode ptraceSyscallMode 363 364 // If ptraceSinglestep is true, the next time the task executes application 365 // code, single-stepping should be enabled. ptraceSinglestep is stored 366 // independently of the architecture-specific trap flag because tracer 367 // detaching (which can happen concurrently with the tracee's execution if 368 // the tracer exits) must disable single-stepping, and the task's 369 // architectural state is implicitly exclusive to the task goroutine (no 370 // synchronization occurs before passing registers to SwitchToApp). 371 // 372 // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP. 373 // 374 // ptraceSinglestep is protected by the TaskSet mutex. 375 ptraceSinglestep bool 376 377 // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the 378 // time that t entered the ptrace stop, reset to 0 when the tracer 379 // acknowledges the stop with a wait*() syscall. Otherwise, it is the 380 // signal number passed to the ptrace operation that ended the last ptrace 381 // stop on this task. In the latter case, the effect of ptraceCode depends 382 // on the nature of the ptrace stop; signal-delivery-stop uses it to 383 // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the 384 // signal to the task after leaving the stop, and PTRACE_EVENT stops and 385 // traced group stops ignore it entirely. 386 // 387 // Linux contextually stores the equivalent of ptraceCode in 388 // task_struct::exit_code. 389 // 390 // ptraceCode is protected by the TaskSet mutex. 391 ptraceCode int32 392 393 // ptraceSiginfo is the value returned to the tracer by 394 // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO). 395 // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.) 396 // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is 397 // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which 398 // is in turn required to distinguish group stops from other ptrace stops, 399 // per subsection "Group-stop" in ptrace(2)). 400 // 401 // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo. 402 // 403 // ptraceSiginfo is protected by the TaskSet mutex. 404 ptraceSiginfo *linux.SignalInfo 405 406 // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to 407 // the tracer by ptrace(PTRACE_GETEVENTMSG). 408 // 409 // ptraceEventMsg is protected by the TaskSet mutex. 410 ptraceEventMsg uint64 411 412 // ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has 413 // been added before. This is used during task exit to decide whether we need 414 // to clean up YAMA exceptions. 415 // 416 // ptraceYAMAExceptionAdded is protected by the TaskSet mutex. 417 ptraceYAMAExceptionAdded bool 418 419 // The struct that holds the IO-related usage. The ioUsage pointer is 420 // immutable. 421 ioUsage *usage.IO 422 423 // logPrefix is a string containing the task's thread ID in the root PID 424 // namespace, and is prepended to log messages emitted by Task.Infof etc. 425 logPrefix atomic.Pointer[string] `state:"nosave"` 426 427 // traceContext and traceTask are both used for tracing, and are 428 // updated along with the logPrefix in updateInfoLocked. 429 // 430 // These are exclusive to the task goroutine. 431 traceContext gocontext.Context `state:"nosave"` 432 traceTask *trace.Task `state:"nosave"` 433 434 // creds is the task's credentials. 435 // 436 // creds.Load() may be called without synchronization. creds.Store() is 437 // serialized by mu. creds is owned by the task goroutine. All 438 // auth.Credentials objects that creds may point to, or have pointed to 439 // in the past, must be treated as immutable. 440 creds auth.AtomicPtrCredentials 441 442 // utsns is the task's UTS namespace. 443 // 444 // utsns is protected by mu. utsns is owned by the task goroutine. 445 utsns *UTSNamespace 446 447 // ipcns is the task's IPC namespace. 448 // 449 // ipcns is protected by mu. ipcns is owned by the task goroutine. 450 ipcns *IPCNamespace 451 452 // mountNamespace is the task's mount namespace. 453 // 454 // It is protected by mu. It is owned by the task goroutine. 455 mountNamespace *vfs.MountNamespace 456 457 // parentDeathSignal is sent to this task's thread group when its parent exits. 458 // 459 // parentDeathSignal is protected by mu. 460 parentDeathSignal linux.Signal 461 462 // seccomp contains all seccomp-bpf syscall filters applicable to the task. 463 // The type of the atomic is *taskSeccomp. 464 // Writing needs to be protected by the signal mutex. 465 // 466 // seccomp is owned by the task goroutine. 467 seccomp atomic.Pointer[taskSeccomp] `state:".(*taskSeccomp)"` 468 469 // If cleartid is non-zero, treat it as a pointer to a ThreadID in the 470 // task's virtual address space; when the task exits, set the pointed-to 471 // ThreadID to 0, and wake any futex waiters. 472 // 473 // cleartid is exclusive to the task goroutine. 474 cleartid hostarch.Addr 475 476 // This is mostly a fake cpumask just for sched_set/getaffinity as we 477 // don't really control the affinity. 478 // 479 // Invariant: allowedCPUMask.Size() == 480 // sched.CPUMaskSize(Kernel.applicationCores). 481 // 482 // allowedCPUMask is protected by mu. 483 allowedCPUMask sched.CPUSet 484 485 // cpu is the fake cpu number returned by getcpu(2). cpu is ignored 486 // entirely if Kernel.useHostCores is true. 487 cpu atomicbitops.Int32 488 489 // This is used to keep track of changes made to a process' priority/niceness. 490 // It is mostly used to provide some reasonable return value from 491 // getpriority(2) after a call to setpriority(2) has been made. 492 // We currently do not actually modify a process' scheduling priority. 493 // NOTE: This represents the userspace view of priority (nice). 494 // This means that the value should be in the range [-20, 19]. 495 // 496 // niceness is protected by mu. 497 niceness int 498 499 // This is used to track the numa policy for the current thread. This can be 500 // modified through a set_mempolicy(2) syscall. Since we always report a 501 // single numa node, all policies are no-ops. We only track this information 502 // so that we can return reasonable values if the application calls 503 // get_mempolicy(2) after setting a non-default policy. Note that in the 504 // real syscall, nodemask can be longer than a single unsigned long, but we 505 // always report a single node so never need to save more than a single 506 // bit. 507 // 508 // numaPolicy and numaNodeMask are protected by mu. 509 numaPolicy linux.NumaPolicy 510 numaNodeMask uint64 511 512 // netns is the task's network namespace. It has to be changed under mu 513 // so that GetNetworkNamespace can take a reference before it is 514 // released. It is changed only from the task goroutine. 515 netns *inet.Namespace 516 517 // If rseqPreempted is true, before the next call to p.Switch(), 518 // interrupt rseq critical regions as defined by rseqAddr and 519 // tg.oldRSeqCritical and write the task goroutine's CPU number to 520 // rseqAddr/oldRSeqCPUAddr. 521 // 522 // We support two ABIs for restartable sequences: 523 // 524 // 1. The upstream interface added in v4.18, 525 // 2. An "old" interface never merged upstream. In the implementation, 526 // this is referred to as "old rseq". 527 // 528 // rseqPreempted is exclusive to the task goroutine. 529 rseqPreempted bool `state:"nosave"` 530 531 // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr. 532 // 533 // If rseq is unused, rseqCPU is -1 for convenient use in 534 // platform.Context.Switch. 535 // 536 // rseqCPU is exclusive to the task goroutine. 537 rseqCPU int32 538 539 // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable. 540 // 541 // oldRSeqCPUAddr is exclusive to the task goroutine. 542 oldRSeqCPUAddr hostarch.Addr 543 544 // rseqAddr is a pointer to the userspace linux.RSeq structure. 545 // 546 // rseqAddr is exclusive to the task goroutine. 547 rseqAddr hostarch.Addr 548 549 // rseqSignature is the signature that the rseq abort IP must be signed 550 // with. 551 // 552 // rseqSignature is exclusive to the task goroutine. 553 rseqSignature uint32 554 555 // copyScratchBuffer is a buffer available to CopyIn/CopyOut 556 // implementations that require an intermediate buffer to copy data 557 // into/out of. It prevents these buffers from being allocated/zeroed in 558 // each syscall and eventually garbage collected. 559 // 560 // copyScratchBuffer is exclusive to the task goroutine. 561 copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"` 562 563 // blockingTimer is used for blocking timeouts. blockingTimerChan is the 564 // channel that is sent to when blockingTimer fires. 565 // 566 // blockingTimer is exclusive to the task goroutine. 567 blockingTimer *ktime.Timer `state:"nosave"` 568 blockingTimerChan <-chan struct{} `state:"nosave"` 569 570 // futexWaiter is used for futex(FUTEX_WAIT) syscalls. 571 // 572 // futexWaiter is exclusive to the task goroutine. 573 futexWaiter *futex.Waiter `state:"nosave"` 574 575 // robustList is a pointer to the head of the tasks's robust futex 576 // list. 577 robustList hostarch.Addr 578 579 // startTime is the real time at which the task started. It is set when 580 // a Task is created or invokes execve(2). 581 // 582 // startTime is protected by mu. 583 startTime ktime.Time 584 585 // kcov is the kcov instance providing code coverage owned by this task. 586 // 587 // kcov is exclusive to the task goroutine. 588 kcov *Kcov 589 590 // cgroups is the set of cgroups this task belongs to. This may be empty if 591 // no cgroup controllers are enabled. Protected by mu. 592 // 593 // +checklocks:mu 594 cgroups map[Cgroup]struct{} 595 596 // memCgID is the memory cgroup id. 597 memCgID atomicbitops.Uint32 598 599 // userCounters is a pointer to a set of user counters. 600 // 601 // The userCounters pointer is exclusive to the task goroutine, but the 602 // userCounters instance must be atomically accessed. 603 userCounters *UserCounters 604 605 // sessionKeyring is a pointer to the task's session keyring, if set. 606 // It is guaranteed to be of type "keyring". 607 // 608 // +checklocks:mu 609 sessionKeyring *auth.Key 610 611 // Origin is the origin of the task. 612 Origin TaskOrigin 613 } 614 615 // Task related metrics 616 var ( 617 // syscallCounter is a metric that tracks how many syscalls the sentry has 618 // executed. 619 syscallCounter = metric.SentryProfiling.MustCreateNewUint64Metric( 620 "/task/syscalls", false, "The number of syscalls the sentry has executed for the user.") 621 622 // faultCounter is a metric that tracks how many faults the sentry has had to 623 // handle. 624 faultCounter = metric.SentryProfiling.MustCreateNewUint64Metric( 625 "/task/faults", false, "The number of faults the sentry has handled.") 626 ) 627 628 func (t *Task) savePtraceTracer() *Task { 629 return t.ptraceTracer.Load() 630 } 631 632 func (t *Task) loadPtraceTracer(_ gocontext.Context, tracer *Task) { 633 t.ptraceTracer.Store(tracer) 634 } 635 636 func (t *Task) saveSeccomp() *taskSeccomp { 637 return t.seccomp.Load() 638 } 639 640 func (t *Task) loadSeccomp(_ gocontext.Context, seccompData *taskSeccomp) { 641 t.seccomp.Store(seccompData) 642 } 643 644 // afterLoad is invoked by stateify. 645 func (t *Task) afterLoad(gocontext.Context) { 646 t.updateInfoLocked() 647 if ts := t.seccomp.Load(); ts != nil { 648 ts.populateCache(t) 649 } 650 t.interruptChan = make(chan struct{}, 1) 651 t.gosched.State = TaskGoroutineNonexistent 652 if t.stop != nil { 653 t.stopCount = atomicbitops.FromInt32(1) 654 } 655 t.endStopCond.L = &t.tg.signalHandlers.mu 656 t.rseqPreempted = true 657 t.futexWaiter = futex.NewWaiter() 658 t.p = t.k.Platform.NewContext(t.AsyncContext()) 659 } 660 661 // copyScratchBufferLen is the length of Task.copyScratchBuffer. 662 const copyScratchBufferLen = 144 // sizeof(struct stat) 663 664 // CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut 665 // functions. It must only be used within those functions and can only be used 666 // by the task goroutine; it exists to improve performance and thus 667 // intentionally lacks any synchronization. 668 // 669 // Callers should pass a constant value as an argument if possible, which will 670 // allow the compiler to inline and optimize out the if statement below. 671 func (t *Task) CopyScratchBuffer(size int) []byte { 672 if size > copyScratchBufferLen { 673 return make([]byte, size) 674 } 675 return t.copyScratchBuffer[:size] 676 } 677 678 // FutexWaiter returns the Task's futex.Waiter. 679 func (t *Task) FutexWaiter() *futex.Waiter { 680 return t.futexWaiter 681 } 682 683 // Kernel returns the Kernel containing t. 684 func (t *Task) Kernel() *Kernel { 685 return t.k 686 } 687 688 // SetClearTID sets t's cleartid. 689 // 690 // Preconditions: The caller must be running on the task goroutine. 691 func (t *Task) SetClearTID(addr hostarch.Addr) { 692 t.cleartid = addr 693 } 694 695 // SetSyscallRestartBlock sets the restart block for use in 696 // restart_syscall(2). After registering a restart block, a syscall should 697 // return ERESTART_RESTARTBLOCK to request a restart using the block. 698 // 699 // Precondition: The caller must be running on the task goroutine. 700 func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) { 701 t.syscallRestartBlock = r 702 } 703 704 // SyscallRestartBlock returns the currently registered restart block for use in 705 // restart_syscall(2). This function is *not* idempotent and may be called once 706 // per syscall. This function must not be called if a restart block has not been 707 // registered for the current syscall. 708 // 709 // Precondition: The caller must be running on the task goroutine. 710 func (t *Task) SyscallRestartBlock() SyscallRestartBlock { 711 r := t.syscallRestartBlock 712 // Explicitly set the restart block to nil so that a future syscall can't 713 // accidentally reuse it. 714 t.syscallRestartBlock = nil 715 return r 716 } 717 718 // IsChrooted returns true if the root directory of t's FSContext is not the 719 // root directory of t's MountNamespace. 720 // 721 // Preconditions: The caller must be running on the task goroutine, or t.mu 722 // must be locked. 723 func (t *Task) IsChrooted() bool { 724 realRoot := t.mountNamespace.Root(t) 725 defer realRoot.DecRef(t) 726 root := t.fsContext.RootDirectory() 727 defer root.DecRef(t) 728 return root != realRoot 729 } 730 731 // TaskImage returns t's TaskImage. 732 // 733 // Precondition: The caller must be running on the task goroutine, or t.mu must 734 // be locked. 735 func (t *Task) TaskImage() *TaskImage { 736 return &t.image 737 } 738 739 // FSContext returns t's FSContext. FSContext does not take an additional 740 // reference on the returned FSContext. 741 // 742 // Precondition: The caller must be running on the task goroutine, or t.mu must 743 // be locked. 744 func (t *Task) FSContext() *FSContext { 745 return t.fsContext 746 } 747 748 // FDTable returns t's FDTable. FDMTable does not take an additional reference 749 // on the returned FDMap. 750 // 751 // Precondition: The caller must be running on the task goroutine, or t.mu must 752 // be locked. 753 func (t *Task) FDTable() *FDTable { 754 return t.fdTable 755 } 756 757 // GetFile is a convenience wrapper for t.FDTable().Get. 758 // 759 // Precondition: same as FDTable.Get. 760 func (t *Task) GetFile(fd int32) *vfs.FileDescription { 761 f, _ := t.fdTable.Get(fd) 762 return f 763 } 764 765 // NewFDs is a convenience wrapper for t.FDTable().NewFDs. 766 // 767 // This automatically passes the task as the context. 768 // 769 // Precondition: same as FDTable. 770 func (t *Task) NewFDs(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) { 771 return t.fdTable.NewFDs(t, fd, files, flags) 772 } 773 774 // NewFDFrom is a convenience wrapper for t.FDTable().NewFD. 775 // 776 // This automatically passes the task as the context. 777 // 778 // Precondition: same as FDTable.Get. 779 func (t *Task) NewFDFrom(minFD int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { 780 return t.fdTable.NewFD(t, minFD, file, flags) 781 } 782 783 // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. 784 // 785 // This automatically passes the task as the context. 786 // 787 // Precondition: same as FDTable. 788 func (t *Task) NewFDAt(fd int32, file *vfs.FileDescription, flags FDFlags) (*vfs.FileDescription, error) { 789 return t.fdTable.NewFDAt(t, fd, file, flags) 790 } 791 792 // WithMuLocked executes f with t.mu locked. 793 func (t *Task) WithMuLocked(f func(*Task)) { 794 t.mu.Lock() 795 f(t) 796 t.mu.Unlock() 797 } 798 799 // MountNamespace returns t's MountNamespace. 800 func (t *Task) MountNamespace() *vfs.MountNamespace { 801 t.mu.Lock() 802 defer t.mu.Unlock() 803 return t.mountNamespace 804 } 805 806 // GetMountNamespace returns t's MountNamespace. A reference is taken on the 807 // returned mount namespace. 808 func (t *Task) GetMountNamespace() *vfs.MountNamespace { 809 t.mu.Lock() 810 defer t.mu.Unlock() 811 mntns := t.mountNamespace 812 if mntns != nil { 813 mntns.IncRef() 814 } 815 return mntns 816 } 817 818 // ContainerID returns t's container ID. 819 func (t *Task) ContainerID() string { 820 return t.containerID 821 } 822 823 // RestoreContainerID sets t's container ID in case the restored container ID 824 // is different from when it was saved. 825 func (t *Task) RestoreContainerID(cid string) { 826 t.containerID = cid 827 } 828 829 // OOMScoreAdj gets the task's thread group's OOM score adjustment. 830 func (t *Task) OOMScoreAdj() int32 { 831 return t.tg.oomScoreAdj.Load() 832 } 833 834 // SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The 835 // value should be between -1000 and 1000 inclusive. 836 func (t *Task) SetOOMScoreAdj(adj int32) error { 837 if adj > 1000 || adj < -1000 { 838 return linuxerr.EINVAL 839 } 840 t.tg.oomScoreAdj.Store(adj) 841 return nil 842 } 843 844 // KUID returns t's kuid. 845 func (t *Task) KUID() uint32 { 846 return uint32(t.Credentials().EffectiveKUID) 847 } 848 849 // KGID returns t's kgid. 850 func (t *Task) KGID() uint32 { 851 return uint32(t.Credentials().EffectiveKGID) 852 } 853 854 // SetKcov sets the kcov instance associated with t. 855 func (t *Task) SetKcov(k *Kcov) { 856 t.kcov = k 857 } 858 859 // ResetKcov clears the kcov instance associated with t. 860 func (t *Task) ResetKcov() { 861 if t.kcov != nil { 862 t.kcov.OnTaskExit() 863 t.kcov = nil 864 } 865 }