github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/task.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 gocontext "context" 19 "runtime/trace" 20 "sync/atomic" 21 22 "github.com/metacubex/gvisor/pkg/abi/linux" 23 "github.com/metacubex/gvisor/pkg/atomicbitops" 24 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 25 "github.com/metacubex/gvisor/pkg/hostarch" 26 "github.com/metacubex/gvisor/pkg/metric" 27 "github.com/metacubex/gvisor/pkg/sentry/inet" 28 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 29 "github.com/metacubex/gvisor/pkg/sentry/kernel/futex" 30 "github.com/metacubex/gvisor/pkg/sentry/kernel/sched" 31 ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 32 "github.com/metacubex/gvisor/pkg/sentry/platform" 33 "github.com/metacubex/gvisor/pkg/sentry/usage" 34 "github.com/metacubex/gvisor/pkg/sentry/vfs" 35 "github.com/metacubex/gvisor/pkg/sync" 36 "github.com/metacubex/gvisor/pkg/waiter" 37 ) 38 39 // Task represents a thread of execution in the untrusted app. It 40 // includes registers and any thread-specific state that you would 41 // normally expect. 42 // 43 // Each task is associated with a goroutine, called the task goroutine, that 44 // executes code (application code, system calls, etc.) on behalf of that task. 45 // See Task.run (task_run.go). 46 // 47 // All fields that are "owned by the task goroutine" can only be mutated by the 48 // task goroutine while it is running. The task goroutine does not require 49 // synchronization to read these fields, although it still requires 50 // synchronization as described for those fields to mutate them. 51 // 52 // All fields that are "exclusive to the task goroutine" can only be accessed 53 // by the task goroutine while it is running. The task goroutine does not 54 // require synchronization to read or write these fields. 55 // 56 // +stateify savable 57 type Task struct { 58 taskNode 59 60 // goid is the task goroutine's ID. goid is owned by the task goroutine, 61 // but since it's used to detect cases where non-task goroutines 62 // incorrectly access state owned by, or exclusive to, the task goroutine, 63 // goid is always accessed using atomic memory operations. 64 goid atomicbitops.Int64 `state:"nosave"` 65 66 // runState is what the task goroutine is executing if it is not stopped. 67 // If runState is nil, the task goroutine should exit or has exited. 68 // runState is exclusive to the task goroutine. 69 runState taskRunState 70 71 // taskWorkCount represents the current size of the task work queue. It is 72 // used to avoid acquiring taskWorkMu when the queue is empty. 73 taskWorkCount atomicbitops.Int32 74 75 // taskWorkMu protects taskWork. 76 taskWorkMu taskWorkMutex `state:"nosave"` 77 78 // taskWork is a queue of work to be executed before resuming user execution. 79 // It is similar to the task_work mechanism in Linux. 80 // 81 // taskWork is exclusive to the task goroutine. 82 taskWork []TaskWorker 83 84 // haveSyscallReturn is true if image.Arch().Return() represents a value 85 // returned by a syscall (or set by ptrace after a syscall). 86 // 87 // haveSyscallReturn is exclusive to the task goroutine. 88 haveSyscallReturn bool 89 90 // interruptChan is notified whenever the task goroutine is interrupted 91 // (usually by a pending signal). interruptChan is effectively a condition 92 // variable that can be used in select statements. 93 // 94 // interruptChan is not saved; because saving interrupts all tasks, 95 // interruptChan is always notified after restore (see Task.run). 96 interruptChan chan struct{} `state:"nosave"` 97 98 // gosched contains the current scheduling state of the task goroutine. 99 // 100 // gosched is protected by goschedSeq. gosched is owned by the task 101 // goroutine. 102 goschedSeq sync.SeqCount `state:"nosave"` 103 gosched TaskGoroutineSchedInfo 104 105 // yieldCount is the number of times the task goroutine has called 106 // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or 107 // Task.Yield(), voluntarily ceasing execution. 108 // 109 // yieldCount is accessed using atomic memory operations. yieldCount is 110 // owned by the task goroutine. 111 yieldCount atomicbitops.Uint64 112 113 // pendingSignals is the set of pending signals that may be handled only by 114 // this task. 115 // 116 // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu 117 // (hereafter "the signal mutex"); see comment on 118 // ThreadGroup.signalHandlers. 119 pendingSignals pendingSignals 120 121 // signalMask is the set of signals whose delivery is currently blocked. 122 // 123 // signalMask is accessed using atomic memory operations, and is protected 124 // by the signal mutex (such that reading signalMask is safe if either the 125 // signal mutex is locked or if atomic memory operations are used, while 126 // writing signalMask requires both). signalMask is owned by the task 127 // goroutine. 128 signalMask atomicbitops.Uint64 129 130 // If the task goroutine is currently executing Task.sigtimedwait, 131 // realSignalMask is the previous value of signalMask, which has temporarily 132 // been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0. 133 // 134 // realSignalMask is exclusive to the task goroutine. 135 realSignalMask linux.SignalSet 136 137 // If haveSavedSignalMask is true, savedSignalMask is the signal mask that 138 // should be applied after the task has either delivered one signal to a 139 // user handler or is about to resume execution in the untrusted 140 // application. 141 // 142 // Both haveSavedSignalMask and savedSignalMask are exclusive to the task 143 // goroutine. 144 haveSavedSignalMask bool 145 savedSignalMask linux.SignalSet 146 147 // signalStack is the alternate signal stack used by signal handlers for 148 // which the SA_ONSTACK flag is set. 149 // 150 // signalStack is exclusive to the task goroutine. 151 signalStack linux.SignalStack 152 153 // signalQueue is a set of registered waiters for signal-related events. 154 // 155 // signalQueue is protected by the signalMutex. Note that the task does 156 // not implement all queue methods, specifically the readiness checks. 157 // The task only broadcast a notification on signal delivery. 158 signalQueue waiter.Queue 159 160 // If groupStopPending is true, the task should participate in a group 161 // stop in the interrupt path. 162 // 163 // groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux. 164 // 165 // groupStopPending is protected by the signal mutex. 166 groupStopPending bool 167 168 // If groupStopAcknowledged is true, the task has already acknowledged that 169 // it is entering the most recent group stop that has been initiated on its 170 // thread group. 171 // 172 // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux. 173 // 174 // groupStopAcknowledged is protected by the signal mutex. 175 groupStopAcknowledged bool 176 177 // If trapStopPending is true, the task goroutine should enter a 178 // PTRACE_INTERRUPT-induced stop from the interrupt path. 179 // 180 // trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that 181 // Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects 182 // JOBCTL_STOP_PENDING. 183 // 184 // trapStopPending is protected by the signal mutex. 185 trapStopPending bool 186 187 // If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group 188 // stop has begun or ended since the last time the task entered a 189 // ptrace-stop from the group-stop path. 190 // 191 // trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux. 192 // 193 // trapNotifyPending is protected by the signal mutex. 194 trapNotifyPending bool 195 196 // If stop is not nil, it is the internally-initiated condition that 197 // currently prevents the task goroutine from running. 198 // 199 // stop is protected by the signal mutex. 200 stop TaskStop 201 202 // stopCount is the number of active external stops (calls to 203 // Task.BeginExternalStop that have not been paired with a call to 204 // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is 205 // non-zero if the task goroutine should stop. 206 // 207 // Mutating stopCount requires both locking the signal mutex and using 208 // atomic memory operations. Reading stopCount requires either locking the 209 // signal mutex or using atomic memory operations. This allows Task.doStop 210 // to require only a single atomic read in the common case where stopCount 211 // is 0. 212 // 213 // stopCount is not saved, because external stops cannot be retained across 214 // a save/restore cycle. (Suppose a sentryctl command issues an external 215 // stop; after a save/restore cycle, the restored sentry has no knowledge 216 // of the pre-save sentryctl command, and the stopped task would remain 217 // stopped forever.) 218 stopCount atomicbitops.Int32 `state:"nosave"` 219 220 // endStopCond is signaled when stopCount transitions to 0. The combination 221 // of stopCount and endStopCond effectively form a sync.WaitGroup, but 222 // WaitGroup provides no way to read its counter value. 223 // 224 // Invariant: endStopCond.L is the signal mutex. (This is not racy because 225 // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine 226 // calls sync.Cond.Wait; and only the task goroutine can change the 227 // identity of the signal mutex, in Task.finishExec.) 228 endStopCond sync.Cond `state:"nosave"` 229 230 // exitStatus is the task's exit status. 231 // 232 // exitStatus is protected by the signal mutex. 233 exitStatus linux.WaitStatus 234 235 // syscallRestartBlock represents a custom restart function to run in 236 // restart_syscall(2) to resume an interrupted syscall. 237 // 238 // syscallRestartBlock is exclusive to the task goroutine. 239 syscallRestartBlock SyscallRestartBlock 240 241 // p provides the mechanism by which the task runs code in userspace. The p 242 // interface object is immutable. 243 p platform.Context `state:"nosave"` 244 245 // k is the Kernel that this task belongs to. The k pointer is immutable. 246 k *Kernel 247 248 // containerID has no equivalent in Linux; it's used by runsc to track all 249 // tasks that belong to a given containers since cgroups aren't implemented. 250 // It's inherited by the children, is immutable, and may be empty. 251 // 252 // NOTE: cgroups can be used to track this when implemented. 253 containerID string 254 255 // mu protects some of the following fields. 256 mu taskMutex `state:"nosave"` 257 258 // image holds task data provided by the ELF loader. 259 // 260 // image is protected by mu, and is owned by the task goroutine. 261 image TaskImage 262 263 // fsContext is the task's filesystem context. 264 // 265 // fsContext is protected by mu, and is owned by the task goroutine. 266 fsContext *FSContext 267 268 // fdTable is the task's file descriptor table. 269 // 270 // fdTable is protected by mu, and is owned by the task goroutine. 271 fdTable *FDTable 272 273 // If vforkParent is not nil, it is the task that created this task with 274 // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when 275 // this TaskImage is released. 276 // 277 // vforkParent is protected by the TaskSet mutex. 278 vforkParent *Task 279 280 // exitState is the task's progress through the exit path. 281 // 282 // exitState is protected by the TaskSet mutex. exitState is owned by the 283 // task goroutine. 284 exitState TaskExitState 285 286 // exitTracerNotified is true if the exit path has either signaled the 287 // task's tracer to indicate the exit, or determined that no such signal is 288 // needed. exitTracerNotified can only be true if exitState is 289 // TaskExitZombie or TaskExitDead. 290 // 291 // exitTracerNotified is protected by the TaskSet mutex. 292 exitTracerNotified bool 293 294 // exitTracerAcked is true if exitTracerNotified is true and either the 295 // task's tracer has acknowledged the exit notification, or the exit path 296 // has determined that no such notification is needed. 297 // 298 // exitTracerAcked is protected by the TaskSet mutex. 299 exitTracerAcked bool 300 301 // exitParentNotified is true if the exit path has either signaled the 302 // task's parent to indicate the exit, or determined that no such signal is 303 // needed. exitParentNotified can only be true if exitState is 304 // TaskExitZombie or TaskExitDead. 305 // 306 // exitParentNotified is protected by the TaskSet mutex. 307 exitParentNotified bool 308 309 // exitParentAcked is true if exitParentNotified is true and either the 310 // task's parent has acknowledged the exit notification, or the exit path 311 // has determined that no such acknowledgment is needed. 312 // 313 // exitParentAcked is protected by the TaskSet mutex. 314 exitParentAcked bool 315 316 // goroutineStopped is a WaitGroup whose counter value is 1 when the task 317 // goroutine is running and 0 when the task goroutine is stopped or has 318 // exited. 319 goroutineStopped sync.WaitGroup `state:"nosave"` 320 321 // ptraceTracer is the task that is ptrace-attached to this one. If 322 // ptraceTracer is nil, this task is not being traced. 323 // 324 // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic 325 // operations. This allows paths that wouldn't otherwise lock the TaskSet 326 // mutex, notably the syscall path, to check if ptraceTracer is nil without 327 // additional synchronization. 328 ptraceTracer atomic.Pointer[Task] `state:".(*Task)"` 329 330 // ptraceTracees is the set of tasks that this task is ptrace-attached to. 331 // 332 // ptraceTracees is protected by the TaskSet mutex. 333 ptraceTracees map[*Task]struct{} 334 335 // ptraceSeized is true if ptraceTracer attached to this task with 336 // PTRACE_SEIZE. 337 // 338 // ptraceSeized is protected by the TaskSet mutex. 339 ptraceSeized bool 340 341 // ptraceOpts contains ptrace options explicitly set by the tracer. If 342 // ptraceTracer is nil, ptraceOpts is expected to be the zero value. 343 // 344 // ptraceOpts is protected by the TaskSet mutex. 345 ptraceOpts ptraceOptions 346 347 // ptraceSyscallMode controls ptrace behavior around syscall entry and 348 // exit. 349 // 350 // ptraceSyscallMode is protected by the TaskSet mutex. 351 ptraceSyscallMode ptraceSyscallMode 352 353 // If ptraceSinglestep is true, the next time the task executes application 354 // code, single-stepping should be enabled. ptraceSinglestep is stored 355 // independently of the architecture-specific trap flag because tracer 356 // detaching (which can happen concurrently with the tracee's execution if 357 // the tracer exits) must disable single-stepping, and the task's 358 // architectural state is implicitly exclusive to the task goroutine (no 359 // synchronization occurs before passing registers to SwitchToApp). 360 // 361 // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP. 362 // 363 // ptraceSinglestep is protected by the TaskSet mutex. 364 ptraceSinglestep bool 365 366 // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the 367 // time that t entered the ptrace stop, reset to 0 when the tracer 368 // acknowledges the stop with a wait*() syscall. Otherwise, it is the 369 // signal number passed to the ptrace operation that ended the last ptrace 370 // stop on this task. In the latter case, the effect of ptraceCode depends 371 // on the nature of the ptrace stop; signal-delivery-stop uses it to 372 // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the 373 // signal to the task after leaving the stop, and PTRACE_EVENT stops and 374 // traced group stops ignore it entirely. 375 // 376 // Linux contextually stores the equivalent of ptraceCode in 377 // task_struct::exit_code. 378 // 379 // ptraceCode is protected by the TaskSet mutex. 380 ptraceCode int32 381 382 // ptraceSiginfo is the value returned to the tracer by 383 // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO). 384 // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.) 385 // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is 386 // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which 387 // is in turn required to distinguish group stops from other ptrace stops, 388 // per subsection "Group-stop" in ptrace(2)). 389 // 390 // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo. 391 // 392 // ptraceSiginfo is protected by the TaskSet mutex. 393 ptraceSiginfo *linux.SignalInfo 394 395 // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to 396 // the tracer by ptrace(PTRACE_GETEVENTMSG). 397 // 398 // ptraceEventMsg is protected by the TaskSet mutex. 399 ptraceEventMsg uint64 400 401 // ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has 402 // been added before. This is used during task exit to decide whether we need 403 // to clean up YAMA exceptions. 404 // 405 // ptraceYAMAExceptionAdded is protected by the TaskSet mutex. 406 ptraceYAMAExceptionAdded bool 407 408 // The struct that holds the IO-related usage. The ioUsage pointer is 409 // immutable. 410 ioUsage *usage.IO 411 412 // logPrefix is a string containing the task's thread ID in the root PID 413 // namespace, and is prepended to log messages emitted by Task.Infof etc. 414 logPrefix atomic.Pointer[string] `state:"nosave"` 415 416 // traceContext and traceTask are both used for tracing, and are 417 // updated along with the logPrefix in updateInfoLocked. 418 // 419 // These are exclusive to the task goroutine. 420 traceContext gocontext.Context `state:"nosave"` 421 traceTask *trace.Task `state:"nosave"` 422 423 // creds is the task's credentials. 424 // 425 // creds.Load() may be called without synchronization. creds.Store() is 426 // serialized by mu. creds is owned by the task goroutine. All 427 // auth.Credentials objects that creds may point to, or have pointed to 428 // in the past, must be treated as immutable. 429 creds auth.AtomicPtrCredentials 430 431 // utsns is the task's UTS namespace. 432 // 433 // utsns is protected by mu. utsns is owned by the task goroutine. 434 utsns *UTSNamespace 435 436 // ipcns is the task's IPC namespace. 437 // 438 // ipcns is protected by mu. ipcns is owned by the task goroutine. 439 ipcns *IPCNamespace 440 441 // mountNamespace is the task's mount namespace. 442 // 443 // It is protected by mu. It is owned by the task goroutine. 444 mountNamespace *vfs.MountNamespace 445 446 // parentDeathSignal is sent to this task's thread group when its parent exits. 447 // 448 // parentDeathSignal is protected by mu. 449 parentDeathSignal linux.Signal 450 451 // seccomp contains all seccomp-bpf syscall filters applicable to the task. 452 // The type of the atomic is *taskSeccomp. 453 // Writing needs to be protected by the signal mutex. 454 // 455 // seccomp is owned by the task goroutine. 456 seccomp atomic.Pointer[taskSeccomp] `state:".(*taskSeccomp)"` 457 458 // If cleartid is non-zero, treat it as a pointer to a ThreadID in the 459 // task's virtual address space; when the task exits, set the pointed-to 460 // ThreadID to 0, and wake any futex waiters. 461 // 462 // cleartid is exclusive to the task goroutine. 463 cleartid hostarch.Addr 464 465 // This is mostly a fake cpumask just for sched_set/getaffinity as we 466 // don't really control the affinity. 467 // 468 // Invariant: allowedCPUMask.Size() == 469 // sched.CPUMaskSize(Kernel.applicationCores). 470 // 471 // allowedCPUMask is protected by mu. 472 allowedCPUMask sched.CPUSet 473 474 // cpu is the fake cpu number returned by getcpu(2). cpu is ignored 475 // entirely if Kernel.useHostCores is true. 476 cpu atomicbitops.Int32 477 478 // This is used to keep track of changes made to a process' priority/niceness. 479 // It is mostly used to provide some reasonable return value from 480 // getpriority(2) after a call to setpriority(2) has been made. 481 // We currently do not actually modify a process' scheduling priority. 482 // NOTE: This represents the userspace view of priority (nice). 483 // This means that the value should be in the range [-20, 19]. 484 // 485 // niceness is protected by mu. 486 niceness int 487 488 // This is used to track the numa policy for the current thread. This can be 489 // modified through a set_mempolicy(2) syscall. Since we always report a 490 // single numa node, all policies are no-ops. We only track this information 491 // so that we can return reasonable values if the application calls 492 // get_mempolicy(2) after setting a non-default policy. Note that in the 493 // real syscall, nodemask can be longer than a single unsigned long, but we 494 // always report a single node so never need to save more than a single 495 // bit. 496 // 497 // numaPolicy and numaNodeMask are protected by mu. 498 numaPolicy linux.NumaPolicy 499 numaNodeMask uint64 500 501 // netns is the task's network namespace. It has to be changed under mu 502 // so that GetNetworkNamespace can take a reference before it is 503 // released. It is changed only from the task goroutine. 504 netns *inet.Namespace 505 506 // If rseqPreempted is true, before the next call to p.Switch(), 507 // interrupt rseq critical regions as defined by rseqAddr and 508 // tg.oldRSeqCritical and write the task goroutine's CPU number to 509 // rseqAddr/oldRSeqCPUAddr. 510 // 511 // We support two ABIs for restartable sequences: 512 // 513 // 1. The upstream interface added in v4.18, 514 // 2. An "old" interface never merged upstream. In the implementation, 515 // this is referred to as "old rseq". 516 // 517 // rseqPreempted is exclusive to the task goroutine. 518 rseqPreempted bool `state:"nosave"` 519 520 // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr. 521 // 522 // If rseq is unused, rseqCPU is -1 for convenient use in 523 // platform.Context.Switch. 524 // 525 // rseqCPU is exclusive to the task goroutine. 526 rseqCPU int32 527 528 // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable. 529 // 530 // oldRSeqCPUAddr is exclusive to the task goroutine. 531 oldRSeqCPUAddr hostarch.Addr 532 533 // rseqAddr is a pointer to the userspace linux.RSeq structure. 534 // 535 // rseqAddr is exclusive to the task goroutine. 536 rseqAddr hostarch.Addr 537 538 // rseqSignature is the signature that the rseq abort IP must be signed 539 // with. 540 // 541 // rseqSignature is exclusive to the task goroutine. 542 rseqSignature uint32 543 544 // copyScratchBuffer is a buffer available to CopyIn/CopyOut 545 // implementations that require an intermediate buffer to copy data 546 // into/out of. It prevents these buffers from being allocated/zeroed in 547 // each syscall and eventually garbage collected. 548 // 549 // copyScratchBuffer is exclusive to the task goroutine. 550 copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"` 551 552 // blockingTimer is used for blocking timeouts. blockingTimerChan is the 553 // channel that is sent to when blockingTimer fires. 554 // 555 // blockingTimer is exclusive to the task goroutine. 556 blockingTimer *ktime.Timer `state:"nosave"` 557 blockingTimerChan <-chan struct{} `state:"nosave"` 558 559 // futexWaiter is used for futex(FUTEX_WAIT) syscalls. 560 // 561 // futexWaiter is exclusive to the task goroutine. 562 futexWaiter *futex.Waiter `state:"nosave"` 563 564 // robustList is a pointer to the head of the tasks's robust futex 565 // list. 566 robustList hostarch.Addr 567 568 // startTime is the real time at which the task started. It is set when 569 // a Task is created or invokes execve(2). 570 // 571 // startTime is protected by mu. 572 startTime ktime.Time 573 574 // kcov is the kcov instance providing code coverage owned by this task. 575 // 576 // kcov is exclusive to the task goroutine. 577 kcov *Kcov 578 579 // cgroups is the set of cgroups this task belongs to. This may be empty if 580 // no cgroup controllers are enabled. Protected by mu. 581 // 582 // +checklocks:mu 583 cgroups map[Cgroup]struct{} 584 585 // memCgID is the memory cgroup id. 586 memCgID atomicbitops.Uint32 587 588 // userCounters is a pointer to a set of user counters. 589 // 590 // The userCounters pointer is exclusive to the task goroutine, but the 591 // userCounters instance must be atomically accessed. 592 userCounters *UserCounters 593 594 // sessionKeyring is a pointer to the task's session keyring, if set. 595 // It is guaranteed to be of type "keyring". 596 // 597 // +checklocks:mu 598 sessionKeyring *auth.Key 599 } 600 601 // Task related metrics 602 var ( 603 // syscallCounter is a metric that tracks how many syscalls the sentry has 604 // executed. 605 syscallCounter = metric.SentryProfiling.MustCreateNewUint64Metric( 606 "/task/syscalls", false, "The number of syscalls the sentry has executed for the user.") 607 608 // faultCounter is a metric that tracks how many faults the sentry has had to 609 // handle. 610 faultCounter = metric.SentryProfiling.MustCreateNewUint64Metric( 611 "/task/faults", false, "The number of faults the sentry has handled.") 612 ) 613 614 func (t *Task) savePtraceTracer() *Task { 615 return t.ptraceTracer.Load() 616 } 617 618 func (t *Task) loadPtraceTracer(_ gocontext.Context, tracer *Task) { 619 t.ptraceTracer.Store(tracer) 620 } 621 622 func (t *Task) saveSeccomp() *taskSeccomp { 623 return t.seccomp.Load() 624 } 625 626 func (t *Task) loadSeccomp(_ gocontext.Context, seccompData *taskSeccomp) { 627 t.seccomp.Store(seccompData) 628 } 629 630 // afterLoad is invoked by stateify. 631 func (t *Task) afterLoad(gocontext.Context) { 632 t.updateInfoLocked() 633 if ts := t.seccomp.Load(); ts != nil { 634 ts.populateCache(t) 635 } 636 t.interruptChan = make(chan struct{}, 1) 637 t.gosched.State = TaskGoroutineNonexistent 638 if t.stop != nil { 639 t.stopCount = atomicbitops.FromInt32(1) 640 } 641 t.endStopCond.L = &t.tg.signalHandlers.mu 642 t.rseqPreempted = true 643 t.futexWaiter = futex.NewWaiter() 644 t.p = t.k.Platform.NewContext(t.AsyncContext()) 645 } 646 647 // copyScratchBufferLen is the length of Task.copyScratchBuffer. 648 const copyScratchBufferLen = 144 // sizeof(struct stat) 649 650 // CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut 651 // functions. It must only be used within those functions and can only be used 652 // by the task goroutine; it exists to improve performance and thus 653 // intentionally lacks any synchronization. 654 // 655 // Callers should pass a constant value as an argument if possible, which will 656 // allow the compiler to inline and optimize out the if statement below. 657 func (t *Task) CopyScratchBuffer(size int) []byte { 658 if size > copyScratchBufferLen { 659 return make([]byte, size) 660 } 661 return t.copyScratchBuffer[:size] 662 } 663 664 // FutexWaiter returns the Task's futex.Waiter. 665 func (t *Task) FutexWaiter() *futex.Waiter { 666 return t.futexWaiter 667 } 668 669 // Kernel returns the Kernel containing t. 670 func (t *Task) Kernel() *Kernel { 671 return t.k 672 } 673 674 // SetClearTID sets t's cleartid. 675 // 676 // Preconditions: The caller must be running on the task goroutine. 677 func (t *Task) SetClearTID(addr hostarch.Addr) { 678 t.cleartid = addr 679 } 680 681 // SetSyscallRestartBlock sets the restart block for use in 682 // restart_syscall(2). After registering a restart block, a syscall should 683 // return ERESTART_RESTARTBLOCK to request a restart using the block. 684 // 685 // Precondition: The caller must be running on the task goroutine. 686 func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) { 687 t.syscallRestartBlock = r 688 } 689 690 // SyscallRestartBlock returns the currently registered restart block for use in 691 // restart_syscall(2). This function is *not* idempotent and may be called once 692 // per syscall. This function must not be called if a restart block has not been 693 // registered for the current syscall. 694 // 695 // Precondition: The caller must be running on the task goroutine. 696 func (t *Task) SyscallRestartBlock() SyscallRestartBlock { 697 r := t.syscallRestartBlock 698 // Explicitly set the restart block to nil so that a future syscall can't 699 // accidentally reuse it. 700 t.syscallRestartBlock = nil 701 return r 702 } 703 704 // IsChrooted returns true if the root directory of t's FSContext is not the 705 // root directory of t's MountNamespace. 706 // 707 // Preconditions: The caller must be running on the task goroutine, or t.mu 708 // must be locked. 709 func (t *Task) IsChrooted() bool { 710 realRoot := t.mountNamespace.Root(t) 711 defer realRoot.DecRef(t) 712 root := t.fsContext.RootDirectory() 713 defer root.DecRef(t) 714 return root != realRoot 715 } 716 717 // TaskImage returns t's TaskImage. 718 // 719 // Precondition: The caller must be running on the task goroutine, or t.mu must 720 // be locked. 721 func (t *Task) TaskImage() *TaskImage { 722 return &t.image 723 } 724 725 // FSContext returns t's FSContext. FSContext does not take an additional 726 // reference on the returned FSContext. 727 // 728 // Precondition: The caller must be running on the task goroutine, or t.mu must 729 // be locked. 730 func (t *Task) FSContext() *FSContext { 731 return t.fsContext 732 } 733 734 // FDTable returns t's FDTable. FDMTable does not take an additional reference 735 // on the returned FDMap. 736 // 737 // Precondition: The caller must be running on the task goroutine, or t.mu must 738 // be locked. 739 func (t *Task) FDTable() *FDTable { 740 return t.fdTable 741 } 742 743 // GetFile is a convenience wrapper for t.FDTable().Get. 744 // 745 // Precondition: same as FDTable.Get. 746 func (t *Task) GetFile(fd int32) *vfs.FileDescription { 747 f, _ := t.fdTable.Get(fd) 748 return f 749 } 750 751 // NewFDs is a convenience wrapper for t.FDTable().NewFDs. 752 // 753 // This automatically passes the task as the context. 754 // 755 // Precondition: same as FDTable. 756 func (t *Task) NewFDs(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) { 757 return t.fdTable.NewFDs(t, fd, files, flags) 758 } 759 760 // NewFDFrom is a convenience wrapper for t.FDTable().NewFD. 761 // 762 // This automatically passes the task as the context. 763 // 764 // Precondition: same as FDTable.Get. 765 func (t *Task) NewFDFrom(minFD int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { 766 return t.fdTable.NewFD(t, minFD, file, flags) 767 } 768 769 // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. 770 // 771 // This automatically passes the task as the context. 772 // 773 // Precondition: same as FDTable. 774 func (t *Task) NewFDAt(fd int32, file *vfs.FileDescription, flags FDFlags) (*vfs.FileDescription, error) { 775 return t.fdTable.NewFDAt(t, fd, file, flags) 776 } 777 778 // WithMuLocked executes f with t.mu locked. 779 func (t *Task) WithMuLocked(f func(*Task)) { 780 t.mu.Lock() 781 f(t) 782 t.mu.Unlock() 783 } 784 785 // MountNamespace returns t's MountNamespace. 786 func (t *Task) MountNamespace() *vfs.MountNamespace { 787 t.mu.Lock() 788 defer t.mu.Unlock() 789 return t.mountNamespace 790 } 791 792 // GetMountNamespace returns t's MountNamespace. A reference is taken on the 793 // returned mount namespace. 794 func (t *Task) GetMountNamespace() *vfs.MountNamespace { 795 t.mu.Lock() 796 defer t.mu.Unlock() 797 mntns := t.mountNamespace 798 if mntns != nil { 799 mntns.IncRef() 800 } 801 return mntns 802 } 803 804 // ContainerID returns t's container ID. 805 func (t *Task) ContainerID() string { 806 return t.containerID 807 } 808 809 // RestoreContainerID sets t's container ID in case the restored container ID 810 // is different from when it was saved. 811 func (t *Task) RestoreContainerID(cid string) { 812 t.containerID = cid 813 } 814 815 // OOMScoreAdj gets the task's thread group's OOM score adjustment. 816 func (t *Task) OOMScoreAdj() int32 { 817 return t.tg.oomScoreAdj.Load() 818 } 819 820 // SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The 821 // value should be between -1000 and 1000 inclusive. 822 func (t *Task) SetOOMScoreAdj(adj int32) error { 823 if adj > 1000 || adj < -1000 { 824 return linuxerr.EINVAL 825 } 826 t.tg.oomScoreAdj.Store(adj) 827 return nil 828 } 829 830 // KUID returns t's kuid. 831 func (t *Task) KUID() uint32 { 832 return uint32(t.Credentials().EffectiveKUID) 833 } 834 835 // KGID returns t's kgid. 836 func (t *Task) KGID() uint32 { 837 return uint32(t.Credentials().EffectiveKGID) 838 } 839 840 // SetKcov sets the kcov instance associated with t. 841 func (t *Task) SetKcov(k *Kcov) { 842 t.kcov = k 843 } 844 845 // ResetKcov clears the kcov instance associated with t. 846 func (t *Task) ResetKcov() { 847 if t.kcov != nil { 848 t.kcov.OnTaskExit() 849 t.kcov = nil 850 } 851 }