github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 gocontext "context" 19 "runtime/trace" 20 "sync/atomic" 21 22 "github.com/SagerNet/gvisor/pkg/abi/linux" 23 "github.com/SagerNet/gvisor/pkg/bpf" 24 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 25 "github.com/SagerNet/gvisor/pkg/hostarch" 26 "github.com/SagerNet/gvisor/pkg/sentry/fs" 27 "github.com/SagerNet/gvisor/pkg/sentry/inet" 28 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 29 "github.com/SagerNet/gvisor/pkg/sentry/kernel/futex" 30 "github.com/SagerNet/gvisor/pkg/sentry/kernel/sched" 31 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 32 "github.com/SagerNet/gvisor/pkg/sentry/platform" 33 "github.com/SagerNet/gvisor/pkg/sentry/usage" 34 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 35 "github.com/SagerNet/gvisor/pkg/sync" 36 "github.com/SagerNet/gvisor/pkg/waiter" 37 ) 38 39 // Task represents a thread of execution in the untrusted app. It 40 // includes registers and any thread-specific state that you would 41 // normally expect. 42 // 43 // Each task is associated with a goroutine, called the task goroutine, that 44 // executes code (application code, system calls, etc.) on behalf of that task. 45 // See Task.run (task_run.go). 46 // 47 // All fields that are "owned by the task goroutine" can only be mutated by the 48 // task goroutine while it is running. The task goroutine does not require 49 // synchronization to read these fields, although it still requires 50 // synchronization as described for those fields to mutate them. 51 // 52 // All fields that are "exclusive to the task goroutine" can only be accessed 53 // by the task goroutine while it is running. The task goroutine does not 54 // require synchronization to read or write these fields. 55 // 56 // +stateify savable 57 type Task struct { 58 taskNode 59 60 // goid is the task goroutine's ID. goid is owned by the task goroutine, 61 // but since it's used to detect cases where non-task goroutines 62 // incorrectly access state owned by, or exclusive to, the task goroutine, 63 // goid is always accessed using atomic memory operations. 64 goid int64 `state:"nosave"` 65 66 // runState is what the task goroutine is executing if it is not stopped. 67 // If runState is nil, the task goroutine should exit or has exited. 68 // runState is exclusive to the task goroutine. 69 runState taskRunState 70 71 // taskWorkCount represents the current size of the task work queue. It is 72 // used to avoid acquiring taskWorkMu when the queue is empty. 73 // 74 // Must accessed with atomic memory operations. 75 taskWorkCount int32 76 77 // taskWorkMu protects taskWork. 78 taskWorkMu sync.Mutex `state:"nosave"` 79 80 // taskWork is a queue of work to be executed before resuming user execution. 81 // It is similar to the task_work mechanism in Linux. 82 // 83 // taskWork is exclusive to the task goroutine. 84 taskWork []TaskWorker 85 86 // haveSyscallReturn is true if image.Arch().Return() represents a value 87 // returned by a syscall (or set by ptrace after a syscall). 88 // 89 // haveSyscallReturn is exclusive to the task goroutine. 90 haveSyscallReturn bool 91 92 // interruptChan is notified whenever the task goroutine is interrupted 93 // (usually by a pending signal). interruptChan is effectively a condition 94 // variable that can be used in select statements. 95 // 96 // interruptChan is not saved; because saving interrupts all tasks, 97 // interruptChan is always notified after restore (see Task.run). 98 interruptChan chan struct{} `state:"nosave"` 99 100 // gosched contains the current scheduling state of the task goroutine. 101 // 102 // gosched is protected by goschedSeq. gosched is owned by the task 103 // goroutine. 104 goschedSeq sync.SeqCount `state:"nosave"` 105 gosched TaskGoroutineSchedInfo 106 107 // yieldCount is the number of times the task goroutine has called 108 // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or 109 // Task.Yield(), voluntarily ceasing execution. 110 // 111 // yieldCount is accessed using atomic memory operations. yieldCount is 112 // owned by the task goroutine. 113 yieldCount uint64 114 115 // pendingSignals is the set of pending signals that may be handled only by 116 // this task. 117 // 118 // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu 119 // (hereafter "the signal mutex"); see comment on 120 // ThreadGroup.signalHandlers. 121 pendingSignals pendingSignals 122 123 // signalMask is the set of signals whose delivery is currently blocked. 124 // 125 // signalMask is accessed using atomic memory operations, and is protected 126 // by the signal mutex (such that reading signalMask is safe if either the 127 // signal mutex is locked or if atomic memory operations are used, while 128 // writing signalMask requires both). signalMask is owned by the task 129 // goroutine. 130 signalMask linux.SignalSet 131 132 // If the task goroutine is currently executing Task.sigtimedwait, 133 // realSignalMask is the previous value of signalMask, which has temporarily 134 // been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0. 135 // 136 // realSignalMask is exclusive to the task goroutine. 137 realSignalMask linux.SignalSet 138 139 // If haveSavedSignalMask is true, savedSignalMask is the signal mask that 140 // should be applied after the task has either delivered one signal to a 141 // user handler or is about to resume execution in the untrusted 142 // application. 143 // 144 // Both haveSavedSignalMask and savedSignalMask are exclusive to the task 145 // goroutine. 146 haveSavedSignalMask bool 147 savedSignalMask linux.SignalSet 148 149 // signalStack is the alternate signal stack used by signal handlers for 150 // which the SA_ONSTACK flag is set. 151 // 152 // signalStack is exclusive to the task goroutine. 153 signalStack linux.SignalStack 154 155 // signalQueue is a set of registered waiters for signal-related events. 156 // 157 // signalQueue is protected by the signalMutex. Note that the task does 158 // not implement all queue methods, specifically the readiness checks. 159 // The task only broadcast a notification on signal delivery. 160 signalQueue waiter.Queue `state:"zerovalue"` 161 162 // If groupStopPending is true, the task should participate in a group 163 // stop in the interrupt path. 164 // 165 // groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux. 166 // 167 // groupStopPending is protected by the signal mutex. 168 groupStopPending bool 169 170 // If groupStopAcknowledged is true, the task has already acknowledged that 171 // it is entering the most recent group stop that has been initiated on its 172 // thread group. 173 // 174 // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux. 175 // 176 // groupStopAcknowledged is protected by the signal mutex. 177 groupStopAcknowledged bool 178 179 // If trapStopPending is true, the task goroutine should enter a 180 // PTRACE_INTERRUPT-induced stop from the interrupt path. 181 // 182 // trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that 183 // Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects 184 // JOBCTL_STOP_PENDING. 185 // 186 // trapStopPending is protected by the signal mutex. 187 trapStopPending bool 188 189 // If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group 190 // stop has begun or ended since the last time the task entered a 191 // ptrace-stop from the group-stop path. 192 // 193 // trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux. 194 // 195 // trapNotifyPending is protected by the signal mutex. 196 trapNotifyPending bool 197 198 // If stop is not nil, it is the internally-initiated condition that 199 // currently prevents the task goroutine from running. 200 // 201 // stop is protected by the signal mutex. 202 stop TaskStop 203 204 // stopCount is the number of active external stops (calls to 205 // Task.BeginExternalStop that have not been paired with a call to 206 // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is 207 // non-zero if the task goroutine should stop. 208 // 209 // Mutating stopCount requires both locking the signal mutex and using 210 // atomic memory operations. Reading stopCount requires either locking the 211 // signal mutex or using atomic memory operations. This allows Task.doStop 212 // to require only a single atomic read in the common case where stopCount 213 // is 0. 214 // 215 // stopCount is not saved, because external stops cannot be retained across 216 // a save/restore cycle. (Suppose a sentryctl command issues an external 217 // stop; after a save/restore cycle, the restored sentry has no knowledge 218 // of the pre-save sentryctl command, and the stopped task would remain 219 // stopped forever.) 220 stopCount int32 `state:"nosave"` 221 222 // endStopCond is signaled when stopCount transitions to 0. The combination 223 // of stopCount and endStopCond effectively form a sync.WaitGroup, but 224 // WaitGroup provides no way to read its counter value. 225 // 226 // Invariant: endStopCond.L is the signal mutex. (This is not racy because 227 // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine 228 // calls sync.Cond.Wait; and only the task goroutine can change the 229 // identity of the signal mutex, in Task.finishExec.) 230 endStopCond sync.Cond `state:"nosave"` 231 232 // exitStatus is the task's exit status. 233 // 234 // exitStatus is protected by the signal mutex. 235 exitStatus ExitStatus 236 237 // syscallRestartBlock represents a custom restart function to run in 238 // restart_syscall(2) to resume an interrupted syscall. 239 // 240 // syscallRestartBlock is exclusive to the task goroutine. 241 syscallRestartBlock SyscallRestartBlock 242 243 // p provides the mechanism by which the task runs code in userspace. The p 244 // interface object is immutable. 245 p platform.Context `state:"nosave"` 246 247 // k is the Kernel that this task belongs to. The k pointer is immutable. 248 k *Kernel 249 250 // containerID has no equivalent in Linux; it's used by runsc to track all 251 // tasks that belong to a given containers since cgroups aren't implemented. 252 // It's inherited by the children, is immutable, and may be empty. 253 // 254 // NOTE: cgroups can be used to track this when implemented. 255 containerID string 256 257 // mu protects some of the following fields. 258 mu sync.Mutex `state:"nosave"` 259 260 // image holds task data provided by the ELF loader. 261 // 262 // image is protected by mu, and is owned by the task goroutine. 263 image TaskImage 264 265 // fsContext is the task's filesystem context. 266 // 267 // fsContext is protected by mu, and is owned by the task goroutine. 268 fsContext *FSContext 269 270 // fdTable is the task's file descriptor table. 271 // 272 // fdTable is protected by mu, and is owned by the task goroutine. 273 fdTable *FDTable 274 275 // If vforkParent is not nil, it is the task that created this task with 276 // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when 277 // this TaskImage is released. 278 // 279 // vforkParent is protected by the TaskSet mutex. 280 vforkParent *Task 281 282 // exitState is the task's progress through the exit path. 283 // 284 // exitState is protected by the TaskSet mutex. exitState is owned by the 285 // task goroutine. 286 exitState TaskExitState 287 288 // exitTracerNotified is true if the exit path has either signaled the 289 // task's tracer to indicate the exit, or determined that no such signal is 290 // needed. exitTracerNotified can only be true if exitState is 291 // TaskExitZombie or TaskExitDead. 292 // 293 // exitTracerNotified is protected by the TaskSet mutex. 294 exitTracerNotified bool 295 296 // exitTracerAcked is true if exitTracerNotified is true and either the 297 // task's tracer has acknowledged the exit notification, or the exit path 298 // has determined that no such notification is needed. 299 // 300 // exitTracerAcked is protected by the TaskSet mutex. 301 exitTracerAcked bool 302 303 // exitParentNotified is true if the exit path has either signaled the 304 // task's parent to indicate the exit, or determined that no such signal is 305 // needed. exitParentNotified can only be true if exitState is 306 // TaskExitZombie or TaskExitDead. 307 // 308 // exitParentNotified is protected by the TaskSet mutex. 309 exitParentNotified bool 310 311 // exitParentAcked is true if exitParentNotified is true and either the 312 // task's parent has acknowledged the exit notification, or the exit path 313 // has determined that no such acknowledgment is needed. 314 // 315 // exitParentAcked is protected by the TaskSet mutex. 316 exitParentAcked bool 317 318 // goroutineStopped is a WaitGroup whose counter value is 1 when the task 319 // goroutine is running and 0 when the task goroutine is stopped or has 320 // exited. 321 goroutineStopped sync.WaitGroup `state:"nosave"` 322 323 // ptraceTracer is the task that is ptrace-attached to this one. If 324 // ptraceTracer is nil, this task is not being traced. Note that due to 325 // atomic.Value limitations (atomic.Value.Store(nil) panics), a nil 326 // ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)). 327 // 328 // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic 329 // operations. This allows paths that wouldn't otherwise lock the TaskSet 330 // mutex, notably the syscall path, to check if ptraceTracer is nil without 331 // additional synchronization. 332 ptraceTracer atomic.Value `state:".(*Task)"` 333 334 // ptraceTracees is the set of tasks that this task is ptrace-attached to. 335 // 336 // ptraceTracees is protected by the TaskSet mutex. 337 ptraceTracees map[*Task]struct{} 338 339 // ptraceSeized is true if ptraceTracer attached to this task with 340 // PTRACE_SEIZE. 341 // 342 // ptraceSeized is protected by the TaskSet mutex. 343 ptraceSeized bool 344 345 // ptraceOpts contains ptrace options explicitly set by the tracer. If 346 // ptraceTracer is nil, ptraceOpts is expected to be the zero value. 347 // 348 // ptraceOpts is protected by the TaskSet mutex. 349 ptraceOpts ptraceOptions 350 351 // ptraceSyscallMode controls ptrace behavior around syscall entry and 352 // exit. 353 // 354 // ptraceSyscallMode is protected by the TaskSet mutex. 355 ptraceSyscallMode ptraceSyscallMode 356 357 // If ptraceSinglestep is true, the next time the task executes application 358 // code, single-stepping should be enabled. ptraceSinglestep is stored 359 // independently of the architecture-specific trap flag because tracer 360 // detaching (which can happen concurrently with the tracee's execution if 361 // the tracer exits) must disable single-stepping, and the task's 362 // architectural state is implicitly exclusive to the task goroutine (no 363 // synchronization occurs before passing registers to SwitchToApp). 364 // 365 // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP. 366 // 367 // ptraceSinglestep is protected by the TaskSet mutex. 368 ptraceSinglestep bool 369 370 // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the 371 // time that t entered the ptrace stop, reset to 0 when the tracer 372 // acknowledges the stop with a wait*() syscall. Otherwise, it is the 373 // signal number passed to the ptrace operation that ended the last ptrace 374 // stop on this task. In the latter case, the effect of ptraceCode depends 375 // on the nature of the ptrace stop; signal-delivery-stop uses it to 376 // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the 377 // signal to the task after leaving the stop, and PTRACE_EVENT stops and 378 // traced group stops ignore it entirely. 379 // 380 // Linux contextually stores the equivalent of ptraceCode in 381 // task_struct::exit_code. 382 // 383 // ptraceCode is protected by the TaskSet mutex. 384 ptraceCode int32 385 386 // ptraceSiginfo is the value returned to the tracer by 387 // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO). 388 // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.) 389 // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is 390 // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which 391 // is in turn required to distinguish group stops from other ptrace stops, 392 // per subsection "Group-stop" in ptrace(2)). 393 // 394 // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo. 395 // 396 // ptraceSiginfo is protected by the TaskSet mutex. 397 ptraceSiginfo *linux.SignalInfo 398 399 // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to 400 // the tracer by ptrace(PTRACE_GETEVENTMSG). 401 // 402 // ptraceEventMsg is protected by the TaskSet mutex. 403 ptraceEventMsg uint64 404 405 // ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has 406 // been added before. This is used during task exit to decide whether we need 407 // to clean up YAMA exceptions. 408 // 409 // ptraceYAMAExceptionAdded is protected by the TaskSet mutex. 410 ptraceYAMAExceptionAdded bool 411 412 // The struct that holds the IO-related usage. The ioUsage pointer is 413 // immutable. 414 ioUsage *usage.IO 415 416 // logPrefix is a string containing the task's thread ID in the root PID 417 // namespace, and is prepended to log messages emitted by Task.Infof etc. 418 logPrefix atomic.Value `state:"nosave"` 419 420 // traceContext and traceTask are both used for tracing, and are 421 // updated along with the logPrefix in updateInfoLocked. 422 // 423 // These are exclusive to the task goroutine. 424 traceContext gocontext.Context `state:"nosave"` 425 traceTask *trace.Task `state:"nosave"` 426 427 // creds is the task's credentials. 428 // 429 // creds.Load() may be called without synchronization. creds.Store() is 430 // serialized by mu. creds is owned by the task goroutine. All 431 // auth.Credentials objects that creds may point to, or have pointed to 432 // in the past, must be treated as immutable. 433 creds auth.AtomicPtrCredentials 434 435 // utsns is the task's UTS namespace. 436 // 437 // utsns is protected by mu. utsns is owned by the task goroutine. 438 utsns *UTSNamespace 439 440 // ipcns is the task's IPC namespace. 441 // 442 // ipcns is protected by mu. ipcns is owned by the task goroutine. 443 ipcns *IPCNamespace 444 445 // abstractSockets tracks abstract sockets that are in use. 446 // 447 // abstractSockets is protected by mu. 448 abstractSockets *AbstractSocketNamespace 449 450 // mountNamespaceVFS2 is the task's mount namespace. 451 // 452 // It is protected by mu. It is owned by the task goroutine. 453 mountNamespaceVFS2 *vfs.MountNamespace 454 455 // parentDeathSignal is sent to this task's thread group when its parent exits. 456 // 457 // parentDeathSignal is protected by mu. 458 parentDeathSignal linux.Signal 459 460 // syscallFilters is all seccomp-bpf syscall filters applicable to the 461 // task, in the order in which they were installed. The type of the atomic 462 // is []bpf.Program. Writing needs to be protected by the signal mutex. 463 // 464 // syscallFilters is owned by the task goroutine. 465 syscallFilters atomic.Value `state:".([]bpf.Program)"` 466 467 // If cleartid is non-zero, treat it as a pointer to a ThreadID in the 468 // task's virtual address space; when the task exits, set the pointed-to 469 // ThreadID to 0, and wake any futex waiters. 470 // 471 // cleartid is exclusive to the task goroutine. 472 cleartid hostarch.Addr 473 474 // This is mostly a fake cpumask just for sched_set/getaffinity as we 475 // don't really control the affinity. 476 // 477 // Invariant: allowedCPUMask.Size() == 478 // sched.CPUMaskSize(Kernel.applicationCores). 479 // 480 // allowedCPUMask is protected by mu. 481 allowedCPUMask sched.CPUSet 482 483 // cpu is the fake cpu number returned by getcpu(2). cpu is ignored 484 // entirely if Kernel.useHostCores is true. 485 // 486 // cpu is accessed using atomic memory operations. 487 cpu int32 488 489 // This is used to keep track of changes made to a process' priority/niceness. 490 // It is mostly used to provide some reasonable return value from 491 // getpriority(2) after a call to setpriority(2) has been made. 492 // We currently do not actually modify a process' scheduling priority. 493 // NOTE: This represents the userspace view of priority (nice). 494 // This means that the value should be in the range [-20, 19]. 495 // 496 // niceness is protected by mu. 497 niceness int 498 499 // This is used to track the numa policy for the current thread. This can be 500 // modified through a set_mempolicy(2) syscall. Since we always report a 501 // single numa node, all policies are no-ops. We only track this information 502 // so that we can return reasonable values if the application calls 503 // get_mempolicy(2) after setting a non-default policy. Note that in the 504 // real syscall, nodemask can be longer than a single unsigned long, but we 505 // always report a single node so never need to save more than a single 506 // bit. 507 // 508 // numaPolicy and numaNodeMask are protected by mu. 509 numaPolicy linux.NumaPolicy 510 numaNodeMask uint64 511 512 // netns is the task's network namespace. netns is never nil. 513 // 514 // netns is protected by mu. 515 netns *inet.Namespace 516 517 // If rseqPreempted is true, before the next call to p.Switch(), 518 // interrupt rseq critical regions as defined by rseqAddr and 519 // tg.oldRSeqCritical and write the task goroutine's CPU number to 520 // rseqAddr/oldRSeqCPUAddr. 521 // 522 // We support two ABIs for restartable sequences: 523 // 524 // 1. The upstream interface added in v4.18, 525 // 2. An "old" interface never merged upstream. In the implementation, 526 // this is referred to as "old rseq". 527 // 528 // rseqPreempted is exclusive to the task goroutine. 529 rseqPreempted bool `state:"nosave"` 530 531 // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr. 532 // 533 // If rseq is unused, rseqCPU is -1 for convenient use in 534 // platform.Context.Switch. 535 // 536 // rseqCPU is exclusive to the task goroutine. 537 rseqCPU int32 538 539 // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable. 540 // 541 // oldRSeqCPUAddr is exclusive to the task goroutine. 542 oldRSeqCPUAddr hostarch.Addr 543 544 // rseqAddr is a pointer to the userspace linux.RSeq structure. 545 // 546 // rseqAddr is exclusive to the task goroutine. 547 rseqAddr hostarch.Addr 548 549 // rseqSignature is the signature that the rseq abort IP must be signed 550 // with. 551 // 552 // rseqSignature is exclusive to the task goroutine. 553 rseqSignature uint32 554 555 // copyScratchBuffer is a buffer available to CopyIn/CopyOut 556 // implementations that require an intermediate buffer to copy data 557 // into/out of. It prevents these buffers from being allocated/zeroed in 558 // each syscall and eventually garbage collected. 559 // 560 // copyScratchBuffer is exclusive to the task goroutine. 561 copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"` 562 563 // blockingTimer is used for blocking timeouts. blockingTimerChan is the 564 // channel that is sent to when blockingTimer fires. 565 // 566 // blockingTimer is exclusive to the task goroutine. 567 blockingTimer *ktime.Timer `state:"nosave"` 568 blockingTimerChan <-chan struct{} `state:"nosave"` 569 570 // futexWaiter is used for futex(FUTEX_WAIT) syscalls. 571 // 572 // futexWaiter is exclusive to the task goroutine. 573 futexWaiter *futex.Waiter `state:"nosave"` 574 575 // robustList is a pointer to the head of the tasks's robust futex 576 // list. 577 robustList hostarch.Addr 578 579 // startTime is the real time at which the task started. It is set when 580 // a Task is created or invokes execve(2). 581 // 582 // startTime is protected by mu. 583 startTime ktime.Time 584 585 // kcov is the kcov instance providing code coverage owned by this task. 586 // 587 // kcov is exclusive to the task goroutine. 588 kcov *Kcov 589 590 // cgroups is the set of cgroups this task belongs to. This may be empty if 591 // no cgroup controllers are enabled. Protected by mu. 592 // 593 // +checklocks:mu 594 cgroups map[Cgroup]struct{} 595 } 596 597 func (t *Task) savePtraceTracer() *Task { 598 return t.ptraceTracer.Load().(*Task) 599 } 600 601 func (t *Task) loadPtraceTracer(tracer *Task) { 602 t.ptraceTracer.Store(tracer) 603 } 604 605 func (t *Task) saveSyscallFilters() []bpf.Program { 606 if f := t.syscallFilters.Load(); f != nil { 607 return f.([]bpf.Program) 608 } 609 return nil 610 } 611 612 func (t *Task) loadSyscallFilters(filters []bpf.Program) { 613 t.syscallFilters.Store(filters) 614 } 615 616 // afterLoad is invoked by stateify. 617 func (t *Task) afterLoad() { 618 t.updateInfoLocked() 619 t.interruptChan = make(chan struct{}, 1) 620 t.gosched.State = TaskGoroutineNonexistent 621 if t.stop != nil { 622 t.stopCount = 1 623 } 624 t.endStopCond.L = &t.tg.signalHandlers.mu 625 t.p = t.k.Platform.NewContext() 626 t.rseqPreempted = true 627 t.futexWaiter = futex.NewWaiter() 628 } 629 630 // copyScratchBufferLen is the length of Task.copyScratchBuffer. 631 const copyScratchBufferLen = 144 // sizeof(struct stat) 632 633 // CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut 634 // functions. It must only be used within those functions and can only be used 635 // by the task goroutine; it exists to improve performance and thus 636 // intentionally lacks any synchronization. 637 // 638 // Callers should pass a constant value as an argument if possible, which will 639 // allow the compiler to inline and optimize out the if statement below. 640 func (t *Task) CopyScratchBuffer(size int) []byte { 641 if size > copyScratchBufferLen { 642 return make([]byte, size) 643 } 644 return t.copyScratchBuffer[:size] 645 } 646 647 // FutexWaiter returns the Task's futex.Waiter. 648 func (t *Task) FutexWaiter() *futex.Waiter { 649 return t.futexWaiter 650 } 651 652 // Kernel returns the Kernel containing t. 653 func (t *Task) Kernel() *Kernel { 654 return t.k 655 } 656 657 // SetClearTID sets t's cleartid. 658 // 659 // Preconditions: The caller must be running on the task goroutine. 660 func (t *Task) SetClearTID(addr hostarch.Addr) { 661 t.cleartid = addr 662 } 663 664 // SetSyscallRestartBlock sets the restart block for use in 665 // restart_syscall(2). After registering a restart block, a syscall should 666 // return ERESTART_RESTARTBLOCK to request a restart using the block. 667 // 668 // Precondition: The caller must be running on the task goroutine. 669 func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) { 670 t.syscallRestartBlock = r 671 } 672 673 // SyscallRestartBlock returns the currently registered restart block for use in 674 // restart_syscall(2). This function is *not* idempotent and may be called once 675 // per syscall. This function must not be called if a restart block has not been 676 // registered for the current syscall. 677 // 678 // Precondition: The caller must be running on the task goroutine. 679 func (t *Task) SyscallRestartBlock() SyscallRestartBlock { 680 r := t.syscallRestartBlock 681 // Explicitly set the restart block to nil so that a future syscall can't 682 // accidentally reuse it. 683 t.syscallRestartBlock = nil 684 return r 685 } 686 687 // IsChrooted returns true if the root directory of t's FSContext is not the 688 // root directory of t's MountNamespace. 689 // 690 // Preconditions: The caller must be running on the task goroutine, or t.mu 691 // must be locked. 692 func (t *Task) IsChrooted() bool { 693 if VFS2Enabled { 694 realRoot := t.mountNamespaceVFS2.Root() 695 root := t.fsContext.RootDirectoryVFS2() 696 defer root.DecRef(t) 697 return root != realRoot 698 } 699 700 realRoot := t.tg.mounts.Root() 701 defer realRoot.DecRef(t) 702 root := t.fsContext.RootDirectory() 703 if root != nil { 704 defer root.DecRef(t) 705 } 706 return root != realRoot 707 } 708 709 // TaskImage returns t's TaskImage. 710 // 711 // Precondition: The caller must be running on the task goroutine, or t.mu must 712 // be locked. 713 func (t *Task) TaskImage() *TaskImage { 714 return &t.image 715 } 716 717 // FSContext returns t's FSContext. FSContext does not take an additional 718 // reference on the returned FSContext. 719 // 720 // Precondition: The caller must be running on the task goroutine, or t.mu must 721 // be locked. 722 func (t *Task) FSContext() *FSContext { 723 return t.fsContext 724 } 725 726 // FDTable returns t's FDTable. FDMTable does not take an additional reference 727 // on the returned FDMap. 728 // 729 // Precondition: The caller must be running on the task goroutine, or t.mu must 730 // be locked. 731 func (t *Task) FDTable() *FDTable { 732 return t.fdTable 733 } 734 735 // GetFile is a convenience wrapper for t.FDTable().Get. 736 // 737 // Precondition: same as FDTable.Get. 738 func (t *Task) GetFile(fd int32) *fs.File { 739 f, _ := t.fdTable.Get(fd) 740 return f 741 } 742 743 // GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2. 744 // 745 // Precondition: same as FDTable.Get. 746 func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription { 747 f, _ := t.fdTable.GetVFS2(fd) 748 return f 749 } 750 751 // NewFDs is a convenience wrapper for t.FDTable().NewFDs. 752 // 753 // This automatically passes the task as the context. 754 // 755 // Precondition: same as FDTable. 756 func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) { 757 return t.fdTable.NewFDs(t, fd, files, flags) 758 } 759 760 // NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2. 761 // 762 // This automatically passes the task as the context. 763 // 764 // Precondition: same as FDTable. 765 func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) { 766 return t.fdTable.NewFDsVFS2(t, fd, files, flags) 767 } 768 769 // NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file. 770 // 771 // This automatically passes the task as the context. 772 // 773 // Precondition: same as FDTable. 774 func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) { 775 fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags) 776 if err != nil { 777 return 0, err 778 } 779 return fds[0], nil 780 } 781 782 // NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2. 783 // 784 // This automatically passes the task as the context. 785 // 786 // Precondition: same as FDTable.Get. 787 func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { 788 return t.fdTable.NewFDVFS2(t, fd, file, flags) 789 } 790 791 // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. 792 // 793 // This automatically passes the task as the context. 794 // 795 // Precondition: same as FDTable. 796 func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error { 797 return t.fdTable.NewFDAt(t, fd, file, flags) 798 } 799 800 // NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2. 801 // 802 // This automatically passes the task as the context. 803 // 804 // Precondition: same as FDTable. 805 func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error { 806 return t.fdTable.NewFDAtVFS2(t, fd, file, flags) 807 } 808 809 // WithMuLocked executes f with t.mu locked. 810 func (t *Task) WithMuLocked(f func(*Task)) { 811 t.mu.Lock() 812 f(t) 813 t.mu.Unlock() 814 } 815 816 // MountNamespace returns t's MountNamespace. MountNamespace does not take an 817 // additional reference on the returned MountNamespace. 818 func (t *Task) MountNamespace() *fs.MountNamespace { 819 return t.tg.mounts 820 } 821 822 // MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the 823 // returned mount namespace. 824 func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace { 825 t.mu.Lock() 826 defer t.mu.Unlock() 827 return t.mountNamespaceVFS2 828 } 829 830 // AbstractSockets returns t's AbstractSocketNamespace. 831 func (t *Task) AbstractSockets() *AbstractSocketNamespace { 832 return t.abstractSockets 833 } 834 835 // ContainerID returns t's container ID. 836 func (t *Task) ContainerID() string { 837 return t.containerID 838 } 839 840 // OOMScoreAdj gets the task's thread group's OOM score adjustment. 841 func (t *Task) OOMScoreAdj() int32 { 842 return atomic.LoadInt32(&t.tg.oomScoreAdj) 843 } 844 845 // SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The 846 // value should be between -1000 and 1000 inclusive. 847 func (t *Task) SetOOMScoreAdj(adj int32) error { 848 if adj > 1000 || adj < -1000 { 849 return linuxerr.EINVAL 850 } 851 atomic.StoreInt32(&t.tg.oomScoreAdj, adj) 852 return nil 853 } 854 855 // KUID returns t's kuid. 856 func (t *Task) KUID() uint32 { 857 return uint32(t.Credentials().EffectiveKUID) 858 } 859 860 // KGID returns t's kgid. 861 func (t *Task) KGID() uint32 { 862 return uint32(t.Credentials().EffectiveKGID) 863 } 864 865 // SetKcov sets the kcov instance associated with t. 866 func (t *Task) SetKcov(k *Kcov) { 867 t.kcov = k 868 } 869 870 // ResetKcov clears the kcov instance associated with t. 871 func (t *Task) ResetKcov() { 872 if t.kcov != nil { 873 t.kcov.OnTaskExit() 874 t.kcov = nil 875 } 876 }