github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/task.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 gocontext "context" 19 "runtime/trace" 20 "sync/atomic" 21 22 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 23 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 24 "github.com/nicocha30/gvisor-ligolo/pkg/bpf" 25 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 26 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 27 "github.com/nicocha30/gvisor-ligolo/pkg/metric" 28 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/inet" 29 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 30 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/futex" 31 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/sched" 32 ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time" 33 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform" 34 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage" 35 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 36 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 37 "github.com/nicocha30/gvisor-ligolo/pkg/waiter" 38 ) 39 40 // Task represents a thread of execution in the untrusted app. It 41 // includes registers and any thread-specific state that you would 42 // normally expect. 43 // 44 // Each task is associated with a goroutine, called the task goroutine, that 45 // executes code (application code, system calls, etc.) on behalf of that task. 46 // See Task.run (task_run.go). 47 // 48 // All fields that are "owned by the task goroutine" can only be mutated by the 49 // task goroutine while it is running. The task goroutine does not require 50 // synchronization to read these fields, although it still requires 51 // synchronization as described for those fields to mutate them. 52 // 53 // All fields that are "exclusive to the task goroutine" can only be accessed 54 // by the task goroutine while it is running. The task goroutine does not 55 // require synchronization to read or write these fields. 56 // 57 // +stateify savable 58 type Task struct { 59 taskNode 60 61 // goid is the task goroutine's ID. goid is owned by the task goroutine, 62 // but since it's used to detect cases where non-task goroutines 63 // incorrectly access state owned by, or exclusive to, the task goroutine, 64 // goid is always accessed using atomic memory operations. 65 goid atomicbitops.Int64 `state:"nosave"` 66 67 // runState is what the task goroutine is executing if it is not stopped. 68 // If runState is nil, the task goroutine should exit or has exited. 69 // runState is exclusive to the task goroutine. 70 runState taskRunState 71 72 // taskWorkCount represents the current size of the task work queue. It is 73 // used to avoid acquiring taskWorkMu when the queue is empty. 74 taskWorkCount atomicbitops.Int32 75 76 // taskWorkMu protects taskWork. 77 taskWorkMu taskWorkMutex `state:"nosave"` 78 79 // taskWork is a queue of work to be executed before resuming user execution. 80 // It is similar to the task_work mechanism in Linux. 81 // 82 // taskWork is exclusive to the task goroutine. 83 taskWork []TaskWorker 84 85 // haveSyscallReturn is true if image.Arch().Return() represents a value 86 // returned by a syscall (or set by ptrace after a syscall). 87 // 88 // haveSyscallReturn is exclusive to the task goroutine. 89 haveSyscallReturn bool 90 91 // interruptChan is notified whenever the task goroutine is interrupted 92 // (usually by a pending signal). interruptChan is effectively a condition 93 // variable that can be used in select statements. 94 // 95 // interruptChan is not saved; because saving interrupts all tasks, 96 // interruptChan is always notified after restore (see Task.run). 97 interruptChan chan struct{} `state:"nosave"` 98 99 // gosched contains the current scheduling state of the task goroutine. 100 // 101 // gosched is protected by goschedSeq. gosched is owned by the task 102 // goroutine. 103 goschedSeq sync.SeqCount `state:"nosave"` 104 gosched TaskGoroutineSchedInfo 105 106 // yieldCount is the number of times the task goroutine has called 107 // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or 108 // Task.Yield(), voluntarily ceasing execution. 109 // 110 // yieldCount is accessed using atomic memory operations. yieldCount is 111 // owned by the task goroutine. 112 yieldCount atomicbitops.Uint64 113 114 // pendingSignals is the set of pending signals that may be handled only by 115 // this task. 116 // 117 // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu 118 // (hereafter "the signal mutex"); see comment on 119 // ThreadGroup.signalHandlers. 120 pendingSignals pendingSignals 121 122 // signalMask is the set of signals whose delivery is currently blocked. 123 // 124 // signalMask is accessed using atomic memory operations, and is protected 125 // by the signal mutex (such that reading signalMask is safe if either the 126 // signal mutex is locked or if atomic memory operations are used, while 127 // writing signalMask requires both). signalMask is owned by the task 128 // goroutine. 129 signalMask atomicbitops.Uint64 130 131 // If the task goroutine is currently executing Task.sigtimedwait, 132 // realSignalMask is the previous value of signalMask, which has temporarily 133 // been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0. 134 // 135 // realSignalMask is exclusive to the task goroutine. 136 realSignalMask linux.SignalSet 137 138 // If haveSavedSignalMask is true, savedSignalMask is the signal mask that 139 // should be applied after the task has either delivered one signal to a 140 // user handler or is about to resume execution in the untrusted 141 // application. 142 // 143 // Both haveSavedSignalMask and savedSignalMask are exclusive to the task 144 // goroutine. 145 haveSavedSignalMask bool 146 savedSignalMask linux.SignalSet 147 148 // signalStack is the alternate signal stack used by signal handlers for 149 // which the SA_ONSTACK flag is set. 150 // 151 // signalStack is exclusive to the task goroutine. 152 signalStack linux.SignalStack 153 154 // signalQueue is a set of registered waiters for signal-related events. 155 // 156 // signalQueue is protected by the signalMutex. Note that the task does 157 // not implement all queue methods, specifically the readiness checks. 158 // The task only broadcast a notification on signal delivery. 159 signalQueue waiter.Queue 160 161 // If groupStopPending is true, the task should participate in a group 162 // stop in the interrupt path. 163 // 164 // groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux. 165 // 166 // groupStopPending is protected by the signal mutex. 167 groupStopPending bool 168 169 // If groupStopAcknowledged is true, the task has already acknowledged that 170 // it is entering the most recent group stop that has been initiated on its 171 // thread group. 172 // 173 // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux. 174 // 175 // groupStopAcknowledged is protected by the signal mutex. 176 groupStopAcknowledged bool 177 178 // If trapStopPending is true, the task goroutine should enter a 179 // PTRACE_INTERRUPT-induced stop from the interrupt path. 180 // 181 // trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that 182 // Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects 183 // JOBCTL_STOP_PENDING. 184 // 185 // trapStopPending is protected by the signal mutex. 186 trapStopPending bool 187 188 // If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group 189 // stop has begun or ended since the last time the task entered a 190 // ptrace-stop from the group-stop path. 191 // 192 // trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux. 193 // 194 // trapNotifyPending is protected by the signal mutex. 195 trapNotifyPending bool 196 197 // If stop is not nil, it is the internally-initiated condition that 198 // currently prevents the task goroutine from running. 199 // 200 // stop is protected by the signal mutex. 201 stop TaskStop 202 203 // stopCount is the number of active external stops (calls to 204 // Task.BeginExternalStop that have not been paired with a call to 205 // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is 206 // non-zero if the task goroutine should stop. 207 // 208 // Mutating stopCount requires both locking the signal mutex and using 209 // atomic memory operations. Reading stopCount requires either locking the 210 // signal mutex or using atomic memory operations. This allows Task.doStop 211 // to require only a single atomic read in the common case where stopCount 212 // is 0. 213 // 214 // stopCount is not saved, because external stops cannot be retained across 215 // a save/restore cycle. (Suppose a sentryctl command issues an external 216 // stop; after a save/restore cycle, the restored sentry has no knowledge 217 // of the pre-save sentryctl command, and the stopped task would remain 218 // stopped forever.) 219 stopCount atomicbitops.Int32 `state:"nosave"` 220 221 // endStopCond is signaled when stopCount transitions to 0. The combination 222 // of stopCount and endStopCond effectively form a sync.WaitGroup, but 223 // WaitGroup provides no way to read its counter value. 224 // 225 // Invariant: endStopCond.L is the signal mutex. (This is not racy because 226 // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine 227 // calls sync.Cond.Wait; and only the task goroutine can change the 228 // identity of the signal mutex, in Task.finishExec.) 229 endStopCond sync.Cond `state:"nosave"` 230 231 // exitStatus is the task's exit status. 232 // 233 // exitStatus is protected by the signal mutex. 234 exitStatus linux.WaitStatus 235 236 // syscallRestartBlock represents a custom restart function to run in 237 // restart_syscall(2) to resume an interrupted syscall. 238 // 239 // syscallRestartBlock is exclusive to the task goroutine. 240 syscallRestartBlock SyscallRestartBlock 241 242 // p provides the mechanism by which the task runs code in userspace. The p 243 // interface object is immutable. 244 p platform.Context `state:"nosave"` 245 246 // k is the Kernel that this task belongs to. The k pointer is immutable. 247 k *Kernel 248 249 // containerID has no equivalent in Linux; it's used by runsc to track all 250 // tasks that belong to a given containers since cgroups aren't implemented. 251 // It's inherited by the children, is immutable, and may be empty. 252 // 253 // NOTE: cgroups can be used to track this when implemented. 254 containerID string 255 256 // mu protects some of the following fields. 257 mu taskMutex `state:"nosave"` 258 259 // image holds task data provided by the ELF loader. 260 // 261 // image is protected by mu, and is owned by the task goroutine. 262 image TaskImage 263 264 // fsContext is the task's filesystem context. 265 // 266 // fsContext is protected by mu, and is owned by the task goroutine. 267 fsContext *FSContext 268 269 // fdTable is the task's file descriptor table. 270 // 271 // fdTable is protected by mu, and is owned by the task goroutine. 272 fdTable *FDTable 273 274 // If vforkParent is not nil, it is the task that created this task with 275 // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when 276 // this TaskImage is released. 277 // 278 // vforkParent is protected by the TaskSet mutex. 279 vforkParent *Task 280 281 // exitState is the task's progress through the exit path. 282 // 283 // exitState is protected by the TaskSet mutex. exitState is owned by the 284 // task goroutine. 285 exitState TaskExitState 286 287 // exitTracerNotified is true if the exit path has either signaled the 288 // task's tracer to indicate the exit, or determined that no such signal is 289 // needed. exitTracerNotified can only be true if exitState is 290 // TaskExitZombie or TaskExitDead. 291 // 292 // exitTracerNotified is protected by the TaskSet mutex. 293 exitTracerNotified bool 294 295 // exitTracerAcked is true if exitTracerNotified is true and either the 296 // task's tracer has acknowledged the exit notification, or the exit path 297 // has determined that no such notification is needed. 298 // 299 // exitTracerAcked is protected by the TaskSet mutex. 300 exitTracerAcked bool 301 302 // exitParentNotified is true if the exit path has either signaled the 303 // task's parent to indicate the exit, or determined that no such signal is 304 // needed. exitParentNotified can only be true if exitState is 305 // TaskExitZombie or TaskExitDead. 306 // 307 // exitParentNotified is protected by the TaskSet mutex. 308 exitParentNotified bool 309 310 // exitParentAcked is true if exitParentNotified is true and either the 311 // task's parent has acknowledged the exit notification, or the exit path 312 // has determined that no such acknowledgment is needed. 313 // 314 // exitParentAcked is protected by the TaskSet mutex. 315 exitParentAcked bool 316 317 // goroutineStopped is a WaitGroup whose counter value is 1 when the task 318 // goroutine is running and 0 when the task goroutine is stopped or has 319 // exited. 320 goroutineStopped sync.WaitGroup `state:"nosave"` 321 322 // ptraceTracer is the task that is ptrace-attached to this one. If 323 // ptraceTracer is nil, this task is not being traced. Note that due to 324 // atomic.Value limitations (atomic.Value.Store(nil) panics), a nil 325 // ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)). 326 // 327 // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic 328 // operations. This allows paths that wouldn't otherwise lock the TaskSet 329 // mutex, notably the syscall path, to check if ptraceTracer is nil without 330 // additional synchronization. 331 ptraceTracer atomic.Value `state:".(*Task)"` 332 333 // ptraceTracees is the set of tasks that this task is ptrace-attached to. 334 // 335 // ptraceTracees is protected by the TaskSet mutex. 336 ptraceTracees map[*Task]struct{} 337 338 // ptraceSeized is true if ptraceTracer attached to this task with 339 // PTRACE_SEIZE. 340 // 341 // ptraceSeized is protected by the TaskSet mutex. 342 ptraceSeized bool 343 344 // ptraceOpts contains ptrace options explicitly set by the tracer. If 345 // ptraceTracer is nil, ptraceOpts is expected to be the zero value. 346 // 347 // ptraceOpts is protected by the TaskSet mutex. 348 ptraceOpts ptraceOptions 349 350 // ptraceSyscallMode controls ptrace behavior around syscall entry and 351 // exit. 352 // 353 // ptraceSyscallMode is protected by the TaskSet mutex. 354 ptraceSyscallMode ptraceSyscallMode 355 356 // If ptraceSinglestep is true, the next time the task executes application 357 // code, single-stepping should be enabled. ptraceSinglestep is stored 358 // independently of the architecture-specific trap flag because tracer 359 // detaching (which can happen concurrently with the tracee's execution if 360 // the tracer exits) must disable single-stepping, and the task's 361 // architectural state is implicitly exclusive to the task goroutine (no 362 // synchronization occurs before passing registers to SwitchToApp). 363 // 364 // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP. 365 // 366 // ptraceSinglestep is protected by the TaskSet mutex. 367 ptraceSinglestep bool 368 369 // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the 370 // time that t entered the ptrace stop, reset to 0 when the tracer 371 // acknowledges the stop with a wait*() syscall. Otherwise, it is the 372 // signal number passed to the ptrace operation that ended the last ptrace 373 // stop on this task. In the latter case, the effect of ptraceCode depends 374 // on the nature of the ptrace stop; signal-delivery-stop uses it to 375 // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the 376 // signal to the task after leaving the stop, and PTRACE_EVENT stops and 377 // traced group stops ignore it entirely. 378 // 379 // Linux contextually stores the equivalent of ptraceCode in 380 // task_struct::exit_code. 381 // 382 // ptraceCode is protected by the TaskSet mutex. 383 ptraceCode int32 384 385 // ptraceSiginfo is the value returned to the tracer by 386 // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO). 387 // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.) 388 // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is 389 // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which 390 // is in turn required to distinguish group stops from other ptrace stops, 391 // per subsection "Group-stop" in ptrace(2)). 392 // 393 // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo. 394 // 395 // ptraceSiginfo is protected by the TaskSet mutex. 396 ptraceSiginfo *linux.SignalInfo 397 398 // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to 399 // the tracer by ptrace(PTRACE_GETEVENTMSG). 400 // 401 // ptraceEventMsg is protected by the TaskSet mutex. 402 ptraceEventMsg uint64 403 404 // ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has 405 // been added before. This is used during task exit to decide whether we need 406 // to clean up YAMA exceptions. 407 // 408 // ptraceYAMAExceptionAdded is protected by the TaskSet mutex. 409 ptraceYAMAExceptionAdded bool 410 411 // The struct that holds the IO-related usage. The ioUsage pointer is 412 // immutable. 413 ioUsage *usage.IO 414 415 // logPrefix is a string containing the task's thread ID in the root PID 416 // namespace, and is prepended to log messages emitted by Task.Infof etc. 417 logPrefix atomic.Value `state:"nosave"` 418 419 // traceContext and traceTask are both used for tracing, and are 420 // updated along with the logPrefix in updateInfoLocked. 421 // 422 // These are exclusive to the task goroutine. 423 traceContext gocontext.Context `state:"nosave"` 424 traceTask *trace.Task `state:"nosave"` 425 426 // creds is the task's credentials. 427 // 428 // creds.Load() may be called without synchronization. creds.Store() is 429 // serialized by mu. creds is owned by the task goroutine. All 430 // auth.Credentials objects that creds may point to, or have pointed to 431 // in the past, must be treated as immutable. 432 creds auth.AtomicPtrCredentials 433 434 // utsns is the task's UTS namespace. 435 // 436 // utsns is protected by mu. utsns is owned by the task goroutine. 437 utsns *UTSNamespace 438 439 // ipcns is the task's IPC namespace. 440 // 441 // ipcns is protected by mu. ipcns is owned by the task goroutine. 442 ipcns *IPCNamespace 443 444 // abstractSockets tracks abstract sockets that are in use. 445 // 446 // abstractSockets is protected by mu. 447 abstractSockets *AbstractSocketNamespace 448 449 // mountNamespace is the task's mount namespace. 450 // 451 // It is protected by mu. It is owned by the task goroutine. 452 mountNamespace *vfs.MountNamespace 453 454 // parentDeathSignal is sent to this task's thread group when its parent exits. 455 // 456 // parentDeathSignal is protected by mu. 457 parentDeathSignal linux.Signal 458 459 // syscallFilters is all seccomp-bpf syscall filters applicable to the 460 // task, in the order in which they were installed. The type of the atomic 461 // is []bpf.Program. Writing needs to be protected by the signal mutex. 462 // 463 // syscallFilters is owned by the task goroutine. 464 syscallFilters atomic.Value `state:".([]bpf.Program)"` 465 466 // If cleartid is non-zero, treat it as a pointer to a ThreadID in the 467 // task's virtual address space; when the task exits, set the pointed-to 468 // ThreadID to 0, and wake any futex waiters. 469 // 470 // cleartid is exclusive to the task goroutine. 471 cleartid hostarch.Addr 472 473 // This is mostly a fake cpumask just for sched_set/getaffinity as we 474 // don't really control the affinity. 475 // 476 // Invariant: allowedCPUMask.Size() == 477 // sched.CPUMaskSize(Kernel.applicationCores). 478 // 479 // allowedCPUMask is protected by mu. 480 allowedCPUMask sched.CPUSet 481 482 // cpu is the fake cpu number returned by getcpu(2). cpu is ignored 483 // entirely if Kernel.useHostCores is true. 484 cpu atomicbitops.Int32 485 486 // This is used to keep track of changes made to a process' priority/niceness. 487 // It is mostly used to provide some reasonable return value from 488 // getpriority(2) after a call to setpriority(2) has been made. 489 // We currently do not actually modify a process' scheduling priority. 490 // NOTE: This represents the userspace view of priority (nice). 491 // This means that the value should be in the range [-20, 19]. 492 // 493 // niceness is protected by mu. 494 niceness int 495 496 // This is used to track the numa policy for the current thread. This can be 497 // modified through a set_mempolicy(2) syscall. Since we always report a 498 // single numa node, all policies are no-ops. We only track this information 499 // so that we can return reasonable values if the application calls 500 // get_mempolicy(2) after setting a non-default policy. Note that in the 501 // real syscall, nodemask can be longer than a single unsigned long, but we 502 // always report a single node so never need to save more than a single 503 // bit. 504 // 505 // numaPolicy and numaNodeMask are protected by mu. 506 numaPolicy linux.NumaPolicy 507 numaNodeMask uint64 508 509 // netns is the task's network namespace. It has to be changed under mu 510 // so that GetNetworkNamespace can take a reference before it is 511 // released. 512 netns inet.NamespaceAtomicPtr 513 514 // If rseqPreempted is true, before the next call to p.Switch(), 515 // interrupt rseq critical regions as defined by rseqAddr and 516 // tg.oldRSeqCritical and write the task goroutine's CPU number to 517 // rseqAddr/oldRSeqCPUAddr. 518 // 519 // We support two ABIs for restartable sequences: 520 // 521 // 1. The upstream interface added in v4.18, 522 // 2. An "old" interface never merged upstream. In the implementation, 523 // this is referred to as "old rseq". 524 // 525 // rseqPreempted is exclusive to the task goroutine. 526 rseqPreempted bool `state:"nosave"` 527 528 // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr. 529 // 530 // If rseq is unused, rseqCPU is -1 for convenient use in 531 // platform.Context.Switch. 532 // 533 // rseqCPU is exclusive to the task goroutine. 534 rseqCPU int32 535 536 // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable. 537 // 538 // oldRSeqCPUAddr is exclusive to the task goroutine. 539 oldRSeqCPUAddr hostarch.Addr 540 541 // rseqAddr is a pointer to the userspace linux.RSeq structure. 542 // 543 // rseqAddr is exclusive to the task goroutine. 544 rseqAddr hostarch.Addr 545 546 // rseqSignature is the signature that the rseq abort IP must be signed 547 // with. 548 // 549 // rseqSignature is exclusive to the task goroutine. 550 rseqSignature uint32 551 552 // copyScratchBuffer is a buffer available to CopyIn/CopyOut 553 // implementations that require an intermediate buffer to copy data 554 // into/out of. It prevents these buffers from being allocated/zeroed in 555 // each syscall and eventually garbage collected. 556 // 557 // copyScratchBuffer is exclusive to the task goroutine. 558 copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"` 559 560 // blockingTimer is used for blocking timeouts. blockingTimerChan is the 561 // channel that is sent to when blockingTimer fires. 562 // 563 // blockingTimer is exclusive to the task goroutine. 564 blockingTimer *ktime.Timer `state:"nosave"` 565 blockingTimerChan <-chan struct{} `state:"nosave"` 566 567 // futexWaiter is used for futex(FUTEX_WAIT) syscalls. 568 // 569 // futexWaiter is exclusive to the task goroutine. 570 futexWaiter *futex.Waiter `state:"nosave"` 571 572 // robustList is a pointer to the head of the tasks's robust futex 573 // list. 574 robustList hostarch.Addr 575 576 // startTime is the real time at which the task started. It is set when 577 // a Task is created or invokes execve(2). 578 // 579 // startTime is protected by mu. 580 startTime ktime.Time 581 582 // kcov is the kcov instance providing code coverage owned by this task. 583 // 584 // kcov is exclusive to the task goroutine. 585 kcov *Kcov 586 587 // cgroups is the set of cgroups this task belongs to. This may be empty if 588 // no cgroup controllers are enabled. Protected by mu. 589 // 590 // +checklocks:mu 591 cgroups map[Cgroup]struct{} 592 593 // memCgID is the memory cgroup id. 594 memCgID atomicbitops.Uint32 595 596 // userCounters is a pointer to a set of user counters. 597 // 598 // The userCounters pointer is exclusive to the task goroutine, but the 599 // userCounters instance must be atomically accessed. 600 userCounters *userCounters 601 } 602 603 // Task related metrics 604 var ( 605 // syscallCounter is a metric that tracks how many syscalls the sentry has 606 // executed. 607 syscallCounter = metric.MustCreateNewProfilingUint64Metric( 608 "/task/syscalls", false, "The number of syscalls the sentry has executed for the user.") 609 610 // faultCounter is a metric that tracks how many faults the sentry has had to 611 // handle. 612 faultCounter = metric.MustCreateNewProfilingUint64Metric( 613 "/task/faults", false, "The number of faults the sentry has handled.") 614 ) 615 616 func (t *Task) savePtraceTracer() *Task { 617 return t.ptraceTracer.Load().(*Task) 618 } 619 620 func (t *Task) loadPtraceTracer(tracer *Task) { 621 t.ptraceTracer.Store(tracer) 622 } 623 624 func (t *Task) saveSyscallFilters() []bpf.Program { 625 if f := t.syscallFilters.Load(); f != nil { 626 return f.([]bpf.Program) 627 } 628 return nil 629 } 630 631 func (t *Task) loadSyscallFilters(filters []bpf.Program) { 632 t.syscallFilters.Store(filters) 633 } 634 635 // afterLoad is invoked by stateify. 636 func (t *Task) afterLoad() { 637 t.updateInfoLocked() 638 t.interruptChan = make(chan struct{}, 1) 639 t.gosched.State = TaskGoroutineNonexistent 640 if t.stop != nil { 641 t.stopCount = atomicbitops.FromInt32(1) 642 } 643 t.endStopCond.L = &t.tg.signalHandlers.mu 644 t.rseqPreempted = true 645 t.futexWaiter = futex.NewWaiter() 646 t.p = t.k.Platform.NewContext(t.AsyncContext()) 647 } 648 649 // copyScratchBufferLen is the length of Task.copyScratchBuffer. 650 const copyScratchBufferLen = 144 // sizeof(struct stat) 651 652 // CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut 653 // functions. It must only be used within those functions and can only be used 654 // by the task goroutine; it exists to improve performance and thus 655 // intentionally lacks any synchronization. 656 // 657 // Callers should pass a constant value as an argument if possible, which will 658 // allow the compiler to inline and optimize out the if statement below. 659 func (t *Task) CopyScratchBuffer(size int) []byte { 660 if size > copyScratchBufferLen { 661 return make([]byte, size) 662 } 663 return t.copyScratchBuffer[:size] 664 } 665 666 // FutexWaiter returns the Task's futex.Waiter. 667 func (t *Task) FutexWaiter() *futex.Waiter { 668 return t.futexWaiter 669 } 670 671 // Kernel returns the Kernel containing t. 672 func (t *Task) Kernel() *Kernel { 673 return t.k 674 } 675 676 // SetClearTID sets t's cleartid. 677 // 678 // Preconditions: The caller must be running on the task goroutine. 679 func (t *Task) SetClearTID(addr hostarch.Addr) { 680 t.cleartid = addr 681 } 682 683 // SetSyscallRestartBlock sets the restart block for use in 684 // restart_syscall(2). After registering a restart block, a syscall should 685 // return ERESTART_RESTARTBLOCK to request a restart using the block. 686 // 687 // Precondition: The caller must be running on the task goroutine. 688 func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) { 689 t.syscallRestartBlock = r 690 } 691 692 // SyscallRestartBlock returns the currently registered restart block for use in 693 // restart_syscall(2). This function is *not* idempotent and may be called once 694 // per syscall. This function must not be called if a restart block has not been 695 // registered for the current syscall. 696 // 697 // Precondition: The caller must be running on the task goroutine. 698 func (t *Task) SyscallRestartBlock() SyscallRestartBlock { 699 r := t.syscallRestartBlock 700 // Explicitly set the restart block to nil so that a future syscall can't 701 // accidentally reuse it. 702 t.syscallRestartBlock = nil 703 return r 704 } 705 706 // IsChrooted returns true if the root directory of t's FSContext is not the 707 // root directory of t's MountNamespace. 708 // 709 // Preconditions: The caller must be running on the task goroutine, or t.mu 710 // must be locked. 711 func (t *Task) IsChrooted() bool { 712 realRoot := t.mountNamespace.Root() 713 root := t.fsContext.RootDirectory() 714 defer root.DecRef(t) 715 return root != realRoot 716 } 717 718 // TaskImage returns t's TaskImage. 719 // 720 // Precondition: The caller must be running on the task goroutine, or t.mu must 721 // be locked. 722 func (t *Task) TaskImage() *TaskImage { 723 return &t.image 724 } 725 726 // FSContext returns t's FSContext. FSContext does not take an additional 727 // reference on the returned FSContext. 728 // 729 // Precondition: The caller must be running on the task goroutine, or t.mu must 730 // be locked. 731 func (t *Task) FSContext() *FSContext { 732 return t.fsContext 733 } 734 735 // FDTable returns t's FDTable. FDMTable does not take an additional reference 736 // on the returned FDMap. 737 // 738 // Precondition: The caller must be running on the task goroutine, or t.mu must 739 // be locked. 740 func (t *Task) FDTable() *FDTable { 741 return t.fdTable 742 } 743 744 // GetFile is a convenience wrapper for t.FDTable().Get. 745 // 746 // Precondition: same as FDTable.Get. 747 func (t *Task) GetFile(fd int32) *vfs.FileDescription { 748 f, _ := t.fdTable.Get(fd) 749 return f 750 } 751 752 // NewFDs is a convenience wrapper for t.FDTable().NewFDs. 753 // 754 // This automatically passes the task as the context. 755 // 756 // Precondition: same as FDTable. 757 func (t *Task) NewFDs(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) { 758 return t.fdTable.NewFDs(t, fd, files, flags) 759 } 760 761 // NewFDFrom is a convenience wrapper for t.FDTable().NewFD. 762 // 763 // This automatically passes the task as the context. 764 // 765 // Precondition: same as FDTable.Get. 766 func (t *Task) NewFDFrom(minFD int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { 767 return t.fdTable.NewFD(t, minFD, file, flags) 768 } 769 770 // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. 771 // 772 // This automatically passes the task as the context. 773 // 774 // Precondition: same as FDTable. 775 func (t *Task) NewFDAt(fd int32, file *vfs.FileDescription, flags FDFlags) error { 776 return t.fdTable.NewFDAt(t, fd, file, flags) 777 } 778 779 // WithMuLocked executes f with t.mu locked. 780 func (t *Task) WithMuLocked(f func(*Task)) { 781 t.mu.Lock() 782 f(t) 783 t.mu.Unlock() 784 } 785 786 // MountNamespace returns t's MountNamespace. A reference is taken on the 787 // returned mount namespace. 788 func (t *Task) MountNamespace() *vfs.MountNamespace { 789 t.mu.Lock() 790 defer t.mu.Unlock() 791 return t.mountNamespace 792 } 793 794 // AbstractSockets returns t's AbstractSocketNamespace. 795 func (t *Task) AbstractSockets() *AbstractSocketNamespace { 796 return t.abstractSockets 797 } 798 799 // ContainerID returns t's container ID. 800 func (t *Task) ContainerID() string { 801 return t.containerID 802 } 803 804 // OOMScoreAdj gets the task's thread group's OOM score adjustment. 805 func (t *Task) OOMScoreAdj() int32 { 806 return t.tg.oomScoreAdj.Load() 807 } 808 809 // SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The 810 // value should be between -1000 and 1000 inclusive. 811 func (t *Task) SetOOMScoreAdj(adj int32) error { 812 if adj > 1000 || adj < -1000 { 813 return linuxerr.EINVAL 814 } 815 t.tg.oomScoreAdj.Store(adj) 816 return nil 817 } 818 819 // KUID returns t's kuid. 820 func (t *Task) KUID() uint32 { 821 return uint32(t.Credentials().EffectiveKUID) 822 } 823 824 // KGID returns t's kgid. 825 func (t *Task) KGID() uint32 { 826 return uint32(t.Credentials().EffectiveKGID) 827 } 828 829 // SetKcov sets the kcov instance associated with t. 830 func (t *Task) SetKcov(k *Kcov) { 831 t.kcov = k 832 } 833 834 // ResetKcov clears the kcov instance associated with t. 835 func (t *Task) ResetKcov() { 836 if t.kcov != nil { 837 t.kcov.OnTaskExit() 838 t.kcov = nil 839 } 840 }