github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/task_run.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "runtime" 20 "runtime/trace" 21 22 "github.com/metacubex/gvisor/pkg/abi/linux" 23 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 24 "github.com/metacubex/gvisor/pkg/goid" 25 "github.com/metacubex/gvisor/pkg/hostarch" 26 "github.com/metacubex/gvisor/pkg/refs" 27 "github.com/metacubex/gvisor/pkg/sentry/hostcpu" 28 ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 29 "github.com/metacubex/gvisor/pkg/sentry/memmap" 30 "github.com/metacubex/gvisor/pkg/sentry/platform" 31 ) 32 33 // A taskRunState is a reified state in the task state machine. See README.md 34 // for details. The canonical list of all run states, as well as transitions 35 // between them, is given in run_states.dot. 36 // 37 // The set of possible states is enumerable and completely defined by the 38 // kernel package, so taskRunState would ideally be represented by a 39 // discriminated union. However, Go does not support sum types. 40 // 41 // Hence, as with TaskStop, data-free taskRunStates should be represented as 42 // typecast nils to avoid unnecessary allocation. 43 type taskRunState interface { 44 // execute executes the code associated with this state over the given task 45 // and returns the following state. If execute returns nil, the task 46 // goroutine should exit. 47 // 48 // It is valid to tail-call a following state's execute to avoid the 49 // overhead of converting the following state to an interface object and 50 // checking for stops, provided that the tail-call cannot recurse. 51 execute(*Task) taskRunState 52 } 53 54 // run runs the task goroutine. 55 // 56 // threadID a dummy value set to the task's TID in the root PID namespace to 57 // make it visible in stack dumps. A goroutine for a given task can be identified 58 // searching for Task.run()'s argument value. 59 func (t *Task) run(threadID uintptr) { 60 t.goid.Store(goid.Get()) 61 62 refs.CleanupSync.Add(1) 63 defer refs.CleanupSync.Done() 64 65 // Construct t.blockingTimer here. We do this here because we can't 66 // reconstruct t.blockingTimer during restore in Task.afterLoad(), because 67 // kernel.timekeeper.SetClocks() hasn't been called yet. 68 blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier() 69 t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier) 70 defer t.blockingTimer.Destroy() 71 t.blockingTimerChan = blockingTimerChan 72 73 // Activate our address space. 74 t.Activate() 75 // The corresponding t.Deactivate occurs in the exit path 76 // (runExitMain.execute) so that when 77 // Platform.CooperativelySharesAddressSpace() == true, we give up the 78 // AddressSpace before the task goroutine finishes executing. 79 80 // If this is a newly-started task, it should check for participation in 81 // group stops. If this is a task resuming after restore, it was 82 // interrupted by saving. In either case, the task is initially 83 // interrupted. 84 t.interruptSelf() 85 86 for { 87 // Explanation for this ordering: 88 // 89 // - A freshly-started task that is stopped should not do anything 90 // before it enters the stop. 91 // 92 // - If taskRunState.execute returns nil, the task goroutine should 93 // exit without checking for a stop. 94 // 95 // - Task.Start won't start Task.run if t.runState is nil, so this 96 // ordering is safe. 97 t.doStop() 98 t.runState = t.runState.execute(t) 99 if t.runState == nil { 100 t.accountTaskGoroutineEnter(TaskGoroutineNonexistent) 101 t.goroutineStopped.Done() 102 t.tg.liveGoroutines.Done() 103 t.tg.pidns.owner.liveGoroutines.Done() 104 t.tg.pidns.owner.runningGoroutines.Done() 105 t.p.Release() 106 107 // Deferring this store triggers a false positive in the race 108 // detector (https://github.com/golang/go/issues/42599). 109 t.goid.Store(0) 110 // Keep argument alive because stack trace for dead variables may not be correct. 111 runtime.KeepAlive(threadID) 112 return 113 } 114 } 115 } 116 117 // doStop is called by Task.run to block until the task is not stopped. 118 func (t *Task) doStop() { 119 if t.stopCount.Load() == 0 { 120 return 121 } 122 t.Deactivate() 123 // NOTE(b/30316266): t.Activate() must be called without any locks held, so 124 // this defer must precede the defer for unlocking the signal mutex. 125 defer t.Activate() 126 t.accountTaskGoroutineEnter(TaskGoroutineStopped) 127 defer t.accountTaskGoroutineLeave(TaskGoroutineStopped) 128 t.tg.signalHandlers.mu.Lock() 129 defer t.tg.signalHandlers.mu.Unlock() 130 t.tg.pidns.owner.runningGoroutines.Add(-1) 131 defer t.tg.pidns.owner.runningGoroutines.Add(1) 132 t.goroutineStopped.Add(-1) 133 defer t.goroutineStopped.Add(1) 134 for t.stopCount.RacyLoad() > 0 { 135 t.endStopCond.Wait() 136 } 137 } 138 139 // The runApp state checks for interrupts before executing untrusted 140 // application code. 141 // 142 // +stateify savable 143 type runApp struct{} 144 145 func (app *runApp) execute(t *Task) taskRunState { 146 if t.interrupted() { 147 // Checkpointing instructs tasks to stop by sending an interrupt, so we 148 // must check for stops before entering runInterrupt (instead of 149 // tail-calling it). 150 return (*runInterrupt)(nil) 151 } 152 153 // Execute any task work callbacks before returning to user space. 154 if t.taskWorkCount.Load() > 0 { 155 t.taskWorkMu.Lock() 156 queue := t.taskWork 157 t.taskWork = nil 158 t.taskWorkCount.Store(0) 159 t.taskWorkMu.Unlock() 160 161 // Do not hold taskWorkMu while executing task work, which may register 162 // more work. 163 for _, work := range queue { 164 work.TaskWork(t) 165 } 166 } 167 168 // We're about to switch to the application again. If there's still an 169 // unhandled SyscallRestartErrno that wasn't translated to an EINTR, 170 // restart the syscall that was interrupted. If there's a saved signal 171 // mask, restore it. (Note that restoring the saved signal mask may unblock 172 // a pending signal, causing another interruption, but that signal should 173 // not interact with the interrupted syscall.) 174 if t.haveSyscallReturn { 175 if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil { 176 t.Warningf("Unable to pull a full state: %v", err) 177 t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1)))) 178 return (*runExit)(nil) 179 } 180 181 if sre, ok := linuxerr.SyscallRestartErrorFromReturn(t.Arch().Return()); ok { 182 if sre == linuxerr.ERESTART_RESTARTBLOCK { 183 t.Debugf("Restarting syscall %d with restart block: not interrupted by handled signal", t.Arch().SyscallNo()) 184 t.Arch().RestartSyscallWithRestartBlock() 185 } else { 186 t.Debugf("Restarting syscall %d: not interrupted by handled signal", t.Arch().SyscallNo()) 187 t.Arch().RestartSyscall() 188 } 189 } 190 t.haveSyscallReturn = false 191 } 192 if t.haveSavedSignalMask { 193 t.SetSignalMask(t.savedSignalMask) 194 t.haveSavedSignalMask = false 195 if t.interrupted() { 196 return (*runInterrupt)(nil) 197 } 198 } 199 200 // Apply restartable sequences. 201 if t.rseqPreempted { 202 t.rseqPreempted = false 203 if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 { 204 t.rseqCPU = int32(hostcpu.GetCPU()) 205 if err := t.rseqCopyOutCPU(); err != nil { 206 t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) 207 t.forceSignal(linux.SIGSEGV, false) 208 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 209 // Re-enter the task run loop for signal delivery. 210 return (*runApp)(nil) 211 } 212 if err := t.oldRSeqCopyOutCPU(); err != nil { 213 t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err) 214 t.forceSignal(linux.SIGSEGV, false) 215 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 216 // Re-enter the task run loop for signal delivery. 217 return (*runApp)(nil) 218 } 219 } 220 t.rseqInterrupt() 221 } 222 223 // Check if we need to enable single-stepping. Tracers expect that the 224 // kernel preserves the value of the single-step flag set by PTRACE_SETREGS 225 // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this 226 // includes our ptrace platform, by the way), so we should only clear the 227 // single-step flag if we're responsible for setting it. (clearSinglestep 228 // is therefore analogous to Linux's TIF_FORCED_TF.) 229 // 230 // Strictly speaking, we should also not clear the single-step flag if we 231 // single-step through an instruction that sets the single-step flag 232 // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their 233 // own TF. (Famous last words, I know.) 234 clearSinglestep := false 235 if t.hasTracer() { 236 t.tg.pidns.owner.mu.RLock() 237 if t.ptraceSinglestep { 238 clearSinglestep = !t.Arch().SingleStep() 239 t.Arch().SetSingleStep() 240 } 241 t.tg.pidns.owner.mu.RUnlock() 242 } 243 244 region := trace.StartRegion(t.traceContext, runRegion) 245 t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) 246 info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU) 247 t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) 248 region.End() 249 250 if clearSinglestep { 251 t.Arch().ClearSingleStep() 252 } 253 if t.hasTracer() { 254 if e := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); e != nil { 255 t.Warningf("Unable to pull a full state: %v", e) 256 err = e 257 } 258 } 259 260 switch err { 261 case nil: 262 // Handle application system call. 263 return t.doSyscall() 264 265 case platform.ErrContextInterrupt: 266 // Interrupted by platform.Context.Interrupt(). Re-enter the run 267 // loop to figure out why. 268 return (*runApp)(nil) 269 270 case platform.ErrContextSignal: 271 // Looks like a signal has been delivered to us. If it's a synchronous 272 // signal (SEGV, SIGBUS, etc.), it should be sent to the application 273 // thread that received it. 274 sig := linux.Signal(info.Signo) 275 276 // Was it a fault that we should handle internally? If so, this wasn't 277 // an application-generated signal and we should continue execution 278 // normally. 279 if at.Any() { 280 faultCounter.Increment() 281 282 region := trace.StartRegion(t.traceContext, faultRegion) 283 addr := hostarch.Addr(info.Addr()) 284 err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack())) 285 region.End() 286 if err == nil { 287 // The fault was handled appropriately. 288 // We can resume running the application. 289 return (*runApp)(nil) 290 } 291 292 // Is this a vsyscall that we need emulate? 293 // 294 // Note that we don't track vsyscalls as part of a 295 // specific trace region. This is because regions don't 296 // stack, and the actual system call will count as a 297 // region. We should be able to easily identify 298 // vsyscalls by having a <fault><syscall> pair. 299 if at.Execute { 300 if sysno, ok := t.image.st.LookupEmulate(addr); ok { 301 return t.doVsyscall(addr, sysno) 302 } 303 } 304 305 // Faults are common, log only at debug level. 306 t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v sig=%v err=%v", addr, t.Arch().IP(), at, sig, err) 307 t.DebugDumpState() 308 309 // Continue to signal handling. 310 // 311 // Convert a BusError error to a SIGBUS from a SIGSEGV. All 312 // other info bits stay the same (address, etc.). 313 if _, ok := err.(*memmap.BusError); ok { 314 sig = linux.SIGBUS 315 info.Signo = int32(linux.SIGBUS) 316 } 317 } 318 319 switch sig { 320 case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP: 321 // Synchronous signal. Send it to ourselves. Assume the signal is 322 // legitimate and force it (work around the signal being ignored or 323 // blocked) like Linux does. Conveniently, this is even the correct 324 // behavior for SIGTRAP from single-stepping. 325 t.forceSignal(linux.Signal(sig), false /* unconditional */) 326 t.SendSignal(info) 327 328 case platform.SignalInterrupt: 329 // Assume that a call to platform.Context.Interrupt() misfired. 330 331 case linux.SIGPROF: 332 // It's a profiling interrupt: there's not much 333 // we can do. We've already paid a decent cost 334 // by intercepting the signal, at this point we 335 // simply ignore it. 336 337 default: 338 // Asynchronous signal. Let the system deal with it. 339 t.k.sendExternalSignal(info, "application") 340 } 341 342 return (*runApp)(nil) 343 344 case platform.ErrContextCPUPreempted: 345 // Ensure that rseq critical sections are interrupted and per-thread 346 // CPU values are updated before the next platform.Context.Switch(). 347 t.rseqPreempted = true 348 return (*runApp)(nil) 349 350 default: 351 // What happened? Can't continue. 352 t.Warningf("Unexpected SwitchToApp error: %v", err) 353 t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1)))) 354 return (*runExit)(nil) 355 } 356 } 357 358 // assertTaskGoroutine panics if the caller is not running on t's task 359 // goroutine. 360 func (t *Task) assertTaskGoroutine() { 361 if got, want := goid.Get(), t.goid.Load(); got != want { 362 panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want)) 363 } 364 } 365 366 // GoroutineID returns the ID of t's task goroutine. 367 func (t *Task) GoroutineID() int64 { 368 return t.goid.Load() 369 } 370 371 // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. 372 func (t *Task) waitGoroutineStoppedOrExited() { 373 t.goroutineStopped.Wait() 374 } 375 376 // WaitExited blocks until all task goroutines in tg have exited. 377 // 378 // WaitExited does not correspond to anything in Linux; it's provided so that 379 // external callers of Kernel.CreateProcess can wait for the created thread 380 // group to terminate. 381 func (tg *ThreadGroup) WaitExited() { 382 tg.liveGoroutines.Wait() 383 } 384 385 // Yield yields the processor for the calling task. 386 func (t *Task) Yield() { 387 t.yieldCount.Add(1) 388 runtime.Gosched() 389 }