github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/task_run.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "runtime" 20 "runtime/trace" 21 22 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 23 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 24 "github.com/MerlinKodo/gvisor/pkg/goid" 25 "github.com/MerlinKodo/gvisor/pkg/hostarch" 26 "github.com/MerlinKodo/gvisor/pkg/refs" 27 "github.com/MerlinKodo/gvisor/pkg/sentry/hostcpu" 28 ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/memmap" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/platform" 31 ) 32 33 // A taskRunState is a reified state in the task state machine. See README.md 34 // for details. The canonical list of all run states, as well as transitions 35 // between them, is given in run_states.dot. 36 // 37 // The set of possible states is enumerable and completely defined by the 38 // kernel package, so taskRunState would ideally be represented by a 39 // discriminated union. However, Go does not support sum types. 40 // 41 // Hence, as with TaskStop, data-free taskRunStates should be represented as 42 // typecast nils to avoid unnecessary allocation. 43 type taskRunState interface { 44 // execute executes the code associated with this state over the given task 45 // and returns the following state. If execute returns nil, the task 46 // goroutine should exit. 47 // 48 // It is valid to tail-call a following state's execute to avoid the 49 // overhead of converting the following state to an interface object and 50 // checking for stops, provided that the tail-call cannot recurse. 51 execute(*Task) taskRunState 52 } 53 54 // run runs the task goroutine. 55 // 56 // threadID a dummy value set to the task's TID in the root PID namespace to 57 // make it visible in stack dumps. A goroutine for a given task can be identified 58 // searching for Task.run()'s argument value. 59 func (t *Task) run(threadID uintptr) { 60 t.goid.Store(goid.Get()) 61 62 refs.CleanupSync.Add(1) 63 defer refs.CleanupSync.Done() 64 65 // Construct t.blockingTimer here. We do this here because we can't 66 // reconstruct t.blockingTimer during restore in Task.afterLoad(), because 67 // kernel.timekeeper.SetClocks() hasn't been called yet. 68 blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier() 69 t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier) 70 defer t.blockingTimer.Destroy() 71 t.blockingTimerChan = blockingTimerChan 72 73 // Activate our address space. 74 t.Activate() 75 // The corresponding t.Deactivate occurs in the exit path 76 // (runExitMain.execute) so that when 77 // Platform.CooperativelySharesAddressSpace() == true, we give up the 78 // AddressSpace before the task goroutine finishes executing. 79 80 // If this is a newly-started task, it should check for participation in 81 // group stops. If this is a task resuming after restore, it was 82 // interrupted by saving. In either case, the task is initially 83 // interrupted. 84 t.interruptSelf() 85 86 for { 87 // Explanation for this ordering: 88 // 89 // - A freshly-started task that is stopped should not do anything 90 // before it enters the stop. 91 // 92 // - If taskRunState.execute returns nil, the task goroutine should 93 // exit without checking for a stop. 94 // 95 // - Task.Start won't start Task.run if t.runState is nil, so this 96 // ordering is safe. 97 t.doStop() 98 t.runState = t.runState.execute(t) 99 if t.runState == nil { 100 t.accountTaskGoroutineEnter(TaskGoroutineNonexistent) 101 t.goroutineStopped.Done() 102 t.tg.liveGoroutines.Done() 103 t.tg.pidns.owner.liveGoroutines.Done() 104 t.tg.pidns.owner.runningGoroutines.Done() 105 t.p.Release() 106 107 // Deferring this store triggers a false positive in the race 108 // detector (https://github.com/golang/go/issues/42599). 109 t.goid.Store(0) 110 // Keep argument alive because stack trace for dead variables may not be correct. 111 runtime.KeepAlive(threadID) 112 return 113 } 114 } 115 } 116 117 // doStop is called by Task.run to block until the task is not stopped. 118 func (t *Task) doStop() { 119 if t.stopCount.Load() == 0 { 120 return 121 } 122 t.Deactivate() 123 // NOTE(b/30316266): t.Activate() must be called without any locks held, so 124 // this defer must precede the defer for unlocking the signal mutex. 125 defer t.Activate() 126 t.accountTaskGoroutineEnter(TaskGoroutineStopped) 127 defer t.accountTaskGoroutineLeave(TaskGoroutineStopped) 128 t.tg.signalHandlers.mu.Lock() 129 defer t.tg.signalHandlers.mu.Unlock() 130 t.tg.pidns.owner.runningGoroutines.Add(-1) 131 defer t.tg.pidns.owner.runningGoroutines.Add(1) 132 t.goroutineStopped.Add(-1) 133 defer t.goroutineStopped.Add(1) 134 for t.stopCount.RacyLoad() > 0 { 135 t.endStopCond.Wait() 136 } 137 } 138 139 // The runApp state checks for interrupts before executing untrusted 140 // application code. 141 // 142 // +stateify savable 143 type runApp struct{} 144 145 func (app *runApp) execute(t *Task) taskRunState { 146 if t.interrupted() { 147 // Checkpointing instructs tasks to stop by sending an interrupt, so we 148 // must check for stops before entering runInterrupt (instead of 149 // tail-calling it). 150 return (*runInterrupt)(nil) 151 } 152 153 // Execute any task work callbacks before returning to user space. 154 if t.taskWorkCount.Load() > 0 { 155 t.taskWorkMu.Lock() 156 queue := t.taskWork 157 t.taskWork = nil 158 t.taskWorkCount.Store(0) 159 t.taskWorkMu.Unlock() 160 161 // Do not hold taskWorkMu while executing task work, which may register 162 // more work. 163 for _, work := range queue { 164 work.TaskWork(t) 165 } 166 } 167 168 // We're about to switch to the application again. If there's still an 169 // unhandled SyscallRestartErrno that wasn't translated to an EINTR, 170 // restart the syscall that was interrupted. If there's a saved signal 171 // mask, restore it. (Note that restoring the saved signal mask may unblock 172 // a pending signal, causing another interruption, but that signal should 173 // not interact with the interrupted syscall.) 174 if t.haveSyscallReturn { 175 if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil { 176 t.Warningf("Unable to pull a full state: %v", err) 177 t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1)))) 178 return (*runExit)(nil) 179 } 180 181 if sre, ok := linuxerr.SyscallRestartErrorFromReturn(t.Arch().Return()); ok { 182 if sre == linuxerr.ERESTART_RESTARTBLOCK { 183 t.Debugf("Restarting syscall %d with restart block: not interrupted by handled signal", t.Arch().SyscallNo()) 184 t.Arch().RestartSyscallWithRestartBlock() 185 } else { 186 t.Debugf("Restarting syscall %d: not interrupted by handled signal", t.Arch().SyscallNo()) 187 t.Arch().RestartSyscall() 188 } 189 } 190 t.haveSyscallReturn = false 191 } 192 if t.haveSavedSignalMask { 193 t.SetSignalMask(t.savedSignalMask) 194 t.haveSavedSignalMask = false 195 if t.interrupted() { 196 return (*runInterrupt)(nil) 197 } 198 } 199 200 // Apply restartable sequences. 201 if t.rseqPreempted { 202 t.rseqPreempted = false 203 if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 { 204 // Linux writes the CPU on every preemption. We only do 205 // so if it changed. Thus we may delay delivery of 206 // SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid. 207 cpu := int32(hostcpu.GetCPU()) 208 if t.rseqCPU != cpu { 209 t.rseqCPU = cpu 210 if err := t.rseqCopyOutCPU(); err != nil { 211 t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) 212 t.forceSignal(linux.SIGSEGV, false) 213 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 214 // Re-enter the task run loop for signal delivery. 215 return (*runApp)(nil) 216 } 217 if err := t.oldRSeqCopyOutCPU(); err != nil { 218 t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err) 219 t.forceSignal(linux.SIGSEGV, false) 220 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 221 // Re-enter the task run loop for signal delivery. 222 return (*runApp)(nil) 223 } 224 } 225 } 226 t.rseqInterrupt() 227 } 228 229 // Check if we need to enable single-stepping. Tracers expect that the 230 // kernel preserves the value of the single-step flag set by PTRACE_SETREGS 231 // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this 232 // includes our ptrace platform, by the way), so we should only clear the 233 // single-step flag if we're responsible for setting it. (clearSinglestep 234 // is therefore analogous to Linux's TIF_FORCED_TF.) 235 // 236 // Strictly speaking, we should also not clear the single-step flag if we 237 // single-step through an instruction that sets the single-step flag 238 // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their 239 // own TF. (Famous last words, I know.) 240 clearSinglestep := false 241 if t.hasTracer() { 242 t.tg.pidns.owner.mu.RLock() 243 if t.ptraceSinglestep { 244 clearSinglestep = !t.Arch().SingleStep() 245 t.Arch().SetSingleStep() 246 } 247 t.tg.pidns.owner.mu.RUnlock() 248 } 249 250 region := trace.StartRegion(t.traceContext, runRegion) 251 t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) 252 info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU) 253 t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) 254 region.End() 255 256 if clearSinglestep { 257 t.Arch().ClearSingleStep() 258 } 259 if t.hasTracer() { 260 if e := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); e != nil { 261 t.Warningf("Unable to pull a full state: %v", e) 262 err = e 263 } 264 } 265 266 switch err { 267 case nil: 268 // Handle application system call. 269 return t.doSyscall() 270 271 case platform.ErrContextInterrupt: 272 // Interrupted by platform.Context.Interrupt(). Re-enter the run 273 // loop to figure out why. 274 return (*runApp)(nil) 275 276 case platform.ErrContextSignal: 277 // Looks like a signal has been delivered to us. If it's a synchronous 278 // signal (SEGV, SIGBUS, etc.), it should be sent to the application 279 // thread that received it. 280 sig := linux.Signal(info.Signo) 281 282 // Was it a fault that we should handle internally? If so, this wasn't 283 // an application-generated signal and we should continue execution 284 // normally. 285 if at.Any() { 286 faultCounter.Increment() 287 288 region := trace.StartRegion(t.traceContext, faultRegion) 289 addr := hostarch.Addr(info.Addr()) 290 err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack())) 291 region.End() 292 if err == nil { 293 // The fault was handled appropriately. 294 // We can resume running the application. 295 return (*runApp)(nil) 296 } 297 298 // Is this a vsyscall that we need emulate? 299 // 300 // Note that we don't track vsyscalls as part of a 301 // specific trace region. This is because regions don't 302 // stack, and the actual system call will count as a 303 // region. We should be able to easily identify 304 // vsyscalls by having a <fault><syscall> pair. 305 if at.Execute { 306 if sysno, ok := t.image.st.LookupEmulate(addr); ok { 307 return t.doVsyscall(addr, sysno) 308 } 309 } 310 311 // Faults are common, log only at debug level. 312 t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v sig=%v err=%v", addr, t.Arch().IP(), at, sig, err) 313 t.DebugDumpState() 314 315 // Continue to signal handling. 316 // 317 // Convert a BusError error to a SIGBUS from a SIGSEGV. All 318 // other info bits stay the same (address, etc.). 319 if _, ok := err.(*memmap.BusError); ok { 320 sig = linux.SIGBUS 321 info.Signo = int32(linux.SIGBUS) 322 } 323 } 324 325 switch sig { 326 case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP: 327 // Synchronous signal. Send it to ourselves. Assume the signal is 328 // legitimate and force it (work around the signal being ignored or 329 // blocked) like Linux does. Conveniently, this is even the correct 330 // behavior for SIGTRAP from single-stepping. 331 t.forceSignal(linux.Signal(sig), false /* unconditional */) 332 t.SendSignal(info) 333 334 case platform.SignalInterrupt: 335 // Assume that a call to platform.Context.Interrupt() misfired. 336 337 case linux.SIGPROF: 338 // It's a profiling interrupt: there's not much 339 // we can do. We've already paid a decent cost 340 // by intercepting the signal, at this point we 341 // simply ignore it. 342 343 default: 344 // Asynchronous signal. Let the system deal with it. 345 t.k.sendExternalSignal(info, "application") 346 } 347 348 return (*runApp)(nil) 349 350 case platform.ErrContextCPUPreempted: 351 // Ensure that rseq critical sections are interrupted and per-thread 352 // CPU values are updated before the next platform.Context.Switch(). 353 t.rseqPreempted = true 354 return (*runApp)(nil) 355 356 default: 357 // What happened? Can't continue. 358 t.Warningf("Unexpected SwitchToApp error: %v", err) 359 t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1)))) 360 return (*runExit)(nil) 361 } 362 } 363 364 // assertTaskGoroutine panics if the caller is not running on t's task 365 // goroutine. 366 func (t *Task) assertTaskGoroutine() { 367 if got, want := goid.Get(), t.goid.Load(); got != want { 368 panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want)) 369 } 370 } 371 372 // GoroutineID returns the ID of t's task goroutine. 373 func (t *Task) GoroutineID() int64 { 374 return t.goid.Load() 375 } 376 377 // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. 378 func (t *Task) waitGoroutineStoppedOrExited() { 379 t.goroutineStopped.Wait() 380 } 381 382 // WaitExited blocks until all task goroutines in tg have exited. 383 // 384 // WaitExited does not correspond to anything in Linux; it's provided so that 385 // external callers of Kernel.CreateProcess can wait for the created thread 386 // group to terminate. 387 func (tg *ThreadGroup) WaitExited() { 388 tg.liveGoroutines.Wait() 389 } 390 391 // Yield yields the processor for the calling task. 392 func (t *Task) Yield() { 393 t.yieldCount.Add(1) 394 runtime.Gosched() 395 }