github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_run.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "bytes" 19 "fmt" 20 "runtime" 21 "runtime/trace" 22 "sync/atomic" 23 24 "github.com/SagerNet/gvisor/pkg/abi/linux" 25 "github.com/SagerNet/gvisor/pkg/goid" 26 "github.com/SagerNet/gvisor/pkg/hostarch" 27 "github.com/SagerNet/gvisor/pkg/sentry/arch" 28 "github.com/SagerNet/gvisor/pkg/sentry/hostcpu" 29 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 30 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 31 "github.com/SagerNet/gvisor/pkg/sentry/platform" 32 "github.com/SagerNet/gvisor/pkg/syserror" 33 ) 34 35 // A taskRunState is a reified state in the task state machine. See README.md 36 // for details. The canonical list of all run states, as well as transitions 37 // between them, is given in run_states.dot. 38 // 39 // The set of possible states is enumerable and completely defined by the 40 // kernel package, so taskRunState would ideally be represented by a 41 // discriminated union. However, Go does not support sum types. 42 // 43 // Hence, as with TaskStop, data-free taskRunStates should be represented as 44 // typecast nils to avoid unnecessary allocation. 45 type taskRunState interface { 46 // execute executes the code associated with this state over the given task 47 // and returns the following state. If execute returns nil, the task 48 // goroutine should exit. 49 // 50 // It is valid to tail-call a following state's execute to avoid the 51 // overhead of converting the following state to an interface object and 52 // checking for stops, provided that the tail-call cannot recurse. 53 execute(*Task) taskRunState 54 } 55 56 // run runs the task goroutine. 57 // 58 // threadID a dummy value set to the task's TID in the root PID namespace to 59 // make it visible in stack dumps. A goroutine for a given task can be identified 60 // searching for Task.run()'s argument value. 61 func (t *Task) run(threadID uintptr) { 62 atomic.StoreInt64(&t.goid, goid.Get()) 63 64 // Construct t.blockingTimer here. We do this here because we can't 65 // reconstruct t.blockingTimer during restore in Task.afterLoad(), because 66 // kernel.timekeeper.SetClocks() hasn't been called yet. 67 blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier() 68 t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier) 69 defer t.blockingTimer.Destroy() 70 t.blockingTimerChan = blockingTimerChan 71 72 // Activate our address space. 73 t.Activate() 74 // The corresponding t.Deactivate occurs in the exit path 75 // (runExitMain.execute) so that when 76 // Platform.CooperativelySharesAddressSpace() == true, we give up the 77 // AddressSpace before the task goroutine finishes executing. 78 79 // If this is a newly-started task, it should check for participation in 80 // group stops. If this is a task resuming after restore, it was 81 // interrupted by saving. In either case, the task is initially 82 // interrupted. 83 t.interruptSelf() 84 85 for { 86 // Explanation for this ordering: 87 // 88 // - A freshly-started task that is stopped should not do anything 89 // before it enters the stop. 90 // 91 // - If taskRunState.execute returns nil, the task goroutine should 92 // exit without checking for a stop. 93 // 94 // - Task.Start won't start Task.run if t.runState is nil, so this 95 // ordering is safe. 96 t.doStop() 97 t.runState = t.runState.execute(t) 98 if t.runState == nil { 99 t.accountTaskGoroutineEnter(TaskGoroutineNonexistent) 100 t.goroutineStopped.Done() 101 t.tg.liveGoroutines.Done() 102 t.tg.pidns.owner.liveGoroutines.Done() 103 t.tg.pidns.owner.runningGoroutines.Done() 104 t.p.Release() 105 106 // Deferring this store triggers a false positive in the race 107 // detector (https://github.com/golang/go/issues/42599). 108 atomic.StoreInt64(&t.goid, 0) 109 // Keep argument alive because stack trace for dead variables may not be correct. 110 runtime.KeepAlive(threadID) 111 return 112 } 113 } 114 } 115 116 // doStop is called by Task.run to block until the task is not stopped. 117 func (t *Task) doStop() { 118 if atomic.LoadInt32(&t.stopCount) == 0 { 119 return 120 } 121 t.Deactivate() 122 // NOTE(b/30316266): t.Activate() must be called without any locks held, so 123 // this defer must precede the defer for unlocking the signal mutex. 124 defer t.Activate() 125 t.accountTaskGoroutineEnter(TaskGoroutineStopped) 126 defer t.accountTaskGoroutineLeave(TaskGoroutineStopped) 127 t.tg.signalHandlers.mu.Lock() 128 defer t.tg.signalHandlers.mu.Unlock() 129 t.tg.pidns.owner.runningGoroutines.Add(-1) 130 defer t.tg.pidns.owner.runningGoroutines.Add(1) 131 t.goroutineStopped.Add(-1) 132 defer t.goroutineStopped.Add(1) 133 for t.stopCount > 0 { 134 t.endStopCond.Wait() 135 } 136 } 137 138 func (*runApp) handleCPUIDInstruction(t *Task) error { 139 if len(arch.CPUIDInstruction) == 0 { 140 // CPUID emulation isn't supported, but this code can be 141 // executed, because the ptrace platform returns 142 // ErrContextSignalCPUID on page faults too. Look at 143 // pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more 144 // details. 145 return platform.ErrContextSignal 146 } 147 // Is this a CPUID instruction? 148 region := trace.StartRegion(t.traceContext, cpuidRegion) 149 expected := arch.CPUIDInstruction[:] 150 found := make([]byte, len(expected)) 151 _, err := t.CopyInBytes(hostarch.Addr(t.Arch().IP()), found) 152 if err == nil && bytes.Equal(expected, found) { 153 // Skip the cpuid instruction. 154 t.Arch().CPUIDEmulate(t) 155 t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected))) 156 region.End() 157 158 return nil 159 } 160 region.End() // Not an actual CPUID, but required copy-in. 161 return platform.ErrContextSignal 162 } 163 164 // The runApp state checks for interrupts before executing untrusted 165 // application code. 166 // 167 // +stateify savable 168 type runApp struct{} 169 170 func (app *runApp) execute(t *Task) taskRunState { 171 if t.interrupted() { 172 // Checkpointing instructs tasks to stop by sending an interrupt, so we 173 // must check for stops before entering runInterrupt (instead of 174 // tail-calling it). 175 return (*runInterrupt)(nil) 176 } 177 178 // Execute any task work callbacks before returning to user space. 179 if atomic.LoadInt32(&t.taskWorkCount) > 0 { 180 t.taskWorkMu.Lock() 181 queue := t.taskWork 182 t.taskWork = nil 183 atomic.StoreInt32(&t.taskWorkCount, 0) 184 t.taskWorkMu.Unlock() 185 186 // Do not hold taskWorkMu while executing task work, which may register 187 // more work. 188 for _, work := range queue { 189 work.TaskWork(t) 190 } 191 } 192 193 // We're about to switch to the application again. If there's still an 194 // unhandled SyscallRestartErrno that wasn't translated to an EINTR, 195 // restart the syscall that was interrupted. If there's a saved signal 196 // mask, restore it. (Note that restoring the saved signal mask may unblock 197 // a pending signal, causing another interruption, but that signal should 198 // not interact with the interrupted syscall.) 199 if t.haveSyscallReturn { 200 if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok { 201 if sre == syserror.ERESTART_RESTARTBLOCK { 202 t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) 203 t.Arch().RestartSyscallWithRestartBlock() 204 } else { 205 t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre) 206 t.Arch().RestartSyscall() 207 } 208 } 209 t.haveSyscallReturn = false 210 } 211 if t.haveSavedSignalMask { 212 t.SetSignalMask(t.savedSignalMask) 213 t.haveSavedSignalMask = false 214 if t.interrupted() { 215 return (*runInterrupt)(nil) 216 } 217 } 218 219 // Apply restartable sequences. 220 if t.rseqPreempted { 221 t.rseqPreempted = false 222 if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 { 223 // Linux writes the CPU on every preemption. We only do 224 // so if it changed. Thus we may delay delivery of 225 // SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid. 226 cpu := int32(hostcpu.GetCPU()) 227 if t.rseqCPU != cpu { 228 t.rseqCPU = cpu 229 if err := t.rseqCopyOutCPU(); err != nil { 230 t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) 231 t.forceSignal(linux.SIGSEGV, false) 232 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 233 // Re-enter the task run loop for signal delivery. 234 return (*runApp)(nil) 235 } 236 if err := t.oldRSeqCopyOutCPU(); err != nil { 237 t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err) 238 t.forceSignal(linux.SIGSEGV, false) 239 t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) 240 // Re-enter the task run loop for signal delivery. 241 return (*runApp)(nil) 242 } 243 } 244 } 245 t.rseqInterrupt() 246 } 247 248 // Check if we need to enable single-stepping. Tracers expect that the 249 // kernel preserves the value of the single-step flag set by PTRACE_SETREGS 250 // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this 251 // includes our ptrace platform, by the way), so we should only clear the 252 // single-step flag if we're responsible for setting it. (clearSinglestep 253 // is therefore analogous to Linux's TIF_FORCED_TF.) 254 // 255 // Strictly speaking, we should also not clear the single-step flag if we 256 // single-step through an instruction that sets the single-step flag 257 // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their 258 // own TF. (Famous last words, I know.) 259 clearSinglestep := false 260 if t.hasTracer() { 261 t.tg.pidns.owner.mu.RLock() 262 if t.ptraceSinglestep { 263 clearSinglestep = !t.Arch().SingleStep() 264 t.Arch().SetSingleStep() 265 } 266 t.tg.pidns.owner.mu.RUnlock() 267 } 268 269 region := trace.StartRegion(t.traceContext, runRegion) 270 t.accountTaskGoroutineEnter(TaskGoroutineRunningApp) 271 info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU) 272 t.accountTaskGoroutineLeave(TaskGoroutineRunningApp) 273 region.End() 274 275 if clearSinglestep { 276 t.Arch().ClearSingleStep() 277 } 278 279 switch err { 280 case nil: 281 // Handle application system call. 282 return t.doSyscall() 283 284 case platform.ErrContextInterrupt: 285 // Interrupted by platform.Context.Interrupt(). Re-enter the run 286 // loop to figure out why. 287 return (*runApp)(nil) 288 289 case platform.ErrContextSignalCPUID: 290 if err := app.handleCPUIDInstruction(t); err == nil { 291 // Resume execution. 292 return (*runApp)(nil) 293 } 294 295 // The instruction at the given RIP was not a CPUID, and we 296 // fallthrough to the default signal deliver behavior below. 297 fallthrough 298 299 case platform.ErrContextSignal: 300 // Looks like a signal has been delivered to us. If it's a synchronous 301 // signal (SEGV, SIGBUS, etc.), it should be sent to the application 302 // thread that received it. 303 sig := linux.Signal(info.Signo) 304 305 // Was it a fault that we should handle internally? If so, this wasn't 306 // an application-generated signal and we should continue execution 307 // normally. 308 if at.Any() { 309 region := trace.StartRegion(t.traceContext, faultRegion) 310 addr := hostarch.Addr(info.Addr()) 311 err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack())) 312 region.End() 313 if err == nil { 314 // The fault was handled appropriately. 315 // We can resume running the application. 316 return (*runApp)(nil) 317 } 318 319 // Is this a vsyscall that we need emulate? 320 // 321 // Note that we don't track vsyscalls as part of a 322 // specific trace region. This is because regions don't 323 // stack, and the actual system call will count as a 324 // region. We should be able to easily identify 325 // vsyscalls by having a <fault><syscall> pair. 326 if at.Execute { 327 if sysno, ok := t.image.st.LookupEmulate(addr); ok { 328 return t.doVsyscall(addr, sysno) 329 } 330 } 331 332 // Faults are common, log only at debug level. 333 t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err) 334 t.DebugDumpState() 335 336 // Continue to signal handling. 337 // 338 // Convert a BusError error to a SIGBUS from a SIGSEGV. All 339 // other info bits stay the same (address, etc.). 340 if _, ok := err.(*memmap.BusError); ok { 341 sig = linux.SIGBUS 342 info.Signo = int32(linux.SIGBUS) 343 } 344 } 345 346 switch sig { 347 case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP: 348 // Synchronous signal. Send it to ourselves. Assume the signal is 349 // legitimate and force it (work around the signal being ignored or 350 // blocked) like Linux does. Conveniently, this is even the correct 351 // behavior for SIGTRAP from single-stepping. 352 t.forceSignal(linux.Signal(sig), false /* unconditional */) 353 t.SendSignal(info) 354 355 case platform.SignalInterrupt: 356 // Assume that a call to platform.Context.Interrupt() misfired. 357 358 case linux.SIGPROF: 359 // It's a profiling interrupt: there's not much 360 // we can do. We've already paid a decent cost 361 // by intercepting the signal, at this point we 362 // simply ignore it. 363 364 default: 365 // Asynchronous signal. Let the system deal with it. 366 t.k.sendExternalSignal(info, "application") 367 } 368 369 return (*runApp)(nil) 370 371 case platform.ErrContextCPUPreempted: 372 // Ensure that rseq critical sections are interrupted and per-thread 373 // CPU values are updated before the next platform.Context.Switch(). 374 t.rseqPreempted = true 375 return (*runApp)(nil) 376 377 default: 378 // What happened? Can't continue. 379 t.Warningf("Unexpected SwitchToApp error: %v", err) 380 t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)}) 381 return (*runExit)(nil) 382 } 383 } 384 385 // assertTaskGoroutine panics if the caller is not running on t's task 386 // goroutine. 387 func (t *Task) assertTaskGoroutine() { 388 if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want { 389 panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want)) 390 } 391 } 392 393 // GoroutineID returns the ID of t's task goroutine. 394 func (t *Task) GoroutineID() int64 { 395 return atomic.LoadInt64(&t.goid) 396 } 397 398 // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits. 399 func (t *Task) waitGoroutineStoppedOrExited() { 400 t.goroutineStopped.Wait() 401 } 402 403 // WaitExited blocks until all task goroutines in tg have exited. 404 // 405 // WaitExited does not correspond to anything in Linux; it's provided so that 406 // external callers of Kernel.CreateProcess can wait for the created thread 407 // group to terminate. 408 func (tg *ThreadGroup) WaitExited() { 409 tg.liveGoroutines.Wait() 410 } 411 412 // Yield yields the processor for the calling task. 413 func (t *Task) Yield() { 414 atomic.AddUint64(&t.yieldCount, 1) 415 runtime.Gosched() 416 }