github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_run.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"runtime"
    21  	"runtime/trace"
    22  	"sync/atomic"
    23  
    24  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    25  	"github.com/SagerNet/gvisor/pkg/goid"
    26  	"github.com/SagerNet/gvisor/pkg/hostarch"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/hostcpu"
    29  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    32  	"github.com/SagerNet/gvisor/pkg/syserror"
    33  )
    34  
    35  // A taskRunState is a reified state in the task state machine. See README.md
    36  // for details. The canonical list of all run states, as well as transitions
    37  // between them, is given in run_states.dot.
    38  //
    39  // The set of possible states is enumerable and completely defined by the
    40  // kernel package, so taskRunState would ideally be represented by a
    41  // discriminated union. However, Go does not support sum types.
    42  //
    43  // Hence, as with TaskStop, data-free taskRunStates should be represented as
    44  // typecast nils to avoid unnecessary allocation.
    45  type taskRunState interface {
    46  	// execute executes the code associated with this state over the given task
    47  	// and returns the following state. If execute returns nil, the task
    48  	// goroutine should exit.
    49  	//
    50  	// It is valid to tail-call a following state's execute to avoid the
    51  	// overhead of converting the following state to an interface object and
    52  	// checking for stops, provided that the tail-call cannot recurse.
    53  	execute(*Task) taskRunState
    54  }
    55  
    56  // run runs the task goroutine.
    57  //
    58  // threadID a dummy value set to the task's TID in the root PID namespace to
    59  // make it visible in stack dumps. A goroutine for a given task can be identified
    60  // searching for Task.run()'s argument value.
    61  func (t *Task) run(threadID uintptr) {
    62  	atomic.StoreInt64(&t.goid, goid.Get())
    63  
    64  	// Construct t.blockingTimer here. We do this here because we can't
    65  	// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
    66  	// kernel.timekeeper.SetClocks() hasn't been called yet.
    67  	blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
    68  	t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
    69  	defer t.blockingTimer.Destroy()
    70  	t.blockingTimerChan = blockingTimerChan
    71  
    72  	// Activate our address space.
    73  	t.Activate()
    74  	// The corresponding t.Deactivate occurs in the exit path
    75  	// (runExitMain.execute) so that when
    76  	// Platform.CooperativelySharesAddressSpace() == true, we give up the
    77  	// AddressSpace before the task goroutine finishes executing.
    78  
    79  	// If this is a newly-started task, it should check for participation in
    80  	// group stops. If this is a task resuming after restore, it was
    81  	// interrupted by saving. In either case, the task is initially
    82  	// interrupted.
    83  	t.interruptSelf()
    84  
    85  	for {
    86  		// Explanation for this ordering:
    87  		//
    88  		// - A freshly-started task that is stopped should not do anything
    89  		// before it enters the stop.
    90  		//
    91  		// - If taskRunState.execute returns nil, the task goroutine should
    92  		// exit without checking for a stop.
    93  		//
    94  		// - Task.Start won't start Task.run if t.runState is nil, so this
    95  		// ordering is safe.
    96  		t.doStop()
    97  		t.runState = t.runState.execute(t)
    98  		if t.runState == nil {
    99  			t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
   100  			t.goroutineStopped.Done()
   101  			t.tg.liveGoroutines.Done()
   102  			t.tg.pidns.owner.liveGoroutines.Done()
   103  			t.tg.pidns.owner.runningGoroutines.Done()
   104  			t.p.Release()
   105  
   106  			// Deferring this store triggers a false positive in the race
   107  			// detector (https://github.com/golang/go/issues/42599).
   108  			atomic.StoreInt64(&t.goid, 0)
   109  			// Keep argument alive because stack trace for dead variables may not be correct.
   110  			runtime.KeepAlive(threadID)
   111  			return
   112  		}
   113  	}
   114  }
   115  
   116  // doStop is called by Task.run to block until the task is not stopped.
   117  func (t *Task) doStop() {
   118  	if atomic.LoadInt32(&t.stopCount) == 0 {
   119  		return
   120  	}
   121  	t.Deactivate()
   122  	// NOTE(b/30316266): t.Activate() must be called without any locks held, so
   123  	// this defer must precede the defer for unlocking the signal mutex.
   124  	defer t.Activate()
   125  	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
   126  	defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
   127  	t.tg.signalHandlers.mu.Lock()
   128  	defer t.tg.signalHandlers.mu.Unlock()
   129  	t.tg.pidns.owner.runningGoroutines.Add(-1)
   130  	defer t.tg.pidns.owner.runningGoroutines.Add(1)
   131  	t.goroutineStopped.Add(-1)
   132  	defer t.goroutineStopped.Add(1)
   133  	for t.stopCount > 0 {
   134  		t.endStopCond.Wait()
   135  	}
   136  }
   137  
   138  func (*runApp) handleCPUIDInstruction(t *Task) error {
   139  	if len(arch.CPUIDInstruction) == 0 {
   140  		// CPUID emulation isn't supported, but this code can be
   141  		// executed, because the ptrace platform returns
   142  		// ErrContextSignalCPUID on page faults too. Look at
   143  		// pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
   144  		// details.
   145  		return platform.ErrContextSignal
   146  	}
   147  	// Is this a CPUID instruction?
   148  	region := trace.StartRegion(t.traceContext, cpuidRegion)
   149  	expected := arch.CPUIDInstruction[:]
   150  	found := make([]byte, len(expected))
   151  	_, err := t.CopyInBytes(hostarch.Addr(t.Arch().IP()), found)
   152  	if err == nil && bytes.Equal(expected, found) {
   153  		// Skip the cpuid instruction.
   154  		t.Arch().CPUIDEmulate(t)
   155  		t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
   156  		region.End()
   157  
   158  		return nil
   159  	}
   160  	region.End() // Not an actual CPUID, but required copy-in.
   161  	return platform.ErrContextSignal
   162  }
   163  
   164  // The runApp state checks for interrupts before executing untrusted
   165  // application code.
   166  //
   167  // +stateify savable
   168  type runApp struct{}
   169  
   170  func (app *runApp) execute(t *Task) taskRunState {
   171  	if t.interrupted() {
   172  		// Checkpointing instructs tasks to stop by sending an interrupt, so we
   173  		// must check for stops before entering runInterrupt (instead of
   174  		// tail-calling it).
   175  		return (*runInterrupt)(nil)
   176  	}
   177  
   178  	// Execute any task work callbacks before returning to user space.
   179  	if atomic.LoadInt32(&t.taskWorkCount) > 0 {
   180  		t.taskWorkMu.Lock()
   181  		queue := t.taskWork
   182  		t.taskWork = nil
   183  		atomic.StoreInt32(&t.taskWorkCount, 0)
   184  		t.taskWorkMu.Unlock()
   185  
   186  		// Do not hold taskWorkMu while executing task work, which may register
   187  		// more work.
   188  		for _, work := range queue {
   189  			work.TaskWork(t)
   190  		}
   191  	}
   192  
   193  	// We're about to switch to the application again. If there's still an
   194  	// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
   195  	// restart the syscall that was interrupted. If there's a saved signal
   196  	// mask, restore it. (Note that restoring the saved signal mask may unblock
   197  	// a pending signal, causing another interruption, but that signal should
   198  	// not interact with the interrupted syscall.)
   199  	if t.haveSyscallReturn {
   200  		if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
   201  			if sre == syserror.ERESTART_RESTARTBLOCK {
   202  				t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
   203  				t.Arch().RestartSyscallWithRestartBlock()
   204  			} else {
   205  				t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
   206  				t.Arch().RestartSyscall()
   207  			}
   208  		}
   209  		t.haveSyscallReturn = false
   210  	}
   211  	if t.haveSavedSignalMask {
   212  		t.SetSignalMask(t.savedSignalMask)
   213  		t.haveSavedSignalMask = false
   214  		if t.interrupted() {
   215  			return (*runInterrupt)(nil)
   216  		}
   217  	}
   218  
   219  	// Apply restartable sequences.
   220  	if t.rseqPreempted {
   221  		t.rseqPreempted = false
   222  		if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
   223  			// Linux writes the CPU on every preemption. We only do
   224  			// so if it changed. Thus we may delay delivery of
   225  			// SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
   226  			cpu := int32(hostcpu.GetCPU())
   227  			if t.rseqCPU != cpu {
   228  				t.rseqCPU = cpu
   229  				if err := t.rseqCopyOutCPU(); err != nil {
   230  					t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
   231  					t.forceSignal(linux.SIGSEGV, false)
   232  					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   233  					// Re-enter the task run loop for signal delivery.
   234  					return (*runApp)(nil)
   235  				}
   236  				if err := t.oldRSeqCopyOutCPU(); err != nil {
   237  					t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
   238  					t.forceSignal(linux.SIGSEGV, false)
   239  					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   240  					// Re-enter the task run loop for signal delivery.
   241  					return (*runApp)(nil)
   242  				}
   243  			}
   244  		}
   245  		t.rseqInterrupt()
   246  	}
   247  
   248  	// Check if we need to enable single-stepping. Tracers expect that the
   249  	// kernel preserves the value of the single-step flag set by PTRACE_SETREGS
   250  	// whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
   251  	// includes our ptrace platform, by the way), so we should only clear the
   252  	// single-step flag if we're responsible for setting it. (clearSinglestep
   253  	// is therefore analogous to Linux's TIF_FORCED_TF.)
   254  	//
   255  	// Strictly speaking, we should also not clear the single-step flag if we
   256  	// single-step through an instruction that sets the single-step flag
   257  	// (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
   258  	// own TF. (Famous last words, I know.)
   259  	clearSinglestep := false
   260  	if t.hasTracer() {
   261  		t.tg.pidns.owner.mu.RLock()
   262  		if t.ptraceSinglestep {
   263  			clearSinglestep = !t.Arch().SingleStep()
   264  			t.Arch().SetSingleStep()
   265  		}
   266  		t.tg.pidns.owner.mu.RUnlock()
   267  	}
   268  
   269  	region := trace.StartRegion(t.traceContext, runRegion)
   270  	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
   271  	info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU)
   272  	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
   273  	region.End()
   274  
   275  	if clearSinglestep {
   276  		t.Arch().ClearSingleStep()
   277  	}
   278  
   279  	switch err {
   280  	case nil:
   281  		// Handle application system call.
   282  		return t.doSyscall()
   283  
   284  	case platform.ErrContextInterrupt:
   285  		// Interrupted by platform.Context.Interrupt(). Re-enter the run
   286  		// loop to figure out why.
   287  		return (*runApp)(nil)
   288  
   289  	case platform.ErrContextSignalCPUID:
   290  		if err := app.handleCPUIDInstruction(t); err == nil {
   291  			// Resume execution.
   292  			return (*runApp)(nil)
   293  		}
   294  
   295  		// The instruction at the given RIP was not a CPUID, and we
   296  		// fallthrough to the default signal deliver behavior below.
   297  		fallthrough
   298  
   299  	case platform.ErrContextSignal:
   300  		// Looks like a signal has been delivered to us. If it's a synchronous
   301  		// signal (SEGV, SIGBUS, etc.), it should be sent to the application
   302  		// thread that received it.
   303  		sig := linux.Signal(info.Signo)
   304  
   305  		// Was it a fault that we should handle internally? If so, this wasn't
   306  		// an application-generated signal and we should continue execution
   307  		// normally.
   308  		if at.Any() {
   309  			region := trace.StartRegion(t.traceContext, faultRegion)
   310  			addr := hostarch.Addr(info.Addr())
   311  			err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack()))
   312  			region.End()
   313  			if err == nil {
   314  				// The fault was handled appropriately.
   315  				// We can resume running the application.
   316  				return (*runApp)(nil)
   317  			}
   318  
   319  			// Is this a vsyscall that we need emulate?
   320  			//
   321  			// Note that we don't track vsyscalls as part of a
   322  			// specific trace region. This is because regions don't
   323  			// stack, and the actual system call will count as a
   324  			// region. We should be able to easily identify
   325  			// vsyscalls by having a <fault><syscall> pair.
   326  			if at.Execute {
   327  				if sysno, ok := t.image.st.LookupEmulate(addr); ok {
   328  					return t.doVsyscall(addr, sysno)
   329  				}
   330  			}
   331  
   332  			// Faults are common, log only at debug level.
   333  			t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
   334  			t.DebugDumpState()
   335  
   336  			// Continue to signal handling.
   337  			//
   338  			// Convert a BusError error to a SIGBUS from a SIGSEGV. All
   339  			// other info bits stay the same (address, etc.).
   340  			if _, ok := err.(*memmap.BusError); ok {
   341  				sig = linux.SIGBUS
   342  				info.Signo = int32(linux.SIGBUS)
   343  			}
   344  		}
   345  
   346  		switch sig {
   347  		case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
   348  			// Synchronous signal. Send it to ourselves. Assume the signal is
   349  			// legitimate and force it (work around the signal being ignored or
   350  			// blocked) like Linux does. Conveniently, this is even the correct
   351  			// behavior for SIGTRAP from single-stepping.
   352  			t.forceSignal(linux.Signal(sig), false /* unconditional */)
   353  			t.SendSignal(info)
   354  
   355  		case platform.SignalInterrupt:
   356  			// Assume that a call to platform.Context.Interrupt() misfired.
   357  
   358  		case linux.SIGPROF:
   359  			// It's a profiling interrupt: there's not much
   360  			// we can do. We've already paid a decent cost
   361  			// by intercepting the signal, at this point we
   362  			// simply ignore it.
   363  
   364  		default:
   365  			// Asynchronous signal. Let the system deal with it.
   366  			t.k.sendExternalSignal(info, "application")
   367  		}
   368  
   369  		return (*runApp)(nil)
   370  
   371  	case platform.ErrContextCPUPreempted:
   372  		// Ensure that rseq critical sections are interrupted and per-thread
   373  		// CPU values are updated before the next platform.Context.Switch().
   374  		t.rseqPreempted = true
   375  		return (*runApp)(nil)
   376  
   377  	default:
   378  		// What happened? Can't continue.
   379  		t.Warningf("Unexpected SwitchToApp error: %v", err)
   380  		t.PrepareExit(ExitStatus{Code: ExtractErrno(err, -1)})
   381  		return (*runExit)(nil)
   382  	}
   383  }
   384  
   385  // assertTaskGoroutine panics if the caller is not running on t's task
   386  // goroutine.
   387  func (t *Task) assertTaskGoroutine() {
   388  	if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want {
   389  		panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want))
   390  	}
   391  }
   392  
   393  // GoroutineID returns the ID of t's task goroutine.
   394  func (t *Task) GoroutineID() int64 {
   395  	return atomic.LoadInt64(&t.goid)
   396  }
   397  
   398  // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
   399  func (t *Task) waitGoroutineStoppedOrExited() {
   400  	t.goroutineStopped.Wait()
   401  }
   402  
   403  // WaitExited blocks until all task goroutines in tg have exited.
   404  //
   405  // WaitExited does not correspond to anything in Linux; it's provided so that
   406  // external callers of Kernel.CreateProcess can wait for the created thread
   407  // group to terminate.
   408  func (tg *ThreadGroup) WaitExited() {
   409  	tg.liveGoroutines.Wait()
   410  }
   411  
   412  // Yield yields the processor for the calling task.
   413  func (t *Task) Yield() {
   414  	atomic.AddUint64(&t.yieldCount, 1)
   415  	runtime.Gosched()
   416  }