github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/task_run.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"fmt"
    19  	"runtime"
    20  	"runtime/trace"
    21  
    22  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    23  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    24  	"github.com/MerlinKodo/gvisor/pkg/goid"
    25  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    26  	"github.com/MerlinKodo/gvisor/pkg/refs"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/hostcpu"
    28  	ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/memmap"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform"
    31  )
    32  
    33  // A taskRunState is a reified state in the task state machine. See README.md
    34  // for details. The canonical list of all run states, as well as transitions
    35  // between them, is given in run_states.dot.
    36  //
    37  // The set of possible states is enumerable and completely defined by the
    38  // kernel package, so taskRunState would ideally be represented by a
    39  // discriminated union. However, Go does not support sum types.
    40  //
    41  // Hence, as with TaskStop, data-free taskRunStates should be represented as
    42  // typecast nils to avoid unnecessary allocation.
    43  type taskRunState interface {
    44  	// execute executes the code associated with this state over the given task
    45  	// and returns the following state. If execute returns nil, the task
    46  	// goroutine should exit.
    47  	//
    48  	// It is valid to tail-call a following state's execute to avoid the
    49  	// overhead of converting the following state to an interface object and
    50  	// checking for stops, provided that the tail-call cannot recurse.
    51  	execute(*Task) taskRunState
    52  }
    53  
    54  // run runs the task goroutine.
    55  //
    56  // threadID a dummy value set to the task's TID in the root PID namespace to
    57  // make it visible in stack dumps. A goroutine for a given task can be identified
    58  // searching for Task.run()'s argument value.
    59  func (t *Task) run(threadID uintptr) {
    60  	t.goid.Store(goid.Get())
    61  
    62  	refs.CleanupSync.Add(1)
    63  	defer refs.CleanupSync.Done()
    64  
    65  	// Construct t.blockingTimer here. We do this here because we can't
    66  	// reconstruct t.blockingTimer during restore in Task.afterLoad(), because
    67  	// kernel.timekeeper.SetClocks() hasn't been called yet.
    68  	blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
    69  	t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
    70  	defer t.blockingTimer.Destroy()
    71  	t.blockingTimerChan = blockingTimerChan
    72  
    73  	// Activate our address space.
    74  	t.Activate()
    75  	// The corresponding t.Deactivate occurs in the exit path
    76  	// (runExitMain.execute) so that when
    77  	// Platform.CooperativelySharesAddressSpace() == true, we give up the
    78  	// AddressSpace before the task goroutine finishes executing.
    79  
    80  	// If this is a newly-started task, it should check for participation in
    81  	// group stops. If this is a task resuming after restore, it was
    82  	// interrupted by saving. In either case, the task is initially
    83  	// interrupted.
    84  	t.interruptSelf()
    85  
    86  	for {
    87  		// Explanation for this ordering:
    88  		//
    89  		//	- A freshly-started task that is stopped should not do anything
    90  		//		before it enters the stop.
    91  		//
    92  		//	- If taskRunState.execute returns nil, the task goroutine should
    93  		//		exit without checking for a stop.
    94  		//
    95  		//	- Task.Start won't start Task.run if t.runState is nil, so this
    96  		//		ordering is safe.
    97  		t.doStop()
    98  		t.runState = t.runState.execute(t)
    99  		if t.runState == nil {
   100  			t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
   101  			t.goroutineStopped.Done()
   102  			t.tg.liveGoroutines.Done()
   103  			t.tg.pidns.owner.liveGoroutines.Done()
   104  			t.tg.pidns.owner.runningGoroutines.Done()
   105  			t.p.Release()
   106  
   107  			// Deferring this store triggers a false positive in the race
   108  			// detector (https://github.com/golang/go/issues/42599).
   109  			t.goid.Store(0)
   110  			// Keep argument alive because stack trace for dead variables may not be correct.
   111  			runtime.KeepAlive(threadID)
   112  			return
   113  		}
   114  	}
   115  }
   116  
   117  // doStop is called by Task.run to block until the task is not stopped.
   118  func (t *Task) doStop() {
   119  	if t.stopCount.Load() == 0 {
   120  		return
   121  	}
   122  	t.Deactivate()
   123  	// NOTE(b/30316266): t.Activate() must be called without any locks held, so
   124  	// this defer must precede the defer for unlocking the signal mutex.
   125  	defer t.Activate()
   126  	t.accountTaskGoroutineEnter(TaskGoroutineStopped)
   127  	defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
   128  	t.tg.signalHandlers.mu.Lock()
   129  	defer t.tg.signalHandlers.mu.Unlock()
   130  	t.tg.pidns.owner.runningGoroutines.Add(-1)
   131  	defer t.tg.pidns.owner.runningGoroutines.Add(1)
   132  	t.goroutineStopped.Add(-1)
   133  	defer t.goroutineStopped.Add(1)
   134  	for t.stopCount.RacyLoad() > 0 {
   135  		t.endStopCond.Wait()
   136  	}
   137  }
   138  
   139  // The runApp state checks for interrupts before executing untrusted
   140  // application code.
   141  //
   142  // +stateify savable
   143  type runApp struct{}
   144  
   145  func (app *runApp) execute(t *Task) taskRunState {
   146  	if t.interrupted() {
   147  		// Checkpointing instructs tasks to stop by sending an interrupt, so we
   148  		// must check for stops before entering runInterrupt (instead of
   149  		// tail-calling it).
   150  		return (*runInterrupt)(nil)
   151  	}
   152  
   153  	// Execute any task work callbacks before returning to user space.
   154  	if t.taskWorkCount.Load() > 0 {
   155  		t.taskWorkMu.Lock()
   156  		queue := t.taskWork
   157  		t.taskWork = nil
   158  		t.taskWorkCount.Store(0)
   159  		t.taskWorkMu.Unlock()
   160  
   161  		// Do not hold taskWorkMu while executing task work, which may register
   162  		// more work.
   163  		for _, work := range queue {
   164  			work.TaskWork(t)
   165  		}
   166  	}
   167  
   168  	// We're about to switch to the application again. If there's still an
   169  	// unhandled SyscallRestartErrno that wasn't translated to an EINTR,
   170  	// restart the syscall that was interrupted. If there's a saved signal
   171  	// mask, restore it. (Note that restoring the saved signal mask may unblock
   172  	// a pending signal, causing another interruption, but that signal should
   173  	// not interact with the interrupted syscall.)
   174  	if t.haveSyscallReturn {
   175  		if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil {
   176  			t.Warningf("Unable to pull a full state: %v", err)
   177  			t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1))))
   178  			return (*runExit)(nil)
   179  		}
   180  
   181  		if sre, ok := linuxerr.SyscallRestartErrorFromReturn(t.Arch().Return()); ok {
   182  			if sre == linuxerr.ERESTART_RESTARTBLOCK {
   183  				t.Debugf("Restarting syscall %d with restart block: not interrupted by handled signal", t.Arch().SyscallNo())
   184  				t.Arch().RestartSyscallWithRestartBlock()
   185  			} else {
   186  				t.Debugf("Restarting syscall %d: not interrupted by handled signal", t.Arch().SyscallNo())
   187  				t.Arch().RestartSyscall()
   188  			}
   189  		}
   190  		t.haveSyscallReturn = false
   191  	}
   192  	if t.haveSavedSignalMask {
   193  		t.SetSignalMask(t.savedSignalMask)
   194  		t.haveSavedSignalMask = false
   195  		if t.interrupted() {
   196  			return (*runInterrupt)(nil)
   197  		}
   198  	}
   199  
   200  	// Apply restartable sequences.
   201  	if t.rseqPreempted {
   202  		t.rseqPreempted = false
   203  		if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
   204  			// Linux writes the CPU on every preemption. We only do
   205  			// so if it changed. Thus we may delay delivery of
   206  			// SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
   207  			cpu := int32(hostcpu.GetCPU())
   208  			if t.rseqCPU != cpu {
   209  				t.rseqCPU = cpu
   210  				if err := t.rseqCopyOutCPU(); err != nil {
   211  					t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
   212  					t.forceSignal(linux.SIGSEGV, false)
   213  					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   214  					// Re-enter the task run loop for signal delivery.
   215  					return (*runApp)(nil)
   216  				}
   217  				if err := t.oldRSeqCopyOutCPU(); err != nil {
   218  					t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
   219  					t.forceSignal(linux.SIGSEGV, false)
   220  					t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
   221  					// Re-enter the task run loop for signal delivery.
   222  					return (*runApp)(nil)
   223  				}
   224  			}
   225  		}
   226  		t.rseqInterrupt()
   227  	}
   228  
   229  	// Check if we need to enable single-stepping. Tracers expect that the
   230  	// kernel preserves the value of the single-step flag set by PTRACE_SETREGS
   231  	// whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
   232  	// includes our ptrace platform, by the way), so we should only clear the
   233  	// single-step flag if we're responsible for setting it. (clearSinglestep
   234  	// is therefore analogous to Linux's TIF_FORCED_TF.)
   235  	//
   236  	// Strictly speaking, we should also not clear the single-step flag if we
   237  	// single-step through an instruction that sets the single-step flag
   238  	// (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
   239  	// own TF. (Famous last words, I know.)
   240  	clearSinglestep := false
   241  	if t.hasTracer() {
   242  		t.tg.pidns.owner.mu.RLock()
   243  		if t.ptraceSinglestep {
   244  			clearSinglestep = !t.Arch().SingleStep()
   245  			t.Arch().SetSingleStep()
   246  		}
   247  		t.tg.pidns.owner.mu.RUnlock()
   248  	}
   249  
   250  	region := trace.StartRegion(t.traceContext, runRegion)
   251  	t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
   252  	info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU)
   253  	t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
   254  	region.End()
   255  
   256  	if clearSinglestep {
   257  		t.Arch().ClearSingleStep()
   258  	}
   259  	if t.hasTracer() {
   260  		if e := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); e != nil {
   261  			t.Warningf("Unable to pull a full state: %v", e)
   262  			err = e
   263  		}
   264  	}
   265  
   266  	switch err {
   267  	case nil:
   268  		// Handle application system call.
   269  		return t.doSyscall()
   270  
   271  	case platform.ErrContextInterrupt:
   272  		// Interrupted by platform.Context.Interrupt(). Re-enter the run
   273  		// loop to figure out why.
   274  		return (*runApp)(nil)
   275  
   276  	case platform.ErrContextSignal:
   277  		// Looks like a signal has been delivered to us. If it's a synchronous
   278  		// signal (SEGV, SIGBUS, etc.), it should be sent to the application
   279  		// thread that received it.
   280  		sig := linux.Signal(info.Signo)
   281  
   282  		// Was it a fault that we should handle internally? If so, this wasn't
   283  		// an application-generated signal and we should continue execution
   284  		// normally.
   285  		if at.Any() {
   286  			faultCounter.Increment()
   287  
   288  			region := trace.StartRegion(t.traceContext, faultRegion)
   289  			addr := hostarch.Addr(info.Addr())
   290  			err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack()))
   291  			region.End()
   292  			if err == nil {
   293  				// The fault was handled appropriately.
   294  				// We can resume running the application.
   295  				return (*runApp)(nil)
   296  			}
   297  
   298  			// Is this a vsyscall that we need emulate?
   299  			//
   300  			// Note that we don't track vsyscalls as part of a
   301  			// specific trace region. This is because regions don't
   302  			// stack, and the actual system call will count as a
   303  			// region. We should be able to easily identify
   304  			// vsyscalls by having a <fault><syscall> pair.
   305  			if at.Execute {
   306  				if sysno, ok := t.image.st.LookupEmulate(addr); ok {
   307  					return t.doVsyscall(addr, sysno)
   308  				}
   309  			}
   310  
   311  			// Faults are common, log only at debug level.
   312  			t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v sig=%v err=%v", addr, t.Arch().IP(), at, sig, err)
   313  			t.DebugDumpState()
   314  
   315  			// Continue to signal handling.
   316  			//
   317  			// Convert a BusError error to a SIGBUS from a SIGSEGV. All
   318  			// other info bits stay the same (address, etc.).
   319  			if _, ok := err.(*memmap.BusError); ok {
   320  				sig = linux.SIGBUS
   321  				info.Signo = int32(linux.SIGBUS)
   322  			}
   323  		}
   324  
   325  		switch sig {
   326  		case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
   327  			// Synchronous signal. Send it to ourselves. Assume the signal is
   328  			// legitimate and force it (work around the signal being ignored or
   329  			// blocked) like Linux does. Conveniently, this is even the correct
   330  			// behavior for SIGTRAP from single-stepping.
   331  			t.forceSignal(linux.Signal(sig), false /* unconditional */)
   332  			t.SendSignal(info)
   333  
   334  		case platform.SignalInterrupt:
   335  			// Assume that a call to platform.Context.Interrupt() misfired.
   336  
   337  		case linux.SIGPROF:
   338  			// It's a profiling interrupt: there's not much
   339  			// we can do. We've already paid a decent cost
   340  			// by intercepting the signal, at this point we
   341  			// simply ignore it.
   342  
   343  		default:
   344  			// Asynchronous signal. Let the system deal with it.
   345  			t.k.sendExternalSignal(info, "application")
   346  		}
   347  
   348  		return (*runApp)(nil)
   349  
   350  	case platform.ErrContextCPUPreempted:
   351  		// Ensure that rseq critical sections are interrupted and per-thread
   352  		// CPU values are updated before the next platform.Context.Switch().
   353  		t.rseqPreempted = true
   354  		return (*runApp)(nil)
   355  
   356  	default:
   357  		// What happened? Can't continue.
   358  		t.Warningf("Unexpected SwitchToApp error: %v", err)
   359  		t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1))))
   360  		return (*runExit)(nil)
   361  	}
   362  }
   363  
   364  // assertTaskGoroutine panics if the caller is not running on t's task
   365  // goroutine.
   366  func (t *Task) assertTaskGoroutine() {
   367  	if got, want := goid.Get(), t.goid.Load(); got != want {
   368  		panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want))
   369  	}
   370  }
   371  
   372  // GoroutineID returns the ID of t's task goroutine.
   373  func (t *Task) GoroutineID() int64 {
   374  	return t.goid.Load()
   375  }
   376  
   377  // waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
   378  func (t *Task) waitGoroutineStoppedOrExited() {
   379  	t.goroutineStopped.Wait()
   380  }
   381  
   382  // WaitExited blocks until all task goroutines in tg have exited.
   383  //
   384  // WaitExited does not correspond to anything in Linux; it's provided so that
   385  // external callers of Kernel.CreateProcess can wait for the created thread
   386  // group to terminate.
   387  func (tg *ThreadGroup) WaitExited() {
   388  	tg.liveGoroutines.Wait()
   389  }
   390  
   391  // Yield yields the processor for the calling task.
   392  func (t *Task) Yield() {
   393  	t.yieldCount.Add(1)
   394  	runtime.Gosched()
   395  }