github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/systrap/subprocess.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package systrap
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"runtime"
    21  	"sync"
    22  	"sync/atomic"
    23  
    24  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    25  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    26  	"github.com/MerlinKodo/gvisor/pkg/log"
    27  	"github.com/MerlinKodo/gvisor/pkg/pool"
    28  	"github.com/MerlinKodo/gvisor/pkg/seccomp"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/memmap"
    31  	"github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc"
    32  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform"
    33  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform/systrap/sysmsg"
    34  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform/systrap/usertrap"
    35  	"github.com/MerlinKodo/gvisor/pkg/sentry/usage"
    36  	"golang.org/x/sys/unix"
    37  )
    38  
    39  var (
    40  	// globalPool tracks all subprocesses in various state: active or available for
    41  	// reuse.
    42  	globalPool = subprocessPool{}
    43  
    44  	// maximumUserAddress is the largest possible user address.
    45  	maximumUserAddress = linux.TaskSize
    46  
    47  	// stubInitAddress is the initial attempt link address for the stub.
    48  	stubInitAddress = linux.TaskSize
    49  
    50  	// maxRandomOffsetOfStubAddress is the maximum offset for randomizing a
    51  	// stub address. It is set to the default value of mm.mmap_rnd_bits.
    52  	//
    53  	// Note: Tools like ThreadSanitizer don't like when the memory layout
    54  	// is changed significantly.
    55  	maxRandomOffsetOfStubAddress = (linux.TaskSize >> 7) & ^(uintptr(hostarch.PageSize) - 1)
    56  
    57  	// maxStubUserAddress is the largest possible user address for
    58  	// processes running inside gVisor. It is fixed because
    59  	// * we don't want to reveal a stub address.
    60  	// * it has to be the same across checkpoint/restore.
    61  	maxStubUserAddress = maximumUserAddress - maxRandomOffsetOfStubAddress
    62  )
    63  
    64  // Linux kernel errnos which "should never be seen by user programs", but will
    65  // be revealed to ptrace syscall exit tracing.
    66  //
    67  // These constants are only used in subprocess.go.
    68  const (
    69  	ERESTARTSYS    = unix.Errno(512)
    70  	ERESTARTNOINTR = unix.Errno(513)
    71  	ERESTARTNOHAND = unix.Errno(514)
    72  )
    73  
    74  // thread is a traced thread; it is a thread identifier.
    75  //
    76  // This is a convenience type for defining ptrace operations.
    77  type thread struct {
    78  	tgid int32
    79  	tid  int32
    80  
    81  	// sysmsgStackID is a stack ID in subprocess.sysmsgStackPool.
    82  	sysmsgStackID uint64
    83  
    84  	// initRegs are the initial registers for the first thread.
    85  	//
    86  	// These are used for the register set for system calls.
    87  	initRegs arch.Registers
    88  }
    89  
    90  // requestThread is used to request a new sysmsg thread. A thread identifier will
    91  // be sent into the thread channel.
    92  type requestThread struct {
    93  	thread chan *thread
    94  }
    95  
    96  // requestStub is used to request a new stub process.
    97  type requestStub struct {
    98  	done chan *thread
    99  }
   100  
   101  // maxSysmsgThreads specifies the maximum number of system threads that a
   102  // subprocess can create in context decoupled mode.
   103  // TODO(b/268366549): Replace maxSystemThreads below.
   104  var maxSysmsgThreads = runtime.GOMAXPROCS(0)
   105  
   106  const (
   107  	// maxSystemThreads specifies the maximum number of system threads that a
   108  	// subprocess may create in order to process the contexts.
   109  	maxSystemThreads = 4096
   110  	// maxGuestContexts specifies the maximum number of task contexts that a
   111  	// subprocess can handle.
   112  	maxGuestContexts = 4095
   113  	// invalidContextID specifies an invalid ID.
   114  	invalidContextID uint32 = 0xfefefefe
   115  	// invalidThreadID is used to indicate that a context is not being worked on by
   116  	// any sysmsg thread.
   117  	invalidThreadID uint32 = 0xfefefefe
   118  )
   119  
   120  // subprocess is a collection of threads being traced.
   121  type subprocess struct {
   122  	platform.NoAddressSpaceIO
   123  	subprocessRefs
   124  
   125  	// requests is used to signal creation of new threads.
   126  	requests chan any
   127  
   128  	// sysmsgInitRegs is used to reset sysemu regs.
   129  	sysmsgInitRegs arch.Registers
   130  
   131  	// mu protects the following fields.
   132  	mu sync.Mutex
   133  
   134  	// faultedContexts is the set of contexts for which it's possible that
   135  	// context.lastFaultSP == this subprocess.
   136  	faultedContexts map[*context]struct{}
   137  
   138  	// sysmsgStackPool is a pool of available sysmsg stacks.
   139  	sysmsgStackPool pool.Pool
   140  
   141  	// threadContextPool is a pool of available sysmsg.ThreadContext IDs.
   142  	threadContextPool pool.Pool
   143  
   144  	// threadContextRegion defines the ThreadContext memory region start
   145  	// within the sentry address space.
   146  	threadContextRegion uintptr
   147  
   148  	// memoryFile is used to allocate a sysmsg stack which is shared
   149  	// between a stub process and the Sentry.
   150  	memoryFile *pgalloc.MemoryFile
   151  
   152  	// usertrap is the state of the usertrap table which contains syscall
   153  	// trampolines.
   154  	usertrap *usertrap.State
   155  
   156  	syscallThreadMu sync.Mutex
   157  	syscallThread   *syscallThread
   158  
   159  	// sysmsgThreadsMu protects sysmsgThreads and numSysmsgThreads
   160  	sysmsgThreadsMu sync.Mutex
   161  	// sysmsgThreads is a collection of all active sysmsg threads in the
   162  	// subprocess.
   163  	sysmsgThreads map[uint32]*sysmsgThread
   164  	// numSysmsgThreads counts the number of active sysmsg threads; we use a
   165  	// counter instead of using len(sysmsgThreads) because we need to synchronize
   166  	// how many threads get created _before_ the creation happens.
   167  	numSysmsgThreads int
   168  
   169  	// contextQueue is a queue of all contexts that are ready to switch back to
   170  	// user mode.
   171  	contextQueue *contextQueue
   172  }
   173  
   174  func (s *subprocess) initSyscallThread(ptraceThread *thread) error {
   175  	s.syscallThreadMu.Lock()
   176  	defer s.syscallThreadMu.Unlock()
   177  
   178  	id, ok := s.sysmsgStackPool.Get()
   179  	if !ok {
   180  		panic("unable to allocate a sysmsg stub thread")
   181  	}
   182  
   183  	ptraceThread.sysmsgStackID = id
   184  	t := syscallThread{
   185  		subproc: s,
   186  		thread:  ptraceThread,
   187  	}
   188  
   189  	if err := t.init(); err != nil {
   190  		panic(fmt.Sprintf("failed to create a syscall thread"))
   191  	}
   192  	s.syscallThread = &t
   193  
   194  	s.syscallThread.detach()
   195  
   196  	return nil
   197  }
   198  
   199  // handlePtraceSyscallRequest executes system calls that can't be run via
   200  // syscallThread without using ptrace. Look at the description of syscallThread
   201  // to get more details about its limitations.
   202  func (s *subprocess) handlePtraceSyscallRequest(req any) {
   203  	s.syscallThreadMu.Lock()
   204  	defer s.syscallThreadMu.Unlock()
   205  	runtime.LockOSThread()
   206  	defer runtime.UnlockOSThread()
   207  	s.syscallThread.attach()
   208  	defer s.syscallThread.detach()
   209  
   210  	ptraceThread := s.syscallThread.thread
   211  
   212  	switch req.(type) {
   213  	case requestThread:
   214  		r := req.(requestThread)
   215  		t, err := ptraceThread.clone()
   216  		if err != nil {
   217  			// Should not happen: not recoverable.
   218  			panic(fmt.Sprintf("error initializing first thread: %v", err))
   219  		}
   220  
   221  		// Since the new thread was created with
   222  		// clone(CLONE_PTRACE), it will begin execution with
   223  		// SIGSTOP pending and with this thread as its tracer.
   224  		// (Hopefully nobody tgkilled it with a signal <
   225  		// SIGSTOP before the SIGSTOP was delivered, in which
   226  		// case that signal would be delivered before SIGSTOP.)
   227  		if sig := t.wait(stopped); sig != unix.SIGSTOP {
   228  			panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
   229  		}
   230  
   231  		t.initRegs = ptraceThread.initRegs
   232  		// Set the parent death signal to SIGKILL.
   233  		_, err = t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_PRCTL,
   234  			arch.SyscallArgument{Value: linux.PR_SET_PDEATHSIG},
   235  			arch.SyscallArgument{Value: uintptr(unix.SIGKILL)},
   236  			arch.SyscallArgument{Value: 0},
   237  			arch.SyscallArgument{Value: 0},
   238  			arch.SyscallArgument{Value: 0},
   239  			arch.SyscallArgument{Value: 0},
   240  		)
   241  		if err != nil {
   242  			panic(fmt.Sprintf("prctl: %v", err))
   243  		}
   244  
   245  		id, ok := s.sysmsgStackPool.Get()
   246  		if !ok {
   247  			panic("unable to allocate a sysmsg stub thread")
   248  		}
   249  		t.sysmsgStackID = id
   250  
   251  		if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(unix.SIGSTOP)); e != 0 {
   252  			panic(fmt.Sprintf("tkill failed: %v", e))
   253  		}
   254  
   255  		// Detach the thread.
   256  		t.detach()
   257  
   258  		// Return the thread.
   259  		r.thread <- t
   260  	case requestStub:
   261  		r := req.(requestStub)
   262  		t, err := ptraceThread.createStub()
   263  		if err != nil {
   264  			panic(fmt.Sprintf("unable to create a stub process: %s", err))
   265  		}
   266  		r.done <- t
   267  
   268  	}
   269  }
   270  
   271  // newSubprocess returns a usable subprocess.
   272  //
   273  // This will either be a newly created subprocess, or one from the global pool.
   274  // The create function will be called in the latter case, which is guaranteed
   275  // to happen with the runtime thread locked.
   276  func newSubprocess(create func() (*thread, error), memoryFile *pgalloc.MemoryFile) (*subprocess, error) {
   277  	if sp := globalPool.fetchAvailable(); sp != nil {
   278  		sp.subprocessRefs.InitRefs()
   279  		sp.usertrap = usertrap.New()
   280  		return sp, nil
   281  	}
   282  
   283  	// The following goroutine is responsible for creating the first traced
   284  	// thread, and responding to requests to make additional threads in the
   285  	// traced process. The process will be killed and reaped when the
   286  	// request channel is closed, which happens in Release below.
   287  	requests := make(chan any)
   288  
   289  	// Ready.
   290  	sp := &subprocess{
   291  		requests:          requests,
   292  		faultedContexts:   make(map[*context]struct{}),
   293  		sysmsgStackPool:   pool.Pool{Start: 0, Limit: maxSystemThreads},
   294  		threadContextPool: pool.Pool{Start: 0, Limit: maxGuestContexts},
   295  		memoryFile:        memoryFile,
   296  		sysmsgThreads:     make(map[uint32]*sysmsgThread),
   297  	}
   298  	sp.subprocessRefs.InitRefs()
   299  	runtime.LockOSThread()
   300  	defer runtime.UnlockOSThread()
   301  
   302  	// Initialize the syscall thread.
   303  	ptraceThread, err := create()
   304  	if err != nil {
   305  		return nil, err
   306  	}
   307  	sp.sysmsgInitRegs = ptraceThread.initRegs
   308  
   309  	if err := sp.initSyscallThread(ptraceThread); err != nil {
   310  		return nil, err
   311  	}
   312  
   313  	go func() { // S/R-SAFE: Platform-related.
   314  
   315  		// Wait for requests to create threads.
   316  		for req := range requests {
   317  			sp.handlePtraceSyscallRequest(req)
   318  		}
   319  
   320  		// Requests should never be closed.
   321  		panic("unreachable")
   322  	}()
   323  
   324  	sp.unmap()
   325  	sp.usertrap = usertrap.New()
   326  	sp.mapSharedRegions()
   327  	sp.mapPrivateRegions()
   328  
   329  	// Create the initial sysmsg thread.
   330  	atomic.AddUint32(&sp.contextQueue.numThreadsToWakeup, 1)
   331  	if err := sp.createSysmsgThread(); err != nil {
   332  		return nil, err
   333  	}
   334  	sp.numSysmsgThreads++
   335  
   336  	return sp, nil
   337  }
   338  
   339  // mapSharedRegions maps the shared regions that are used between the subprocess
   340  // and ALL of the subsequently created sysmsg threads into both the sentry and
   341  // the syscall thread.
   342  //
   343  // Should be called before any sysmsg threads are created.
   344  // Initializes s.contextQueue and s.threadContextRegion.
   345  func (s *subprocess) mapSharedRegions() {
   346  	if s.contextQueue != nil || s.threadContextRegion != 0 {
   347  		panic("contextQueue or threadContextRegion was already initialized")
   348  	}
   349  
   350  	opts := pgalloc.AllocOpts{
   351  		Kind: usage.System,
   352  		Dir:  pgalloc.TopDown,
   353  	}
   354  
   355  	// Map shared regions into the sentry.
   356  	contextQueueFR, contextQueue := mmapContextQueueForSentry(s.memoryFile, opts)
   357  	contextQueue.init()
   358  
   359  	// Map thread context region into the syscall thread.
   360  	_, err := s.syscallThread.syscall(
   361  		unix.SYS_MMAP,
   362  		arch.SyscallArgument{Value: uintptr(stubContextQueueRegion)},
   363  		arch.SyscallArgument{Value: uintptr(contextQueueFR.Length())},
   364  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   365  		arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)},
   366  		arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())},
   367  		arch.SyscallArgument{Value: uintptr(contextQueueFR.Start)})
   368  	if err != nil {
   369  		panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err))
   370  	}
   371  
   372  	s.contextQueue = contextQueue
   373  
   374  	// Map thread context region into the sentry.
   375  	threadContextFR, err := s.memoryFile.Allocate(uint64(stubContextRegionLen), opts)
   376  	if err != nil {
   377  		panic(fmt.Sprintf("failed to allocate a new subprocess context memory region"))
   378  	}
   379  	sentryThreadContextRegionAddr, _, errno := unix.RawSyscall6(
   380  		unix.SYS_MMAP,
   381  		0,
   382  		uintptr(threadContextFR.Length()),
   383  		unix.PROT_WRITE|unix.PROT_READ,
   384  		unix.MAP_SHARED|unix.MAP_FILE,
   385  		uintptr(s.memoryFile.FD()), uintptr(threadContextFR.Start))
   386  	if errno != 0 {
   387  		panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno))
   388  	}
   389  
   390  	// Map thread context region into the syscall thread.
   391  	if _, err := s.syscallThread.syscall(
   392  		unix.SYS_MMAP,
   393  		arch.SyscallArgument{Value: uintptr(stubContextRegion)},
   394  		arch.SyscallArgument{Value: uintptr(threadContextFR.Length())},
   395  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   396  		arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)},
   397  		arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())},
   398  		arch.SyscallArgument{Value: uintptr(threadContextFR.Start)}); err != nil {
   399  		panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err))
   400  	}
   401  
   402  	s.threadContextRegion = sentryThreadContextRegionAddr
   403  }
   404  
   405  func (s *subprocess) mapPrivateRegions() {
   406  	_, err := s.syscallThread.syscall(
   407  		unix.SYS_MMAP,
   408  		arch.SyscallArgument{Value: uintptr(stubSpinningThreadQueueAddr)},
   409  		arch.SyscallArgument{Value: uintptr(sysmsg.SpinningQueueMemSize)},
   410  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   411  		arch.SyscallArgument{Value: uintptr(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED)},
   412  		arch.SyscallArgument{Value: 0},
   413  		arch.SyscallArgument{Value: 0})
   414  	if err != nil {
   415  		panic(fmt.Sprintf("failed to mmap spinning queue region into syscall thread: %v", err))
   416  	}
   417  }
   418  
   419  // unmap unmaps non-stub regions of the process.
   420  //
   421  // This will panic on failure (which should never happen).
   422  func (s *subprocess) unmap() {
   423  	s.Unmap(0, uint64(stubStart))
   424  	if maximumUserAddress != stubEnd {
   425  		s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd))
   426  	}
   427  }
   428  
   429  // Release kills the subprocess.
   430  //
   431  // Just kidding! We can't safely co-ordinate the detaching of all the
   432  // tracees (since the tracers are random runtime threads, and the process
   433  // won't exit until tracers have been notifier).
   434  //
   435  // Therefore we simply unmap everything in the subprocess and return it to the
   436  // globalPool. This has the added benefit of reducing creation time for new
   437  // subprocesses.
   438  func (s *subprocess) Release() {
   439  	s.unmap()
   440  	s.DecRef(s.release)
   441  }
   442  
   443  // release returns the subprocess to the global pool.
   444  func (s *subprocess) release() {
   445  	globalPool.markAvailable(s)
   446  }
   447  
   448  // newThread creates a new traced thread.
   449  //
   450  // Precondition: the OS thread must be locked.
   451  func (s *subprocess) newThread() *thread {
   452  	// Ask the first thread to create a new one.
   453  	var r requestThread
   454  	r.thread = make(chan *thread)
   455  	s.requests <- r
   456  	t := <-r.thread
   457  
   458  	// Attach the subprocess to this one.
   459  	t.attach()
   460  
   461  	// Return the new thread, which is now bound.
   462  	return t
   463  }
   464  
   465  // attach attaches to the thread.
   466  func (t *thread) attach() {
   467  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
   468  		panic(fmt.Sprintf("unable to attach: %v", errno))
   469  	}
   470  
   471  	// PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already
   472  	// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
   473  	// newSubprocess), so we always expect to see signal-delivery-stop with
   474  	// SIGSTOP.
   475  	if sig := t.wait(stopped); sig != unix.SIGSTOP {
   476  		panic(fmt.Sprintf("wait failed: expected SIGSTOP, got %v", sig))
   477  	}
   478  
   479  	// Initialize options.
   480  	t.init()
   481  }
   482  
   483  func (t *thread) grabInitRegs() {
   484  	// Grab registers.
   485  	//
   486  	// Note that we adjust the current register RIP value to be just before
   487  	// the current system call executed. This depends on the definition of
   488  	// the stub itself.
   489  	if err := t.getRegs(&t.initRegs); err != nil {
   490  		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
   491  	}
   492  	t.adjustInitRegsRip()
   493  	t.initRegs.SetStackPointer(0)
   494  }
   495  
   496  // detach detaches from the thread.
   497  //
   498  // Because the SIGSTOP is not suppressed, the thread will enter group-stop.
   499  func (t *thread) detach() {
   500  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 {
   501  		panic(fmt.Sprintf("can't detach new clone: %v", errno))
   502  	}
   503  }
   504  
   505  // waitOutcome is used for wait below.
   506  type waitOutcome int
   507  
   508  const (
   509  	// stopped indicates that the process was stopped.
   510  	stopped waitOutcome = iota
   511  
   512  	// killed indicates that the process was killed.
   513  	killed
   514  )
   515  
   516  func (t *thread) Debugf(format string, v ...any) {
   517  	prefix := fmt.Sprintf("%8d:", t.tid)
   518  	log.DebugfAtDepth(1, prefix+format, v...)
   519  }
   520  
   521  func (t *thread) dumpAndPanic(message string) {
   522  	var regs arch.Registers
   523  	message += "\n"
   524  	if err := t.getRegs(&regs); err == nil {
   525  		message += dumpRegs(&regs)
   526  	} else {
   527  		log.Warningf("unable to get registers: %v", err)
   528  	}
   529  	message += fmt.Sprintf("stubStart\t = %016x\n", stubStart)
   530  	panic(message)
   531  }
   532  
   533  func (t *thread) dumpRegs(message string) {
   534  	var regs arch.Registers
   535  	message += "\n"
   536  	if err := t.getRegs(&regs); err == nil {
   537  		message += dumpRegs(&regs)
   538  	} else {
   539  		log.Warningf("unable to get registers: %v", err)
   540  	}
   541  	log.Infof("%s", message)
   542  }
   543  
   544  func (t *thread) unexpectedStubExit() {
   545  	msg, err := t.getEventMessage()
   546  	status := unix.WaitStatus(msg)
   547  	if status.Signaled() && status.Signal() == unix.SIGKILL {
   548  		// SIGKILL can be only sent by a user or OOM-killer. In both
   549  		// these cases, we don't need to panic. There is no reasons to
   550  		// think that something wrong in gVisor.
   551  		log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid)
   552  		pid := os.Getpid()
   553  		unix.Tgkill(pid, pid, unix.Signal(unix.SIGKILL))
   554  	}
   555  	t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err))
   556  }
   557  
   558  // wait waits for a stop event.
   559  //
   560  // Precondition: outcome is a valid waitOutcome.
   561  func (t *thread) wait(outcome waitOutcome) unix.Signal {
   562  	var status unix.WaitStatus
   563  
   564  	for {
   565  		r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil)
   566  		if err == unix.EINTR || err == unix.EAGAIN {
   567  			// Wait was interrupted; wait again.
   568  			continue
   569  		} else if err != nil {
   570  			panic(fmt.Sprintf("ptrace wait failed: %v", err))
   571  		}
   572  		if int(r) != int(t.tid) {
   573  			panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
   574  		}
   575  		switch outcome {
   576  		case stopped:
   577  			if !status.Stopped() {
   578  				t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
   579  			}
   580  			stopSig := status.StopSignal()
   581  			if stopSig == 0 {
   582  				continue // Spurious stop.
   583  			}
   584  			if stopSig == unix.SIGTRAP {
   585  				if status.TrapCause() == unix.PTRACE_EVENT_EXIT {
   586  					t.unexpectedStubExit()
   587  				}
   588  				// Re-encode the trap cause the way it's expected.
   589  				return stopSig | unix.Signal(status.TrapCause()<<8)
   590  			}
   591  			// Not a trap signal.
   592  			return stopSig
   593  		case killed:
   594  			if !status.Exited() && !status.Signaled() {
   595  				t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
   596  			}
   597  			return unix.Signal(status.ExitStatus())
   598  		default:
   599  			// Should not happen.
   600  			t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome))
   601  		}
   602  	}
   603  }
   604  
   605  // destroy kills the thread.
   606  //
   607  // Note that this should not be used in the general case; the death of threads
   608  // will typically cause the death of the parent. This is a utility method for
   609  // manually created threads.
   610  func (t *thread) destroy() {
   611  	t.detach()
   612  	unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL))
   613  	t.wait(killed)
   614  }
   615  
   616  // init initializes trace options.
   617  func (t *thread) init() {
   618  	// Set the TRACESYSGOOD option to differentiate real SIGTRAP.
   619  	// set PTRACE_O_EXITKILL to ensure that the unexpected exit of the
   620  	// sentry will immediately kill the associated stubs.
   621  	_, _, errno := unix.RawSyscall6(
   622  		unix.SYS_PTRACE,
   623  		unix.PTRACE_SETOPTIONS,
   624  		uintptr(t.tid),
   625  		0,
   626  		unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|unix.PTRACE_O_EXITKILL,
   627  		0, 0)
   628  	if errno != 0 {
   629  		panic(fmt.Sprintf("ptrace set options failed: %v", errno))
   630  	}
   631  }
   632  
   633  // syscall executes a system call cycle in the traced context.
   634  //
   635  // This is _not_ for use by application system calls, rather it is for use when
   636  // a system call must be injected into the remote context (e.g. mmap, munmap).
   637  // Note that clones are handled separately.
   638  func (t *thread) syscall(regs *arch.Registers) (uintptr, error) {
   639  	// Set registers.
   640  	if err := t.setRegs(regs); err != nil {
   641  		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
   642  	}
   643  
   644  	for {
   645  		// Execute the syscall instruction. The task has to stop on the
   646  		// trap instruction which is right after the syscall
   647  		// instruction.
   648  		if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
   649  			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
   650  		}
   651  
   652  		sig := t.wait(stopped)
   653  		if sig == unix.SIGTRAP {
   654  			// Reached syscall-enter-stop.
   655  			break
   656  		} else {
   657  			// Some other signal caused a thread stop; ignore.
   658  			if sig != unix.SIGSTOP && sig != unix.SIGCHLD {
   659  				log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig)
   660  			}
   661  			continue
   662  		}
   663  	}
   664  
   665  	// Grab registers.
   666  	if err := t.getRegs(regs); err != nil {
   667  		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
   668  	}
   669  	return syscallReturnValue(regs)
   670  }
   671  
   672  // syscallIgnoreInterrupt ignores interrupts on the system call thread and
   673  // restarts the syscall if the kernel indicates that should happen.
   674  func (t *thread) syscallIgnoreInterrupt(
   675  	initRegs *arch.Registers,
   676  	sysno uintptr,
   677  	args ...arch.SyscallArgument) (uintptr, error) {
   678  	for {
   679  		regs := createSyscallRegs(initRegs, sysno, args...)
   680  		rval, err := t.syscall(&regs)
   681  		switch err {
   682  		case ERESTARTSYS:
   683  			continue
   684  		case ERESTARTNOINTR:
   685  			continue
   686  		case ERESTARTNOHAND:
   687  			continue
   688  		default:
   689  			return rval, err
   690  		}
   691  	}
   692  }
   693  
   694  // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
   695  func (t *thread) NotifyInterrupt() {
   696  	unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt))
   697  }
   698  
   699  func (s *subprocess) incAwakeContexts() {
   700  	nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, 1)
   701  	if nr > uint32(maxSysmsgThreads) {
   702  		return
   703  	}
   704  	nr = nrMaxAwakeStubThreads.Add(1)
   705  	if nr > fastPathContextLimit {
   706  		dispatcher.disableStubFastPath()
   707  	}
   708  }
   709  
   710  func (s *subprocess) decAwakeContexts() {
   711  	nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, ^uint32(0))
   712  	if nr >= uint32(maxSysmsgThreads) {
   713  		return
   714  	}
   715  	nrMaxAwakeStubThreads.Add(^uint32(0))
   716  }
   717  
   718  // switchToApp is called from the main SwitchToApp entrypoint.
   719  //
   720  // This function returns true on a system call, false on a signal.
   721  // The second return value is true if a syscall instruction can be replaced on
   722  // a function call.
   723  func (s *subprocess) switchToApp(c *context, ac *arch.Context64) (isSyscall bool, shouldPatchSyscall bool, err error) {
   724  	// Reset necessary registers.
   725  	regs := &ac.StateData().Regs
   726  	s.resetSysemuRegs(regs)
   727  	ctx := c.sharedContext
   728  	ctx.shared.Regs = regs.PtraceRegs
   729  	restoreArchSpecificState(ctx.shared, ac)
   730  
   731  	// Check for interrupts, and ensure that future interrupts signal the context.
   732  	if !c.interrupt.Enable(c.sharedContext) {
   733  		// Pending interrupt; simulate.
   734  		ctx.clearInterrupt()
   735  		c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)}
   736  		return false, false, nil
   737  	}
   738  	defer func() {
   739  		ctx.clearInterrupt()
   740  		c.interrupt.Disable()
   741  	}()
   742  
   743  	restoreFPState(ctx, c, ac)
   744  
   745  	// Place the context onto the context queue.
   746  	if ctx.sleeping {
   747  		ctx.sleeping = false
   748  		s.incAwakeContexts()
   749  	}
   750  	stubFastPathEnabled := dispatcher.stubFastPathEnabled()
   751  	ctx.setState(sysmsg.ContextStateNone)
   752  	s.contextQueue.add(ctx, stubFastPathEnabled)
   753  	s.waitOnState(ctx, stubFastPathEnabled)
   754  
   755  	// Check if there's been an error.
   756  	threadID := ctx.threadID()
   757  	if threadID != invalidThreadID {
   758  		if sysThread, ok := s.sysmsgThreads[threadID]; ok && sysThread.msg.Err != 0 {
   759  			msg := sysThread.msg
   760  			panic(fmt.Sprintf("stub thread %d failed: err 0x%x line %d: %s", sysThread.thread.tid, msg.Err, msg.Line, msg))
   761  		}
   762  		log.Warningf("systrap: found unexpected ThreadContext.ThreadID field, expected %d found %d", invalidThreadID, threadID)
   763  	}
   764  
   765  	// Copy register state locally.
   766  	regs.PtraceRegs = ctx.shared.Regs
   767  	retrieveArchSpecificState(ctx.shared, ac)
   768  	c.needToPullFullState = true
   769  	// We have a signal. We verify however, that the signal was
   770  	// either delivered from the kernel or from this process. We
   771  	// don't respect other signals.
   772  	c.signalInfo = ctx.shared.SignalInfo
   773  	ctxState := ctx.state()
   774  	if ctxState == sysmsg.ContextStateSyscallCanBePatched {
   775  		ctxState = sysmsg.ContextStateSyscall
   776  		shouldPatchSyscall = true
   777  	}
   778  
   779  	if ctxState == sysmsg.ContextStateSyscall || ctxState == sysmsg.ContextStateSyscallTrap {
   780  		if maybePatchSignalInfo(regs, &c.signalInfo) {
   781  			return false, false, nil
   782  		}
   783  		updateSyscallRegs(regs)
   784  		return true, shouldPatchSyscall, nil
   785  	} else if ctxState != sysmsg.ContextStateFault {
   786  		panic(fmt.Sprintf("unknown context state: %v", ctxState))
   787  	}
   788  
   789  	return false, false, nil
   790  }
   791  
   792  func (s *subprocess) waitOnState(ctx *sharedContext, stubFastPathEnabled bool) {
   793  	ctx.kicked = false
   794  	slowPath := false
   795  	start := cputicks()
   796  	ctx.startWaitingTS = start
   797  	if !stubFastPathEnabled || atomic.LoadUint32(&s.contextQueue.numActiveThreads) == 0 {
   798  		ctx.kicked = s.kickSysmsgThread()
   799  	}
   800  	for curState := ctx.state(); curState == sysmsg.ContextStateNone; curState = ctx.state() {
   801  		if !slowPath {
   802  			events := dispatcher.waitFor(ctx)
   803  			if events&sharedContextKicked != 0 {
   804  				if ctx.kicked {
   805  					continue
   806  				}
   807  				if ctx.isAcked() {
   808  					ctx.kicked = true
   809  					continue
   810  				}
   811  				s.kickSysmsgThread()
   812  				ctx.kicked = true
   813  				continue
   814  			}
   815  			if events&sharedContextSlowPath != 0 {
   816  				ctx.disableSentryFastPath()
   817  				slowPath = true
   818  				continue
   819  			}
   820  		} else {
   821  			// If the context already received a handshake then it knows it's being
   822  			// worked on.
   823  			if !ctx.kicked && !ctx.isAcked() {
   824  				ctx.kicked = s.kickSysmsgThread()
   825  			}
   826  
   827  			ctx.sleepOnState(curState)
   828  		}
   829  	}
   830  
   831  	ctx.resetAcked()
   832  	ctx.enableSentryFastPath()
   833  }
   834  
   835  // canKickSysmsgThread returns true if a new thread can be kicked.
   836  // The second return value is the expected number of threads after kicking a
   837  // new one.
   838  func (s *subprocess) canKickSysmsgThread() (bool, uint32) {
   839  	// numActiveContexts and numActiveThreads can be changed from stub
   840  	// threads that handles the contextQueue without any locks. The idea
   841  	// here is that any stub thread that gets CPU time can make some
   842  	// progress. In stub threads, we can use only spinlock-like
   843  	// synchronizations, but they don't work well because a thread that
   844  	// holds a lock can be preempted by another thread that is waiting for
   845  	// the same lock.
   846  	nrActiveThreads := atomic.LoadUint32(&s.contextQueue.numActiveThreads)
   847  	nrThreadsToWakeup := atomic.LoadUint32(&s.contextQueue.numThreadsToWakeup)
   848  	nrActiveContexts := atomic.LoadUint32(&s.contextQueue.numActiveContexts)
   849  
   850  	nrActiveThreads += nrThreadsToWakeup + 1
   851  	if nrActiveThreads > nrActiveContexts {
   852  		// This can happen when one or more stub threads are
   853  		// waiting for cpu time. The host probably has more
   854  		// running tasks than a number of cpu-s.
   855  		return false, nrActiveThreads
   856  	}
   857  	return true, nrActiveThreads
   858  }
   859  
   860  func (s *subprocess) kickSysmsgThread() bool {
   861  	kick, _ := s.canKickSysmsgThread()
   862  	if !kick {
   863  		return false
   864  	}
   865  
   866  	s.sysmsgThreadsMu.Lock()
   867  	kick, nrThreads := s.canKickSysmsgThread()
   868  	if !kick {
   869  		s.sysmsgThreadsMu.Unlock()
   870  		return false
   871  	}
   872  	atomic.AddUint32(&s.contextQueue.numThreadsToWakeup, 1)
   873  	if s.numSysmsgThreads < maxSysmsgThreads && s.numSysmsgThreads < int(nrThreads) {
   874  		s.numSysmsgThreads++
   875  		s.sysmsgThreadsMu.Unlock()
   876  		if err := s.createSysmsgThread(); err != nil {
   877  			log.Warningf("Unable to create a new stub thread: %s", err)
   878  			s.sysmsgThreadsMu.Lock()
   879  			s.numSysmsgThreads--
   880  			s.sysmsgThreadsMu.Unlock()
   881  		}
   882  	} else {
   883  		s.sysmsgThreadsMu.Unlock()
   884  	}
   885  	s.contextQueue.wakeupSysmsgThread()
   886  
   887  	return false
   888  }
   889  
   890  // syscall executes the given system call without handling interruptions.
   891  func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) {
   892  	s.syscallThreadMu.Lock()
   893  	defer s.syscallThreadMu.Unlock()
   894  
   895  	return s.syscallThread.syscall(sysno, args...)
   896  }
   897  
   898  // MapFile implements platform.AddressSpace.MapFile.
   899  func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
   900  	var flags int
   901  	if precommit {
   902  		flags |= unix.MAP_POPULATE
   903  	}
   904  	_, err := s.syscall(
   905  		unix.SYS_MMAP,
   906  		arch.SyscallArgument{Value: uintptr(addr)},
   907  		arch.SyscallArgument{Value: uintptr(fr.Length())},
   908  		arch.SyscallArgument{Value: uintptr(at.Prot())},
   909  		arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)},
   910  		arch.SyscallArgument{Value: uintptr(f.FD())},
   911  		arch.SyscallArgument{Value: uintptr(fr.Start)})
   912  	return err
   913  }
   914  
   915  // Unmap implements platform.AddressSpace.Unmap.
   916  func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) {
   917  	ar, ok := addr.ToRange(length)
   918  	if !ok {
   919  		panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length))
   920  	}
   921  	s.mu.Lock()
   922  	for c := range s.faultedContexts {
   923  		c.mu.Lock()
   924  		if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) {
   925  			// Forget the last fault so that if c faults again, the fault isn't
   926  			// incorrectly reported as a write fault. If this is being called
   927  			// due to munmap() of the corresponding vma, handling of the second
   928  			// fault will fail anyway.
   929  			c.lastFaultSP = nil
   930  			delete(s.faultedContexts, c)
   931  		}
   932  		c.mu.Unlock()
   933  	}
   934  	s.mu.Unlock()
   935  	_, err := s.syscall(
   936  		unix.SYS_MUNMAP,
   937  		arch.SyscallArgument{Value: uintptr(addr)},
   938  		arch.SyscallArgument{Value: uintptr(length)})
   939  	if err != nil {
   940  		// We never expect this to happen.
   941  		panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err))
   942  	}
   943  }
   944  
   945  func (s *subprocess) PullFullState(c *context, ac *arch.Context64) error {
   946  	if !c.sharedContext.isActiveInSubprocess(s) {
   947  		panic("Attempted to PullFullState for context that is not used in subprocess")
   948  	}
   949  	saveFPState(c.sharedContext, ac)
   950  	return nil
   951  }
   952  
   953  var sysmsgThreadPriority int
   954  
   955  func initSysmsgThreadPriority() {
   956  	prio, err := unix.Getpriority(unix.PRIO_PROCESS, 0)
   957  	if err != nil {
   958  		panic("unable to get current scheduling priority")
   959  	}
   960  	// Sysmsg threads are executed with a priority one lower than the Sentry.
   961  	sysmsgThreadPriority = 20 - prio + 1
   962  }
   963  
   964  // createSysmsgThread creates a new sysmsg thread.
   965  // The thread starts processing any available context in the context queue.
   966  func (s *subprocess) createSysmsgThread() error {
   967  	// Create a new seccomp process.
   968  	var r requestThread
   969  	r.thread = make(chan *thread)
   970  	s.requests <- r
   971  	p := <-r.thread
   972  
   973  	runtime.LockOSThread()
   974  	defer runtime.UnlockOSThread()
   975  	p.attach()
   976  
   977  	// Skip SIGSTOP.
   978  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(p.tid), 0, 0, 0, 0); errno != 0 {
   979  		panic(fmt.Sprintf("ptrace cont failed: %v", errno))
   980  	}
   981  	sig := p.wait(stopped)
   982  	if sig != unix.SIGSTOP {
   983  		panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
   984  	}
   985  
   986  	// Allocate a new stack for the BPF process.
   987  	opts := pgalloc.AllocOpts{
   988  		Kind: usage.System,
   989  		Dir:  pgalloc.TopDown,
   990  	}
   991  	fr, err := s.memoryFile.Allocate(uint64(sysmsg.PerThreadSharedStackSize), opts)
   992  	if err != nil {
   993  		// TODO(b/144063246): Need to fail the clone system call.
   994  		panic(fmt.Sprintf("failed to allocate a new stack: %v", err))
   995  	}
   996  	sysThread := &sysmsgThread{
   997  		thread:     p,
   998  		subproc:    s,
   999  		stackRange: fr,
  1000  	}
  1001  	// Use the sysmsgStackID as a handle on this thread instead of host tid in
  1002  	// order to be able to reliably specify invalidThreadID.
  1003  	threadID := uint32(p.sysmsgStackID)
  1004  
  1005  	// Map the stack into the sentry.
  1006  	sentryStackAddr, _, errno := unix.RawSyscall6(
  1007  		unix.SYS_MMAP,
  1008  		0,
  1009  		sysmsg.PerThreadSharedStackSize,
  1010  		unix.PROT_WRITE|unix.PROT_READ,
  1011  		unix.MAP_SHARED|unix.MAP_FILE,
  1012  		uintptr(s.memoryFile.FD()), uintptr(fr.Start))
  1013  	if errno != 0 {
  1014  		panic(fmt.Sprintf("mmap failed: %v", errno))
  1015  	}
  1016  
  1017  	// Before installing the stub syscall filters, we need to call a few
  1018  	// system calls (e.g. sigaltstack, sigaction) which have in-memory
  1019  	// arguments.  We need to prevent changing these parameters by other
  1020  	// stub threads, so lets map the future BPF stack as read-only and
  1021  	// fill syscall arguments from the Sentry.
  1022  	sysmsgStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadSharedStackOffset
  1023  	err = sysThread.mapStack(sysmsgStackAddr, true)
  1024  	if err != nil {
  1025  		panic(fmt.Sprintf("mmap failed: %v", err))
  1026  	}
  1027  
  1028  	sysThread.init(sentryStackAddr, sysmsgStackAddr)
  1029  
  1030  	// Map the stack into the BPF process.
  1031  	err = sysThread.mapStack(sysmsgStackAddr, false)
  1032  	if err != nil {
  1033  		s.memoryFile.DecRef(fr)
  1034  		panic(fmt.Sprintf("mmap failed: %v", err))
  1035  	}
  1036  
  1037  	// Map the stack into the BPF process.
  1038  	privateStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadPrivateStackOffset
  1039  	err = sysThread.mapPrivateStack(privateStackAddr, sysmsg.PerThreadPrivateStackSize)
  1040  	if err != nil {
  1041  		s.memoryFile.DecRef(fr)
  1042  		panic(fmt.Sprintf("mmap failed: %v", err))
  1043  	}
  1044  
  1045  	sysThread.setMsg(sysmsg.StackAddrToMsg(sentryStackAddr))
  1046  	sysThread.msg.Init(threadID)
  1047  	sysThread.msg.Self = uint64(sysmsgStackAddr + sysmsg.MsgOffsetFromSharedStack)
  1048  	sysThread.msg.SyshandlerStack = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr()))
  1049  	sysThread.msg.Syshandler = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_syshandler))
  1050  
  1051  	sysThread.msg.State.Set(sysmsg.ThreadStateInitializing)
  1052  
  1053  	if err := unix.Setpriority(unix.PRIO_PROCESS, int(p.tid), sysmsgThreadPriority); err != nil {
  1054  		log.Warningf("Unable to change priority of a stub thread: %s", err)
  1055  	}
  1056  
  1057  	// Install a pre-compiled seccomp rules for the BPF process.
  1058  	_, err = p.syscallIgnoreInterrupt(&p.initRegs, unix.SYS_PRCTL,
  1059  		arch.SyscallArgument{Value: uintptr(linux.PR_SET_NO_NEW_PRIVS)},
  1060  		arch.SyscallArgument{Value: uintptr(1)},
  1061  		arch.SyscallArgument{Value: uintptr(0)},
  1062  		arch.SyscallArgument{Value: uintptr(0)},
  1063  		arch.SyscallArgument{Value: uintptr(0)},
  1064  		arch.SyscallArgument{Value: uintptr(0)})
  1065  	if err != nil {
  1066  		panic(fmt.Sprintf("prctl(PR_SET_NO_NEW_PRIVS) failed: %v", err))
  1067  	}
  1068  
  1069  	_, err = p.syscallIgnoreInterrupt(&p.initRegs, seccomp.SYS_SECCOMP,
  1070  		arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)},
  1071  		arch.SyscallArgument{Value: uintptr(0)},
  1072  		arch.SyscallArgument{Value: stubSysmsgRules})
  1073  	if err != nil {
  1074  		panic(fmt.Sprintf("seccomp failed: %v", err))
  1075  	}
  1076  
  1077  	// Prepare to start the BPF process.
  1078  	tregs := &arch.Registers{}
  1079  	s.resetSysemuRegs(tregs)
  1080  	setArchSpecificRegs(sysThread, tregs)
  1081  	if err := p.setRegs(tregs); err != nil {
  1082  		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
  1083  	}
  1084  	archSpecificSysmsgThreadInit(sysThread)
  1085  	// Skip SIGSTOP.
  1086  	if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 {
  1087  		panic(fmt.Sprintf("tkill failed: %v", e))
  1088  	}
  1089  	// Resume the BPF process.
  1090  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(p.tid), 0, 0, 0, 0); errno != 0 {
  1091  		panic(fmt.Sprintf("can't detach new clone: %v", errno))
  1092  	}
  1093  
  1094  	s.sysmsgThreadsMu.Lock()
  1095  	s.sysmsgThreads[threadID] = sysThread
  1096  	s.sysmsgThreadsMu.Unlock()
  1097  
  1098  	return nil
  1099  }
  1100  
  1101  // PreFork implements platform.AddressSpace.PreFork.
  1102  // We need to take the usertrap lock to be sure that fork() will not be in the
  1103  // middle of applying a binary patch.
  1104  func (s *subprocess) PreFork() {
  1105  	s.usertrap.PreFork()
  1106  }
  1107  
  1108  // PostFork implements platform.AddressSpace.PostFork.
  1109  func (s *subprocess) PostFork() {
  1110  	s.usertrap.PostFork() // +checklocksforce: PreFork acquires, above.
  1111  }
  1112  
  1113  // activateContext activates the context in this subprocess.
  1114  // No-op if the context is already active within the subprocess; if not,
  1115  // deactivates it from its last subprocess.
  1116  func (s *subprocess) activateContext(c *context) error {
  1117  	if !c.sharedContext.isActiveInSubprocess(s) {
  1118  		c.sharedContext.release()
  1119  		c.sharedContext = nil
  1120  
  1121  		shared, err := s.getSharedContext()
  1122  		if err != nil {
  1123  			return err
  1124  		}
  1125  		c.sharedContext = shared
  1126  	}
  1127  	return nil
  1128  }