github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/systrap/subprocess.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package systrap
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"runtime"
    21  	"sync"
    22  	"sync/atomic"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"github.com/metacubex/gvisor/pkg/abi/linux"
    26  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    27  	"github.com/metacubex/gvisor/pkg/hostarch"
    28  	"github.com/metacubex/gvisor/pkg/log"
    29  	"github.com/metacubex/gvisor/pkg/pool"
    30  	"github.com/metacubex/gvisor/pkg/seccomp"
    31  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    32  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    33  	"github.com/metacubex/gvisor/pkg/sentry/pgalloc"
    34  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    35  	"github.com/metacubex/gvisor/pkg/sentry/platform/systrap/sysmsg"
    36  	"github.com/metacubex/gvisor/pkg/sentry/platform/systrap/usertrap"
    37  	"github.com/metacubex/gvisor/pkg/sentry/usage"
    38  )
    39  
    40  var (
    41  	// globalPool tracks all subprocesses in various state: active or available for
    42  	// reuse.
    43  	globalPool = subprocessPool{}
    44  
    45  	// maximumUserAddress is the largest possible user address.
    46  	maximumUserAddress = linux.TaskSize
    47  
    48  	// stubInitAddress is the initial attempt link address for the stub.
    49  	stubInitAddress = linux.TaskSize
    50  
    51  	// maxRandomOffsetOfStubAddress is the maximum offset for randomizing a
    52  	// stub address. It is set to the default value of mm.mmap_rnd_bits.
    53  	//
    54  	// Note: Tools like ThreadSanitizer don't like when the memory layout
    55  	// is changed significantly.
    56  	maxRandomOffsetOfStubAddress = (linux.TaskSize >> 7) & ^(uintptr(hostarch.PageSize) - 1)
    57  
    58  	// maxStubUserAddress is the largest possible user address for
    59  	// processes running inside gVisor. It is fixed because
    60  	// * we don't want to reveal a stub address.
    61  	// * it has to be the same across checkpoint/restore.
    62  	maxStubUserAddress = maximumUserAddress - maxRandomOffsetOfStubAddress
    63  )
    64  
    65  // Linux kernel errnos which "should never be seen by user programs", but will
    66  // be revealed to ptrace syscall exit tracing.
    67  //
    68  // These constants are only used in subprocess.go.
    69  const (
    70  	ERESTARTSYS    = unix.Errno(512)
    71  	ERESTARTNOINTR = unix.Errno(513)
    72  	ERESTARTNOHAND = unix.Errno(514)
    73  )
    74  
    75  // thread is a traced thread; it is a thread identifier.
    76  //
    77  // This is a convenience type for defining ptrace operations.
    78  type thread struct {
    79  	tgid int32
    80  	tid  int32
    81  
    82  	// sysmsgStackID is a stack ID in subprocess.sysmsgStackPool.
    83  	sysmsgStackID uint64
    84  
    85  	// initRegs are the initial registers for the first thread.
    86  	//
    87  	// These are used for the register set for system calls.
    88  	initRegs arch.Registers
    89  }
    90  
    91  // requestThread is used to request a new sysmsg thread. A thread identifier will
    92  // be sent into the thread channel.
    93  type requestThread struct {
    94  	thread chan *thread
    95  }
    96  
    97  // requestStub is used to request a new stub process.
    98  type requestStub struct {
    99  	done chan *thread
   100  }
   101  
   102  // maxSysmsgThreads specifies the maximum number of system threads that a
   103  // subprocess can create in context decoupled mode.
   104  // TODO(b/268366549): Replace maxSystemThreads below.
   105  var maxSysmsgThreads = runtime.GOMAXPROCS(0)
   106  
   107  const (
   108  	// maxSystemThreads specifies the maximum number of system threads that a
   109  	// subprocess may create in order to process the contexts.
   110  	maxSystemThreads = 4096
   111  	// maxGuestContexts specifies the maximum number of task contexts that a
   112  	// subprocess can handle.
   113  	maxGuestContexts = 4095
   114  	// invalidContextID specifies an invalid ID.
   115  	invalidContextID uint32 = 0xfefefefe
   116  	// invalidThreadID is used to indicate that a context is not being worked on by
   117  	// any sysmsg thread.
   118  	invalidThreadID uint32 = 0xfefefefe
   119  )
   120  
   121  // subprocess is a collection of threads being traced.
   122  type subprocess struct {
   123  	platform.NoAddressSpaceIO
   124  	subprocessRefs
   125  
   126  	// requests is used to signal creation of new threads.
   127  	requests chan any
   128  
   129  	// sysmsgInitRegs is used to reset sysemu regs.
   130  	sysmsgInitRegs arch.Registers
   131  
   132  	// mu protects the following fields.
   133  	mu sync.Mutex
   134  
   135  	// faultedContexts is the set of contexts for which it's possible that
   136  	// platformContext.lastFaultSP == this subprocess.
   137  	faultedContexts map[*platformContext]struct{}
   138  
   139  	// sysmsgStackPool is a pool of available sysmsg stacks.
   140  	sysmsgStackPool pool.Pool
   141  
   142  	// threadContextPool is a pool of available sysmsg.ThreadContext IDs.
   143  	threadContextPool pool.Pool
   144  
   145  	// threadContextRegion defines the ThreadContext memory region start
   146  	// within the sentry address space.
   147  	threadContextRegion uintptr
   148  
   149  	// memoryFile is used to allocate a sysmsg stack which is shared
   150  	// between a stub process and the Sentry.
   151  	memoryFile *pgalloc.MemoryFile
   152  
   153  	// usertrap is the state of the usertrap table which contains syscall
   154  	// trampolines.
   155  	usertrap *usertrap.State
   156  
   157  	syscallThreadMu sync.Mutex
   158  	syscallThread   *syscallThread
   159  
   160  	// sysmsgThreadsMu protects sysmsgThreads and numSysmsgThreads
   161  	sysmsgThreadsMu sync.Mutex
   162  	// sysmsgThreads is a collection of all active sysmsg threads in the
   163  	// subprocess.
   164  	sysmsgThreads map[uint32]*sysmsgThread
   165  	// numSysmsgThreads counts the number of active sysmsg threads; we use a
   166  	// counter instead of using len(sysmsgThreads) because we need to synchronize
   167  	// how many threads get created _before_ the creation happens.
   168  	numSysmsgThreads int
   169  
   170  	// contextQueue is a queue of all contexts that are ready to switch back to
   171  	// user mode.
   172  	contextQueue *contextQueue
   173  
   174  	// dead indicates whether the subprocess is alive or not.
   175  	dead atomicbitops.Bool
   176  }
   177  
   178  func (s *subprocess) initSyscallThread(ptraceThread *thread) error {
   179  	s.syscallThreadMu.Lock()
   180  	defer s.syscallThreadMu.Unlock()
   181  
   182  	id, ok := s.sysmsgStackPool.Get()
   183  	if !ok {
   184  		panic("unable to allocate a sysmsg stub thread")
   185  	}
   186  
   187  	ptraceThread.sysmsgStackID = id
   188  	t := syscallThread{
   189  		subproc: s,
   190  		thread:  ptraceThread,
   191  	}
   192  
   193  	if err := t.init(); err != nil {
   194  		panic(fmt.Sprintf("failed to create a syscall thread"))
   195  	}
   196  	s.syscallThread = &t
   197  
   198  	s.syscallThread.detach()
   199  
   200  	return nil
   201  }
   202  
   203  func handlePtraceSyscallRequestError(req any, format string, values ...any) {
   204  	switch req.(type) {
   205  	case requestThread:
   206  		req.(requestThread).thread <- nil
   207  	case requestStub:
   208  		req.(requestStub).done <- nil
   209  	}
   210  	log.Warningf("handlePtraceSyscallRequest failed: "+format, values...)
   211  }
   212  
   213  // handlePtraceSyscallRequest executes system calls that can't be run via
   214  // syscallThread without using ptrace. Look at the description of syscallThread
   215  // to get more details about its limitations.
   216  func (s *subprocess) handlePtraceSyscallRequest(req any) {
   217  	s.syscallThreadMu.Lock()
   218  	defer s.syscallThreadMu.Unlock()
   219  	runtime.LockOSThread()
   220  	defer runtime.UnlockOSThread()
   221  	if err := s.syscallThread.attach(); err != nil {
   222  		handlePtraceSyscallRequestError(req, err.Error())
   223  		return
   224  	}
   225  	defer s.syscallThread.detach()
   226  
   227  	ptraceThread := s.syscallThread.thread
   228  
   229  	switch r := req.(type) {
   230  	case requestThread:
   231  		t, err := ptraceThread.clone()
   232  		if err != nil {
   233  			handlePtraceSyscallRequestError(req, "error initializing thread: %v", err)
   234  			return
   235  		}
   236  
   237  		// Since the new thread was created with
   238  		// clone(CLONE_PTRACE), it will begin execution with
   239  		// SIGSTOP pending and with this thread as its tracer.
   240  		// (Hopefully nobody tgkilled it with a signal <
   241  		// SIGSTOP before the SIGSTOP was delivered, in which
   242  		// case that signal would be delivered before SIGSTOP.)
   243  		if sig := t.wait(stopped); sig != unix.SIGSTOP {
   244  			handlePtraceSyscallRequestError(req, "error waiting for new clone: expected SIGSTOP, got %v", sig)
   245  			return
   246  		}
   247  
   248  		t.initRegs = ptraceThread.initRegs
   249  		// Set the parent death signal to SIGKILL.
   250  		_, err = t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_PRCTL,
   251  			arch.SyscallArgument{Value: linux.PR_SET_PDEATHSIG},
   252  			arch.SyscallArgument{Value: uintptr(unix.SIGKILL)},
   253  			arch.SyscallArgument{Value: 0},
   254  			arch.SyscallArgument{Value: 0},
   255  			arch.SyscallArgument{Value: 0},
   256  			arch.SyscallArgument{Value: 0},
   257  		)
   258  		if err != nil {
   259  			handlePtraceSyscallRequestError(req, "prctl: %v", err)
   260  			return
   261  		}
   262  
   263  		id, ok := s.sysmsgStackPool.Get()
   264  		if !ok {
   265  			handlePtraceSyscallRequestError(req, "unable to allocate a sysmsg stub thread")
   266  			return
   267  		}
   268  		t.sysmsgStackID = id
   269  
   270  		if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(unix.SIGSTOP)); e != 0 {
   271  			handlePtraceSyscallRequestError(req, "tkill failed: %v", e)
   272  			return
   273  		}
   274  
   275  		// Detach the thread.
   276  		t.detach()
   277  
   278  		// Return the thread.
   279  		r.thread <- t
   280  	case requestStub:
   281  		t, err := ptraceThread.createStub()
   282  		if err != nil {
   283  			handlePtraceSyscallRequestError(req, "unable to create a stub process: %v", err)
   284  			return
   285  		}
   286  		r.done <- t
   287  
   288  	}
   289  }
   290  
   291  // newSubprocess returns a usable subprocess.
   292  //
   293  // This will either be a newly created subprocess, or one from the global pool.
   294  // The create function will be called in the latter case, which is guaranteed
   295  // to happen with the runtime thread locked.
   296  func newSubprocess(create func() (*thread, error), memoryFile *pgalloc.MemoryFile) (*subprocess, error) {
   297  	if sp := globalPool.fetchAvailable(); sp != nil {
   298  		sp.subprocessRefs.InitRefs()
   299  		sp.usertrap = usertrap.New()
   300  		return sp, nil
   301  	}
   302  
   303  	// The following goroutine is responsible for creating the first traced
   304  	// thread, and responding to requests to make additional threads in the
   305  	// traced process. The process will be killed and reaped when the
   306  	// request channel is closed, which happens in Release below.
   307  	requests := make(chan any)
   308  
   309  	// Ready.
   310  	sp := &subprocess{
   311  		requests:          requests,
   312  		faultedContexts:   make(map[*platformContext]struct{}),
   313  		sysmsgStackPool:   pool.Pool{Start: 0, Limit: maxSystemThreads},
   314  		threadContextPool: pool.Pool{Start: 0, Limit: maxGuestContexts},
   315  		memoryFile:        memoryFile,
   316  		sysmsgThreads:     make(map[uint32]*sysmsgThread),
   317  	}
   318  	sp.subprocessRefs.InitRefs()
   319  	runtime.LockOSThread()
   320  	defer runtime.UnlockOSThread()
   321  
   322  	// Initialize the syscall thread.
   323  	ptraceThread, err := create()
   324  	if err != nil {
   325  		return nil, err
   326  	}
   327  	sp.sysmsgInitRegs = ptraceThread.initRegs
   328  
   329  	if err := sp.initSyscallThread(ptraceThread); err != nil {
   330  		return nil, err
   331  	}
   332  
   333  	go func() { // S/R-SAFE: Platform-related.
   334  
   335  		// Wait for requests to create threads.
   336  		for req := range requests {
   337  			sp.handlePtraceSyscallRequest(req)
   338  		}
   339  
   340  		// Requests should never be closed.
   341  		panic("unreachable")
   342  	}()
   343  
   344  	sp.unmap()
   345  	sp.usertrap = usertrap.New()
   346  	sp.mapSharedRegions()
   347  	sp.mapPrivateRegions()
   348  
   349  	// Create the initial sysmsg thread.
   350  	atomic.AddUint32(&sp.contextQueue.numThreadsToWakeup, 1)
   351  	if err := sp.createSysmsgThread(); err != nil {
   352  		return nil, err
   353  	}
   354  	sp.numSysmsgThreads++
   355  
   356  	return sp, nil
   357  }
   358  
   359  // mapSharedRegions maps the shared regions that are used between the subprocess
   360  // and ALL of the subsequently created sysmsg threads into both the sentry and
   361  // the syscall thread.
   362  //
   363  // Should be called before any sysmsg threads are created.
   364  // Initializes s.contextQueue and s.threadContextRegion.
   365  func (s *subprocess) mapSharedRegions() {
   366  	if s.contextQueue != nil || s.threadContextRegion != 0 {
   367  		panic("contextQueue or threadContextRegion was already initialized")
   368  	}
   369  
   370  	opts := pgalloc.AllocOpts{
   371  		Kind: usage.System,
   372  		Dir:  pgalloc.TopDown,
   373  	}
   374  
   375  	// Map shared regions into the sentry.
   376  	contextQueueFR, contextQueue := mmapContextQueueForSentry(s.memoryFile, opts)
   377  	contextQueue.init()
   378  
   379  	// Map thread context region into the syscall thread.
   380  	_, err := s.syscallThread.syscall(
   381  		unix.SYS_MMAP,
   382  		arch.SyscallArgument{Value: uintptr(stubContextQueueRegion)},
   383  		arch.SyscallArgument{Value: uintptr(contextQueueFR.Length())},
   384  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   385  		arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)},
   386  		arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())},
   387  		arch.SyscallArgument{Value: uintptr(contextQueueFR.Start)})
   388  	if err != nil {
   389  		panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err))
   390  	}
   391  
   392  	s.contextQueue = contextQueue
   393  
   394  	// Map thread context region into the sentry.
   395  	threadContextFR, err := s.memoryFile.Allocate(uint64(stubContextRegionLen), opts)
   396  	if err != nil {
   397  		panic(fmt.Sprintf("failed to allocate a new subprocess context memory region"))
   398  	}
   399  	sentryThreadContextRegionAddr, _, errno := unix.RawSyscall6(
   400  		unix.SYS_MMAP,
   401  		0,
   402  		uintptr(threadContextFR.Length()),
   403  		unix.PROT_WRITE|unix.PROT_READ,
   404  		unix.MAP_SHARED|unix.MAP_FILE,
   405  		uintptr(s.memoryFile.FD()), uintptr(threadContextFR.Start))
   406  	if errno != 0 {
   407  		panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno))
   408  	}
   409  
   410  	// Map thread context region into the syscall thread.
   411  	if _, err := s.syscallThread.syscall(
   412  		unix.SYS_MMAP,
   413  		arch.SyscallArgument{Value: uintptr(stubContextRegion)},
   414  		arch.SyscallArgument{Value: uintptr(threadContextFR.Length())},
   415  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   416  		arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)},
   417  		arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())},
   418  		arch.SyscallArgument{Value: uintptr(threadContextFR.Start)}); err != nil {
   419  		panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err))
   420  	}
   421  
   422  	s.threadContextRegion = sentryThreadContextRegionAddr
   423  }
   424  
   425  func (s *subprocess) mapPrivateRegions() {
   426  	_, err := s.syscallThread.syscall(
   427  		unix.SYS_MMAP,
   428  		arch.SyscallArgument{Value: uintptr(stubSpinningThreadQueueAddr)},
   429  		arch.SyscallArgument{Value: uintptr(sysmsg.SpinningQueueMemSize)},
   430  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   431  		arch.SyscallArgument{Value: uintptr(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED)},
   432  		arch.SyscallArgument{Value: 0},
   433  		arch.SyscallArgument{Value: 0})
   434  	if err != nil {
   435  		panic(fmt.Sprintf("failed to mmap spinning queue region into syscall thread: %v", err))
   436  	}
   437  }
   438  
   439  // unmap unmaps non-stub regions of the process.
   440  //
   441  // This will panic on failure (which should never happen).
   442  func (s *subprocess) unmap() {
   443  	s.Unmap(0, uint64(stubStart))
   444  	if maximumUserAddress != stubEnd {
   445  		s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd))
   446  	}
   447  }
   448  
   449  // Release kills the subprocess.
   450  //
   451  // Just kidding! We can't safely coordinate the detaching of all the
   452  // tracees (since the tracers are random runtime threads, and the process
   453  // won't exit until tracers have been notifier).
   454  //
   455  // Therefore we simply unmap everything in the subprocess and return it to the
   456  // globalPool. This has the added benefit of reducing creation time for new
   457  // subprocesses.
   458  func (s *subprocess) Release() {
   459  	if !s.alive() {
   460  		return
   461  	}
   462  	s.unmap()
   463  	s.DecRef(s.release)
   464  }
   465  
   466  // release returns the subprocess to the global pool.
   467  func (s *subprocess) release() {
   468  	if s.alive() {
   469  		globalPool.markAvailable(s)
   470  	}
   471  }
   472  
   473  // attach attaches to the thread.
   474  func (t *thread) attach() error {
   475  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
   476  		return fmt.Errorf("unable to attach: %v", errno)
   477  	}
   478  
   479  	// PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already
   480  	// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
   481  	// newSubprocess), so we always expect to see signal-delivery-stop with
   482  	// SIGSTOP.
   483  	if sig := t.wait(stopped); sig != unix.SIGSTOP {
   484  		return fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
   485  	}
   486  
   487  	// Initialize options.
   488  	t.init()
   489  	return nil
   490  }
   491  
   492  func (t *thread) grabInitRegs() {
   493  	// Grab registers.
   494  	//
   495  	// Note that we adjust the current register RIP value to be just before
   496  	// the current system call executed. This depends on the definition of
   497  	// the stub itself.
   498  	if err := t.getRegs(&t.initRegs); err != nil {
   499  		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
   500  	}
   501  	t.adjustInitRegsRip()
   502  	t.initRegs.SetStackPointer(0)
   503  }
   504  
   505  // detach detaches from the thread.
   506  //
   507  // Because the SIGSTOP is not suppressed, the thread will enter group-stop.
   508  func (t *thread) detach() {
   509  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 {
   510  		panic(fmt.Sprintf("can't detach new clone: %v", errno))
   511  	}
   512  }
   513  
   514  // waitOutcome is used for wait below.
   515  type waitOutcome int
   516  
   517  const (
   518  	// stopped indicates that the process was stopped.
   519  	stopped waitOutcome = iota
   520  
   521  	// killed indicates that the process was killed.
   522  	killed
   523  )
   524  
   525  func (t *thread) Debugf(format string, v ...any) {
   526  	prefix := fmt.Sprintf("%8d:", t.tid)
   527  	log.DebugfAtDepth(1, prefix+format, v...)
   528  }
   529  
   530  func (t *thread) dumpAndPanic(message string) {
   531  	var regs arch.Registers
   532  	message += "\n"
   533  	if err := t.getRegs(&regs); err == nil {
   534  		message += dumpRegs(&regs)
   535  	} else {
   536  		log.Warningf("unable to get registers: %v", err)
   537  	}
   538  	message += fmt.Sprintf("stubStart\t = %016x\n", stubStart)
   539  	panic(message)
   540  }
   541  
   542  func (t *thread) dumpRegs(message string) {
   543  	var regs arch.Registers
   544  	message += "\n"
   545  	if err := t.getRegs(&regs); err == nil {
   546  		message += dumpRegs(&regs)
   547  	} else {
   548  		log.Warningf("unable to get registers: %v", err)
   549  	}
   550  	log.Infof("%s", message)
   551  }
   552  
   553  func (t *thread) unexpectedStubExit() {
   554  	msg, err := t.getEventMessage()
   555  	status := unix.WaitStatus(msg)
   556  	if status.Signaled() && status.Signal() == unix.SIGKILL {
   557  		// SIGKILL can be only sent by a user or OOM-killer. In both
   558  		// these cases, we don't need to panic. There is no reasons to
   559  		// think that something wrong in gVisor.
   560  		log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid)
   561  		pid := os.Getpid()
   562  		unix.Tgkill(pid, pid, unix.Signal(unix.SIGKILL))
   563  	}
   564  	t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err))
   565  }
   566  
   567  // wait waits for a stop event.
   568  //
   569  // Precondition: outcome is a valid waitOutcome.
   570  func (t *thread) wait(outcome waitOutcome) unix.Signal {
   571  	var status unix.WaitStatus
   572  
   573  	for {
   574  		r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil)
   575  		if err == unix.EINTR || err == unix.EAGAIN {
   576  			// Wait was interrupted; wait again.
   577  			continue
   578  		} else if err != nil {
   579  			panic(fmt.Sprintf("ptrace wait failed: %v", err))
   580  		}
   581  		if int(r) != int(t.tid) {
   582  			panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
   583  		}
   584  		switch outcome {
   585  		case stopped:
   586  			if !status.Stopped() {
   587  				t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
   588  			}
   589  			stopSig := status.StopSignal()
   590  			if stopSig == 0 {
   591  				continue // Spurious stop.
   592  			}
   593  			if stopSig == unix.SIGTRAP {
   594  				if status.TrapCause() == unix.PTRACE_EVENT_EXIT {
   595  					t.unexpectedStubExit()
   596  				}
   597  				// Re-encode the trap cause the way it's expected.
   598  				return stopSig | unix.Signal(status.TrapCause()<<8)
   599  			}
   600  			// Not a trap signal.
   601  			return stopSig
   602  		case killed:
   603  			if !status.Exited() && !status.Signaled() {
   604  				t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
   605  			}
   606  			return unix.Signal(status.ExitStatus())
   607  		default:
   608  			// Should not happen.
   609  			t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome))
   610  		}
   611  	}
   612  }
   613  
   614  // destroy kills the thread.
   615  //
   616  // Note that this should not be used in the general case; the death of threads
   617  // will typically cause the death of the parent. This is a utility method for
   618  // manually created threads.
   619  func (t *thread) destroy() {
   620  	t.detach()
   621  	unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL))
   622  	t.wait(killed)
   623  }
   624  
   625  // init initializes trace options.
   626  func (t *thread) init() {
   627  	// Set the TRACESYSGOOD option to differentiate real SIGTRAP.
   628  	// set PTRACE_O_EXITKILL to ensure that the unexpected exit of the
   629  	// sentry will immediately kill the associated stubs.
   630  	_, _, errno := unix.RawSyscall6(
   631  		unix.SYS_PTRACE,
   632  		unix.PTRACE_SETOPTIONS,
   633  		uintptr(t.tid),
   634  		0,
   635  		unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|unix.PTRACE_O_EXITKILL,
   636  		0, 0)
   637  	if errno != 0 {
   638  		panic(fmt.Sprintf("ptrace set options failed: %v", errno))
   639  	}
   640  }
   641  
   642  // syscall executes a system call cycle in the traced context.
   643  //
   644  // This is _not_ for use by application system calls, rather it is for use when
   645  // a system call must be injected into the remote context (e.g. mmap, munmap).
   646  // Note that clones are handled separately.
   647  func (t *thread) syscall(regs *arch.Registers) (uintptr, error) {
   648  	// Set registers.
   649  	if err := t.setRegs(regs); err != nil {
   650  		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
   651  	}
   652  
   653  	for {
   654  		// Execute the syscall instruction. The task has to stop on the
   655  		// trap instruction which is right after the syscall
   656  		// instruction.
   657  		if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
   658  			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
   659  		}
   660  
   661  		sig := t.wait(stopped)
   662  		if sig == unix.SIGTRAP {
   663  			// Reached syscall-enter-stop.
   664  			break
   665  		} else {
   666  			// Some other signal caused a thread stop; ignore.
   667  			if sig != unix.SIGSTOP && sig != unix.SIGCHLD {
   668  				log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig)
   669  			}
   670  			continue
   671  		}
   672  	}
   673  
   674  	// Grab registers.
   675  	if err := t.getRegs(regs); err != nil {
   676  		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
   677  	}
   678  	return syscallReturnValue(regs)
   679  }
   680  
   681  // syscallIgnoreInterrupt ignores interrupts on the system call thread and
   682  // restarts the syscall if the kernel indicates that should happen.
   683  func (t *thread) syscallIgnoreInterrupt(
   684  	initRegs *arch.Registers,
   685  	sysno uintptr,
   686  	args ...arch.SyscallArgument) (uintptr, error) {
   687  	for {
   688  		regs := createSyscallRegs(initRegs, sysno, args...)
   689  		rval, err := t.syscall(&regs)
   690  		switch err {
   691  		case ERESTARTSYS:
   692  			continue
   693  		case ERESTARTNOINTR:
   694  			continue
   695  		case ERESTARTNOHAND:
   696  			continue
   697  		default:
   698  			return rval, err
   699  		}
   700  	}
   701  }
   702  
   703  // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
   704  func (t *thread) NotifyInterrupt() {
   705  	unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt))
   706  }
   707  
   708  func (s *subprocess) incAwakeContexts() {
   709  	nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, 1)
   710  	if nr > uint32(maxSysmsgThreads) {
   711  		return
   712  	}
   713  	fastpath.nrMaxAwakeStubThreads.Add(1)
   714  }
   715  
   716  func (s *subprocess) decAwakeContexts() {
   717  	nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, ^uint32(0))
   718  	if nr >= uint32(maxSysmsgThreads) {
   719  		return
   720  	}
   721  	fastpath.nrMaxAwakeStubThreads.Add(^uint32(0))
   722  }
   723  
   724  // switchToApp is called from the main SwitchToApp entrypoint.
   725  //
   726  // This function returns true on a system call, false on a signal.
   727  // The second return value is true if a syscall instruction can be replaced on
   728  // a function call.
   729  func (s *subprocess) switchToApp(c *platformContext, ac *arch.Context64) (isSyscall bool, shouldPatchSyscall bool, err *platform.ContextError) {
   730  	// Reset necessary registers.
   731  	regs := &ac.StateData().Regs
   732  	s.resetSysemuRegs(regs)
   733  	ctx := c.sharedContext
   734  	ctx.shared.Regs = regs.PtraceRegs
   735  	restoreArchSpecificState(ctx.shared, ac)
   736  
   737  	// Check for interrupts, and ensure that future interrupts signal the context.
   738  	if !c.interrupt.Enable(c.sharedContext) {
   739  		// Pending interrupt; simulate.
   740  		ctx.clearInterrupt()
   741  		c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)}
   742  		return false, false, nil
   743  	}
   744  	defer func() {
   745  		ctx.clearInterrupt()
   746  		c.interrupt.Disable()
   747  	}()
   748  
   749  	restoreFPState(ctx, c, ac)
   750  
   751  	// Place the context onto the context queue.
   752  	if ctx.sleeping {
   753  		ctx.sleeping = false
   754  		s.incAwakeContexts()
   755  	}
   756  	ctx.setState(sysmsg.ContextStateNone)
   757  	if err := s.contextQueue.add(ctx); err != nil {
   758  		return false, false, err
   759  	}
   760  
   761  	if err := s.waitOnState(ctx); err != nil {
   762  		return false, false, corruptedSharedMemoryErr(err.Error())
   763  	}
   764  
   765  	// Check if there's been an error.
   766  	threadID := ctx.threadID()
   767  	if threadID != invalidThreadID {
   768  		if sysThread, ok := s.sysmsgThreads[threadID]; ok && sysThread.msg.Err != 0 {
   769  			return false, false, sysThread.msg.ConvertSysmsgErr()
   770  		}
   771  		return false, false, corruptedSharedMemoryErr(fmt.Sprintf("found unexpected ThreadContext.ThreadID field, expected %d found %d", invalidThreadID, threadID))
   772  	}
   773  
   774  	// Copy register state locally.
   775  	regs.PtraceRegs = ctx.shared.Regs
   776  	retrieveArchSpecificState(ctx.shared, ac)
   777  	c.needToPullFullState = true
   778  	// We have a signal. We verify however, that the signal was
   779  	// either delivered from the kernel or from this process. We
   780  	// don't respect other signals.
   781  	c.signalInfo = ctx.shared.SignalInfo
   782  	ctxState := ctx.state()
   783  	if ctxState == sysmsg.ContextStateSyscallCanBePatched {
   784  		ctxState = sysmsg.ContextStateSyscall
   785  		shouldPatchSyscall = true
   786  	}
   787  
   788  	if ctxState == sysmsg.ContextStateSyscall || ctxState == sysmsg.ContextStateSyscallTrap {
   789  		if maybePatchSignalInfo(regs, &c.signalInfo) {
   790  			return false, false, nil
   791  		}
   792  		updateSyscallRegs(regs)
   793  		return true, shouldPatchSyscall, nil
   794  	} else if ctxState != sysmsg.ContextStateFault {
   795  		return false, false, corruptedSharedMemoryErr(fmt.Sprintf("unknown context state: %v", ctxState))
   796  	}
   797  
   798  	return false, false, nil
   799  }
   800  
   801  func (s *subprocess) waitOnState(ctx *sharedContext) error {
   802  	ctx.kicked = false
   803  	slowPath := false
   804  	if !s.contextQueue.fastPathEnabled() || atomic.LoadUint32(&s.contextQueue.numActiveThreads) == 0 {
   805  		ctx.kicked = s.kickSysmsgThread()
   806  	}
   807  	for curState := ctx.state(); curState == sysmsg.ContextStateNone; curState = ctx.state() {
   808  		if !slowPath {
   809  			events := dispatcher.waitFor(ctx)
   810  			if events&sharedContextKicked != 0 {
   811  				if ctx.kicked {
   812  					continue
   813  				}
   814  				if ctx.isAcked() {
   815  					ctx.kicked = true
   816  					continue
   817  				}
   818  				s.kickSysmsgThread()
   819  				ctx.kicked = true
   820  				continue
   821  			}
   822  			if events&sharedContextSlowPath != 0 {
   823  				ctx.disableSentryFastPath()
   824  				slowPath = true
   825  				continue
   826  			}
   827  		} else {
   828  			// If the context already received a handshake then it knows it's being
   829  			// worked on.
   830  			if !ctx.kicked && !ctx.isAcked() {
   831  				ctx.kicked = s.kickSysmsgThread()
   832  			}
   833  
   834  			if err := ctx.sleepOnState(curState); err != nil {
   835  				return err
   836  			}
   837  		}
   838  	}
   839  
   840  	ctx.recordLatency()
   841  	ctx.resetLatencyMeasures()
   842  	ctx.enableSentryFastPath()
   843  
   844  	return nil
   845  }
   846  
   847  // canKickSysmsgThread returns true if a new thread can be kicked.
   848  // The second return value is the expected number of threads after kicking a
   849  // new one.
   850  func (s *subprocess) canKickSysmsgThread() (bool, uint32) {
   851  	// numActiveContexts and numActiveThreads can be changed from stub
   852  	// threads that handles the contextQueue without any locks. The idea
   853  	// here is that any stub thread that gets CPU time can make some
   854  	// progress. In stub threads, we can use only spinlock-like
   855  	// synchronizations, but they don't work well because a thread that
   856  	// holds a lock can be preempted by another thread that is waiting for
   857  	// the same lock.
   858  	nrActiveThreads := atomic.LoadUint32(&s.contextQueue.numActiveThreads)
   859  	nrThreadsToWakeup := atomic.LoadUint32(&s.contextQueue.numThreadsToWakeup)
   860  	nrActiveContexts := atomic.LoadUint32(&s.contextQueue.numActiveContexts)
   861  
   862  	nrActiveThreads += nrThreadsToWakeup + 1
   863  	if nrActiveThreads > nrActiveContexts {
   864  		// This can happen when one or more stub threads are
   865  		// waiting for cpu time. The host probably has more
   866  		// running tasks than a number of cpu-s.
   867  		return false, nrActiveThreads
   868  	}
   869  	return true, nrActiveThreads
   870  }
   871  
   872  // kickSysmsgThread returns true if it was able to wake up or create a new sysmsg
   873  // stub thread.
   874  func (s *subprocess) kickSysmsgThread() bool {
   875  	kick, _ := s.canKickSysmsgThread()
   876  	if !kick {
   877  		return false
   878  	}
   879  
   880  	s.sysmsgThreadsMu.Lock()
   881  	kick, nrThreads := s.canKickSysmsgThread()
   882  	if !kick {
   883  		s.sysmsgThreadsMu.Unlock()
   884  		return false
   885  	}
   886  	numTimesStubKicked.Increment()
   887  	atomic.AddUint32(&s.contextQueue.numThreadsToWakeup, 1)
   888  	if s.numSysmsgThreads < maxSysmsgThreads && s.numSysmsgThreads < int(nrThreads) {
   889  		s.numSysmsgThreads++
   890  		s.sysmsgThreadsMu.Unlock()
   891  		if err := s.createSysmsgThread(); err != nil {
   892  			log.Warningf("Unable to create a new stub thread: %s", err)
   893  			s.sysmsgThreadsMu.Lock()
   894  			s.numSysmsgThreads--
   895  			s.sysmsgThreadsMu.Unlock()
   896  		}
   897  	} else {
   898  		s.sysmsgThreadsMu.Unlock()
   899  	}
   900  	s.contextQueue.wakeupSysmsgThread()
   901  
   902  	return true
   903  }
   904  
   905  // syscall executes the given system call without handling interruptions.
   906  func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) {
   907  	s.syscallThreadMu.Lock()
   908  	defer s.syscallThreadMu.Unlock()
   909  
   910  	return s.syscallThread.syscall(sysno, args...)
   911  }
   912  
   913  // MapFile implements platform.AddressSpace.MapFile.
   914  func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
   915  	var flags int
   916  	if precommit {
   917  		flags |= unix.MAP_POPULATE
   918  	}
   919  	_, err := s.syscall(
   920  		unix.SYS_MMAP,
   921  		arch.SyscallArgument{Value: uintptr(addr)},
   922  		arch.SyscallArgument{Value: uintptr(fr.Length())},
   923  		arch.SyscallArgument{Value: uintptr(at.Prot())},
   924  		arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)},
   925  		arch.SyscallArgument{Value: uintptr(f.FD())},
   926  		arch.SyscallArgument{Value: uintptr(fr.Start)})
   927  	return err
   928  }
   929  
   930  // Unmap implements platform.AddressSpace.Unmap.
   931  func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) {
   932  	ar, ok := addr.ToRange(length)
   933  	if !ok {
   934  		panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length))
   935  	}
   936  	s.mu.Lock()
   937  	for c := range s.faultedContexts {
   938  		c.mu.Lock()
   939  		if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) {
   940  			// Forget the last fault so that if c faults again, the fault isn't
   941  			// incorrectly reported as a write fault. If this is being called
   942  			// due to munmap() of the corresponding vma, handling of the second
   943  			// fault will fail anyway.
   944  			c.lastFaultSP = nil
   945  			delete(s.faultedContexts, c)
   946  		}
   947  		c.mu.Unlock()
   948  	}
   949  	s.mu.Unlock()
   950  	_, err := s.syscall(
   951  		unix.SYS_MUNMAP,
   952  		arch.SyscallArgument{Value: uintptr(addr)},
   953  		arch.SyscallArgument{Value: uintptr(length)})
   954  	if err != nil && err != errDeadSubprocess {
   955  		// We never expect this to happen.
   956  		panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err))
   957  	}
   958  }
   959  
   960  func (s *subprocess) PullFullState(c *platformContext, ac *arch.Context64) error {
   961  	if !c.sharedContext.isActiveInSubprocess(s) {
   962  		panic("Attempted to PullFullState for context that is not used in subprocess")
   963  	}
   964  	saveFPState(c.sharedContext, ac)
   965  	return nil
   966  }
   967  
   968  var (
   969  	sysmsgThreadPriorityOnce sync.Once
   970  	sysmsgThreadPriority     int
   971  )
   972  
   973  // initSysmsgThreadPriority looks at the current priority of the process
   974  // and updates `sysmsgThreadPriority` accordingly.
   975  func initSysmsgThreadPriority() {
   976  	sysmsgThreadPriorityOnce.Do(func() {
   977  		prio, err := unix.Getpriority(unix.PRIO_PROCESS, 0)
   978  		if err != nil {
   979  			panic("unable to get current scheduling priority")
   980  		}
   981  		// Sysmsg threads are executed with a priority one lower than the Sentry.
   982  		sysmsgThreadPriority = 20 - prio + 1
   983  	})
   984  }
   985  
   986  // createSysmsgThread creates a new sysmsg thread.
   987  // The thread starts processing any available context in the context queue.
   988  func (s *subprocess) createSysmsgThread() error {
   989  	// Create a new seccomp process.
   990  	var r requestThread
   991  	r.thread = make(chan *thread)
   992  	s.requests <- r
   993  	p := <-r.thread
   994  	if p == nil {
   995  		return fmt.Errorf("createSysmsgThread: failed to get clone")
   996  	}
   997  
   998  	runtime.LockOSThread()
   999  	defer runtime.UnlockOSThread()
  1000  	if err := p.attach(); err != nil {
  1001  		return err
  1002  	}
  1003  
  1004  	// Skip SIGSTOP.
  1005  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(p.tid), 0, 0, 0, 0); errno != 0 {
  1006  		panic(fmt.Sprintf("ptrace cont failed: %v", errno))
  1007  	}
  1008  	sig := p.wait(stopped)
  1009  	if sig != unix.SIGSTOP {
  1010  		panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
  1011  	}
  1012  
  1013  	// Allocate a new stack for the BPF process.
  1014  	opts := pgalloc.AllocOpts{
  1015  		Kind: usage.System,
  1016  		Dir:  pgalloc.TopDown,
  1017  	}
  1018  	fr, err := s.memoryFile.Allocate(uint64(sysmsg.PerThreadSharedStackSize), opts)
  1019  	if err != nil {
  1020  		// TODO(b/144063246): Need to fail the clone system call.
  1021  		panic(fmt.Sprintf("failed to allocate a new stack: %v", err))
  1022  	}
  1023  	sysThread := &sysmsgThread{
  1024  		thread:     p,
  1025  		subproc:    s,
  1026  		stackRange: fr,
  1027  	}
  1028  	// Use the sysmsgStackID as a handle on this thread instead of host tid in
  1029  	// order to be able to reliably specify invalidThreadID.
  1030  	threadID := uint32(p.sysmsgStackID)
  1031  
  1032  	// Map the stack into the sentry.
  1033  	sentryStackAddr, _, errno := unix.RawSyscall6(
  1034  		unix.SYS_MMAP,
  1035  		0,
  1036  		sysmsg.PerThreadSharedStackSize,
  1037  		unix.PROT_WRITE|unix.PROT_READ,
  1038  		unix.MAP_SHARED|unix.MAP_FILE,
  1039  		uintptr(s.memoryFile.FD()), uintptr(fr.Start))
  1040  	if errno != 0 {
  1041  		panic(fmt.Sprintf("mmap failed: %v", errno))
  1042  	}
  1043  
  1044  	// Before installing the stub syscall filters, we need to call a few
  1045  	// system calls (e.g. sigaltstack, sigaction) which have in-memory
  1046  	// arguments.  We need to prevent changing these parameters by other
  1047  	// stub threads, so lets map the future BPF stack as read-only and
  1048  	// fill syscall arguments from the Sentry.
  1049  	sysmsgStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadSharedStackOffset
  1050  	err = sysThread.mapStack(sysmsgStackAddr, true)
  1051  	if err != nil {
  1052  		panic(fmt.Sprintf("mmap failed: %v", err))
  1053  	}
  1054  
  1055  	sysThread.init(sentryStackAddr, sysmsgStackAddr)
  1056  
  1057  	// Map the stack into the BPF process.
  1058  	err = sysThread.mapStack(sysmsgStackAddr, false)
  1059  	if err != nil {
  1060  		s.memoryFile.DecRef(fr)
  1061  		panic(fmt.Sprintf("mmap failed: %v", err))
  1062  	}
  1063  
  1064  	// Map the stack into the BPF process.
  1065  	privateStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadPrivateStackOffset
  1066  	err = sysThread.mapPrivateStack(privateStackAddr, sysmsg.PerThreadPrivateStackSize)
  1067  	if err != nil {
  1068  		s.memoryFile.DecRef(fr)
  1069  		panic(fmt.Sprintf("mmap failed: %v", err))
  1070  	}
  1071  
  1072  	sysThread.setMsg(sysmsg.StackAddrToMsg(sentryStackAddr))
  1073  	sysThread.msg.Init(threadID)
  1074  	sysThread.msg.Self = uint64(sysmsgStackAddr + sysmsg.MsgOffsetFromSharedStack)
  1075  	sysThread.msg.SyshandlerStack = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr()))
  1076  	sysThread.msg.Syshandler = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_syshandler))
  1077  
  1078  	sysThread.msg.State.Set(sysmsg.ThreadStateInitializing)
  1079  
  1080  	if err := unix.Setpriority(unix.PRIO_PROCESS, int(p.tid), sysmsgThreadPriority); err != nil {
  1081  		log.Warningf("Unable to change priority of a stub thread: %s", err)
  1082  	}
  1083  
  1084  	// Install a pre-compiled seccomp rules for the BPF process.
  1085  	_, err = p.syscallIgnoreInterrupt(&p.initRegs, unix.SYS_PRCTL,
  1086  		arch.SyscallArgument{Value: uintptr(linux.PR_SET_NO_NEW_PRIVS)},
  1087  		arch.SyscallArgument{Value: uintptr(1)},
  1088  		arch.SyscallArgument{Value: uintptr(0)},
  1089  		arch.SyscallArgument{Value: uintptr(0)},
  1090  		arch.SyscallArgument{Value: uintptr(0)},
  1091  		arch.SyscallArgument{Value: uintptr(0)})
  1092  	if err != nil {
  1093  		panic(fmt.Sprintf("prctl(PR_SET_NO_NEW_PRIVS) failed: %v", err))
  1094  	}
  1095  
  1096  	_, err = p.syscallIgnoreInterrupt(&p.initRegs, seccomp.SYS_SECCOMP,
  1097  		arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)},
  1098  		arch.SyscallArgument{Value: uintptr(0)},
  1099  		arch.SyscallArgument{Value: stubSysmsgRules})
  1100  	if err != nil {
  1101  		panic(fmt.Sprintf("seccomp failed: %v", err))
  1102  	}
  1103  
  1104  	// Prepare to start the BPF process.
  1105  	tregs := &arch.Registers{}
  1106  	s.resetSysemuRegs(tregs)
  1107  	setArchSpecificRegs(sysThread, tregs)
  1108  	if err := p.setRegs(tregs); err != nil {
  1109  		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
  1110  	}
  1111  	archSpecificSysmsgThreadInit(sysThread)
  1112  	// Skip SIGSTOP.
  1113  	if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 {
  1114  		panic(fmt.Sprintf("tkill failed: %v", e))
  1115  	}
  1116  	// Resume the BPF process.
  1117  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(p.tid), 0, 0, 0, 0); errno != 0 {
  1118  		panic(fmt.Sprintf("can't detach new clone: %v", errno))
  1119  	}
  1120  
  1121  	s.sysmsgThreadsMu.Lock()
  1122  	s.sysmsgThreads[threadID] = sysThread
  1123  	s.sysmsgThreadsMu.Unlock()
  1124  
  1125  	return nil
  1126  }
  1127  
  1128  // PreFork implements platform.AddressSpace.PreFork.
  1129  // We need to take the usertrap lock to be sure that fork() will not be in the
  1130  // middle of applying a binary patch.
  1131  func (s *subprocess) PreFork() {
  1132  	s.usertrap.PreFork()
  1133  }
  1134  
  1135  // PostFork implements platform.AddressSpace.PostFork.
  1136  func (s *subprocess) PostFork() {
  1137  	s.usertrap.PostFork() // +checklocksforce: PreFork acquires, above.
  1138  }
  1139  
  1140  // activateContext activates the context in this subprocess.
  1141  // No-op if the context is already active within the subprocess; if not,
  1142  // deactivates it from its last subprocess.
  1143  func (s *subprocess) activateContext(c *platformContext) error {
  1144  	if !c.sharedContext.isActiveInSubprocess(s) {
  1145  		c.sharedContext.release()
  1146  		c.sharedContext = nil
  1147  
  1148  		shared, err := s.getSharedContext()
  1149  		if err != nil {
  1150  			return err
  1151  		}
  1152  		c.sharedContext = shared
  1153  	}
  1154  	return nil
  1155  }