gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/subprocess.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package systrap
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"runtime"
    21  	"sync"
    22  	"sync/atomic"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"gvisor.dev/gvisor/pkg/abi/linux"
    26  	"gvisor.dev/gvisor/pkg/atomicbitops"
    27  	"gvisor.dev/gvisor/pkg/hostarch"
    28  	"gvisor.dev/gvisor/pkg/log"
    29  	"gvisor.dev/gvisor/pkg/pool"
    30  	"gvisor.dev/gvisor/pkg/seccomp"
    31  	"gvisor.dev/gvisor/pkg/sentry/arch"
    32  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    33  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    34  	"gvisor.dev/gvisor/pkg/sentry/platform"
    35  	"gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg"
    36  	"gvisor.dev/gvisor/pkg/sentry/platform/systrap/usertrap"
    37  	"gvisor.dev/gvisor/pkg/sentry/usage"
    38  )
    39  
    40  var (
    41  	// globalPool tracks all subprocesses in various state: active or available for
    42  	// reuse.
    43  	globalPool = subprocessPool{}
    44  
    45  	// maximumUserAddress is the largest possible user address.
    46  	maximumUserAddress = linux.TaskSize
    47  
    48  	// stubInitAddress is the initial attempt link address for the stub.
    49  	stubInitAddress = linux.TaskSize
    50  
    51  	// maxRandomOffsetOfStubAddress is the maximum offset for randomizing a
    52  	// stub address. It is set to the default value of mm.mmap_rnd_bits.
    53  	//
    54  	// Note: Tools like ThreadSanitizer don't like when the memory layout
    55  	// is changed significantly.
    56  	maxRandomOffsetOfStubAddress = (linux.TaskSize >> 7) & ^(uintptr(hostarch.PageSize) - 1)
    57  
    58  	// maxStubUserAddress is the largest possible user address for
    59  	// processes running inside gVisor. It is fixed because
    60  	// * we don't want to reveal a stub address.
    61  	// * it has to be the same across checkpoint/restore.
    62  	maxStubUserAddress = maximumUserAddress - maxRandomOffsetOfStubAddress
    63  )
    64  
    65  // Linux kernel errnos which "should never be seen by user programs", but will
    66  // be revealed to ptrace syscall exit tracing.
    67  //
    68  // These constants are only used in subprocess.go.
    69  const (
    70  	ERESTARTSYS    = unix.Errno(512)
    71  	ERESTARTNOINTR = unix.Errno(513)
    72  	ERESTARTNOHAND = unix.Errno(514)
    73  )
    74  
    75  // thread is a traced thread; it is a thread identifier.
    76  //
    77  // This is a convenience type for defining ptrace operations.
    78  type thread struct {
    79  	tgid int32
    80  	tid  int32
    81  
    82  	// sysmsgStackID is a stack ID in subprocess.sysmsgStackPool.
    83  	sysmsgStackID uint64
    84  
    85  	// initRegs are the initial registers for the first thread.
    86  	//
    87  	// These are used for the register set for system calls.
    88  	initRegs arch.Registers
    89  
    90  	logPrefix atomic.Pointer[string]
    91  }
    92  
    93  // requestThread is used to request a new sysmsg thread. A thread identifier will
    94  // be sent into the thread channel.
    95  type requestThread struct {
    96  	thread chan *thread
    97  }
    98  
    99  // requestStub is used to request a new stub process.
   100  type requestStub struct {
   101  	done chan *thread
   102  }
   103  
   104  // maxSysmsgThreads is the maximum number of sysmsg threads that a subprocess
   105  // can create. It is based on GOMAXPROCS and set once, so it must be set after
   106  // GOMAXPROCS has been adjusted (see loader.go:Args.NumCPU).
   107  var maxSysmsgThreads = 0
   108  
   109  // maxChildThreads is the max number of all child system threads that a
   110  // subprocess can create, including sysmsg threads.
   111  var maxChildThreads = 0
   112  
   113  const (
   114  	// maxGuestContexts specifies the maximum number of task contexts that a
   115  	// subprocess can handle.
   116  	maxGuestContexts = 4095
   117  	// invalidContextID specifies an invalid ID.
   118  	invalidContextID uint32 = 0xfefefefe
   119  	// invalidThreadID is used to indicate that a context is not being worked on by
   120  	// any sysmsg thread.
   121  	invalidThreadID uint32 = 0xfefefefe
   122  )
   123  
   124  // subprocess is a collection of threads being traced.
   125  type subprocess struct {
   126  	platform.NoAddressSpaceIO
   127  	subprocessRefs
   128  
   129  	// requests is used to signal creation of new threads.
   130  	requests chan any
   131  
   132  	// sysmsgInitRegs is used to reset sysemu regs.
   133  	sysmsgInitRegs arch.Registers
   134  
   135  	// mu protects the following fields.
   136  	mu sync.Mutex
   137  
   138  	// faultedContexts is the set of contexts for which it's possible that
   139  	// platformContext.lastFaultSP == this subprocess.
   140  	faultedContexts map[*platformContext]struct{}
   141  
   142  	// sysmsgStackPool is a pool of available sysmsg stacks.
   143  	sysmsgStackPool pool.Pool
   144  
   145  	// threadContextPool is a pool of available sysmsg.ThreadContext IDs.
   146  	threadContextPool pool.Pool
   147  
   148  	// threadContextRegion defines the ThreadContext memory region start
   149  	// within the sentry address space.
   150  	threadContextRegion uintptr
   151  
   152  	// memoryFile is used to allocate a sysmsg stack which is shared
   153  	// between a stub process and the Sentry.
   154  	memoryFile *pgalloc.MemoryFile
   155  
   156  	// usertrap is the state of the usertrap table which contains syscall
   157  	// trampolines.
   158  	usertrap *usertrap.State
   159  
   160  	syscallThreadMu sync.Mutex
   161  	syscallThread   *syscallThread
   162  
   163  	// sysmsgThreadsMu protects sysmsgThreads and numSysmsgThreads
   164  	sysmsgThreadsMu sync.Mutex
   165  	// sysmsgThreads is a collection of all active sysmsg threads in the
   166  	// subprocess.
   167  	sysmsgThreads map[uint32]*sysmsgThread
   168  	// numSysmsgThreads counts the number of active sysmsg threads; we use a
   169  	// counter instead of using len(sysmsgThreads) because we need to synchronize
   170  	// how many threads get created _before_ the creation happens.
   171  	numSysmsgThreads int
   172  
   173  	// contextQueue is a queue of all contexts that are ready to switch back to
   174  	// user mode.
   175  	contextQueue *contextQueue
   176  
   177  	// dead indicates whether the subprocess is alive or not.
   178  	dead atomicbitops.Bool
   179  }
   180  
   181  func (s *subprocess) initSyscallThread(ptraceThread *thread, seccompNotify bool) error {
   182  	s.syscallThreadMu.Lock()
   183  	defer s.syscallThreadMu.Unlock()
   184  
   185  	id, ok := s.sysmsgStackPool.Get()
   186  	if !ok {
   187  		panic("unable to allocate a sysmsg stub thread")
   188  	}
   189  
   190  	ptraceThread.sysmsgStackID = id
   191  	t := syscallThread{
   192  		subproc: s,
   193  		thread:  ptraceThread,
   194  	}
   195  
   196  	if err := t.init(seccompNotify); err != nil {
   197  		panic(fmt.Sprintf("failed to create a syscall thread"))
   198  	}
   199  	s.syscallThread = &t
   200  
   201  	s.syscallThread.detach()
   202  
   203  	return nil
   204  }
   205  
   206  func handlePtraceSyscallRequestError(req any, format string, values ...any) {
   207  	switch req.(type) {
   208  	case requestThread:
   209  		req.(requestThread).thread <- nil
   210  	case requestStub:
   211  		req.(requestStub).done <- nil
   212  	}
   213  	log.Warningf("handlePtraceSyscallRequest failed: "+format, values...)
   214  }
   215  
   216  // handlePtraceSyscallRequest executes system calls that can't be run via
   217  // syscallThread without using ptrace. Look at the description of syscallThread
   218  // to get more details about its limitations.
   219  func (s *subprocess) handlePtraceSyscallRequest(req any) {
   220  	s.syscallThreadMu.Lock()
   221  	defer s.syscallThreadMu.Unlock()
   222  	runtime.LockOSThread()
   223  	defer runtime.UnlockOSThread()
   224  	if err := s.syscallThread.attach(); err != nil {
   225  		handlePtraceSyscallRequestError(req, err.Error())
   226  		return
   227  	}
   228  	defer s.syscallThread.detach()
   229  
   230  	ptraceThread := s.syscallThread.thread
   231  
   232  	switch r := req.(type) {
   233  	case requestThread:
   234  		t, err := ptraceThread.clone()
   235  		if err != nil {
   236  			handlePtraceSyscallRequestError(req, "error initializing thread: %v", err)
   237  			return
   238  		}
   239  
   240  		// Since the new thread was created with
   241  		// clone(CLONE_PTRACE), it will begin execution with
   242  		// SIGSTOP pending and with this thread as its tracer.
   243  		// (Hopefully nobody tgkilled it with a signal <
   244  		// SIGSTOP before the SIGSTOP was delivered, in which
   245  		// case that signal would be delivered before SIGSTOP.)
   246  		if sig := t.wait(stopped); sig != unix.SIGSTOP {
   247  			handlePtraceSyscallRequestError(req, "error waiting for new clone: expected SIGSTOP, got %v", sig)
   248  			return
   249  		}
   250  
   251  		t.initRegs = ptraceThread.initRegs
   252  		// Set the parent death signal to SIGKILL.
   253  		_, err = t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_PRCTL,
   254  			arch.SyscallArgument{Value: linux.PR_SET_PDEATHSIG},
   255  			arch.SyscallArgument{Value: uintptr(unix.SIGKILL)},
   256  			arch.SyscallArgument{Value: 0},
   257  			arch.SyscallArgument{Value: 0},
   258  			arch.SyscallArgument{Value: 0},
   259  			arch.SyscallArgument{Value: 0},
   260  		)
   261  		if err != nil {
   262  			handlePtraceSyscallRequestError(req, "prctl: %v", err)
   263  			return
   264  		}
   265  
   266  		id, ok := s.sysmsgStackPool.Get()
   267  		if !ok {
   268  			handlePtraceSyscallRequestError(req, "unable to allocate a sysmsg stub thread")
   269  			return
   270  		}
   271  		t.sysmsgStackID = id
   272  
   273  		if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(t.tgid), uintptr(t.tid), uintptr(unix.SIGSTOP)); e != 0 {
   274  			handlePtraceSyscallRequestError(req, "tkill failed: %v", e)
   275  			return
   276  		}
   277  
   278  		// Detach the thread.
   279  		t.detach()
   280  
   281  		// Return the thread.
   282  		r.thread <- t
   283  	case requestStub:
   284  		t, err := ptraceThread.createStub()
   285  		if err != nil {
   286  			handlePtraceSyscallRequestError(req, "unable to create a stub process: %v", err)
   287  			return
   288  		}
   289  		r.done <- t
   290  
   291  	}
   292  }
   293  
   294  // newSubprocess returns a usable subprocess.
   295  //
   296  // This will either be a newly created subprocess, or one from the global pool.
   297  // The create function will be called in the latter case, which is guaranteed
   298  // to happen with the runtime thread locked.
   299  //
   300  // seccompNotify indicates a ways of comunications with syscall threads.
   301  // If it is false, futex-s are used. Otherwise, seccomp-unotify is used.
   302  // seccomp-unotify can't be used for the source pool process, because it is a
   303  // parent of all other stub processes, but only one filter can be installed
   304  // with SECCOMP_FILTER_FLAG_NEW_LISTENER.
   305  func newSubprocess(create func() (*thread, error), memoryFile *pgalloc.MemoryFile, seccompNotify bool) (*subprocess, error) {
   306  	if sp := globalPool.fetchAvailable(); sp != nil {
   307  		sp.subprocessRefs.InitRefs()
   308  		sp.usertrap = usertrap.New()
   309  		return sp, nil
   310  	}
   311  
   312  	// The following goroutine is responsible for creating the first traced
   313  	// thread, and responding to requests to make additional threads in the
   314  	// traced process. The process will be killed and reaped when the
   315  	// request channel is closed, which happens in Release below.
   316  	requests := make(chan any)
   317  
   318  	// Ready.
   319  	sp := &subprocess{
   320  		requests:          requests,
   321  		faultedContexts:   make(map[*platformContext]struct{}),
   322  		sysmsgStackPool:   pool.Pool{Start: 0, Limit: uint64(maxChildThreads)},
   323  		threadContextPool: pool.Pool{Start: 0, Limit: maxGuestContexts},
   324  		memoryFile:        memoryFile,
   325  		sysmsgThreads:     make(map[uint32]*sysmsgThread),
   326  	}
   327  	sp.subprocessRefs.InitRefs()
   328  	runtime.LockOSThread()
   329  	defer runtime.UnlockOSThread()
   330  
   331  	// Initialize the syscall thread.
   332  	ptraceThread, err := create()
   333  	if err != nil {
   334  		return nil, err
   335  	}
   336  	sp.sysmsgInitRegs = ptraceThread.initRegs
   337  
   338  	if err := sp.initSyscallThread(ptraceThread, seccompNotify); err != nil {
   339  		return nil, err
   340  	}
   341  
   342  	go func() { // S/R-SAFE: Platform-related.
   343  
   344  		// Wait for requests to create threads.
   345  		for req := range requests {
   346  			sp.handlePtraceSyscallRequest(req)
   347  		}
   348  
   349  		// Requests should never be closed.
   350  		panic("unreachable")
   351  	}()
   352  
   353  	sp.unmap()
   354  	sp.usertrap = usertrap.New()
   355  	sp.mapSharedRegions()
   356  	sp.mapPrivateRegions()
   357  
   358  	// The main stub doesn't need sysmsg threads.
   359  	if seccompNotify {
   360  		// Create the initial sysmsg thread.
   361  		atomic.AddUint32(&sp.contextQueue.numThreadsToWakeup, 1)
   362  		if err := sp.createSysmsgThread(); err != nil {
   363  			return nil, err
   364  		}
   365  		sp.numSysmsgThreads++
   366  	}
   367  
   368  	return sp, nil
   369  }
   370  
   371  // mapSharedRegions maps the shared regions that are used between the subprocess
   372  // and ALL of the subsequently created sysmsg threads into both the sentry and
   373  // the syscall thread.
   374  //
   375  // Should be called before any sysmsg threads are created.
   376  // Initializes s.contextQueue and s.threadContextRegion.
   377  func (s *subprocess) mapSharedRegions() {
   378  	if s.contextQueue != nil || s.threadContextRegion != 0 {
   379  		panic("contextQueue or threadContextRegion was already initialized")
   380  	}
   381  
   382  	opts := pgalloc.AllocOpts{
   383  		Kind: usage.System,
   384  		Dir:  pgalloc.TopDown,
   385  	}
   386  
   387  	// Map shared regions into the sentry.
   388  	contextQueueFR, contextQueue := mmapContextQueueForSentry(s.memoryFile, opts)
   389  	contextQueue.init()
   390  
   391  	// Map thread context region into the syscall thread.
   392  	_, err := s.syscallThread.syscall(
   393  		unix.SYS_MMAP,
   394  		arch.SyscallArgument{Value: uintptr(stubContextQueueRegion)},
   395  		arch.SyscallArgument{Value: uintptr(contextQueueFR.Length())},
   396  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   397  		arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)},
   398  		arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())},
   399  		arch.SyscallArgument{Value: uintptr(contextQueueFR.Start)})
   400  	if err != nil {
   401  		panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err))
   402  	}
   403  
   404  	s.contextQueue = contextQueue
   405  
   406  	// Map thread context region into the sentry.
   407  	threadContextFR, err := s.memoryFile.Allocate(uint64(stubContextRegionLen), opts)
   408  	if err != nil {
   409  		panic(fmt.Sprintf("failed to allocate a new subprocess context memory region"))
   410  	}
   411  	sentryThreadContextRegionAddr, _, errno := unix.RawSyscall6(
   412  		unix.SYS_MMAP,
   413  		0,
   414  		uintptr(threadContextFR.Length()),
   415  		unix.PROT_WRITE|unix.PROT_READ,
   416  		unix.MAP_SHARED|unix.MAP_FILE,
   417  		uintptr(s.memoryFile.FD()), uintptr(threadContextFR.Start))
   418  	if errno != 0 {
   419  		panic(fmt.Sprintf("mmap failed for subprocess context memory region: %v", errno))
   420  	}
   421  
   422  	// Map thread context region into the syscall thread.
   423  	if _, err := s.syscallThread.syscall(
   424  		unix.SYS_MMAP,
   425  		arch.SyscallArgument{Value: uintptr(stubContextRegion)},
   426  		arch.SyscallArgument{Value: uintptr(threadContextFR.Length())},
   427  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   428  		arch.SyscallArgument{Value: uintptr(unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED)},
   429  		arch.SyscallArgument{Value: uintptr(s.memoryFile.FD())},
   430  		arch.SyscallArgument{Value: uintptr(threadContextFR.Start)}); err != nil {
   431  		panic(fmt.Sprintf("failed to mmap context queue region into syscall thread: %v", err))
   432  	}
   433  
   434  	s.threadContextRegion = sentryThreadContextRegionAddr
   435  }
   436  
   437  func (s *subprocess) mapPrivateRegions() {
   438  	_, err := s.syscallThread.syscall(
   439  		unix.SYS_MMAP,
   440  		arch.SyscallArgument{Value: uintptr(stubSpinningThreadQueueAddr)},
   441  		arch.SyscallArgument{Value: uintptr(sysmsg.SpinningQueueMemSize)},
   442  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   443  		arch.SyscallArgument{Value: uintptr(unix.MAP_PRIVATE | unix.MAP_ANONYMOUS | unix.MAP_FIXED)},
   444  		arch.SyscallArgument{Value: 0},
   445  		arch.SyscallArgument{Value: 0})
   446  	if err != nil {
   447  		panic(fmt.Sprintf("failed to mmap spinning queue region into syscall thread: %v", err))
   448  	}
   449  }
   450  
   451  // unmap unmaps non-stub regions of the process.
   452  //
   453  // This will panic on failure (which should never happen).
   454  func (s *subprocess) unmap() {
   455  	s.Unmap(0, uint64(stubStart))
   456  	if maximumUserAddress != stubEnd {
   457  		s.Unmap(hostarch.Addr(stubEnd), uint64(maximumUserAddress-stubEnd))
   458  	}
   459  }
   460  
   461  // Release kills the subprocess.
   462  //
   463  // Just kidding! We can't safely coordinate the detaching of all the
   464  // tracees (since the tracers are random runtime threads, and the process
   465  // won't exit until tracers have been notifier).
   466  //
   467  // Therefore we simply unmap everything in the subprocess and return it to the
   468  // globalPool. This has the added benefit of reducing creation time for new
   469  // subprocesses.
   470  func (s *subprocess) Release() {
   471  	if !s.alive() {
   472  		return
   473  	}
   474  	s.unmap()
   475  	s.DecRef(s.release)
   476  }
   477  
   478  // release returns the subprocess to the global pool.
   479  func (s *subprocess) release() {
   480  	if s.alive() {
   481  		globalPool.markAvailable(s)
   482  		return
   483  	}
   484  	if s.syscallThread != nil && s.syscallThread.seccompNotify != nil {
   485  		s.syscallThread.seccompNotify.Close()
   486  	}
   487  }
   488  
   489  // attach attaches to the thread.
   490  func (t *thread) attach() error {
   491  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_ATTACH, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
   492  		return fmt.Errorf("unable to attach: %v", errno)
   493  	}
   494  
   495  	// PTRACE_ATTACH sends SIGSTOP, and wakes the tracee if it was already
   496  	// stopped from the SIGSTOP queued by CLONE_PTRACE (see inner loop of
   497  	// newSubprocess), so we always expect to see signal-delivery-stop with
   498  	// SIGSTOP.
   499  	if sig := t.wait(stopped); sig != unix.SIGSTOP {
   500  		return fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
   501  	}
   502  
   503  	// Initialize options.
   504  	t.init()
   505  	return nil
   506  }
   507  
   508  func (t *thread) grabInitRegs() {
   509  	// Grab registers.
   510  	//
   511  	// Note that we adjust the current register RIP value to be just before
   512  	// the current system call executed. This depends on the definition of
   513  	// the stub itself.
   514  	if err := t.getRegs(&t.initRegs); err != nil {
   515  		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
   516  	}
   517  	t.adjustInitRegsRip()
   518  	t.initRegs.SetStackPointer(0)
   519  }
   520  
   521  // detach detaches from the thread.
   522  //
   523  // Because the SIGSTOP is not suppressed, the thread will enter group-stop.
   524  func (t *thread) detach() {
   525  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(t.tid), 0, uintptr(unix.SIGSTOP), 0, 0); errno != 0 {
   526  		panic(fmt.Sprintf("can't detach new clone: %v", errno))
   527  	}
   528  }
   529  
   530  // waitOutcome is used for wait below.
   531  type waitOutcome int
   532  
   533  const (
   534  	// stopped indicates that the process was stopped.
   535  	stopped waitOutcome = iota
   536  
   537  	// killed indicates that the process was killed.
   538  	killed
   539  )
   540  
   541  func (t *thread) loadLogPrefix() *string {
   542  	p := t.logPrefix.Load()
   543  	if p == nil {
   544  		prefix := fmt.Sprintf("[% 4d:% 4d] ", t.tgid, t.tid)
   545  		t.logPrefix.Store(&prefix)
   546  		p = &prefix
   547  	}
   548  	return p
   549  }
   550  
   551  // Debugf logs with the debugging severity.
   552  func (t *thread) Debugf(format string, v ...any) {
   553  	if log.IsLogging(log.Debug) {
   554  		log.DebugfAtDepth(1, *t.loadLogPrefix()+format, v...)
   555  	}
   556  }
   557  
   558  // Warningf logs with the warning severity.
   559  func (t *thread) Warningf(format string, v ...any) {
   560  	if log.IsLogging(log.Warning) {
   561  		log.WarningfAtDepth(1, *t.loadLogPrefix()+format, v...)
   562  	}
   563  }
   564  
   565  func (t *thread) dumpAndPanic(message string) {
   566  	var regs arch.Registers
   567  	message += "\n"
   568  	if err := t.getRegs(&regs); err == nil {
   569  		message += dumpRegs(&regs)
   570  	} else {
   571  		log.Warningf("unable to get registers: %v", err)
   572  	}
   573  	message += fmt.Sprintf("stubStart\t = %016x\n", stubStart)
   574  	panic(message)
   575  }
   576  
   577  func (t *thread) dumpRegs(message string) {
   578  	var regs arch.Registers
   579  	message += "\n"
   580  	if err := t.getRegs(&regs); err == nil {
   581  		message += dumpRegs(&regs)
   582  	} else {
   583  		log.Warningf("unable to get registers: %v", err)
   584  	}
   585  	log.Infof("%s", message)
   586  }
   587  
   588  func (t *thread) unexpectedStubExit() {
   589  	msg, err := t.getEventMessage()
   590  	status := unix.WaitStatus(msg)
   591  	if status.Signaled() && status.Signal() == unix.SIGKILL {
   592  		// SIGKILL can be only sent by a user or OOM-killer. In both
   593  		// these cases, we don't need to panic. There is no reasons to
   594  		// think that something wrong in gVisor.
   595  		log.Warningf("The ptrace stub process %v has been killed by SIGKILL.", t.tgid)
   596  		pid := os.Getpid()
   597  		unix.Tgkill(pid, pid, unix.Signal(unix.SIGKILL))
   598  	}
   599  	t.dumpAndPanic(fmt.Sprintf("wait failed: the process %d:%d exited: %x (err %v)", t.tgid, t.tid, msg, err))
   600  }
   601  
   602  // wait waits for a stop event.
   603  //
   604  // Precondition: outcome is a valid waitOutcome.
   605  func (t *thread) wait(outcome waitOutcome) unix.Signal {
   606  	var status unix.WaitStatus
   607  
   608  	for {
   609  		r, err := unix.Wait4(int(t.tid), &status, unix.WALL|unix.WUNTRACED, nil)
   610  		if err == unix.EINTR || err == unix.EAGAIN {
   611  			// Wait was interrupted; wait again.
   612  			continue
   613  		} else if err != nil {
   614  			panic(fmt.Sprintf("ptrace wait failed: %v", err))
   615  		}
   616  		if int(r) != int(t.tid) {
   617  			panic(fmt.Sprintf("ptrace wait returned %v, expected %v", r, t.tid))
   618  		}
   619  		switch outcome {
   620  		case stopped:
   621  			if !status.Stopped() {
   622  				t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
   623  			}
   624  			stopSig := status.StopSignal()
   625  			if stopSig == 0 {
   626  				continue // Spurious stop.
   627  			}
   628  			if stopSig == unix.SIGTRAP {
   629  				if status.TrapCause() == unix.PTRACE_EVENT_EXIT {
   630  					t.unexpectedStubExit()
   631  				}
   632  				// Re-encode the trap cause the way it's expected.
   633  				return stopSig | unix.Signal(status.TrapCause()<<8)
   634  			}
   635  			// Not a trap signal.
   636  			return stopSig
   637  		case killed:
   638  			if !status.Exited() && !status.Signaled() {
   639  				t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
   640  			}
   641  			return unix.Signal(status.ExitStatus())
   642  		default:
   643  			// Should not happen.
   644  			t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome))
   645  		}
   646  	}
   647  }
   648  
   649  // kill kills the thread;
   650  func (t *thread) kill() {
   651  	unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL))
   652  }
   653  
   654  // destroy kills and waits on the thread.
   655  //
   656  // Note that this should not be used in the general case; the death of threads
   657  // will typically cause the death of the parent. This is a utility method for
   658  // manually created threads.
   659  func (t *thread) destroy() {
   660  	t.detach()
   661  	unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(unix.SIGKILL))
   662  	t.wait(killed)
   663  }
   664  
   665  // init initializes trace options.
   666  func (t *thread) init() {
   667  	// Set the TRACESYSGOOD option to differentiate real SIGTRAP.
   668  	// set PTRACE_O_EXITKILL to ensure that the unexpected exit of the
   669  	// sentry will immediately kill the associated stubs.
   670  	_, _, errno := unix.RawSyscall6(
   671  		unix.SYS_PTRACE,
   672  		unix.PTRACE_SETOPTIONS,
   673  		uintptr(t.tid),
   674  		0,
   675  		unix.PTRACE_O_TRACESYSGOOD|unix.PTRACE_O_TRACEEXIT|unix.PTRACE_O_EXITKILL,
   676  		0, 0)
   677  	if errno != 0 {
   678  		panic(fmt.Sprintf("ptrace set options failed: %v", errno))
   679  	}
   680  }
   681  
   682  // syscall executes a system call cycle in the traced context.
   683  //
   684  // This is _not_ for use by application system calls, rather it is for use when
   685  // a system call must be injected into the remote context (e.g. mmap, munmap).
   686  // Note that clones are handled separately.
   687  func (t *thread) syscall(regs *arch.Registers) (uintptr, error) {
   688  	// Set registers.
   689  	if err := t.setRegs(regs); err != nil {
   690  		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
   691  	}
   692  
   693  	for {
   694  		// Execute the syscall instruction. The task has to stop on the
   695  		// trap instruction which is right after the syscall
   696  		// instruction.
   697  		if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
   698  			panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
   699  		}
   700  
   701  		sig := t.wait(stopped)
   702  		if sig == unix.SIGTRAP {
   703  			// Reached syscall-enter-stop.
   704  			break
   705  		} else {
   706  			// Some other signal caused a thread stop; ignore.
   707  			if sig != unix.SIGSTOP && sig != unix.SIGCHLD {
   708  				log.Warningf("The thread %d:%d has been interrupted by %d", t.tgid, t.tid, sig)
   709  			}
   710  			continue
   711  		}
   712  	}
   713  
   714  	// Grab registers.
   715  	if err := t.getRegs(regs); err != nil {
   716  		panic(fmt.Sprintf("ptrace get regs failed: %v", err))
   717  	}
   718  	return syscallReturnValue(regs)
   719  }
   720  
   721  // syscallIgnoreInterrupt ignores interrupts on the system call thread and
   722  // restarts the syscall if the kernel indicates that should happen.
   723  func (t *thread) syscallIgnoreInterrupt(
   724  	initRegs *arch.Registers,
   725  	sysno uintptr,
   726  	args ...arch.SyscallArgument) (uintptr, error) {
   727  	for {
   728  		regs := createSyscallRegs(initRegs, sysno, args...)
   729  		rval, err := t.syscall(&regs)
   730  		switch err {
   731  		case ERESTARTSYS:
   732  			continue
   733  		case ERESTARTNOINTR:
   734  			continue
   735  		case ERESTARTNOHAND:
   736  			continue
   737  		default:
   738  			return rval, err
   739  		}
   740  	}
   741  }
   742  
   743  // NotifyInterrupt implements interrupt.Receiver.NotifyInterrupt.
   744  func (t *thread) NotifyInterrupt() {
   745  	unix.Tgkill(int(t.tgid), int(t.tid), unix.Signal(platform.SignalInterrupt))
   746  }
   747  
   748  func (s *subprocess) incAwakeContexts() {
   749  	nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, 1)
   750  	if nr > uint32(maxSysmsgThreads) {
   751  		return
   752  	}
   753  	fastpath.nrMaxAwakeStubThreads.Add(1)
   754  }
   755  
   756  func (s *subprocess) decAwakeContexts() {
   757  	nr := atomic.AddUint32(&s.contextQueue.numAwakeContexts, ^uint32(0))
   758  	if nr >= uint32(maxSysmsgThreads) {
   759  		return
   760  	}
   761  	fastpath.nrMaxAwakeStubThreads.Add(^uint32(0))
   762  }
   763  
   764  // switchToApp is called from the main SwitchToApp entrypoint.
   765  //
   766  // This function returns true on a system call, false on a signal.
   767  // The second return value is true if a syscall instruction can be replaced on
   768  // a function call.
   769  func (s *subprocess) switchToApp(c *platformContext, ac *arch.Context64) (isSyscall bool, shouldPatchSyscall bool, err *platform.ContextError) {
   770  	// Reset necessary registers.
   771  	regs := &ac.StateData().Regs
   772  	s.resetSysemuRegs(regs)
   773  	ctx := c.sharedContext
   774  	ctx.shared.Regs = regs.PtraceRegs
   775  	restoreArchSpecificState(ctx.shared, ac)
   776  
   777  	// Check for interrupts, and ensure that future interrupts signal the context.
   778  	if !c.interrupt.Enable(c.sharedContext) {
   779  		// Pending interrupt; simulate.
   780  		ctx.clearInterrupt()
   781  		c.signalInfo = linux.SignalInfo{Signo: int32(platform.SignalInterrupt)}
   782  		return false, false, nil
   783  	}
   784  	defer func() {
   785  		ctx.clearInterrupt()
   786  		c.interrupt.Disable()
   787  	}()
   788  
   789  	restoreFPState(ctx, c, ac)
   790  
   791  	// Place the context onto the context queue.
   792  	if ctx.sleeping {
   793  		ctx.sleeping = false
   794  		s.incAwakeContexts()
   795  	}
   796  	ctx.setState(sysmsg.ContextStateNone)
   797  	if err := s.contextQueue.add(ctx); err != nil {
   798  		return false, false, err
   799  	}
   800  
   801  	if err := s.waitOnState(ctx); err != nil {
   802  		return false, false, corruptedSharedMemoryErr(err.Error())
   803  	}
   804  
   805  	// Check if there's been an error.
   806  	threadID := ctx.threadID()
   807  	if threadID != invalidThreadID {
   808  		if sysThread, ok := s.sysmsgThreads[threadID]; ok && sysThread.msg.Err != 0 {
   809  			return false, false, sysThread.msg.ConvertSysmsgErr()
   810  		}
   811  		return false, false, corruptedSharedMemoryErr(fmt.Sprintf("found unexpected ThreadContext.ThreadID field, expected %d found %d", invalidThreadID, threadID))
   812  	}
   813  
   814  	// Copy register state locally.
   815  	regs.PtraceRegs = ctx.shared.Regs
   816  	retrieveArchSpecificState(ctx.shared, ac)
   817  	c.needToPullFullState = true
   818  	// We have a signal. We verify however, that the signal was
   819  	// either delivered from the kernel or from this process. We
   820  	// don't respect other signals.
   821  	c.signalInfo = ctx.shared.SignalInfo
   822  	ctxState := ctx.state()
   823  	if ctxState == sysmsg.ContextStateSyscallCanBePatched {
   824  		ctxState = sysmsg.ContextStateSyscall
   825  		shouldPatchSyscall = true
   826  	}
   827  
   828  	if ctxState == sysmsg.ContextStateSyscall || ctxState == sysmsg.ContextStateSyscallTrap {
   829  		if maybePatchSignalInfo(regs, &c.signalInfo) {
   830  			return false, false, nil
   831  		}
   832  		updateSyscallRegs(regs)
   833  		return true, shouldPatchSyscall, nil
   834  	} else if ctxState != sysmsg.ContextStateFault {
   835  		return false, false, corruptedSharedMemoryErr(fmt.Sprintf("unknown context state: %v", ctxState))
   836  	}
   837  
   838  	return false, false, nil
   839  }
   840  
   841  func (s *subprocess) waitOnState(ctx *sharedContext) error {
   842  	ctx.kicked = false
   843  	slowPath := false
   844  	if !s.contextQueue.fastPathEnabled() || atomic.LoadUint32(&s.contextQueue.numActiveThreads) == 0 {
   845  		ctx.kicked = s.kickSysmsgThread()
   846  	}
   847  	for curState := ctx.state(); curState == sysmsg.ContextStateNone; curState = ctx.state() {
   848  		if !slowPath {
   849  			events := dispatcher.waitFor(ctx)
   850  			if events&sharedContextKicked != 0 {
   851  				if ctx.kicked {
   852  					continue
   853  				}
   854  				if ctx.isAcked() {
   855  					ctx.kicked = true
   856  					continue
   857  				}
   858  				s.kickSysmsgThread()
   859  				ctx.kicked = true
   860  				continue
   861  			}
   862  			if events&sharedContextSlowPath != 0 {
   863  				ctx.disableSentryFastPath()
   864  				slowPath = true
   865  				continue
   866  			}
   867  		} else {
   868  			// If the context already received a handshake then it knows it's being
   869  			// worked on.
   870  			if !ctx.kicked && !ctx.isAcked() {
   871  				ctx.kicked = s.kickSysmsgThread()
   872  			}
   873  
   874  			if err := ctx.sleepOnState(curState); err != nil {
   875  				return err
   876  			}
   877  		}
   878  	}
   879  
   880  	ctx.recordLatency()
   881  	ctx.resetLatencyMeasures()
   882  	ctx.enableSentryFastPath()
   883  
   884  	return nil
   885  }
   886  
   887  // canKickSysmsgThread returns true if a new thread can be kicked.
   888  // The second return value is the expected number of threads after kicking a
   889  // new one.
   890  func (s *subprocess) canKickSysmsgThread() (bool, uint32) {
   891  	// numActiveContexts and numActiveThreads can be changed from stub
   892  	// threads that handles the contextQueue without any locks. The idea
   893  	// here is that any stub thread that gets CPU time can make some
   894  	// progress. In stub threads, we can use only spinlock-like
   895  	// synchronizations, but they don't work well because a thread that
   896  	// holds a lock can be preempted by another thread that is waiting for
   897  	// the same lock.
   898  	nrActiveThreads := atomic.LoadUint32(&s.contextQueue.numActiveThreads)
   899  	nrThreadsToWakeup := atomic.LoadUint32(&s.contextQueue.numThreadsToWakeup)
   900  	nrActiveContexts := atomic.LoadUint32(&s.contextQueue.numActiveContexts)
   901  
   902  	nrActiveThreads += nrThreadsToWakeup + 1
   903  	if nrActiveThreads > nrActiveContexts {
   904  		// This can happen when one or more stub threads are
   905  		// waiting for cpu time. The host probably has more
   906  		// running tasks than a number of cpu-s.
   907  		return false, nrActiveThreads
   908  	}
   909  	return true, nrActiveThreads
   910  }
   911  
   912  // kickSysmsgThread returns true if it was able to wake up or create a new sysmsg
   913  // stub thread.
   914  func (s *subprocess) kickSysmsgThread() bool {
   915  	kick, _ := s.canKickSysmsgThread()
   916  	if !kick {
   917  		return false
   918  	}
   919  
   920  	s.sysmsgThreadsMu.Lock()
   921  	kick, nrThreads := s.canKickSysmsgThread()
   922  	if !kick {
   923  		s.sysmsgThreadsMu.Unlock()
   924  		return false
   925  	}
   926  	numTimesStubKicked.Increment()
   927  	atomic.AddUint32(&s.contextQueue.numThreadsToWakeup, 1)
   928  	if s.numSysmsgThreads < maxSysmsgThreads && s.numSysmsgThreads < int(nrThreads) {
   929  		s.numSysmsgThreads++
   930  		s.sysmsgThreadsMu.Unlock()
   931  		if err := s.createSysmsgThread(); err != nil {
   932  			log.Warningf("Unable to create a new stub thread: %s", err)
   933  			s.sysmsgThreadsMu.Lock()
   934  			s.numSysmsgThreads--
   935  			s.sysmsgThreadsMu.Unlock()
   936  		}
   937  	} else {
   938  		s.sysmsgThreadsMu.Unlock()
   939  	}
   940  	s.contextQueue.wakeupSysmsgThread()
   941  
   942  	return true
   943  }
   944  
   945  // syscall executes the given system call without handling interruptions.
   946  func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) {
   947  	s.syscallThreadMu.Lock()
   948  	defer s.syscallThreadMu.Unlock()
   949  
   950  	return s.syscallThread.syscall(sysno, args...)
   951  }
   952  
   953  // MapFile implements platform.AddressSpace.MapFile.
   954  func (s *subprocess) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
   955  	var flags int
   956  	if precommit {
   957  		flags |= unix.MAP_POPULATE
   958  	}
   959  	_, err := s.syscall(
   960  		unix.SYS_MMAP,
   961  		arch.SyscallArgument{Value: uintptr(addr)},
   962  		arch.SyscallArgument{Value: uintptr(fr.Length())},
   963  		arch.SyscallArgument{Value: uintptr(at.Prot())},
   964  		arch.SyscallArgument{Value: uintptr(flags | unix.MAP_SHARED | unix.MAP_FIXED)},
   965  		arch.SyscallArgument{Value: uintptr(f.FD())},
   966  		arch.SyscallArgument{Value: uintptr(fr.Start)})
   967  	return err
   968  }
   969  
   970  // Unmap implements platform.AddressSpace.Unmap.
   971  func (s *subprocess) Unmap(addr hostarch.Addr, length uint64) {
   972  	ar, ok := addr.ToRange(length)
   973  	if !ok {
   974  		panic(fmt.Sprintf("addr %#x + length %#x overflows", addr, length))
   975  	}
   976  	s.mu.Lock()
   977  	for c := range s.faultedContexts {
   978  		c.mu.Lock()
   979  		if c.lastFaultSP == s && ar.Contains(c.lastFaultAddr) {
   980  			// Forget the last fault so that if c faults again, the fault isn't
   981  			// incorrectly reported as a write fault. If this is being called
   982  			// due to munmap() of the corresponding vma, handling of the second
   983  			// fault will fail anyway.
   984  			c.lastFaultSP = nil
   985  			delete(s.faultedContexts, c)
   986  		}
   987  		c.mu.Unlock()
   988  	}
   989  	s.mu.Unlock()
   990  	_, err := s.syscall(
   991  		unix.SYS_MUNMAP,
   992  		arch.SyscallArgument{Value: uintptr(addr)},
   993  		arch.SyscallArgument{Value: uintptr(length)})
   994  	if err != nil && err != errDeadSubprocess {
   995  		// We never expect this to happen.
   996  		panic(fmt.Sprintf("munmap(%x, %x)) failed: %v", addr, length, err))
   997  	}
   998  }
   999  
  1000  func (s *subprocess) PullFullState(c *platformContext, ac *arch.Context64) error {
  1001  	if !c.sharedContext.isActiveInSubprocess(s) {
  1002  		panic("Attempted to PullFullState for context that is not used in subprocess")
  1003  	}
  1004  	saveFPState(c.sharedContext, ac)
  1005  	return nil
  1006  }
  1007  
  1008  var (
  1009  	sysmsgThreadPriorityOnce sync.Once
  1010  	sysmsgThreadPriority     int
  1011  )
  1012  
  1013  // initSysmsgThreadPriority looks at the current priority of the process
  1014  // and updates `sysmsgThreadPriority` accordingly.
  1015  func initSysmsgThreadPriority() {
  1016  	sysmsgThreadPriorityOnce.Do(func() {
  1017  		prio, err := unix.Getpriority(unix.PRIO_PROCESS, 0)
  1018  		if err != nil {
  1019  			panic("unable to get current scheduling priority")
  1020  		}
  1021  		// Sysmsg threads are executed with a priority one lower than the Sentry.
  1022  		sysmsgThreadPriority = 20 - prio + 1
  1023  	})
  1024  }
  1025  
  1026  // createSysmsgThread creates a new sysmsg thread.
  1027  // The thread starts processing any available context in the context queue.
  1028  func (s *subprocess) createSysmsgThread() error {
  1029  	// Create a new seccomp process.
  1030  	var r requestThread
  1031  	r.thread = make(chan *thread)
  1032  	s.requests <- r
  1033  	p := <-r.thread
  1034  	if p == nil {
  1035  		return fmt.Errorf("createSysmsgThread: failed to get clone")
  1036  	}
  1037  
  1038  	runtime.LockOSThread()
  1039  	defer runtime.UnlockOSThread()
  1040  	if err := p.attach(); err != nil {
  1041  		return err
  1042  	}
  1043  
  1044  	// Skip SIGSTOP.
  1045  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_CONT, uintptr(p.tid), 0, 0, 0, 0); errno != 0 {
  1046  		panic(fmt.Sprintf("ptrace cont failed: %v", errno))
  1047  	}
  1048  	sig := p.wait(stopped)
  1049  	if sig != unix.SIGSTOP {
  1050  		panic(fmt.Sprintf("error waiting for new clone: expected SIGSTOP, got %v", sig))
  1051  	}
  1052  
  1053  	// Allocate a new stack for the BPF process.
  1054  	opts := pgalloc.AllocOpts{
  1055  		Kind: usage.System,
  1056  		Dir:  pgalloc.TopDown,
  1057  	}
  1058  	fr, err := s.memoryFile.Allocate(uint64(sysmsg.PerThreadSharedStackSize), opts)
  1059  	if err != nil {
  1060  		// TODO(b/144063246): Need to fail the clone system call.
  1061  		panic(fmt.Sprintf("failed to allocate a new stack: %v", err))
  1062  	}
  1063  	sysThread := &sysmsgThread{
  1064  		thread:     p,
  1065  		subproc:    s,
  1066  		stackRange: fr,
  1067  	}
  1068  	// Use the sysmsgStackID as a handle on this thread instead of host tid in
  1069  	// order to be able to reliably specify invalidThreadID.
  1070  	threadID := uint32(p.sysmsgStackID)
  1071  
  1072  	// Map the stack into the sentry.
  1073  	sentryStackAddr, _, errno := unix.RawSyscall6(
  1074  		unix.SYS_MMAP,
  1075  		0,
  1076  		sysmsg.PerThreadSharedStackSize,
  1077  		unix.PROT_WRITE|unix.PROT_READ,
  1078  		unix.MAP_SHARED|unix.MAP_FILE,
  1079  		uintptr(s.memoryFile.FD()), uintptr(fr.Start))
  1080  	if errno != 0 {
  1081  		panic(fmt.Sprintf("mmap failed: %v", errno))
  1082  	}
  1083  
  1084  	// Before installing the stub syscall filters, we need to call a few
  1085  	// system calls (e.g. sigaltstack, sigaction) which have in-memory
  1086  	// arguments.  We need to prevent changing these parameters by other
  1087  	// stub threads, so lets map the future BPF stack as read-only and
  1088  	// fill syscall arguments from the Sentry.
  1089  	sysmsgStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadSharedStackOffset
  1090  	err = sysThread.mapStack(sysmsgStackAddr, true)
  1091  	if err != nil {
  1092  		panic(fmt.Sprintf("mmap failed: %v", err))
  1093  	}
  1094  
  1095  	sysThread.init(sentryStackAddr, sysmsgStackAddr)
  1096  
  1097  	// Map the stack into the BPF process.
  1098  	err = sysThread.mapStack(sysmsgStackAddr, false)
  1099  	if err != nil {
  1100  		s.memoryFile.DecRef(fr)
  1101  		panic(fmt.Sprintf("mmap failed: %v", err))
  1102  	}
  1103  
  1104  	// Map the stack into the BPF process.
  1105  	privateStackAddr := sysThread.sysmsgPerThreadMemAddr() + sysmsg.PerThreadPrivateStackOffset
  1106  	err = sysThread.mapPrivateStack(privateStackAddr, sysmsg.PerThreadPrivateStackSize)
  1107  	if err != nil {
  1108  		s.memoryFile.DecRef(fr)
  1109  		panic(fmt.Sprintf("mmap failed: %v", err))
  1110  	}
  1111  
  1112  	sysThread.setMsg(sysmsg.StackAddrToMsg(sentryStackAddr))
  1113  	sysThread.msg.Init(threadID)
  1114  	sysThread.msg.Self = uint64(sysmsgStackAddr + sysmsg.MsgOffsetFromSharedStack)
  1115  	sysThread.msg.SyshandlerStack = uint64(sysmsg.StackAddrToSyshandlerStack(sysThread.sysmsgPerThreadMemAddr()))
  1116  	sysThread.msg.Syshandler = uint64(stubSysmsgStart + uintptr(sysmsg.Sighandler_blob_offset____export_syshandler))
  1117  
  1118  	sysThread.msg.State.Set(sysmsg.ThreadStateInitializing)
  1119  
  1120  	if err := unix.Setpriority(unix.PRIO_PROCESS, int(p.tid), sysmsgThreadPriority); err != nil {
  1121  		log.Warningf("Unable to change priority of a stub thread: %s", err)
  1122  	}
  1123  
  1124  	// Install a pre-compiled seccomp rules for the BPF process.
  1125  	_, err = p.syscallIgnoreInterrupt(&p.initRegs, unix.SYS_PRCTL,
  1126  		arch.SyscallArgument{Value: uintptr(linux.PR_SET_NO_NEW_PRIVS)},
  1127  		arch.SyscallArgument{Value: uintptr(1)},
  1128  		arch.SyscallArgument{Value: uintptr(0)},
  1129  		arch.SyscallArgument{Value: uintptr(0)},
  1130  		arch.SyscallArgument{Value: uintptr(0)},
  1131  		arch.SyscallArgument{Value: uintptr(0)})
  1132  	if err != nil {
  1133  		panic(fmt.Sprintf("prctl(PR_SET_NO_NEW_PRIVS) failed: %v", err))
  1134  	}
  1135  
  1136  	_, err = p.syscallIgnoreInterrupt(&p.initRegs, seccomp.SYS_SECCOMP,
  1137  		arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)},
  1138  		arch.SyscallArgument{Value: uintptr(0)},
  1139  		arch.SyscallArgument{Value: stubSysmsgRules})
  1140  	if err != nil {
  1141  		panic(fmt.Sprintf("seccomp failed: %v", err))
  1142  	}
  1143  
  1144  	// Prepare to start the BPF process.
  1145  	tregs := &arch.Registers{}
  1146  	s.resetSysemuRegs(tregs)
  1147  	setArchSpecificRegs(sysThread, tregs)
  1148  	if err := p.setRegs(tregs); err != nil {
  1149  		panic(fmt.Sprintf("ptrace set regs failed: %v", err))
  1150  	}
  1151  	archSpecificSysmsgThreadInit(sysThread)
  1152  	// Skip SIGSTOP.
  1153  	if _, _, e := unix.RawSyscall(unix.SYS_TGKILL, uintptr(p.tgid), uintptr(p.tid), uintptr(unix.SIGCONT)); e != 0 {
  1154  		panic(fmt.Sprintf("tkill failed: %v", e))
  1155  	}
  1156  	// Resume the BPF process.
  1157  	if _, _, errno := unix.RawSyscall6(unix.SYS_PTRACE, unix.PTRACE_DETACH, uintptr(p.tid), 0, 0, 0, 0); errno != 0 {
  1158  		panic(fmt.Sprintf("can't detach new clone: %v", errno))
  1159  	}
  1160  
  1161  	s.sysmsgThreadsMu.Lock()
  1162  	s.sysmsgThreads[threadID] = sysThread
  1163  	s.sysmsgThreadsMu.Unlock()
  1164  
  1165  	return nil
  1166  }
  1167  
  1168  // PreFork implements platform.AddressSpace.PreFork.
  1169  // We need to take the usertrap lock to be sure that fork() will not be in the
  1170  // middle of applying a binary patch.
  1171  func (s *subprocess) PreFork() {
  1172  	s.usertrap.PreFork()
  1173  }
  1174  
  1175  // PostFork implements platform.AddressSpace.PostFork.
  1176  func (s *subprocess) PostFork() {
  1177  	s.usertrap.PostFork() // +checklocksforce: PreFork acquires, above.
  1178  }
  1179  
  1180  // activateContext activates the context in this subprocess.
  1181  // No-op if the context is already active within the subprocess; if not,
  1182  // deactivates it from its last subprocess.
  1183  func (s *subprocess) activateContext(c *platformContext) error {
  1184  	if !c.sharedContext.isActiveInSubprocess(s) {
  1185  		c.sharedContext.release()
  1186  		c.sharedContext = nil
  1187  
  1188  		shared, err := s.getSharedContext()
  1189  		if err != nil {
  1190  			return err
  1191  		}
  1192  		c.sharedContext = shared
  1193  	}
  1194  	return nil
  1195  }