github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/systrap/subprocess_linux.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  package systrap
    19  
    20  import (
    21  	"fmt"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/metacubex/gvisor/pkg/abi/linux"
    25  	"github.com/metacubex/gvisor/pkg/bpf"
    26  	"github.com/metacubex/gvisor/pkg/seccomp"
    27  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    28  )
    29  
    30  const syscallEvent unix.Signal = 0x80
    31  
    32  // createStub creates a fresh stub processes.
    33  //
    34  // Precondition: the runtime OS thread must be locked.
    35  func createStub() (*thread, error) {
    36  	// When creating the new child process, we specify SIGKILL as the
    37  	// signal to deliver when the child exits. We never expect a subprocess
    38  	// to exit; they are pooled and reused. This is done to ensure that if
    39  	// a subprocess is OOM-killed, this process (and all other stubs,
    40  	// transitively) will be killed as well. It's simply not possible to
    41  	// safely handle a single stub getting killed: the exact state of
    42  	// execution is unknown and not recoverable.
    43  	return attachedThread(unix.CLONE_FILES|uintptr(unix.SIGCHLD), linux.SECCOMP_RET_TRAP)
    44  }
    45  
    46  // attachedThread returns a new attached thread.
    47  //
    48  // Precondition: the runtime OS thread must be locked.
    49  func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) {
    50  	// Create a BPF program that allows only the system calls needed by the
    51  	// stub and all its children. This is used to create child stubs
    52  	// (below), so we must include the ability to fork, but otherwise lock
    53  	// down available calls only to what is needed.
    54  	rules := []seccomp.RuleSet{}
    55  	if defaultAction != linux.SECCOMP_RET_ALLOW {
    56  		ruleSet := seccomp.RuleSet{
    57  			Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{
    58  				unix.SYS_CLONE: seccomp.Or{
    59  					// Allow creation of new subprocesses (used by the master).
    60  					seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.CLONE_PARENT | unix.SIGCHLD)},
    61  					seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGCHLD)},
    62  					// Allow creation of new sysmsg thread.
    63  					seccomp.PerArg{seccomp.EqualTo(
    64  						unix.CLONE_FILES |
    65  							unix.CLONE_FS |
    66  							unix.CLONE_VM |
    67  							unix.CLONE_PTRACE |
    68  							linux.SIGKILL)},
    69  					// Allow creation of new threads within a single address space (used by address spaces).
    70  					seccomp.PerArg{seccomp.EqualTo(
    71  						unix.CLONE_FILES |
    72  							unix.CLONE_FS |
    73  							unix.CLONE_SIGHAND |
    74  							unix.CLONE_THREAD |
    75  							unix.CLONE_PTRACE |
    76  							unix.CLONE_VM)},
    77  				},
    78  
    79  				// For the initial process creation.
    80  				unix.SYS_WAIT4: seccomp.MatchAll{},
    81  				unix.SYS_EXIT:  seccomp.MatchAll{},
    82  
    83  				// For the stub prctl dance (all).
    84  				unix.SYS_PRCTL: seccomp.Or{
    85  					seccomp.PerArg{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)},
    86  					seccomp.PerArg{seccomp.EqualTo(linux.PR_SET_NO_NEW_PRIVS), seccomp.EqualTo(1)},
    87  				},
    88  				unix.SYS_GETPPID: seccomp.MatchAll{},
    89  
    90  				// For the stub to stop itself (all).
    91  				unix.SYS_GETPID: seccomp.MatchAll{},
    92  				unix.SYS_KILL: seccomp.PerArg{
    93  					seccomp.AnyValue{},
    94  					seccomp.EqualTo(unix.SIGSTOP),
    95  				},
    96  
    97  				// Injected to support the address space operations.
    98  				unix.SYS_MMAP:   seccomp.MatchAll{},
    99  				unix.SYS_MUNMAP: seccomp.MatchAll{},
   100  
   101  				// For sysmsg threads. Look at sysmsg/sighandler.c for more details.
   102  				unix.SYS_RT_SIGRETURN: seccomp.MatchAll{},
   103  				unix.SYS_SCHED_YIELD:  seccomp.MatchAll{},
   104  				unix.SYS_FUTEX: seccomp.Or{
   105  					seccomp.PerArg{
   106  						seccomp.AnyValue{},
   107  						seccomp.EqualTo(linux.FUTEX_WAIT),
   108  						seccomp.AnyValue{},
   109  						seccomp.AnyValue{},
   110  					},
   111  					seccomp.PerArg{
   112  						seccomp.AnyValue{},
   113  						seccomp.EqualTo(linux.FUTEX_WAKE),
   114  						seccomp.AnyValue{},
   115  						seccomp.AnyValue{},
   116  					},
   117  				},
   118  				unix.SYS_SIGALTSTACK: seccomp.MatchAll{},
   119  				unix.SYS_TKILL: seccomp.PerArg{
   120  					seccomp.AnyValue{},
   121  					seccomp.EqualTo(unix.SIGSTOP),
   122  				},
   123  				unix.SYS_GETTID: seccomp.MatchAll{},
   124  				seccomp.SYS_SECCOMP: seccomp.PerArg{
   125  					seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER),
   126  					seccomp.EqualTo(0),
   127  					seccomp.AnyValue{},
   128  				},
   129  			}),
   130  			Action: linux.SECCOMP_RET_ALLOW,
   131  		}
   132  		rules = append(rules, ruleSet)
   133  		rules = appendArchSeccompRules(rules)
   134  	}
   135  	instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{
   136  		DefaultAction: defaultAction,
   137  		BadArchAction: defaultAction,
   138  	})
   139  	if err != nil {
   140  		return nil, err
   141  	}
   142  
   143  	return forkStub(flags, instrs)
   144  }
   145  
   146  // In the child, this function must not acquire any locks, because they might
   147  // have been locked at the time of the fork. This means no rescheduling, no
   148  // malloc calls, and no new stack segments.  For the same reason compiler does
   149  // not race instrument it.
   150  //
   151  //go:norace
   152  func forkStub(flags uintptr, instrs []bpf.Instruction) (*thread, error) {
   153  	// Declare all variables up front in order to ensure that there's no
   154  	// need for allocations between beforeFork & afterFork.
   155  	var (
   156  		pid   uintptr
   157  		ppid  uintptr
   158  		errno unix.Errno
   159  	)
   160  
   161  	// Remember the current ppid for the pdeathsig race.
   162  	ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0)
   163  
   164  	// Among other things, beforeFork masks all signals.
   165  	beforeFork()
   166  
   167  	// Do the clone.
   168  	pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0)
   169  	if errno != 0 {
   170  		afterFork()
   171  		return nil, errno
   172  	}
   173  
   174  	// Is this the parent?
   175  	if pid != 0 {
   176  		// Among other things, restore signal mask.
   177  		afterFork()
   178  
   179  		// Initialize the first thread.
   180  		t := &thread{
   181  			tgid: int32(pid),
   182  			tid:  int32(pid),
   183  		}
   184  		if sig := t.wait(stopped); sig != unix.SIGSTOP {
   185  			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
   186  		}
   187  		if err := t.attach(); err != nil {
   188  			return nil, err
   189  		}
   190  		t.grabInitRegs()
   191  		_, err := t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_MUNMAP,
   192  			arch.SyscallArgument{Value: stubROMapEnd},
   193  			arch.SyscallArgument{Value: maximumUserAddress - stubROMapEnd})
   194  		if err != nil {
   195  			return nil, err
   196  		}
   197  
   198  		return t, nil
   199  	}
   200  
   201  	// Move the stub to a new session (and thus a new process group). This
   202  	// prevents the stub from getting PTY job control signals intended only
   203  	// for the sentry process. We must call this before restoring signal
   204  	// mask.
   205  	if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 {
   206  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   207  	}
   208  
   209  	// afterForkInChild resets all signals to their default dispositions
   210  	// and restores the signal mask to its pre-fork state.
   211  	afterForkInChild()
   212  
   213  	if errno := sysmsgSigactions(stubSysmsgStart); errno != 0 {
   214  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   215  	}
   216  
   217  	// Explicitly unmask all signals to ensure that the tracer can see
   218  	// them.
   219  	if errno := unmaskAllSignals(); errno != 0 {
   220  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   221  	}
   222  
   223  	// Set an aggressive BPF filter for the stub and all it's children. See
   224  	// the description of the BPF program built above.
   225  	if errno := seccomp.SetFilterInChild(instrs); errno != 0 {
   226  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   227  	}
   228  
   229  	// Enable cpuid-faulting.
   230  	enableCpuidFault()
   231  
   232  	// Call the stub; should not return.
   233  	stubCall(stubInitProcess, ppid)
   234  	panic("unreachable")
   235  }
   236  
   237  // createStub creates a stub processes as a child of an existing subprocesses.
   238  //
   239  // Precondition: the runtime OS thread must be locked.
   240  func (t *thread) createStub() (*thread, error) {
   241  	// There's no need to lock the runtime thread here, as this can only be
   242  	// called from a context that is already locked.
   243  
   244  	// Pass the expected PPID to the child via R15.
   245  	regs := t.initRegs
   246  	initChildProcessPPID(&regs, t.tgid)
   247  
   248  	// Call fork in a subprocess.
   249  	//
   250  	// The new child must set up PDEATHSIG to ensure it dies if this
   251  	// process dies. Since this process could die at any time, this cannot
   252  	// be done via instrumentation from here.
   253  	//
   254  	// Instead, we create the child untraced, which will do the PDEATHSIG
   255  	// setup and then SIGSTOP itself for our attach below.
   256  	//
   257  	// See above re: SIGKILL.
   258  	pid, err := t.syscallIgnoreInterrupt(
   259  		&regs,
   260  		unix.SYS_CLONE,
   261  		arch.SyscallArgument{Value: uintptr(unix.CLONE_FILES | unix.CLONE_PARENT | uintptr(unix.SIGCHLD))},
   262  		arch.SyscallArgument{Value: 0},
   263  		arch.SyscallArgument{Value: 0},
   264  		arch.SyscallArgument{Value: 0},
   265  		arch.SyscallArgument{Value: 0},
   266  		arch.SyscallArgument{Value: 0})
   267  	if err != nil {
   268  		return nil, fmt.Errorf("creating stub process: %v", err)
   269  	}
   270  
   271  	// Wait for child to enter group-stop, so we don't stop its
   272  	// bootstrapping work with t.attach below.
   273  	//
   274  	// We unfortunately don't have a handy part of memory to write the wait
   275  	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
   276  	// If the child actually exited, the attach below will fail.
   277  	_, err = unix.Wait4(int(pid), nil, unix.WALL|unix.WUNTRACED, nil)
   278  	if err != nil {
   279  		return nil, fmt.Errorf("waiting on stub process: %v", err)
   280  	}
   281  
   282  	childT := &thread{
   283  		tgid: int32(pid),
   284  		tid:  int32(pid),
   285  	}
   286  
   287  	return childT, nil
   288  }
   289  
   290  func (s *subprocess) createStub() (*thread, error) {
   291  	req := requestStub{}
   292  	req.done = make(chan *thread, 1)
   293  	s.requests <- req
   294  
   295  	childT := <-req.done
   296  	if childT == nil {
   297  		return nil, fmt.Errorf("createStub: failed to get clone")
   298  	}
   299  	if err := childT.attach(); err != nil {
   300  		return nil, err
   301  	}
   302  	childT.grabInitRegs()
   303  
   304  	return childT, nil
   305  }