gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/subprocess_linux.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  package systrap
    19  
    20  import (
    21  	"fmt"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"gvisor.dev/gvisor/pkg/abi/linux"
    25  	"gvisor.dev/gvisor/pkg/bpf"
    26  	"gvisor.dev/gvisor/pkg/seccomp"
    27  	"gvisor.dev/gvisor/pkg/sentry/arch"
    28  )
    29  
    30  const syscallEvent unix.Signal = 0x80
    31  
    32  // createStub creates a fresh stub processes.
    33  //
    34  // Precondition: the runtime OS thread must be locked.
    35  func createStub() (*thread, error) {
    36  	// When creating the new child process, we specify SIGKILL as the
    37  	// signal to deliver when the child exits. We never expect a subprocess
    38  	// to exit; they are pooled and reused. This is done to ensure that if
    39  	// a subprocess is OOM-killed, this process (and all other stubs,
    40  	// transitively) will be killed as well. It's simply not possible to
    41  	// safely handle a single stub getting killed: the exact state of
    42  	// execution is unknown and not recoverable.
    43  	return attachedThread(unix.CLONE_FILES|uintptr(unix.SIGCHLD), linux.SECCOMP_RET_TRAP)
    44  }
    45  
    46  // attachedThread returns a new attached thread.
    47  //
    48  // Precondition: the runtime OS thread must be locked.
    49  func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) {
    50  	// Create a BPF program that allows only the system calls needed by the
    51  	// stub and all its children. This is used to create child stubs
    52  	// (below), so we must include the ability to fork, but otherwise lock
    53  	// down available calls only to what is needed.
    54  	rules := []seccomp.RuleSet{}
    55  	if defaultAction != linux.SECCOMP_RET_ALLOW {
    56  		ruleSet := seccomp.RuleSet{
    57  			Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{
    58  				unix.SYS_CLONE: seccomp.Or{
    59  					// Allow creation of new subprocesses (used by the master).
    60  					seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.CLONE_PARENT | unix.SIGCHLD)},
    61  					seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGCHLD)},
    62  					// Allow creation of new sysmsg thread.
    63  					seccomp.PerArg{seccomp.EqualTo(
    64  						unix.CLONE_FILES |
    65  							unix.CLONE_FS |
    66  							unix.CLONE_VM |
    67  							unix.CLONE_PTRACE |
    68  							linux.SIGKILL)},
    69  					// Allow creation of new threads within a single address space (used by address spaces).
    70  					seccomp.PerArg{seccomp.EqualTo(
    71  						unix.CLONE_FILES |
    72  							unix.CLONE_FS |
    73  							unix.CLONE_SIGHAND |
    74  							unix.CLONE_THREAD |
    75  							unix.CLONE_PTRACE |
    76  							unix.CLONE_VM)},
    77  				},
    78  
    79  				// For the initial process creation.
    80  				unix.SYS_WAIT4: seccomp.MatchAll{},
    81  				unix.SYS_EXIT:  seccomp.MatchAll{},
    82  
    83  				// For the stub prctl dance (all).
    84  				unix.SYS_PRCTL: seccomp.Or{
    85  					seccomp.PerArg{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)},
    86  					seccomp.PerArg{seccomp.EqualTo(linux.PR_SET_NO_NEW_PRIVS), seccomp.EqualTo(1)},
    87  				},
    88  				unix.SYS_GETPPID: seccomp.MatchAll{},
    89  
    90  				// For the stub to stop itself (all).
    91  				unix.SYS_GETPID: seccomp.MatchAll{},
    92  				unix.SYS_KILL: seccomp.PerArg{
    93  					seccomp.AnyValue{},
    94  					seccomp.EqualTo(unix.SIGSTOP),
    95  				},
    96  
    97  				// Injected to support the address space operations.
    98  				unix.SYS_MMAP:   seccomp.MatchAll{},
    99  				unix.SYS_MUNMAP: seccomp.MatchAll{},
   100  
   101  				// For sysmsg threads. Look at sysmsg/sighandler.c for more details.
   102  				unix.SYS_RT_SIGRETURN: seccomp.MatchAll{},
   103  				unix.SYS_SCHED_YIELD:  seccomp.MatchAll{},
   104  				unix.SYS_FUTEX: seccomp.Or{
   105  					seccomp.PerArg{
   106  						seccomp.AnyValue{},
   107  						seccomp.EqualTo(linux.FUTEX_WAIT),
   108  						seccomp.AnyValue{},
   109  						seccomp.AnyValue{},
   110  					},
   111  					seccomp.PerArg{
   112  						seccomp.AnyValue{},
   113  						seccomp.EqualTo(linux.FUTEX_WAKE),
   114  						seccomp.AnyValue{},
   115  						seccomp.AnyValue{},
   116  					},
   117  				},
   118  				unix.SYS_SIGALTSTACK: seccomp.MatchAll{},
   119  				unix.SYS_TKILL: seccomp.PerArg{
   120  					seccomp.AnyValue{},
   121  					seccomp.EqualTo(unix.SIGSTOP),
   122  				},
   123  				unix.SYS_GETTID:     seccomp.MatchAll{},
   124  				unix.SYS_EXIT_GROUP: seccomp.MatchAll{},
   125  				seccomp.SYS_SECCOMP: seccomp.Or{
   126  					seccomp.PerArg{
   127  						seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER),
   128  						seccomp.EqualTo(0),
   129  						seccomp.AnyValue{},
   130  					},
   131  					seccomp.PerArg{
   132  						seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER),
   133  						seccomp.EqualTo(linux.SECCOMP_FILTER_FLAG_NEW_LISTENER),
   134  						seccomp.AnyValue{},
   135  					},
   136  				},
   137  			}),
   138  			Action: linux.SECCOMP_RET_ALLOW,
   139  		}
   140  		rules = append(rules, ruleSet)
   141  		rules = appendArchSeccompRules(rules)
   142  	}
   143  	instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{
   144  		DefaultAction: defaultAction,
   145  		BadArchAction: defaultAction,
   146  	})
   147  	if err != nil {
   148  		return nil, err
   149  	}
   150  
   151  	return forkStub(flags, instrs)
   152  }
   153  
   154  // In the child, this function must not acquire any locks, because they might
   155  // have been locked at the time of the fork. This means no rescheduling, no
   156  // malloc calls, and no new stack segments.  For the same reason compiler does
   157  // not race instrument it.
   158  //
   159  //go:norace
   160  func forkStub(flags uintptr, instrs []bpf.Instruction) (*thread, error) {
   161  	// Declare all variables up front in order to ensure that there's no
   162  	// need for allocations between beforeFork & afterFork.
   163  	var (
   164  		pid   uintptr
   165  		ppid  uintptr
   166  		errno unix.Errno
   167  	)
   168  
   169  	// Remember the current ppid for the pdeathsig race.
   170  	ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0)
   171  
   172  	// Among other things, beforeFork masks all signals.
   173  	beforeFork()
   174  
   175  	// Do the clone.
   176  	pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0)
   177  	if errno != 0 {
   178  		afterFork()
   179  		return nil, errno
   180  	}
   181  
   182  	// Is this the parent?
   183  	if pid != 0 {
   184  		// Among other things, restore signal mask.
   185  		afterFork()
   186  
   187  		// Initialize the first thread.
   188  		t := &thread{
   189  			tgid: int32(pid),
   190  			tid:  int32(pid),
   191  		}
   192  		if sig := t.wait(stopped); sig != unix.SIGSTOP {
   193  			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
   194  		}
   195  		if err := t.attach(); err != nil {
   196  			return nil, err
   197  		}
   198  		t.grabInitRegs()
   199  		_, err := t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_MUNMAP,
   200  			arch.SyscallArgument{Value: stubROMapEnd},
   201  			arch.SyscallArgument{Value: maximumUserAddress - stubROMapEnd})
   202  		if err != nil {
   203  			return nil, err
   204  		}
   205  
   206  		return t, nil
   207  	}
   208  
   209  	// Move the stub to a new session (and thus a new process group). This
   210  	// prevents the stub from getting PTY job control signals intended only
   211  	// for the sentry process. We must call this before restoring signal
   212  	// mask.
   213  	if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 {
   214  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   215  	}
   216  
   217  	// afterForkInChild resets all signals to their default dispositions
   218  	// and restores the signal mask to its pre-fork state.
   219  	afterForkInChild()
   220  
   221  	if errno := sysmsgSigactions(stubSysmsgStart); errno != 0 {
   222  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   223  	}
   224  
   225  	// Explicitly unmask all signals to ensure that the tracer can see
   226  	// them.
   227  	if errno := unmaskAllSignals(); errno != 0 {
   228  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   229  	}
   230  
   231  	// Set an aggressive BPF filter for the stub and all it's children. See
   232  	// the description of the BPF program built above.
   233  	if errno := seccomp.SetFilterInChild(instrs); errno != 0 {
   234  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   235  	}
   236  
   237  	// Enable cpuid-faulting.
   238  	enableCpuidFault()
   239  
   240  	// Call the stub; should not return.
   241  	stubCall(stubInitProcess, ppid)
   242  	panic("unreachable")
   243  }
   244  
   245  // createStub creates a stub processes as a child of an existing subprocesses.
   246  //
   247  // Precondition: the runtime OS thread must be locked.
   248  func (t *thread) createStub() (*thread, error) {
   249  	// There's no need to lock the runtime thread here, as this can only be
   250  	// called from a context that is already locked.
   251  
   252  	// Pass the expected PPID to the child via R15.
   253  	regs := t.initRegs
   254  	initChildProcessPPID(&regs, t.tgid)
   255  
   256  	// Call fork in a subprocess.
   257  	//
   258  	// The new child must set up PDEATHSIG to ensure it dies if this
   259  	// process dies. Since this process could die at any time, this cannot
   260  	// be done via instrumentation from here.
   261  	//
   262  	// Instead, we create the child untraced, which will do the PDEATHSIG
   263  	// setup and then SIGSTOP itself for our attach below.
   264  	//
   265  	// See above re: SIGKILL.
   266  	pid, err := t.syscallIgnoreInterrupt(
   267  		&regs,
   268  		unix.SYS_CLONE,
   269  		arch.SyscallArgument{Value: uintptr(unix.CLONE_FILES | unix.CLONE_PARENT | uintptr(unix.SIGCHLD))},
   270  		arch.SyscallArgument{Value: 0},
   271  		arch.SyscallArgument{Value: 0},
   272  		arch.SyscallArgument{Value: 0},
   273  		arch.SyscallArgument{Value: 0},
   274  		arch.SyscallArgument{Value: 0})
   275  	if err != nil {
   276  		return nil, fmt.Errorf("creating stub process: %v", err)
   277  	}
   278  
   279  	// Wait for child to enter group-stop, so we don't stop its
   280  	// bootstrapping work with t.attach below.
   281  	//
   282  	// We unfortunately don't have a handy part of memory to write the wait
   283  	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
   284  	// If the child actually exited, the attach below will fail.
   285  	_, err = unix.Wait4(int(pid), nil, unix.WALL|unix.WUNTRACED, nil)
   286  	if err != nil {
   287  		return nil, fmt.Errorf("waiting on stub process: %v", err)
   288  	}
   289  
   290  	childT := &thread{
   291  		tgid: int32(pid),
   292  		tid:  int32(pid),
   293  	}
   294  
   295  	return childT, nil
   296  }
   297  
   298  func (s *subprocess) createStub() (*thread, error) {
   299  	req := requestStub{}
   300  	req.done = make(chan *thread, 1)
   301  	s.requests <- req
   302  
   303  	childT := <-req.done
   304  	if childT == nil {
   305  		return nil, fmt.Errorf("createStub: failed to get clone")
   306  	}
   307  	if err := childT.attach(); err != nil {
   308  		return nil, err
   309  	}
   310  	childT.grabInitRegs()
   311  
   312  	return childT, nil
   313  }