github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/ptrace/subprocess_linux.go

github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/ptrace/subprocess_linux.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  package ptrace
    19  
    20  import (
    21  	"fmt"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/metacubex/gvisor/pkg/abi/linux"
    25  	"github.com/metacubex/gvisor/pkg/bpf"
    26  	"github.com/metacubex/gvisor/pkg/hosttid"
    27  	"github.com/metacubex/gvisor/pkg/log"
    28  	"github.com/metacubex/gvisor/pkg/seccomp"
    29  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    30  )
    31  
    32  const syscallEvent unix.Signal = 0x80
    33  
    34  // createStub creates a fresh stub processes.
    35  //
    36  // Precondition: the runtime OS thread must be locked.
    37  func createStub() (*thread, error) {
    38  	// The exact interactions of ptrace and seccomp are complex, and
    39  	// changed in recent kernel versions. Before commit 93e35efb8de45, the
    40  	// seccomp check is done before the ptrace emulation check. This means
    41  	// that any calls not matching this list will trigger the seccomp
    42  	// default action instead of notifying ptrace.
    43  	//
    44  	// After commit 93e35efb8de45, the seccomp check is done after the
    45  	// ptrace emulation check. This simplifies using SYSEMU, since seccomp
    46  	// will never run for emulation. Seccomp will only run for injected
    47  	// system calls, and thus we can use RET_KILL as our violation action.
    48  	var defaultAction linux.BPFAction
    49  	if probeSeccomp() {
    50  		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
    51  		defaultAction = linux.SECCOMP_RET_KILL_THREAD
    52  	} else {
    53  		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
    54  		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
    55  		defaultAction = linux.SECCOMP_RET_ALLOW
    56  	}
    57  
    58  	// When creating the new child process, we specify SIGKILL as the
    59  	// signal to deliver when the child exits. We never expect a subprocess
    60  	// to exit; they are pooled and reused. This is done to ensure that if
    61  	// a subprocess is OOM-killed, this process (and all other stubs,
    62  	// transitively) will be killed as well. It's simply not possible to
    63  	// safely handle a single stub getting killed: the exact state of
    64  	// execution is unknown and not recoverable.
    65  	//
    66  	// In addition, we set the PTRACE_O_TRACEEXIT option to log more
    67  	// information about a stub process when it receives a fatal signal.
    68  	return attachedThread(uintptr(unix.SIGKILL)|unix.CLONE_FILES, defaultAction)
    69  }
    70  
    71  // attachedThread returns a new attached thread.
    72  //
    73  // Precondition: the runtime OS thread must be locked.
    74  func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) {
    75  	// Create a BPF program that allows only the system calls needed by the
    76  	// stub and all its children. This is used to create child stubs
    77  	// (below), so we must include the ability to fork, but otherwise lock
    78  	// down available calls only to what is needed.
    79  	rules := []seccomp.RuleSet{}
    80  	if defaultAction != linux.SECCOMP_RET_ALLOW {
    81  		rules = append(rules, seccomp.RuleSet{
    82  			Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{
    83  				unix.SYS_CLONE: seccomp.Or{
    84  					// Allow creation of new subprocesses (used by the master).
    85  					seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGKILL)},
    86  					// Allow creation of new threads within a single address space (used by address spaces).
    87  					seccomp.PerArg{
    88  						seccomp.EqualTo(
    89  							unix.CLONE_FILES |
    90  								unix.CLONE_FS |
    91  								unix.CLONE_SIGHAND |
    92  								unix.CLONE_THREAD |
    93  								unix.CLONE_PTRACE |
    94  								unix.CLONE_VM)},
    95  				},
    96  
    97  				// For the initial process creation.
    98  				unix.SYS_WAIT4: seccomp.MatchAll{},
    99  				unix.SYS_EXIT:  seccomp.MatchAll{},
   100  
   101  				// For the stub prctl dance (all).
   102  				unix.SYS_PRCTL:   seccomp.PerArg{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)},
   103  				unix.SYS_GETPPID: seccomp.MatchAll{},
   104  
   105  				// For the stub to stop itself (all).
   106  				unix.SYS_GETPID: seccomp.MatchAll{},
   107  				unix.SYS_KILL:   seccomp.PerArg{seccomp.AnyValue{}, seccomp.EqualTo(unix.SIGSTOP)},
   108  
   109  				// Injected to support the address space operations.
   110  				unix.SYS_MMAP:   seccomp.MatchAll{},
   111  				unix.SYS_MUNMAP: seccomp.MatchAll{},
   112  			}),
   113  			Action: linux.SECCOMP_RET_ALLOW,
   114  		})
   115  	}
   116  	rules = appendArchSeccompRules(rules, defaultAction)
   117  	instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{
   118  		DefaultAction: defaultAction,
   119  		BadArchAction: defaultAction,
   120  	})
   121  	if err != nil {
   122  		return nil, err
   123  	}
   124  
   125  	return forkStub(flags, instrs)
   126  }
   127  
   128  // In the child, this function must not acquire any locks, because they might
   129  // have been locked at the time of the fork. This means no rescheduling, no
   130  // malloc calls, and no new stack segments.  For the same reason compiler does
   131  // not race instrument it.
   132  //
   133  //go:norace
   134  func forkStub(flags uintptr, instrs []bpf.Instruction) (*thread, error) {
   135  	// Declare all variables up front in order to ensure that there's no
   136  	// need for allocations between beforeFork & afterFork.
   137  	var (
   138  		pid   uintptr
   139  		ppid  uintptr
   140  		errno unix.Errno
   141  	)
   142  
   143  	// Remember the current ppid for the pdeathsig race.
   144  	ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0)
   145  
   146  	// Among other things, beforeFork masks all signals.
   147  	beforeFork()
   148  
   149  	// Do the clone.
   150  	pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0)
   151  	if errno != 0 {
   152  		afterFork()
   153  		return nil, errno
   154  	}
   155  
   156  	// Is this the parent?
   157  	if pid != 0 {
   158  		// Among other things, restore signal mask.
   159  		afterFork()
   160  
   161  		// Initialize the first thread.
   162  		t := &thread{
   163  			tgid: int32(pid),
   164  			tid:  int32(pid),
   165  			cpu:  ^uint32(0),
   166  		}
   167  		if sig := t.wait(stopped); sig != unix.SIGSTOP {
   168  			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
   169  		}
   170  		t.attach()
   171  		t.grabInitRegs()
   172  
   173  		return t, nil
   174  	}
   175  
   176  	// Move the stub to a new session (and thus a new process group). This
   177  	// prevents the stub from getting PTY job control signals intended only
   178  	// for the sentry process. We must call this before restoring signal
   179  	// mask.
   180  	if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 {
   181  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   182  	}
   183  
   184  	// afterForkInChild resets all signals to their default dispositions
   185  	// and restores the signal mask to its pre-fork state.
   186  	afterForkInChild()
   187  
   188  	// Explicitly unmask all signals to ensure that the tracer can see
   189  	// them.
   190  	if errno := unmaskAllSignals(); errno != 0 {
   191  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   192  	}
   193  
   194  	// Set an aggressive BPF filter for the stub and all it's children. See
   195  	// the description of the BPF program built above.
   196  	if errno := seccomp.SetFilterInChild(instrs); errno != 0 {
   197  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   198  	}
   199  
   200  	// Enable cpuid-faulting.
   201  	enableCpuidFault()
   202  
   203  	// Call the stub; should not return.
   204  	stubCall(stubStart, ppid)
   205  	panic("unreachable")
   206  }
   207  
   208  // createStub creates a stub processes as a child of an existing subprocesses.
   209  //
   210  // Precondition: the runtime OS thread must be locked.
   211  func (s *subprocess) createStub() (*thread, error) {
   212  	// There's no need to lock the runtime thread here, as this can only be
   213  	// called from a context that is already locked.
   214  	currentTID := int32(hosttid.Current())
   215  	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
   216  
   217  	// Pass the expected PPID to the child via R15.
   218  	regs := t.initRegs
   219  	initChildProcessPPID(&regs, t.tgid)
   220  
   221  	// Call fork in a subprocess.
   222  	//
   223  	// The new child must set up PDEATHSIG to ensure it dies if this
   224  	// process dies. Since this process could die at any time, this cannot
   225  	// be done via instrumentation from here.
   226  	//
   227  	// Instead, we create the child untraced, which will do the PDEATHSIG
   228  	// setup and then SIGSTOP itself for our attach below.
   229  	//
   230  	// See above re: SIGKILL.
   231  	pid, err := t.syscallIgnoreInterrupt(
   232  		&regs,
   233  		unix.SYS_CLONE,
   234  		arch.SyscallArgument{Value: uintptr(unix.SIGKILL | unix.CLONE_FILES)},
   235  		arch.SyscallArgument{Value: 0},
   236  		arch.SyscallArgument{Value: 0},
   237  		arch.SyscallArgument{Value: 0},
   238  		arch.SyscallArgument{Value: 0},
   239  		arch.SyscallArgument{Value: 0})
   240  	if err != nil {
   241  		return nil, fmt.Errorf("creating stub process: %v", err)
   242  	}
   243  
   244  	// Wait for child to enter group-stop, so we don't stop its
   245  	// bootstrapping work with t.attach below.
   246  	//
   247  	// We unfortunately don't have a handy part of memory to write the wait
   248  	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
   249  	// If the child actually exited, the attach below will fail.
   250  	_, err = t.syscallIgnoreInterrupt(
   251  		&t.initRegs,
   252  		unix.SYS_WAIT4,
   253  		arch.SyscallArgument{Value: uintptr(pid)},
   254  		arch.SyscallArgument{Value: 0},
   255  		arch.SyscallArgument{Value: unix.WALL | unix.WUNTRACED},
   256  		arch.SyscallArgument{Value: 0},
   257  		arch.SyscallArgument{Value: 0},
   258  		arch.SyscallArgument{Value: 0})
   259  	if err != nil {
   260  		return nil, fmt.Errorf("waiting on stub process: %v", err)
   261  	}
   262  
   263  	childT := &thread{
   264  		tgid: int32(pid),
   265  		tid:  int32(pid),
   266  		cpu:  ^uint32(0),
   267  	}
   268  	childT.attach()
   269  
   270  	return childT, nil
   271  }