github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/ptrace/subprocess_linux.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  package ptrace
    19  
    20  import (
    21  	"fmt"
    22  
    23  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    24  	"github.com/MerlinKodo/gvisor/pkg/hosttid"
    25  	"github.com/MerlinKodo/gvisor/pkg/log"
    26  	"github.com/MerlinKodo/gvisor/pkg/seccomp"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    28  	"golang.org/x/sys/unix"
    29  )
    30  
    31  const syscallEvent unix.Signal = 0x80
    32  
    33  // createStub creates a fresh stub processes.
    34  //
    35  // Precondition: the runtime OS thread must be locked.
    36  func createStub() (*thread, error) {
    37  	// The exact interactions of ptrace and seccomp are complex, and
    38  	// changed in recent kernel versions. Before commit 93e35efb8de45, the
    39  	// seccomp check is done before the ptrace emulation check. This means
    40  	// that any calls not matching this list will trigger the seccomp
    41  	// default action instead of notifying ptrace.
    42  	//
    43  	// After commit 93e35efb8de45, the seccomp check is done after the
    44  	// ptrace emulation check. This simplifies using SYSEMU, since seccomp
    45  	// will never run for emulation. Seccomp will only run for injected
    46  	// system calls, and thus we can use RET_KILL as our violation action.
    47  	var defaultAction linux.BPFAction
    48  	if probeSeccomp() {
    49  		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
    50  		defaultAction = linux.SECCOMP_RET_KILL_THREAD
    51  	} else {
    52  		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
    53  		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
    54  		defaultAction = linux.SECCOMP_RET_ALLOW
    55  	}
    56  
    57  	// When creating the new child process, we specify SIGKILL as the
    58  	// signal to deliver when the child exits. We never expect a subprocess
    59  	// to exit; they are pooled and reused. This is done to ensure that if
    60  	// a subprocess is OOM-killed, this process (and all other stubs,
    61  	// transitively) will be killed as well. It's simply not possible to
    62  	// safely handle a single stub getting killed: the exact state of
    63  	// execution is unknown and not recoverable.
    64  	//
    65  	// In addition, we set the PTRACE_O_TRACEEXIT option to log more
    66  	// information about a stub process when it receives a fatal signal.
    67  	return attachedThread(uintptr(unix.SIGKILL)|unix.CLONE_FILES, defaultAction)
    68  }
    69  
    70  // attachedThread returns a new attached thread.
    71  //
    72  // Precondition: the runtime OS thread must be locked.
    73  func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) {
    74  	// Create a BPF program that allows only the system calls needed by the
    75  	// stub and all its children. This is used to create child stubs
    76  	// (below), so we must include the ability to fork, but otherwise lock
    77  	// down available calls only to what is needed.
    78  	rules := []seccomp.RuleSet{}
    79  	if defaultAction != linux.SECCOMP_RET_ALLOW {
    80  		rules = append(rules, seccomp.RuleSet{
    81  			Rules: seccomp.SyscallRules{
    82  				unix.SYS_CLONE: []seccomp.Rule{
    83  					// Allow creation of new subprocesses (used by the master).
    84  					{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGKILL)},
    85  					// Allow creation of new threads within a single address space (used by addresss spaces).
    86  					{seccomp.EqualTo(
    87  						unix.CLONE_FILES |
    88  							unix.CLONE_FS |
    89  							unix.CLONE_SIGHAND |
    90  							unix.CLONE_THREAD |
    91  							unix.CLONE_PTRACE |
    92  							unix.CLONE_VM)},
    93  				},
    94  
    95  				// For the initial process creation.
    96  				unix.SYS_WAIT4: {},
    97  				unix.SYS_EXIT:  {},
    98  
    99  				// For the stub prctl dance (all).
   100  				unix.SYS_PRCTL: []seccomp.Rule{
   101  					{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)},
   102  				},
   103  				unix.SYS_GETPPID: {},
   104  
   105  				// For the stub to stop itself (all).
   106  				unix.SYS_GETPID: {},
   107  				unix.SYS_KILL: []seccomp.Rule{
   108  					{seccomp.MatchAny{}, seccomp.EqualTo(unix.SIGSTOP)},
   109  				},
   110  
   111  				// Injected to support the address space operations.
   112  				unix.SYS_MMAP:   {},
   113  				unix.SYS_MUNMAP: {},
   114  			},
   115  			Action: linux.SECCOMP_RET_ALLOW,
   116  		})
   117  	}
   118  	rules = appendArchSeccompRules(rules, defaultAction)
   119  	instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction)
   120  	if err != nil {
   121  		return nil, err
   122  	}
   123  
   124  	return forkStub(flags, instrs)
   125  }
   126  
   127  // In the child, this function must not acquire any locks, because they might
   128  // have been locked at the time of the fork. This means no rescheduling, no
   129  // malloc calls, and no new stack segments.  For the same reason compiler does
   130  // not race instrument it.
   131  //
   132  //go:norace
   133  func forkStub(flags uintptr, instrs []linux.BPFInstruction) (*thread, error) {
   134  	// Declare all variables up front in order to ensure that there's no
   135  	// need for allocations between beforeFork & afterFork.
   136  	var (
   137  		pid   uintptr
   138  		ppid  uintptr
   139  		errno unix.Errno
   140  	)
   141  
   142  	// Remember the current ppid for the pdeathsig race.
   143  	ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0)
   144  
   145  	// Among other things, beforeFork masks all signals.
   146  	beforeFork()
   147  
   148  	// Do the clone.
   149  	pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0)
   150  	if errno != 0 {
   151  		afterFork()
   152  		return nil, errno
   153  	}
   154  
   155  	// Is this the parent?
   156  	if pid != 0 {
   157  		// Among other things, restore signal mask.
   158  		afterFork()
   159  
   160  		// Initialize the first thread.
   161  		t := &thread{
   162  			tgid: int32(pid),
   163  			tid:  int32(pid),
   164  			cpu:  ^uint32(0),
   165  		}
   166  		if sig := t.wait(stopped); sig != unix.SIGSTOP {
   167  			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
   168  		}
   169  		t.attach()
   170  		t.grabInitRegs()
   171  
   172  		return t, nil
   173  	}
   174  
   175  	// Move the stub to a new session (and thus a new process group). This
   176  	// prevents the stub from getting PTY job control signals intended only
   177  	// for the sentry process. We must call this before restoring signal
   178  	// mask.
   179  	if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 {
   180  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   181  	}
   182  
   183  	// afterForkInChild resets all signals to their default dispositions
   184  	// and restores the signal mask to its pre-fork state.
   185  	afterForkInChild()
   186  
   187  	// Explicitly unmask all signals to ensure that the tracer can see
   188  	// them.
   189  	if errno := unmaskAllSignals(); errno != 0 {
   190  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   191  	}
   192  
   193  	// Set an aggressive BPF filter for the stub and all it's children. See
   194  	// the description of the BPF program built above.
   195  	if errno := seccomp.SetFilterInChild(instrs); errno != 0 {
   196  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   197  	}
   198  
   199  	// Enable cpuid-faulting.
   200  	enableCpuidFault()
   201  
   202  	// Call the stub; should not return.
   203  	stubCall(stubStart, ppid)
   204  	panic("unreachable")
   205  }
   206  
   207  // createStub creates a stub processes as a child of an existing subprocesses.
   208  //
   209  // Precondition: the runtime OS thread must be locked.
   210  func (s *subprocess) createStub() (*thread, error) {
   211  	// There's no need to lock the runtime thread here, as this can only be
   212  	// called from a context that is already locked.
   213  	currentTID := int32(hosttid.Current())
   214  	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
   215  
   216  	// Pass the expected PPID to the child via R15.
   217  	regs := t.initRegs
   218  	initChildProcessPPID(&regs, t.tgid)
   219  
   220  	// Call fork in a subprocess.
   221  	//
   222  	// The new child must set up PDEATHSIG to ensure it dies if this
   223  	// process dies. Since this process could die at any time, this cannot
   224  	// be done via instrumentation from here.
   225  	//
   226  	// Instead, we create the child untraced, which will do the PDEATHSIG
   227  	// setup and then SIGSTOP itself for our attach below.
   228  	//
   229  	// See above re: SIGKILL.
   230  	pid, err := t.syscallIgnoreInterrupt(
   231  		&regs,
   232  		unix.SYS_CLONE,
   233  		arch.SyscallArgument{Value: uintptr(unix.SIGKILL | unix.CLONE_FILES)},
   234  		arch.SyscallArgument{Value: 0},
   235  		arch.SyscallArgument{Value: 0},
   236  		arch.SyscallArgument{Value: 0},
   237  		arch.SyscallArgument{Value: 0},
   238  		arch.SyscallArgument{Value: 0})
   239  	if err != nil {
   240  		return nil, fmt.Errorf("creating stub process: %v", err)
   241  	}
   242  
   243  	// Wait for child to enter group-stop, so we don't stop its
   244  	// bootstrapping work with t.attach below.
   245  	//
   246  	// We unfortunately don't have a handy part of memory to write the wait
   247  	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
   248  	// If the child actually exited, the attach below will fail.
   249  	_, err = t.syscallIgnoreInterrupt(
   250  		&t.initRegs,
   251  		unix.SYS_WAIT4,
   252  		arch.SyscallArgument{Value: uintptr(pid)},
   253  		arch.SyscallArgument{Value: 0},
   254  		arch.SyscallArgument{Value: unix.WALL | unix.WUNTRACED},
   255  		arch.SyscallArgument{Value: 0},
   256  		arch.SyscallArgument{Value: 0},
   257  		arch.SyscallArgument{Value: 0})
   258  	if err != nil {
   259  		return nil, fmt.Errorf("waiting on stub process: %v", err)
   260  	}
   261  
   262  	childT := &thread{
   263  		tgid: int32(pid),
   264  		tid:  int32(pid),
   265  		cpu:  ^uint32(0),
   266  	}
   267  	childT.attach()
   268  
   269  	return childT, nil
   270  }