github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/ptrace/subprocess_linux.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // +build linux
    16  
    17  package ptrace
    18  
    19  import (
    20  	"fmt"
    21  
    22  	"golang.org/x/sys/unix"
    23  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    24  	"github.com/SagerNet/gvisor/pkg/log"
    25  	"github.com/SagerNet/gvisor/pkg/procid"
    26  	"github.com/SagerNet/gvisor/pkg/seccomp"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    28  )
    29  
    30  const syscallEvent unix.Signal = 0x80
    31  
    32  // createStub creates a fresh stub processes.
    33  //
    34  // Precondition: the runtime OS thread must be locked.
    35  func createStub() (*thread, error) {
    36  	// The exact interactions of ptrace and seccomp are complex, and
    37  	// changed in recent kernel versions. Before commit 93e35efb8de45, the
    38  	// seccomp check is done before the ptrace emulation check. This means
    39  	// that any calls not matching this list will trigger the seccomp
    40  	// default action instead of notifying ptrace.
    41  	//
    42  	// After commit 93e35efb8de45, the seccomp check is done after the
    43  	// ptrace emulation check. This simplifies using SYSEMU, since seccomp
    44  	// will never run for emulation. Seccomp will only run for injected
    45  	// system calls, and thus we can use RET_KILL as our violation action.
    46  	var defaultAction linux.BPFAction
    47  	if probeSeccomp() {
    48  		log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
    49  		defaultAction = linux.SECCOMP_RET_KILL_THREAD
    50  	} else {
    51  		// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
    52  		log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
    53  		defaultAction = linux.SECCOMP_RET_ALLOW
    54  	}
    55  
    56  	// When creating the new child process, we specify SIGKILL as the
    57  	// signal to deliver when the child exits. We never expect a subprocess
    58  	// to exit; they are pooled and reused. This is done to ensure that if
    59  	// a subprocess is OOM-killed, this process (and all other stubs,
    60  	// transitively) will be killed as well. It's simply not possible to
    61  	// safely handle a single stub getting killed: the exact state of
    62  	// execution is unknown and not recoverable.
    63  	//
    64  	// In addition, we set the PTRACE_O_TRACEEXIT option to log more
    65  	// information about a stub process when it receives a fatal signal.
    66  	return attachedThread(uintptr(unix.SIGKILL)|unix.CLONE_FILES, defaultAction)
    67  }
    68  
    69  // attachedThread returns a new attached thread.
    70  //
    71  // Precondition: the runtime OS thread must be locked.
    72  func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) {
    73  	// Create a BPF program that allows only the system calls needed by the
    74  	// stub and all its children. This is used to create child stubs
    75  	// (below), so we must include the ability to fork, but otherwise lock
    76  	// down available calls only to what is needed.
    77  	rules := []seccomp.RuleSet{}
    78  	if defaultAction != linux.SECCOMP_RET_ALLOW {
    79  		rules = append(rules, seccomp.RuleSet{
    80  			Rules: seccomp.SyscallRules{
    81  				unix.SYS_CLONE: []seccomp.Rule{
    82  					// Allow creation of new subprocesses (used by the master).
    83  					{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGKILL)},
    84  					// Allow creation of new threads within a single address space (used by addresss spaces).
    85  					{seccomp.EqualTo(
    86  						unix.CLONE_FILES |
    87  							unix.CLONE_FS |
    88  							unix.CLONE_SIGHAND |
    89  							unix.CLONE_THREAD |
    90  							unix.CLONE_PTRACE |
    91  							unix.CLONE_VM)},
    92  				},
    93  
    94  				// For the initial process creation.
    95  				unix.SYS_WAIT4: {},
    96  				unix.SYS_EXIT:  {},
    97  
    98  				// For the stub prctl dance (all).
    99  				unix.SYS_PRCTL: []seccomp.Rule{
   100  					{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)},
   101  				},
   102  				unix.SYS_GETPPID: {},
   103  
   104  				// For the stub to stop itself (all).
   105  				unix.SYS_GETPID: {},
   106  				unix.SYS_KILL: []seccomp.Rule{
   107  					{seccomp.MatchAny{}, seccomp.EqualTo(unix.SIGSTOP)},
   108  				},
   109  
   110  				// Injected to support the address space operations.
   111  				unix.SYS_MMAP:   {},
   112  				unix.SYS_MUNMAP: {},
   113  			},
   114  			Action: linux.SECCOMP_RET_ALLOW,
   115  		})
   116  	}
   117  	rules = appendArchSeccompRules(rules, defaultAction)
   118  	instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction)
   119  	if err != nil {
   120  		return nil, err
   121  	}
   122  
   123  	// Declare all variables up front in order to ensure that there's no
   124  	// need for allocations between beforeFork & afterFork.
   125  	var (
   126  		pid   uintptr
   127  		ppid  uintptr
   128  		errno unix.Errno
   129  	)
   130  
   131  	// Remember the current ppid for the pdeathsig race.
   132  	ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0)
   133  
   134  	// Among other things, beforeFork masks all signals.
   135  	beforeFork()
   136  
   137  	// Do the clone.
   138  	pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0)
   139  	if errno != 0 {
   140  		afterFork()
   141  		return nil, errno
   142  	}
   143  
   144  	// Is this the parent?
   145  	if pid != 0 {
   146  		// Among other things, restore signal mask.
   147  		afterFork()
   148  
   149  		// Initialize the first thread.
   150  		t := &thread{
   151  			tgid: int32(pid),
   152  			tid:  int32(pid),
   153  			cpu:  ^uint32(0),
   154  		}
   155  		if sig := t.wait(stopped); sig != unix.SIGSTOP {
   156  			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
   157  		}
   158  		t.attach()
   159  		t.grabInitRegs()
   160  
   161  		return t, nil
   162  	}
   163  
   164  	// Move the stub to a new session (and thus a new process group). This
   165  	// prevents the stub from getting PTY job control signals intended only
   166  	// for the sentry process. We must call this before restoring signal
   167  	// mask.
   168  	if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 {
   169  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   170  	}
   171  
   172  	// afterForkInChild resets all signals to their default dispositions
   173  	// and restores the signal mask to its pre-fork state.
   174  	afterForkInChild()
   175  
   176  	// Explicitly unmask all signals to ensure that the tracer can see
   177  	// them.
   178  	if errno := unmaskAllSignals(); errno != 0 {
   179  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   180  	}
   181  
   182  	// Set an aggressive BPF filter for the stub and all it's children. See
   183  	// the description of the BPF program built above.
   184  	if errno := seccomp.SetFilter(instrs); errno != 0 {
   185  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   186  	}
   187  
   188  	// Enable cpuid-faulting.
   189  	enableCpuidFault()
   190  
   191  	// Call the stub; should not return.
   192  	stubCall(stubStart, ppid)
   193  	panic("unreachable")
   194  }
   195  
   196  // createStub creates a stub processes as a child of an existing subprocesses.
   197  //
   198  // Precondition: the runtime OS thread must be locked.
   199  func (s *subprocess) createStub() (*thread, error) {
   200  	// There's no need to lock the runtime thread here, as this can only be
   201  	// called from a context that is already locked.
   202  	currentTID := int32(procid.Current())
   203  	t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
   204  
   205  	// Pass the expected PPID to the child via R15.
   206  	regs := t.initRegs
   207  	initChildProcessPPID(&regs, t.tgid)
   208  
   209  	// Call fork in a subprocess.
   210  	//
   211  	// The new child must set up PDEATHSIG to ensure it dies if this
   212  	// process dies. Since this process could die at any time, this cannot
   213  	// be done via instrumentation from here.
   214  	//
   215  	// Instead, we create the child untraced, which will do the PDEATHSIG
   216  	// setup and then SIGSTOP itself for our attach below.
   217  	//
   218  	// See above re: SIGKILL.
   219  	pid, err := t.syscallIgnoreInterrupt(
   220  		&regs,
   221  		unix.SYS_CLONE,
   222  		arch.SyscallArgument{Value: uintptr(unix.SIGKILL | unix.CLONE_FILES)},
   223  		arch.SyscallArgument{Value: 0},
   224  		arch.SyscallArgument{Value: 0},
   225  		arch.SyscallArgument{Value: 0},
   226  		arch.SyscallArgument{Value: 0},
   227  		arch.SyscallArgument{Value: 0})
   228  	if err != nil {
   229  		return nil, fmt.Errorf("creating stub process: %v", err)
   230  	}
   231  
   232  	// Wait for child to enter group-stop, so we don't stop its
   233  	// bootstrapping work with t.attach below.
   234  	//
   235  	// We unfortunately don't have a handy part of memory to write the wait
   236  	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
   237  	// If the child actually exited, the attach below will fail.
   238  	_, err = t.syscallIgnoreInterrupt(
   239  		&t.initRegs,
   240  		unix.SYS_WAIT4,
   241  		arch.SyscallArgument{Value: uintptr(pid)},
   242  		arch.SyscallArgument{Value: 0},
   243  		arch.SyscallArgument{Value: unix.WALL | unix.WUNTRACED},
   244  		arch.SyscallArgument{Value: 0},
   245  		arch.SyscallArgument{Value: 0},
   246  		arch.SyscallArgument{Value: 0})
   247  	if err != nil {
   248  		return nil, fmt.Errorf("waiting on stub process: %v", err)
   249  	}
   250  
   251  	childT := &thread{
   252  		tgid: int32(pid),
   253  		tid:  int32(pid),
   254  		cpu:  ^uint32(0),
   255  	}
   256  	childT.attach()
   257  
   258  	return childT, nil
   259  }