github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/systrap/subprocess_linux.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build linux
    16  // +build linux
    17  
    18  package systrap
    19  
    20  import (
    21  	"fmt"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/seccomp"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    27  )
    28  
    29  const syscallEvent unix.Signal = 0x80
    30  
    31  // createStub creates a fresh stub processes.
    32  //
    33  // Precondition: the runtime OS thread must be locked.
    34  func createStub() (*thread, error) {
    35  	// When creating the new child process, we specify SIGKILL as the
    36  	// signal to deliver when the child exits. We never expect a subprocess
    37  	// to exit; they are pooled and reused. This is done to ensure that if
    38  	// a subprocess is OOM-killed, this process (and all other stubs,
    39  	// transitively) will be killed as well. It's simply not possible to
    40  	// safely handle a single stub getting killed: the exact state of
    41  	// execution is unknown and not recoverable.
    42  	return attachedThread(uintptr(unix.SIGKILL)|unix.CLONE_FILES, linux.SECCOMP_RET_TRAP)
    43  }
    44  
    45  // attachedThread returns a new attached thread.
    46  //
    47  // Precondition: the runtime OS thread must be locked.
    48  func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) {
    49  	// Create a BPF program that allows only the system calls needed by the
    50  	// stub and all its children. This is used to create child stubs
    51  	// (below), so we must include the ability to fork, but otherwise lock
    52  	// down available calls only to what is needed.
    53  	rules := []seccomp.RuleSet{}
    54  	if defaultAction != linux.SECCOMP_RET_ALLOW {
    55  		ruleSet := seccomp.RuleSet{
    56  			Rules: seccomp.SyscallRules{
    57  				unix.SYS_CLONE: []seccomp.Rule{
    58  					// Allow creation of new subprocesses (used by the master).
    59  					{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGKILL)},
    60  					// Allow creation of new sysmsg thread.
    61  					{seccomp.EqualTo(
    62  						unix.CLONE_FILES |
    63  							unix.CLONE_FS |
    64  							unix.CLONE_VM |
    65  							unix.CLONE_PTRACE)},
    66  					// Allow creation of new threads within a single address space (used by addresss spaces).
    67  					{seccomp.EqualTo(
    68  						unix.CLONE_FILES |
    69  							unix.CLONE_FS |
    70  							unix.CLONE_SIGHAND |
    71  							unix.CLONE_THREAD |
    72  							unix.CLONE_PTRACE |
    73  							unix.CLONE_VM)},
    74  				},
    75  
    76  				// For the initial process creation.
    77  				unix.SYS_WAIT4: {},
    78  				unix.SYS_EXIT:  {},
    79  
    80  				// For the stub prctl dance (all).
    81  				unix.SYS_PRCTL: []seccomp.Rule{
    82  					{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)},
    83  					{seccomp.EqualTo(linux.PR_SET_NO_NEW_PRIVS), seccomp.EqualTo(1)},
    84  				},
    85  				unix.SYS_GETPPID: {},
    86  
    87  				// For the stub to stop itself (all).
    88  				unix.SYS_GETPID: {},
    89  				unix.SYS_KILL: []seccomp.Rule{
    90  					{seccomp.MatchAny{}, seccomp.EqualTo(unix.SIGSTOP)},
    91  				},
    92  
    93  				// Injected to support the address space operations.
    94  				unix.SYS_MMAP:   {},
    95  				unix.SYS_MUNMAP: {},
    96  
    97  				// For sysmsg threads. Look at sysmsg/sighandler.c for more details.
    98  				unix.SYS_RT_SIGRETURN: {},
    99  				unix.SYS_SCHED_YIELD:  {},
   100  				unix.SYS_FUTEX: {
   101  					seccomp.Rule{
   102  						seccomp.MatchAny{},
   103  						seccomp.EqualTo(linux.FUTEX_WAIT),
   104  						seccomp.MatchAny{},
   105  						seccomp.MatchAny{},
   106  					},
   107  					seccomp.Rule{
   108  						seccomp.MatchAny{},
   109  						seccomp.EqualTo(linux.FUTEX_WAKE),
   110  						seccomp.MatchAny{},
   111  						seccomp.MatchAny{},
   112  					},
   113  				},
   114  				unix.SYS_SIGALTSTACK: {},
   115  				unix.SYS_TKILL: {
   116  					{seccomp.MatchAny{}, seccomp.EqualTo(unix.SIGSTOP)},
   117  				},
   118  				unix.SYS_GETTID: {},
   119  				seccomp.SYS_SECCOMP: {
   120  					{seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER), seccomp.EqualTo(0), seccomp.MatchAny{}},
   121  				},
   122  			},
   123  			Action: linux.SECCOMP_RET_ALLOW,
   124  		}
   125  		rules = append(rules, ruleSet)
   126  		rules = appendArchSeccompRules(rules)
   127  	}
   128  	instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction)
   129  	if err != nil {
   130  		return nil, err
   131  	}
   132  
   133  	return forkStub(flags, instrs)
   134  }
   135  
   136  // In the child, this function must not acquire any locks, because they might
   137  // have been locked at the time of the fork. This means no rescheduling, no
   138  // malloc calls, and no new stack segments.  For the same reason compiler does
   139  // not race instrument it.
   140  //
   141  //go:norace
   142  func forkStub(flags uintptr, instrs []linux.BPFInstruction) (*thread, error) {
   143  	// Declare all variables up front in order to ensure that there's no
   144  	// need for allocations between beforeFork & afterFork.
   145  	var (
   146  		pid   uintptr
   147  		ppid  uintptr
   148  		errno unix.Errno
   149  	)
   150  
   151  	// Remember the current ppid for the pdeathsig race.
   152  	ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0)
   153  
   154  	// Among other things, beforeFork masks all signals.
   155  	beforeFork()
   156  
   157  	// Do the clone.
   158  	pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0)
   159  	if errno != 0 {
   160  		afterFork()
   161  		return nil, errno
   162  	}
   163  
   164  	// Is this the parent?
   165  	if pid != 0 {
   166  		// Among other things, restore signal mask.
   167  		afterFork()
   168  
   169  		// Initialize the first thread.
   170  		t := &thread{
   171  			tgid: int32(pid),
   172  			tid:  int32(pid),
   173  		}
   174  		if sig := t.wait(stopped); sig != unix.SIGSTOP {
   175  			return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
   176  		}
   177  		t.attach()
   178  		t.grabInitRegs()
   179  		_, err := t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_MUNMAP,
   180  			arch.SyscallArgument{Value: stubROMapEnd},
   181  			arch.SyscallArgument{Value: maximumUserAddress - stubROMapEnd})
   182  		if err != nil {
   183  			return nil, err
   184  		}
   185  
   186  		return t, nil
   187  	}
   188  
   189  	// Move the stub to a new session (and thus a new process group). This
   190  	// prevents the stub from getting PTY job control signals intended only
   191  	// for the sentry process. We must call this before restoring signal
   192  	// mask.
   193  	if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 {
   194  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   195  	}
   196  
   197  	// afterForkInChild resets all signals to their default dispositions
   198  	// and restores the signal mask to its pre-fork state.
   199  	afterForkInChild()
   200  
   201  	if errno := sysmsgSigactions(stubSysmsgStart); errno != 0 {
   202  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   203  	}
   204  
   205  	// Explicitly unmask all signals to ensure that the tracer can see
   206  	// them.
   207  	if errno := unmaskAllSignals(); errno != 0 {
   208  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   209  	}
   210  
   211  	// Set an aggressive BPF filter for the stub and all it's children. See
   212  	// the description of the BPF program built above.
   213  	if errno := seccomp.SetFilterInChild(instrs); errno != 0 {
   214  		unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0)
   215  	}
   216  
   217  	// Enable cpuid-faulting.
   218  	enableCpuidFault()
   219  
   220  	// Call the stub; should not return.
   221  	stubCall(stubInitProcess, ppid)
   222  	panic("unreachable")
   223  }
   224  
   225  // createStub creates a stub processes as a child of an existing subprocesses.
   226  //
   227  // Precondition: the runtime OS thread must be locked.
   228  func (t *thread) createStub() (*thread, error) {
   229  	// There's no need to lock the runtime thread here, as this can only be
   230  	// called from a context that is already locked.
   231  
   232  	// Pass the expected PPID to the child via R15.
   233  	regs := t.initRegs
   234  	initChildProcessPPID(&regs, t.tgid)
   235  
   236  	// Call fork in a subprocess.
   237  	//
   238  	// The new child must set up PDEATHSIG to ensure it dies if this
   239  	// process dies. Since this process could die at any time, this cannot
   240  	// be done via instrumentation from here.
   241  	//
   242  	// Instead, we create the child untraced, which will do the PDEATHSIG
   243  	// setup and then SIGSTOP itself for our attach below.
   244  	//
   245  	// See above re: SIGKILL.
   246  	pid, err := t.syscallIgnoreInterrupt(
   247  		&regs,
   248  		unix.SYS_CLONE,
   249  		arch.SyscallArgument{Value: uintptr(unix.SIGKILL | unix.CLONE_FILES)},
   250  		arch.SyscallArgument{Value: 0},
   251  		arch.SyscallArgument{Value: 0},
   252  		arch.SyscallArgument{Value: 0},
   253  		arch.SyscallArgument{Value: 0},
   254  		arch.SyscallArgument{Value: 0})
   255  	if err != nil {
   256  		return nil, fmt.Errorf("creating stub process: %v", err)
   257  	}
   258  
   259  	// Wait for child to enter group-stop, so we don't stop its
   260  	// bootstrapping work with t.attach below.
   261  	//
   262  	// We unfortunately don't have a handy part of memory to write the wait
   263  	// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
   264  	// If the child actually exited, the attach below will fail.
   265  	_, err = t.syscallIgnoreInterrupt(
   266  		&t.initRegs,
   267  		unix.SYS_WAIT4,
   268  		arch.SyscallArgument{Value: uintptr(pid)},
   269  		arch.SyscallArgument{Value: 0},
   270  		arch.SyscallArgument{Value: unix.WALL | unix.WUNTRACED},
   271  		arch.SyscallArgument{Value: 0},
   272  		arch.SyscallArgument{Value: 0},
   273  		arch.SyscallArgument{Value: 0})
   274  	if err != nil {
   275  		return nil, fmt.Errorf("waiting on stub process: %v", err)
   276  	}
   277  
   278  	childT := &thread{
   279  		tgid: int32(pid),
   280  		tid:  int32(pid),
   281  	}
   282  
   283  	return childT, nil
   284  }
   285  
   286  func (s *subprocess) createStub() (*thread, error) {
   287  	req := requestStub{}
   288  	req.done = make(chan *thread, 1)
   289  	s.requests <- req
   290  
   291  	childT := <-req.done
   292  	childT.attach()
   293  	childT.grabInitRegs()
   294  
   295  	return childT, nil
   296  }