github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/ptrace/subprocess_linux.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // +build linux 16 17 package ptrace 18 19 import ( 20 "fmt" 21 22 "golang.org/x/sys/unix" 23 "github.com/SagerNet/gvisor/pkg/abi/linux" 24 "github.com/SagerNet/gvisor/pkg/log" 25 "github.com/SagerNet/gvisor/pkg/procid" 26 "github.com/SagerNet/gvisor/pkg/seccomp" 27 "github.com/SagerNet/gvisor/pkg/sentry/arch" 28 ) 29 30 const syscallEvent unix.Signal = 0x80 31 32 // createStub creates a fresh stub processes. 33 // 34 // Precondition: the runtime OS thread must be locked. 35 func createStub() (*thread, error) { 36 // The exact interactions of ptrace and seccomp are complex, and 37 // changed in recent kernel versions. Before commit 93e35efb8de45, the 38 // seccomp check is done before the ptrace emulation check. This means 39 // that any calls not matching this list will trigger the seccomp 40 // default action instead of notifying ptrace. 41 // 42 // After commit 93e35efb8de45, the seccomp check is done after the 43 // ptrace emulation check. This simplifies using SYSEMU, since seccomp 44 // will never run for emulation. Seccomp will only run for injected 45 // system calls, and thus we can use RET_KILL as our violation action. 46 var defaultAction linux.BPFAction 47 if probeSeccomp() { 48 log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)") 49 defaultAction = linux.SECCOMP_RET_KILL_THREAD 50 } else { 51 // We must rely on SYSEMU behavior; tracing with SYSEMU is broken. 52 log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)") 53 defaultAction = linux.SECCOMP_RET_ALLOW 54 } 55 56 // When creating the new child process, we specify SIGKILL as the 57 // signal to deliver when the child exits. We never expect a subprocess 58 // to exit; they are pooled and reused. This is done to ensure that if 59 // a subprocess is OOM-killed, this process (and all other stubs, 60 // transitively) will be killed as well. It's simply not possible to 61 // safely handle a single stub getting killed: the exact state of 62 // execution is unknown and not recoverable. 63 // 64 // In addition, we set the PTRACE_O_TRACEEXIT option to log more 65 // information about a stub process when it receives a fatal signal. 66 return attachedThread(uintptr(unix.SIGKILL)|unix.CLONE_FILES, defaultAction) 67 } 68 69 // attachedThread returns a new attached thread. 70 // 71 // Precondition: the runtime OS thread must be locked. 72 func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) { 73 // Create a BPF program that allows only the system calls needed by the 74 // stub and all its children. This is used to create child stubs 75 // (below), so we must include the ability to fork, but otherwise lock 76 // down available calls only to what is needed. 77 rules := []seccomp.RuleSet{} 78 if defaultAction != linux.SECCOMP_RET_ALLOW { 79 rules = append(rules, seccomp.RuleSet{ 80 Rules: seccomp.SyscallRules{ 81 unix.SYS_CLONE: []seccomp.Rule{ 82 // Allow creation of new subprocesses (used by the master). 83 {seccomp.EqualTo(unix.CLONE_FILES | unix.SIGKILL)}, 84 // Allow creation of new threads within a single address space (used by addresss spaces). 85 {seccomp.EqualTo( 86 unix.CLONE_FILES | 87 unix.CLONE_FS | 88 unix.CLONE_SIGHAND | 89 unix.CLONE_THREAD | 90 unix.CLONE_PTRACE | 91 unix.CLONE_VM)}, 92 }, 93 94 // For the initial process creation. 95 unix.SYS_WAIT4: {}, 96 unix.SYS_EXIT: {}, 97 98 // For the stub prctl dance (all). 99 unix.SYS_PRCTL: []seccomp.Rule{ 100 {seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)}, 101 }, 102 unix.SYS_GETPPID: {}, 103 104 // For the stub to stop itself (all). 105 unix.SYS_GETPID: {}, 106 unix.SYS_KILL: []seccomp.Rule{ 107 {seccomp.MatchAny{}, seccomp.EqualTo(unix.SIGSTOP)}, 108 }, 109 110 // Injected to support the address space operations. 111 unix.SYS_MMAP: {}, 112 unix.SYS_MUNMAP: {}, 113 }, 114 Action: linux.SECCOMP_RET_ALLOW, 115 }) 116 } 117 rules = appendArchSeccompRules(rules, defaultAction) 118 instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction) 119 if err != nil { 120 return nil, err 121 } 122 123 // Declare all variables up front in order to ensure that there's no 124 // need for allocations between beforeFork & afterFork. 125 var ( 126 pid uintptr 127 ppid uintptr 128 errno unix.Errno 129 ) 130 131 // Remember the current ppid for the pdeathsig race. 132 ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0) 133 134 // Among other things, beforeFork masks all signals. 135 beforeFork() 136 137 // Do the clone. 138 pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0) 139 if errno != 0 { 140 afterFork() 141 return nil, errno 142 } 143 144 // Is this the parent? 145 if pid != 0 { 146 // Among other things, restore signal mask. 147 afterFork() 148 149 // Initialize the first thread. 150 t := &thread{ 151 tgid: int32(pid), 152 tid: int32(pid), 153 cpu: ^uint32(0), 154 } 155 if sig := t.wait(stopped); sig != unix.SIGSTOP { 156 return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) 157 } 158 t.attach() 159 t.grabInitRegs() 160 161 return t, nil 162 } 163 164 // Move the stub to a new session (and thus a new process group). This 165 // prevents the stub from getting PTY job control signals intended only 166 // for the sentry process. We must call this before restoring signal 167 // mask. 168 if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 { 169 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 170 } 171 172 // afterForkInChild resets all signals to their default dispositions 173 // and restores the signal mask to its pre-fork state. 174 afterForkInChild() 175 176 // Explicitly unmask all signals to ensure that the tracer can see 177 // them. 178 if errno := unmaskAllSignals(); errno != 0 { 179 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 180 } 181 182 // Set an aggressive BPF filter for the stub and all it's children. See 183 // the description of the BPF program built above. 184 if errno := seccomp.SetFilter(instrs); errno != 0 { 185 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 186 } 187 188 // Enable cpuid-faulting. 189 enableCpuidFault() 190 191 // Call the stub; should not return. 192 stubCall(stubStart, ppid) 193 panic("unreachable") 194 } 195 196 // createStub creates a stub processes as a child of an existing subprocesses. 197 // 198 // Precondition: the runtime OS thread must be locked. 199 func (s *subprocess) createStub() (*thread, error) { 200 // There's no need to lock the runtime thread here, as this can only be 201 // called from a context that is already locked. 202 currentTID := int32(procid.Current()) 203 t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) 204 205 // Pass the expected PPID to the child via R15. 206 regs := t.initRegs 207 initChildProcessPPID(®s, t.tgid) 208 209 // Call fork in a subprocess. 210 // 211 // The new child must set up PDEATHSIG to ensure it dies if this 212 // process dies. Since this process could die at any time, this cannot 213 // be done via instrumentation from here. 214 // 215 // Instead, we create the child untraced, which will do the PDEATHSIG 216 // setup and then SIGSTOP itself for our attach below. 217 // 218 // See above re: SIGKILL. 219 pid, err := t.syscallIgnoreInterrupt( 220 ®s, 221 unix.SYS_CLONE, 222 arch.SyscallArgument{Value: uintptr(unix.SIGKILL | unix.CLONE_FILES)}, 223 arch.SyscallArgument{Value: 0}, 224 arch.SyscallArgument{Value: 0}, 225 arch.SyscallArgument{Value: 0}, 226 arch.SyscallArgument{Value: 0}, 227 arch.SyscallArgument{Value: 0}) 228 if err != nil { 229 return nil, fmt.Errorf("creating stub process: %v", err) 230 } 231 232 // Wait for child to enter group-stop, so we don't stop its 233 // bootstrapping work with t.attach below. 234 // 235 // We unfortunately don't have a handy part of memory to write the wait 236 // status. If the wait succeeds, we'll assume that it was the SIGSTOP. 237 // If the child actually exited, the attach below will fail. 238 _, err = t.syscallIgnoreInterrupt( 239 &t.initRegs, 240 unix.SYS_WAIT4, 241 arch.SyscallArgument{Value: uintptr(pid)}, 242 arch.SyscallArgument{Value: 0}, 243 arch.SyscallArgument{Value: unix.WALL | unix.WUNTRACED}, 244 arch.SyscallArgument{Value: 0}, 245 arch.SyscallArgument{Value: 0}, 246 arch.SyscallArgument{Value: 0}) 247 if err != nil { 248 return nil, fmt.Errorf("waiting on stub process: %v", err) 249 } 250 251 childT := &thread{ 252 tgid: int32(pid), 253 tid: int32(pid), 254 cpu: ^uint32(0), 255 } 256 childT.attach() 257 258 return childT, nil 259 }