github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/ptrace/subprocess_linux.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 package ptrace 19 20 import ( 21 "fmt" 22 23 "golang.org/x/sys/unix" 24 "github.com/metacubex/gvisor/pkg/abi/linux" 25 "github.com/metacubex/gvisor/pkg/bpf" 26 "github.com/metacubex/gvisor/pkg/hosttid" 27 "github.com/metacubex/gvisor/pkg/log" 28 "github.com/metacubex/gvisor/pkg/seccomp" 29 "github.com/metacubex/gvisor/pkg/sentry/arch" 30 ) 31 32 const syscallEvent unix.Signal = 0x80 33 34 // createStub creates a fresh stub processes. 35 // 36 // Precondition: the runtime OS thread must be locked. 37 func createStub() (*thread, error) { 38 // The exact interactions of ptrace and seccomp are complex, and 39 // changed in recent kernel versions. Before commit 93e35efb8de45, the 40 // seccomp check is done before the ptrace emulation check. This means 41 // that any calls not matching this list will trigger the seccomp 42 // default action instead of notifying ptrace. 43 // 44 // After commit 93e35efb8de45, the seccomp check is done after the 45 // ptrace emulation check. This simplifies using SYSEMU, since seccomp 46 // will never run for emulation. Seccomp will only run for injected 47 // system calls, and thus we can use RET_KILL as our violation action. 48 var defaultAction linux.BPFAction 49 if probeSeccomp() { 50 log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)") 51 defaultAction = linux.SECCOMP_RET_KILL_THREAD 52 } else { 53 // We must rely on SYSEMU behavior; tracing with SYSEMU is broken. 54 log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)") 55 defaultAction = linux.SECCOMP_RET_ALLOW 56 } 57 58 // When creating the new child process, we specify SIGKILL as the 59 // signal to deliver when the child exits. We never expect a subprocess 60 // to exit; they are pooled and reused. This is done to ensure that if 61 // a subprocess is OOM-killed, this process (and all other stubs, 62 // transitively) will be killed as well. It's simply not possible to 63 // safely handle a single stub getting killed: the exact state of 64 // execution is unknown and not recoverable. 65 // 66 // In addition, we set the PTRACE_O_TRACEEXIT option to log more 67 // information about a stub process when it receives a fatal signal. 68 return attachedThread(uintptr(unix.SIGKILL)|unix.CLONE_FILES, defaultAction) 69 } 70 71 // attachedThread returns a new attached thread. 72 // 73 // Precondition: the runtime OS thread must be locked. 74 func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) { 75 // Create a BPF program that allows only the system calls needed by the 76 // stub and all its children. This is used to create child stubs 77 // (below), so we must include the ability to fork, but otherwise lock 78 // down available calls only to what is needed. 79 rules := []seccomp.RuleSet{} 80 if defaultAction != linux.SECCOMP_RET_ALLOW { 81 rules = append(rules, seccomp.RuleSet{ 82 Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ 83 unix.SYS_CLONE: seccomp.Or{ 84 // Allow creation of new subprocesses (used by the master). 85 seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGKILL)}, 86 // Allow creation of new threads within a single address space (used by address spaces). 87 seccomp.PerArg{ 88 seccomp.EqualTo( 89 unix.CLONE_FILES | 90 unix.CLONE_FS | 91 unix.CLONE_SIGHAND | 92 unix.CLONE_THREAD | 93 unix.CLONE_PTRACE | 94 unix.CLONE_VM)}, 95 }, 96 97 // For the initial process creation. 98 unix.SYS_WAIT4: seccomp.MatchAll{}, 99 unix.SYS_EXIT: seccomp.MatchAll{}, 100 101 // For the stub prctl dance (all). 102 unix.SYS_PRCTL: seccomp.PerArg{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)}, 103 unix.SYS_GETPPID: seccomp.MatchAll{}, 104 105 // For the stub to stop itself (all). 106 unix.SYS_GETPID: seccomp.MatchAll{}, 107 unix.SYS_KILL: seccomp.PerArg{seccomp.AnyValue{}, seccomp.EqualTo(unix.SIGSTOP)}, 108 109 // Injected to support the address space operations. 110 unix.SYS_MMAP: seccomp.MatchAll{}, 111 unix.SYS_MUNMAP: seccomp.MatchAll{}, 112 }), 113 Action: linux.SECCOMP_RET_ALLOW, 114 }) 115 } 116 rules = appendArchSeccompRules(rules, defaultAction) 117 instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{ 118 DefaultAction: defaultAction, 119 BadArchAction: defaultAction, 120 }) 121 if err != nil { 122 return nil, err 123 } 124 125 return forkStub(flags, instrs) 126 } 127 128 // In the child, this function must not acquire any locks, because they might 129 // have been locked at the time of the fork. This means no rescheduling, no 130 // malloc calls, and no new stack segments. For the same reason compiler does 131 // not race instrument it. 132 // 133 //go:norace 134 func forkStub(flags uintptr, instrs []bpf.Instruction) (*thread, error) { 135 // Declare all variables up front in order to ensure that there's no 136 // need for allocations between beforeFork & afterFork. 137 var ( 138 pid uintptr 139 ppid uintptr 140 errno unix.Errno 141 ) 142 143 // Remember the current ppid for the pdeathsig race. 144 ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0) 145 146 // Among other things, beforeFork masks all signals. 147 beforeFork() 148 149 // Do the clone. 150 pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0) 151 if errno != 0 { 152 afterFork() 153 return nil, errno 154 } 155 156 // Is this the parent? 157 if pid != 0 { 158 // Among other things, restore signal mask. 159 afterFork() 160 161 // Initialize the first thread. 162 t := &thread{ 163 tgid: int32(pid), 164 tid: int32(pid), 165 cpu: ^uint32(0), 166 } 167 if sig := t.wait(stopped); sig != unix.SIGSTOP { 168 return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) 169 } 170 t.attach() 171 t.grabInitRegs() 172 173 return t, nil 174 } 175 176 // Move the stub to a new session (and thus a new process group). This 177 // prevents the stub from getting PTY job control signals intended only 178 // for the sentry process. We must call this before restoring signal 179 // mask. 180 if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 { 181 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 182 } 183 184 // afterForkInChild resets all signals to their default dispositions 185 // and restores the signal mask to its pre-fork state. 186 afterForkInChild() 187 188 // Explicitly unmask all signals to ensure that the tracer can see 189 // them. 190 if errno := unmaskAllSignals(); errno != 0 { 191 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 192 } 193 194 // Set an aggressive BPF filter for the stub and all it's children. See 195 // the description of the BPF program built above. 196 if errno := seccomp.SetFilterInChild(instrs); errno != 0 { 197 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 198 } 199 200 // Enable cpuid-faulting. 201 enableCpuidFault() 202 203 // Call the stub; should not return. 204 stubCall(stubStart, ppid) 205 panic("unreachable") 206 } 207 208 // createStub creates a stub processes as a child of an existing subprocesses. 209 // 210 // Precondition: the runtime OS thread must be locked. 211 func (s *subprocess) createStub() (*thread, error) { 212 // There's no need to lock the runtime thread here, as this can only be 213 // called from a context that is already locked. 214 currentTID := int32(hosttid.Current()) 215 t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) 216 217 // Pass the expected PPID to the child via R15. 218 regs := t.initRegs 219 initChildProcessPPID(®s, t.tgid) 220 221 // Call fork in a subprocess. 222 // 223 // The new child must set up PDEATHSIG to ensure it dies if this 224 // process dies. Since this process could die at any time, this cannot 225 // be done via instrumentation from here. 226 // 227 // Instead, we create the child untraced, which will do the PDEATHSIG 228 // setup and then SIGSTOP itself for our attach below. 229 // 230 // See above re: SIGKILL. 231 pid, err := t.syscallIgnoreInterrupt( 232 ®s, 233 unix.SYS_CLONE, 234 arch.SyscallArgument{Value: uintptr(unix.SIGKILL | unix.CLONE_FILES)}, 235 arch.SyscallArgument{Value: 0}, 236 arch.SyscallArgument{Value: 0}, 237 arch.SyscallArgument{Value: 0}, 238 arch.SyscallArgument{Value: 0}, 239 arch.SyscallArgument{Value: 0}) 240 if err != nil { 241 return nil, fmt.Errorf("creating stub process: %v", err) 242 } 243 244 // Wait for child to enter group-stop, so we don't stop its 245 // bootstrapping work with t.attach below. 246 // 247 // We unfortunately don't have a handy part of memory to write the wait 248 // status. If the wait succeeds, we'll assume that it was the SIGSTOP. 249 // If the child actually exited, the attach below will fail. 250 _, err = t.syscallIgnoreInterrupt( 251 &t.initRegs, 252 unix.SYS_WAIT4, 253 arch.SyscallArgument{Value: uintptr(pid)}, 254 arch.SyscallArgument{Value: 0}, 255 arch.SyscallArgument{Value: unix.WALL | unix.WUNTRACED}, 256 arch.SyscallArgument{Value: 0}, 257 arch.SyscallArgument{Value: 0}, 258 arch.SyscallArgument{Value: 0}) 259 if err != nil { 260 return nil, fmt.Errorf("waiting on stub process: %v", err) 261 } 262 263 childT := &thread{ 264 tgid: int32(pid), 265 tid: int32(pid), 266 cpu: ^uint32(0), 267 } 268 childT.attach() 269 270 return childT, nil 271 }