gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/subprocess_linux.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 package systrap 19 20 import ( 21 "fmt" 22 23 "golang.org/x/sys/unix" 24 "gvisor.dev/gvisor/pkg/abi/linux" 25 "gvisor.dev/gvisor/pkg/bpf" 26 "gvisor.dev/gvisor/pkg/seccomp" 27 "gvisor.dev/gvisor/pkg/sentry/arch" 28 ) 29 30 const syscallEvent unix.Signal = 0x80 31 32 // createStub creates a fresh stub processes. 33 // 34 // Precondition: the runtime OS thread must be locked. 35 func createStub() (*thread, error) { 36 // When creating the new child process, we specify SIGKILL as the 37 // signal to deliver when the child exits. We never expect a subprocess 38 // to exit; they are pooled and reused. This is done to ensure that if 39 // a subprocess is OOM-killed, this process (and all other stubs, 40 // transitively) will be killed as well. It's simply not possible to 41 // safely handle a single stub getting killed: the exact state of 42 // execution is unknown and not recoverable. 43 return attachedThread(unix.CLONE_FILES|uintptr(unix.SIGCHLD), linux.SECCOMP_RET_TRAP) 44 } 45 46 // attachedThread returns a new attached thread. 47 // 48 // Precondition: the runtime OS thread must be locked. 49 func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) { 50 // Create a BPF program that allows only the system calls needed by the 51 // stub and all its children. This is used to create child stubs 52 // (below), so we must include the ability to fork, but otherwise lock 53 // down available calls only to what is needed. 54 rules := []seccomp.RuleSet{} 55 if defaultAction != linux.SECCOMP_RET_ALLOW { 56 ruleSet := seccomp.RuleSet{ 57 Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ 58 unix.SYS_CLONE: seccomp.Or{ 59 // Allow creation of new subprocesses (used by the master). 60 seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.CLONE_PARENT | unix.SIGCHLD)}, 61 seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGCHLD)}, 62 // Allow creation of new sysmsg thread. 63 seccomp.PerArg{seccomp.EqualTo( 64 unix.CLONE_FILES | 65 unix.CLONE_FS | 66 unix.CLONE_VM | 67 unix.CLONE_PTRACE | 68 linux.SIGKILL)}, 69 // Allow creation of new threads within a single address space (used by address spaces). 70 seccomp.PerArg{seccomp.EqualTo( 71 unix.CLONE_FILES | 72 unix.CLONE_FS | 73 unix.CLONE_SIGHAND | 74 unix.CLONE_THREAD | 75 unix.CLONE_PTRACE | 76 unix.CLONE_VM)}, 77 }, 78 79 // For the initial process creation. 80 unix.SYS_WAIT4: seccomp.MatchAll{}, 81 unix.SYS_EXIT: seccomp.MatchAll{}, 82 83 // For the stub prctl dance (all). 84 unix.SYS_PRCTL: seccomp.Or{ 85 seccomp.PerArg{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)}, 86 seccomp.PerArg{seccomp.EqualTo(linux.PR_SET_NO_NEW_PRIVS), seccomp.EqualTo(1)}, 87 }, 88 unix.SYS_GETPPID: seccomp.MatchAll{}, 89 90 // For the stub to stop itself (all). 91 unix.SYS_GETPID: seccomp.MatchAll{}, 92 unix.SYS_KILL: seccomp.PerArg{ 93 seccomp.AnyValue{}, 94 seccomp.EqualTo(unix.SIGSTOP), 95 }, 96 97 // Injected to support the address space operations. 98 unix.SYS_MMAP: seccomp.MatchAll{}, 99 unix.SYS_MUNMAP: seccomp.MatchAll{}, 100 101 // For sysmsg threads. Look at sysmsg/sighandler.c for more details. 102 unix.SYS_RT_SIGRETURN: seccomp.MatchAll{}, 103 unix.SYS_SCHED_YIELD: seccomp.MatchAll{}, 104 unix.SYS_FUTEX: seccomp.Or{ 105 seccomp.PerArg{ 106 seccomp.AnyValue{}, 107 seccomp.EqualTo(linux.FUTEX_WAIT), 108 seccomp.AnyValue{}, 109 seccomp.AnyValue{}, 110 }, 111 seccomp.PerArg{ 112 seccomp.AnyValue{}, 113 seccomp.EqualTo(linux.FUTEX_WAKE), 114 seccomp.AnyValue{}, 115 seccomp.AnyValue{}, 116 }, 117 }, 118 unix.SYS_SIGALTSTACK: seccomp.MatchAll{}, 119 unix.SYS_TKILL: seccomp.PerArg{ 120 seccomp.AnyValue{}, 121 seccomp.EqualTo(unix.SIGSTOP), 122 }, 123 unix.SYS_GETTID: seccomp.MatchAll{}, 124 unix.SYS_EXIT_GROUP: seccomp.MatchAll{}, 125 seccomp.SYS_SECCOMP: seccomp.Or{ 126 seccomp.PerArg{ 127 seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER), 128 seccomp.EqualTo(0), 129 seccomp.AnyValue{}, 130 }, 131 seccomp.PerArg{ 132 seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER), 133 seccomp.EqualTo(linux.SECCOMP_FILTER_FLAG_NEW_LISTENER), 134 seccomp.AnyValue{}, 135 }, 136 }, 137 }), 138 Action: linux.SECCOMP_RET_ALLOW, 139 } 140 rules = append(rules, ruleSet) 141 rules = appendArchSeccompRules(rules) 142 } 143 instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{ 144 DefaultAction: defaultAction, 145 BadArchAction: defaultAction, 146 }) 147 if err != nil { 148 return nil, err 149 } 150 151 return forkStub(flags, instrs) 152 } 153 154 // In the child, this function must not acquire any locks, because they might 155 // have been locked at the time of the fork. This means no rescheduling, no 156 // malloc calls, and no new stack segments. For the same reason compiler does 157 // not race instrument it. 158 // 159 //go:norace 160 func forkStub(flags uintptr, instrs []bpf.Instruction) (*thread, error) { 161 // Declare all variables up front in order to ensure that there's no 162 // need for allocations between beforeFork & afterFork. 163 var ( 164 pid uintptr 165 ppid uintptr 166 errno unix.Errno 167 ) 168 169 // Remember the current ppid for the pdeathsig race. 170 ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0) 171 172 // Among other things, beforeFork masks all signals. 173 beforeFork() 174 175 // Do the clone. 176 pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0) 177 if errno != 0 { 178 afterFork() 179 return nil, errno 180 } 181 182 // Is this the parent? 183 if pid != 0 { 184 // Among other things, restore signal mask. 185 afterFork() 186 187 // Initialize the first thread. 188 t := &thread{ 189 tgid: int32(pid), 190 tid: int32(pid), 191 } 192 if sig := t.wait(stopped); sig != unix.SIGSTOP { 193 return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) 194 } 195 if err := t.attach(); err != nil { 196 return nil, err 197 } 198 t.grabInitRegs() 199 _, err := t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_MUNMAP, 200 arch.SyscallArgument{Value: stubROMapEnd}, 201 arch.SyscallArgument{Value: maximumUserAddress - stubROMapEnd}) 202 if err != nil { 203 return nil, err 204 } 205 206 return t, nil 207 } 208 209 // Move the stub to a new session (and thus a new process group). This 210 // prevents the stub from getting PTY job control signals intended only 211 // for the sentry process. We must call this before restoring signal 212 // mask. 213 if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 { 214 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 215 } 216 217 // afterForkInChild resets all signals to their default dispositions 218 // and restores the signal mask to its pre-fork state. 219 afterForkInChild() 220 221 if errno := sysmsgSigactions(stubSysmsgStart); errno != 0 { 222 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 223 } 224 225 // Explicitly unmask all signals to ensure that the tracer can see 226 // them. 227 if errno := unmaskAllSignals(); errno != 0 { 228 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 229 } 230 231 // Set an aggressive BPF filter for the stub and all it's children. See 232 // the description of the BPF program built above. 233 if errno := seccomp.SetFilterInChild(instrs); errno != 0 { 234 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 235 } 236 237 // Enable cpuid-faulting. 238 enableCpuidFault() 239 240 // Call the stub; should not return. 241 stubCall(stubInitProcess, ppid) 242 panic("unreachable") 243 } 244 245 // createStub creates a stub processes as a child of an existing subprocesses. 246 // 247 // Precondition: the runtime OS thread must be locked. 248 func (t *thread) createStub() (*thread, error) { 249 // There's no need to lock the runtime thread here, as this can only be 250 // called from a context that is already locked. 251 252 // Pass the expected PPID to the child via R15. 253 regs := t.initRegs 254 initChildProcessPPID(®s, t.tgid) 255 256 // Call fork in a subprocess. 257 // 258 // The new child must set up PDEATHSIG to ensure it dies if this 259 // process dies. Since this process could die at any time, this cannot 260 // be done via instrumentation from here. 261 // 262 // Instead, we create the child untraced, which will do the PDEATHSIG 263 // setup and then SIGSTOP itself for our attach below. 264 // 265 // See above re: SIGKILL. 266 pid, err := t.syscallIgnoreInterrupt( 267 ®s, 268 unix.SYS_CLONE, 269 arch.SyscallArgument{Value: uintptr(unix.CLONE_FILES | unix.CLONE_PARENT | uintptr(unix.SIGCHLD))}, 270 arch.SyscallArgument{Value: 0}, 271 arch.SyscallArgument{Value: 0}, 272 arch.SyscallArgument{Value: 0}, 273 arch.SyscallArgument{Value: 0}, 274 arch.SyscallArgument{Value: 0}) 275 if err != nil { 276 return nil, fmt.Errorf("creating stub process: %v", err) 277 } 278 279 // Wait for child to enter group-stop, so we don't stop its 280 // bootstrapping work with t.attach below. 281 // 282 // We unfortunately don't have a handy part of memory to write the wait 283 // status. If the wait succeeds, we'll assume that it was the SIGSTOP. 284 // If the child actually exited, the attach below will fail. 285 _, err = unix.Wait4(int(pid), nil, unix.WALL|unix.WUNTRACED, nil) 286 if err != nil { 287 return nil, fmt.Errorf("waiting on stub process: %v", err) 288 } 289 290 childT := &thread{ 291 tgid: int32(pid), 292 tid: int32(pid), 293 } 294 295 return childT, nil 296 } 297 298 func (s *subprocess) createStub() (*thread, error) { 299 req := requestStub{} 300 req.done = make(chan *thread, 1) 301 s.requests <- req 302 303 childT := <-req.done 304 if childT == nil { 305 return nil, fmt.Errorf("createStub: failed to get clone") 306 } 307 if err := childT.attach(); err != nil { 308 return nil, err 309 } 310 childT.grabInitRegs() 311 312 return childT, nil 313 }