github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/systrap/subprocess_linux.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 package systrap 19 20 import ( 21 "fmt" 22 23 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 24 "github.com/MerlinKodo/gvisor/pkg/seccomp" 25 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 26 "golang.org/x/sys/unix" 27 ) 28 29 const syscallEvent unix.Signal = 0x80 30 31 // createStub creates a fresh stub processes. 32 // 33 // Precondition: the runtime OS thread must be locked. 34 func createStub() (*thread, error) { 35 // When creating the new child process, we specify SIGKILL as the 36 // signal to deliver when the child exits. We never expect a subprocess 37 // to exit; they are pooled and reused. This is done to ensure that if 38 // a subprocess is OOM-killed, this process (and all other stubs, 39 // transitively) will be killed as well. It's simply not possible to 40 // safely handle a single stub getting killed: the exact state of 41 // execution is unknown and not recoverable. 42 return attachedThread(uintptr(unix.SIGKILL)|unix.CLONE_FILES, linux.SECCOMP_RET_TRAP) 43 } 44 45 // attachedThread returns a new attached thread. 46 // 47 // Precondition: the runtime OS thread must be locked. 48 func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) { 49 // Create a BPF program that allows only the system calls needed by the 50 // stub and all its children. This is used to create child stubs 51 // (below), so we must include the ability to fork, but otherwise lock 52 // down available calls only to what is needed. 53 rules := []seccomp.RuleSet{} 54 if defaultAction != linux.SECCOMP_RET_ALLOW { 55 ruleSet := seccomp.RuleSet{ 56 Rules: seccomp.SyscallRules{ 57 unix.SYS_CLONE: []seccomp.Rule{ 58 // Allow creation of new subprocesses (used by the master). 59 {seccomp.EqualTo(unix.CLONE_FILES | unix.SIGKILL)}, 60 // Allow creation of new sysmsg thread. 61 {seccomp.EqualTo( 62 unix.CLONE_FILES | 63 unix.CLONE_FS | 64 unix.CLONE_VM | 65 unix.CLONE_PTRACE)}, 66 // Allow creation of new threads within a single address space (used by addresss spaces). 67 {seccomp.EqualTo( 68 unix.CLONE_FILES | 69 unix.CLONE_FS | 70 unix.CLONE_SIGHAND | 71 unix.CLONE_THREAD | 72 unix.CLONE_PTRACE | 73 unix.CLONE_VM)}, 74 }, 75 76 // For the initial process creation. 77 unix.SYS_WAIT4: {}, 78 unix.SYS_EXIT: {}, 79 80 // For the stub prctl dance (all). 81 unix.SYS_PRCTL: []seccomp.Rule{ 82 {seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)}, 83 {seccomp.EqualTo(linux.PR_SET_NO_NEW_PRIVS), seccomp.EqualTo(1)}, 84 }, 85 unix.SYS_GETPPID: {}, 86 87 // For the stub to stop itself (all). 88 unix.SYS_GETPID: {}, 89 unix.SYS_KILL: []seccomp.Rule{ 90 {seccomp.MatchAny{}, seccomp.EqualTo(unix.SIGSTOP)}, 91 }, 92 93 // Injected to support the address space operations. 94 unix.SYS_MMAP: {}, 95 unix.SYS_MUNMAP: {}, 96 97 // For sysmsg threads. Look at sysmsg/sighandler.c for more details. 98 unix.SYS_RT_SIGRETURN: {}, 99 unix.SYS_SCHED_YIELD: {}, 100 unix.SYS_FUTEX: { 101 seccomp.Rule{ 102 seccomp.MatchAny{}, 103 seccomp.EqualTo(linux.FUTEX_WAIT), 104 seccomp.MatchAny{}, 105 seccomp.MatchAny{}, 106 }, 107 seccomp.Rule{ 108 seccomp.MatchAny{}, 109 seccomp.EqualTo(linux.FUTEX_WAKE), 110 seccomp.MatchAny{}, 111 seccomp.MatchAny{}, 112 }, 113 }, 114 unix.SYS_SIGALTSTACK: {}, 115 unix.SYS_TKILL: { 116 {seccomp.MatchAny{}, seccomp.EqualTo(unix.SIGSTOP)}, 117 }, 118 unix.SYS_GETTID: {}, 119 seccomp.SYS_SECCOMP: { 120 {seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER), seccomp.EqualTo(0), seccomp.MatchAny{}}, 121 }, 122 }, 123 Action: linux.SECCOMP_RET_ALLOW, 124 } 125 rules = append(rules, ruleSet) 126 rules = appendArchSeccompRules(rules) 127 } 128 instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction) 129 if err != nil { 130 return nil, err 131 } 132 133 return forkStub(flags, instrs) 134 } 135 136 // In the child, this function must not acquire any locks, because they might 137 // have been locked at the time of the fork. This means no rescheduling, no 138 // malloc calls, and no new stack segments. For the same reason compiler does 139 // not race instrument it. 140 // 141 //go:norace 142 func forkStub(flags uintptr, instrs []linux.BPFInstruction) (*thread, error) { 143 // Declare all variables up front in order to ensure that there's no 144 // need for allocations between beforeFork & afterFork. 145 var ( 146 pid uintptr 147 ppid uintptr 148 errno unix.Errno 149 ) 150 151 // Remember the current ppid for the pdeathsig race. 152 ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0) 153 154 // Among other things, beforeFork masks all signals. 155 beforeFork() 156 157 // Do the clone. 158 pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0) 159 if errno != 0 { 160 afterFork() 161 return nil, errno 162 } 163 164 // Is this the parent? 165 if pid != 0 { 166 // Among other things, restore signal mask. 167 afterFork() 168 169 // Initialize the first thread. 170 t := &thread{ 171 tgid: int32(pid), 172 tid: int32(pid), 173 } 174 if sig := t.wait(stopped); sig != unix.SIGSTOP { 175 return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) 176 } 177 t.attach() 178 t.grabInitRegs() 179 _, err := t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_MUNMAP, 180 arch.SyscallArgument{Value: stubROMapEnd}, 181 arch.SyscallArgument{Value: maximumUserAddress - stubROMapEnd}) 182 if err != nil { 183 return nil, err 184 } 185 186 return t, nil 187 } 188 189 // Move the stub to a new session (and thus a new process group). This 190 // prevents the stub from getting PTY job control signals intended only 191 // for the sentry process. We must call this before restoring signal 192 // mask. 193 if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 { 194 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 195 } 196 197 // afterForkInChild resets all signals to their default dispositions 198 // and restores the signal mask to its pre-fork state. 199 afterForkInChild() 200 201 if errno := sysmsgSigactions(stubSysmsgStart); errno != 0 { 202 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 203 } 204 205 // Explicitly unmask all signals to ensure that the tracer can see 206 // them. 207 if errno := unmaskAllSignals(); errno != 0 { 208 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 209 } 210 211 // Set an aggressive BPF filter for the stub and all it's children. See 212 // the description of the BPF program built above. 213 if errno := seccomp.SetFilterInChild(instrs); errno != 0 { 214 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 215 } 216 217 // Enable cpuid-faulting. 218 enableCpuidFault() 219 220 // Call the stub; should not return. 221 stubCall(stubInitProcess, ppid) 222 panic("unreachable") 223 } 224 225 // createStub creates a stub processes as a child of an existing subprocesses. 226 // 227 // Precondition: the runtime OS thread must be locked. 228 func (t *thread) createStub() (*thread, error) { 229 // There's no need to lock the runtime thread here, as this can only be 230 // called from a context that is already locked. 231 232 // Pass the expected PPID to the child via R15. 233 regs := t.initRegs 234 initChildProcessPPID(®s, t.tgid) 235 236 // Call fork in a subprocess. 237 // 238 // The new child must set up PDEATHSIG to ensure it dies if this 239 // process dies. Since this process could die at any time, this cannot 240 // be done via instrumentation from here. 241 // 242 // Instead, we create the child untraced, which will do the PDEATHSIG 243 // setup and then SIGSTOP itself for our attach below. 244 // 245 // See above re: SIGKILL. 246 pid, err := t.syscallIgnoreInterrupt( 247 ®s, 248 unix.SYS_CLONE, 249 arch.SyscallArgument{Value: uintptr(unix.SIGKILL | unix.CLONE_FILES)}, 250 arch.SyscallArgument{Value: 0}, 251 arch.SyscallArgument{Value: 0}, 252 arch.SyscallArgument{Value: 0}, 253 arch.SyscallArgument{Value: 0}, 254 arch.SyscallArgument{Value: 0}) 255 if err != nil { 256 return nil, fmt.Errorf("creating stub process: %v", err) 257 } 258 259 // Wait for child to enter group-stop, so we don't stop its 260 // bootstrapping work with t.attach below. 261 // 262 // We unfortunately don't have a handy part of memory to write the wait 263 // status. If the wait succeeds, we'll assume that it was the SIGSTOP. 264 // If the child actually exited, the attach below will fail. 265 _, err = t.syscallIgnoreInterrupt( 266 &t.initRegs, 267 unix.SYS_WAIT4, 268 arch.SyscallArgument{Value: uintptr(pid)}, 269 arch.SyscallArgument{Value: 0}, 270 arch.SyscallArgument{Value: unix.WALL | unix.WUNTRACED}, 271 arch.SyscallArgument{Value: 0}, 272 arch.SyscallArgument{Value: 0}, 273 arch.SyscallArgument{Value: 0}) 274 if err != nil { 275 return nil, fmt.Errorf("waiting on stub process: %v", err) 276 } 277 278 childT := &thread{ 279 tgid: int32(pid), 280 tid: int32(pid), 281 } 282 283 return childT, nil 284 } 285 286 func (s *subprocess) createStub() (*thread, error) { 287 req := requestStub{} 288 req.done = make(chan *thread, 1) 289 s.requests <- req 290 291 childT := <-req.done 292 childT.attach() 293 childT.grabInitRegs() 294 295 return childT, nil 296 }