github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/platform/systrap/subprocess_linux.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 package systrap 19 20 import ( 21 "fmt" 22 23 "golang.org/x/sys/unix" 24 "github.com/metacubex/gvisor/pkg/abi/linux" 25 "github.com/metacubex/gvisor/pkg/bpf" 26 "github.com/metacubex/gvisor/pkg/seccomp" 27 "github.com/metacubex/gvisor/pkg/sentry/arch" 28 ) 29 30 const syscallEvent unix.Signal = 0x80 31 32 // createStub creates a fresh stub processes. 33 // 34 // Precondition: the runtime OS thread must be locked. 35 func createStub() (*thread, error) { 36 // When creating the new child process, we specify SIGKILL as the 37 // signal to deliver when the child exits. We never expect a subprocess 38 // to exit; they are pooled and reused. This is done to ensure that if 39 // a subprocess is OOM-killed, this process (and all other stubs, 40 // transitively) will be killed as well. It's simply not possible to 41 // safely handle a single stub getting killed: the exact state of 42 // execution is unknown and not recoverable. 43 return attachedThread(unix.CLONE_FILES|uintptr(unix.SIGCHLD), linux.SECCOMP_RET_TRAP) 44 } 45 46 // attachedThread returns a new attached thread. 47 // 48 // Precondition: the runtime OS thread must be locked. 49 func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) { 50 // Create a BPF program that allows only the system calls needed by the 51 // stub and all its children. This is used to create child stubs 52 // (below), so we must include the ability to fork, but otherwise lock 53 // down available calls only to what is needed. 54 rules := []seccomp.RuleSet{} 55 if defaultAction != linux.SECCOMP_RET_ALLOW { 56 ruleSet := seccomp.RuleSet{ 57 Rules: seccomp.MakeSyscallRules(map[uintptr]seccomp.SyscallRule{ 58 unix.SYS_CLONE: seccomp.Or{ 59 // Allow creation of new subprocesses (used by the master). 60 seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.CLONE_PARENT | unix.SIGCHLD)}, 61 seccomp.PerArg{seccomp.EqualTo(unix.CLONE_FILES | unix.SIGCHLD)}, 62 // Allow creation of new sysmsg thread. 63 seccomp.PerArg{seccomp.EqualTo( 64 unix.CLONE_FILES | 65 unix.CLONE_FS | 66 unix.CLONE_VM | 67 unix.CLONE_PTRACE | 68 linux.SIGKILL)}, 69 // Allow creation of new threads within a single address space (used by address spaces). 70 seccomp.PerArg{seccomp.EqualTo( 71 unix.CLONE_FILES | 72 unix.CLONE_FS | 73 unix.CLONE_SIGHAND | 74 unix.CLONE_THREAD | 75 unix.CLONE_PTRACE | 76 unix.CLONE_VM)}, 77 }, 78 79 // For the initial process creation. 80 unix.SYS_WAIT4: seccomp.MatchAll{}, 81 unix.SYS_EXIT: seccomp.MatchAll{}, 82 83 // For the stub prctl dance (all). 84 unix.SYS_PRCTL: seccomp.Or{ 85 seccomp.PerArg{seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)}, 86 seccomp.PerArg{seccomp.EqualTo(linux.PR_SET_NO_NEW_PRIVS), seccomp.EqualTo(1)}, 87 }, 88 unix.SYS_GETPPID: seccomp.MatchAll{}, 89 90 // For the stub to stop itself (all). 91 unix.SYS_GETPID: seccomp.MatchAll{}, 92 unix.SYS_KILL: seccomp.PerArg{ 93 seccomp.AnyValue{}, 94 seccomp.EqualTo(unix.SIGSTOP), 95 }, 96 97 // Injected to support the address space operations. 98 unix.SYS_MMAP: seccomp.MatchAll{}, 99 unix.SYS_MUNMAP: seccomp.MatchAll{}, 100 101 // For sysmsg threads. Look at sysmsg/sighandler.c for more details. 102 unix.SYS_RT_SIGRETURN: seccomp.MatchAll{}, 103 unix.SYS_SCHED_YIELD: seccomp.MatchAll{}, 104 unix.SYS_FUTEX: seccomp.Or{ 105 seccomp.PerArg{ 106 seccomp.AnyValue{}, 107 seccomp.EqualTo(linux.FUTEX_WAIT), 108 seccomp.AnyValue{}, 109 seccomp.AnyValue{}, 110 }, 111 seccomp.PerArg{ 112 seccomp.AnyValue{}, 113 seccomp.EqualTo(linux.FUTEX_WAKE), 114 seccomp.AnyValue{}, 115 seccomp.AnyValue{}, 116 }, 117 }, 118 unix.SYS_SIGALTSTACK: seccomp.MatchAll{}, 119 unix.SYS_TKILL: seccomp.PerArg{ 120 seccomp.AnyValue{}, 121 seccomp.EqualTo(unix.SIGSTOP), 122 }, 123 unix.SYS_GETTID: seccomp.MatchAll{}, 124 seccomp.SYS_SECCOMP: seccomp.PerArg{ 125 seccomp.EqualTo(linux.SECCOMP_SET_MODE_FILTER), 126 seccomp.EqualTo(0), 127 seccomp.AnyValue{}, 128 }, 129 }), 130 Action: linux.SECCOMP_RET_ALLOW, 131 } 132 rules = append(rules, ruleSet) 133 rules = appendArchSeccompRules(rules) 134 } 135 instrs, _, err := seccomp.BuildProgram(rules, seccomp.ProgramOptions{ 136 DefaultAction: defaultAction, 137 BadArchAction: defaultAction, 138 }) 139 if err != nil { 140 return nil, err 141 } 142 143 return forkStub(flags, instrs) 144 } 145 146 // In the child, this function must not acquire any locks, because they might 147 // have been locked at the time of the fork. This means no rescheduling, no 148 // malloc calls, and no new stack segments. For the same reason compiler does 149 // not race instrument it. 150 // 151 //go:norace 152 func forkStub(flags uintptr, instrs []bpf.Instruction) (*thread, error) { 153 // Declare all variables up front in order to ensure that there's no 154 // need for allocations between beforeFork & afterFork. 155 var ( 156 pid uintptr 157 ppid uintptr 158 errno unix.Errno 159 ) 160 161 // Remember the current ppid for the pdeathsig race. 162 ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0) 163 164 // Among other things, beforeFork masks all signals. 165 beforeFork() 166 167 // Do the clone. 168 pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0) 169 if errno != 0 { 170 afterFork() 171 return nil, errno 172 } 173 174 // Is this the parent? 175 if pid != 0 { 176 // Among other things, restore signal mask. 177 afterFork() 178 179 // Initialize the first thread. 180 t := &thread{ 181 tgid: int32(pid), 182 tid: int32(pid), 183 } 184 if sig := t.wait(stopped); sig != unix.SIGSTOP { 185 return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) 186 } 187 if err := t.attach(); err != nil { 188 return nil, err 189 } 190 t.grabInitRegs() 191 _, err := t.syscallIgnoreInterrupt(&t.initRegs, unix.SYS_MUNMAP, 192 arch.SyscallArgument{Value: stubROMapEnd}, 193 arch.SyscallArgument{Value: maximumUserAddress - stubROMapEnd}) 194 if err != nil { 195 return nil, err 196 } 197 198 return t, nil 199 } 200 201 // Move the stub to a new session (and thus a new process group). This 202 // prevents the stub from getting PTY job control signals intended only 203 // for the sentry process. We must call this before restoring signal 204 // mask. 205 if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 { 206 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 207 } 208 209 // afterForkInChild resets all signals to their default dispositions 210 // and restores the signal mask to its pre-fork state. 211 afterForkInChild() 212 213 if errno := sysmsgSigactions(stubSysmsgStart); errno != 0 { 214 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 215 } 216 217 // Explicitly unmask all signals to ensure that the tracer can see 218 // them. 219 if errno := unmaskAllSignals(); errno != 0 { 220 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 221 } 222 223 // Set an aggressive BPF filter for the stub and all it's children. See 224 // the description of the BPF program built above. 225 if errno := seccomp.SetFilterInChild(instrs); errno != 0 { 226 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 227 } 228 229 // Enable cpuid-faulting. 230 enableCpuidFault() 231 232 // Call the stub; should not return. 233 stubCall(stubInitProcess, ppid) 234 panic("unreachable") 235 } 236 237 // createStub creates a stub processes as a child of an existing subprocesses. 238 // 239 // Precondition: the runtime OS thread must be locked. 240 func (t *thread) createStub() (*thread, error) { 241 // There's no need to lock the runtime thread here, as this can only be 242 // called from a context that is already locked. 243 244 // Pass the expected PPID to the child via R15. 245 regs := t.initRegs 246 initChildProcessPPID(®s, t.tgid) 247 248 // Call fork in a subprocess. 249 // 250 // The new child must set up PDEATHSIG to ensure it dies if this 251 // process dies. Since this process could die at any time, this cannot 252 // be done via instrumentation from here. 253 // 254 // Instead, we create the child untraced, which will do the PDEATHSIG 255 // setup and then SIGSTOP itself for our attach below. 256 // 257 // See above re: SIGKILL. 258 pid, err := t.syscallIgnoreInterrupt( 259 ®s, 260 unix.SYS_CLONE, 261 arch.SyscallArgument{Value: uintptr(unix.CLONE_FILES | unix.CLONE_PARENT | uintptr(unix.SIGCHLD))}, 262 arch.SyscallArgument{Value: 0}, 263 arch.SyscallArgument{Value: 0}, 264 arch.SyscallArgument{Value: 0}, 265 arch.SyscallArgument{Value: 0}, 266 arch.SyscallArgument{Value: 0}) 267 if err != nil { 268 return nil, fmt.Errorf("creating stub process: %v", err) 269 } 270 271 // Wait for child to enter group-stop, so we don't stop its 272 // bootstrapping work with t.attach below. 273 // 274 // We unfortunately don't have a handy part of memory to write the wait 275 // status. If the wait succeeds, we'll assume that it was the SIGSTOP. 276 // If the child actually exited, the attach below will fail. 277 _, err = unix.Wait4(int(pid), nil, unix.WALL|unix.WUNTRACED, nil) 278 if err != nil { 279 return nil, fmt.Errorf("waiting on stub process: %v", err) 280 } 281 282 childT := &thread{ 283 tgid: int32(pid), 284 tid: int32(pid), 285 } 286 287 return childT, nil 288 } 289 290 func (s *subprocess) createStub() (*thread, error) { 291 req := requestStub{} 292 req.done = make(chan *thread, 1) 293 s.requests <- req 294 295 childT := <-req.done 296 if childT == nil { 297 return nil, fmt.Errorf("createStub: failed to get clone") 298 } 299 if err := childT.attach(); err != nil { 300 return nil, err 301 } 302 childT.grabInitRegs() 303 304 return childT, nil 305 }