github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/ptrace/subprocess_linux.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 package ptrace 19 20 import ( 21 "fmt" 22 23 "golang.org/x/sys/unix" 24 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 25 "github.com/nicocha30/gvisor-ligolo/pkg/hosttid" 26 "github.com/nicocha30/gvisor-ligolo/pkg/log" 27 "github.com/nicocha30/gvisor-ligolo/pkg/seccomp" 28 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch" 29 ) 30 31 const syscallEvent unix.Signal = 0x80 32 33 // createStub creates a fresh stub processes. 34 // 35 // Precondition: the runtime OS thread must be locked. 36 func createStub() (*thread, error) { 37 // The exact interactions of ptrace and seccomp are complex, and 38 // changed in recent kernel versions. Before commit 93e35efb8de45, the 39 // seccomp check is done before the ptrace emulation check. This means 40 // that any calls not matching this list will trigger the seccomp 41 // default action instead of notifying ptrace. 42 // 43 // After commit 93e35efb8de45, the seccomp check is done after the 44 // ptrace emulation check. This simplifies using SYSEMU, since seccomp 45 // will never run for emulation. Seccomp will only run for injected 46 // system calls, and thus we can use RET_KILL as our violation action. 47 var defaultAction linux.BPFAction 48 if probeSeccomp() { 49 log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)") 50 defaultAction = linux.SECCOMP_RET_KILL_THREAD 51 } else { 52 // We must rely on SYSEMU behavior; tracing with SYSEMU is broken. 53 log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)") 54 defaultAction = linux.SECCOMP_RET_ALLOW 55 } 56 57 // When creating the new child process, we specify SIGKILL as the 58 // signal to deliver when the child exits. We never expect a subprocess 59 // to exit; they are pooled and reused. This is done to ensure that if 60 // a subprocess is OOM-killed, this process (and all other stubs, 61 // transitively) will be killed as well. It's simply not possible to 62 // safely handle a single stub getting killed: the exact state of 63 // execution is unknown and not recoverable. 64 // 65 // In addition, we set the PTRACE_O_TRACEEXIT option to log more 66 // information about a stub process when it receives a fatal signal. 67 return attachedThread(uintptr(unix.SIGKILL)|unix.CLONE_FILES, defaultAction) 68 } 69 70 // attachedThread returns a new attached thread. 71 // 72 // Precondition: the runtime OS thread must be locked. 73 func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) { 74 // Create a BPF program that allows only the system calls needed by the 75 // stub and all its children. This is used to create child stubs 76 // (below), so we must include the ability to fork, but otherwise lock 77 // down available calls only to what is needed. 78 rules := []seccomp.RuleSet{} 79 if defaultAction != linux.SECCOMP_RET_ALLOW { 80 rules = append(rules, seccomp.RuleSet{ 81 Rules: seccomp.SyscallRules{ 82 unix.SYS_CLONE: []seccomp.Rule{ 83 // Allow creation of new subprocesses (used by the master). 84 {seccomp.EqualTo(unix.CLONE_FILES | unix.SIGKILL)}, 85 // Allow creation of new threads within a single address space (used by addresss spaces). 86 {seccomp.EqualTo( 87 unix.CLONE_FILES | 88 unix.CLONE_FS | 89 unix.CLONE_SIGHAND | 90 unix.CLONE_THREAD | 91 unix.CLONE_PTRACE | 92 unix.CLONE_VM)}, 93 }, 94 95 // For the initial process creation. 96 unix.SYS_WAIT4: {}, 97 unix.SYS_EXIT: {}, 98 99 // For the stub prctl dance (all). 100 unix.SYS_PRCTL: []seccomp.Rule{ 101 {seccomp.EqualTo(unix.PR_SET_PDEATHSIG), seccomp.EqualTo(unix.SIGKILL)}, 102 }, 103 unix.SYS_GETPPID: {}, 104 105 // For the stub to stop itself (all). 106 unix.SYS_GETPID: {}, 107 unix.SYS_KILL: []seccomp.Rule{ 108 {seccomp.MatchAny{}, seccomp.EqualTo(unix.SIGSTOP)}, 109 }, 110 111 // Injected to support the address space operations. 112 unix.SYS_MMAP: {}, 113 unix.SYS_MUNMAP: {}, 114 }, 115 Action: linux.SECCOMP_RET_ALLOW, 116 }) 117 } 118 rules = appendArchSeccompRules(rules, defaultAction) 119 instrs, err := seccomp.BuildProgram(rules, defaultAction, defaultAction) 120 if err != nil { 121 return nil, err 122 } 123 124 return forkStub(flags, instrs) 125 } 126 127 // In the child, this function must not acquire any locks, because they might 128 // have been locked at the time of the fork. This means no rescheduling, no 129 // malloc calls, and no new stack segments. For the same reason compiler does 130 // not race instrument it. 131 // 132 //go:norace 133 func forkStub(flags uintptr, instrs []linux.BPFInstruction) (*thread, error) { 134 // Declare all variables up front in order to ensure that there's no 135 // need for allocations between beforeFork & afterFork. 136 var ( 137 pid uintptr 138 ppid uintptr 139 errno unix.Errno 140 ) 141 142 // Remember the current ppid for the pdeathsig race. 143 ppid, _, _ = unix.RawSyscall(unix.SYS_GETPID, 0, 0, 0) 144 145 // Among other things, beforeFork masks all signals. 146 beforeFork() 147 148 // Do the clone. 149 pid, _, errno = unix.RawSyscall6(unix.SYS_CLONE, flags, 0, 0, 0, 0, 0) 150 if errno != 0 { 151 afterFork() 152 return nil, errno 153 } 154 155 // Is this the parent? 156 if pid != 0 { 157 // Among other things, restore signal mask. 158 afterFork() 159 160 // Initialize the first thread. 161 t := &thread{ 162 tgid: int32(pid), 163 tid: int32(pid), 164 cpu: ^uint32(0), 165 } 166 if sig := t.wait(stopped); sig != unix.SIGSTOP { 167 return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig) 168 } 169 t.attach() 170 t.grabInitRegs() 171 172 return t, nil 173 } 174 175 // Move the stub to a new session (and thus a new process group). This 176 // prevents the stub from getting PTY job control signals intended only 177 // for the sentry process. We must call this before restoring signal 178 // mask. 179 if _, _, errno := unix.RawSyscall(unix.SYS_SETSID, 0, 0, 0); errno != 0 { 180 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 181 } 182 183 // afterForkInChild resets all signals to their default dispositions 184 // and restores the signal mask to its pre-fork state. 185 afterForkInChild() 186 187 // Explicitly unmask all signals to ensure that the tracer can see 188 // them. 189 if errno := unmaskAllSignals(); errno != 0 { 190 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 191 } 192 193 // Set an aggressive BPF filter for the stub and all it's children. See 194 // the description of the BPF program built above. 195 if errno := seccomp.SetFilterInChild(instrs); errno != 0 { 196 unix.RawSyscall(unix.SYS_EXIT, uintptr(errno), 0, 0) 197 } 198 199 // Enable cpuid-faulting. 200 enableCpuidFault() 201 202 // Call the stub; should not return. 203 stubCall(stubStart, ppid) 204 panic("unreachable") 205 } 206 207 // createStub creates a stub processes as a child of an existing subprocesses. 208 // 209 // Precondition: the runtime OS thread must be locked. 210 func (s *subprocess) createStub() (*thread, error) { 211 // There's no need to lock the runtime thread here, as this can only be 212 // called from a context that is already locked. 213 currentTID := int32(hosttid.Current()) 214 t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread) 215 216 // Pass the expected PPID to the child via R15. 217 regs := t.initRegs 218 initChildProcessPPID(®s, t.tgid) 219 220 // Call fork in a subprocess. 221 // 222 // The new child must set up PDEATHSIG to ensure it dies if this 223 // process dies. Since this process could die at any time, this cannot 224 // be done via instrumentation from here. 225 // 226 // Instead, we create the child untraced, which will do the PDEATHSIG 227 // setup and then SIGSTOP itself for our attach below. 228 // 229 // See above re: SIGKILL. 230 pid, err := t.syscallIgnoreInterrupt( 231 ®s, 232 unix.SYS_CLONE, 233 arch.SyscallArgument{Value: uintptr(unix.SIGKILL | unix.CLONE_FILES)}, 234 arch.SyscallArgument{Value: 0}, 235 arch.SyscallArgument{Value: 0}, 236 arch.SyscallArgument{Value: 0}, 237 arch.SyscallArgument{Value: 0}, 238 arch.SyscallArgument{Value: 0}) 239 if err != nil { 240 return nil, fmt.Errorf("creating stub process: %v", err) 241 } 242 243 // Wait for child to enter group-stop, so we don't stop its 244 // bootstrapping work with t.attach below. 245 // 246 // We unfortunately don't have a handy part of memory to write the wait 247 // status. If the wait succeeds, we'll assume that it was the SIGSTOP. 248 // If the child actually exited, the attach below will fail. 249 _, err = t.syscallIgnoreInterrupt( 250 &t.initRegs, 251 unix.SYS_WAIT4, 252 arch.SyscallArgument{Value: uintptr(pid)}, 253 arch.SyscallArgument{Value: 0}, 254 arch.SyscallArgument{Value: unix.WALL | unix.WUNTRACED}, 255 arch.SyscallArgument{Value: 0}, 256 arch.SyscallArgument{Value: 0}, 257 arch.SyscallArgument{Value: 0}) 258 if err != nil { 259 return nil, fmt.Errorf("waiting on stub process: %v", err) 260 } 261 262 childT := &thread{ 263 tgid: int32(pid), 264 tid: int32(pid), 265 cpu: ^uint32(0), 266 } 267 childT.attach() 268 269 return childT, nil 270 }