github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/task_exec.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 // This file implements the machinery behind the execve() syscall. In brief, a 18 // thread executes an execve() by killing all other threads in its thread 19 // group, assuming the leader's identity, and then switching process images. 20 // 21 // This design is effectively mandated by Linux. From ptrace(2): 22 // 23 // """ 24 // execve(2) under ptrace 25 // When one thread in a multithreaded process calls execve(2), the 26 // kernel destroys all other threads in the process, and resets the 27 // thread ID of the execing thread to the thread group ID (process ID). 28 // (Or, to put things another way, when a multithreaded process does an 29 // execve(2), at completion of the call, it appears as though the 30 // execve(2) occurred in the thread group leader, regardless of which 31 // thread did the execve(2).) This resetting of the thread ID looks 32 // very confusing to tracers: 33 // 34 // * All other threads stop in PTRACE_EVENT_EXIT stop, if the 35 // PTRACE_O_TRACEEXIT option was turned on. Then all other threads 36 // except the thread group leader report death as if they exited via 37 // _exit(2) with exit code 0. 38 // 39 // * The execing tracee changes its thread ID while it is in the 40 // execve(2). (Remember, under ptrace, the "pid" returned from 41 // waitpid(2), or fed into ptrace calls, is the tracee's thread ID.) 42 // That is, the tracee's thread ID is reset to be the same as its 43 // process ID, which is the same as the thread group leader's thread 44 // ID. 45 // 46 // * Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC 47 // option was turned on. 48 // 49 // * If the thread group leader has reported its PTRACE_EVENT_EXIT stop 50 // by this time, it appears to the tracer that the dead thread leader 51 // "reappears from nowhere". (Note: the thread group leader does not 52 // report death via WIFEXITED(status) until there is at least one 53 // other live thread. This eliminates the possibility that the 54 // tracer will see it dying and then reappearing.) If the thread 55 // group leader was still alive, for the tracer this may look as if 56 // thread group leader returns from a different system call than it 57 // entered, or even "returned from a system call even though it was 58 // not in any system call". If the thread group leader was not 59 // traced (or was traced by a different tracer), then during 60 // execve(2) it will appear as if it has become a tracee of the 61 // tracer of the execing tracee. 62 // 63 // All of the above effects are the artifacts of the thread ID change in 64 // the tracee. 65 // """ 66 67 import ( 68 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 69 "github.com/nicocha30/gvisor-ligolo/pkg/cleanup" 70 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 71 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/mm" 72 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck" 73 pb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck/points/points_go_proto" 74 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 75 ) 76 77 // execStop is a TaskStop that a task sets on itself when it wants to execve 78 // and is waiting for the other tasks in its thread group to exit first. 79 // 80 // +stateify savable 81 type execStop struct{} 82 83 // Killable implements TaskStop.Killable. 84 func (*execStop) Killable() bool { return true } 85 86 // Execve implements the execve(2) syscall by killing all other tasks in its 87 // thread group and switching to newImage. Execve always takes ownership of 88 // newImage. 89 // 90 // If executable is not nil, it is the first executable file that was loaded in 91 // the process of obtaining newImage, and pathname is a path to it. 92 // 93 // Preconditions: The caller must be running Task.doSyscallInvoke on the task 94 // goroutine. 95 func (t *Task) Execve(newImage *TaskImage, argv, env []string, executable *vfs.FileDescription, pathname string) (*SyscallControl, error) { 96 cu := cleanup.Make(func() { 97 newImage.release(t) 98 }) 99 defer cu.Clean() 100 // We can't clearly hold kernel package locks while stat'ing executable. 101 if seccheck.Global.Enabled(seccheck.PointExecve) { 102 mask, info := getExecveSeccheckInfo(t, argv, env, executable, pathname) 103 if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 104 return c.Execve(t, mask, info) 105 }); err != nil { 106 return nil, err 107 } 108 } 109 110 t.tg.pidns.owner.mu.Lock() 111 defer t.tg.pidns.owner.mu.Unlock() 112 t.tg.signalHandlers.mu.Lock() 113 defer t.tg.signalHandlers.mu.Unlock() 114 115 if t.tg.exiting || t.tg.execing != nil { 116 // We lost to a racing group-exit, kill, or exec from another thread 117 // and should just exit. 118 return nil, linuxerr.EINTR 119 } 120 121 // Cancel any racing group stops. 122 t.tg.endGroupStopLocked(false) 123 124 // If the task has any siblings, they have to exit before the exec can 125 // continue. 126 t.tg.execing = t 127 if t.tg.tasks.Front() != t.tg.tasks.Back() { 128 // "[All] other threads except the thread group leader report death as 129 // if they exited via _exit(2) with exit code 0." - ptrace(2) 130 for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { 131 if t != sibling { 132 sibling.killLocked() 133 } 134 } 135 // The last sibling to exit will wake t. 136 t.beginInternalStopLocked((*execStop)(nil)) 137 } 138 139 cu.Release() 140 return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil 141 } 142 143 // The runSyscallAfterExecStop state continues execve(2) after all siblings of 144 // a thread in the execve syscall have exited. 145 // 146 // +stateify savable 147 type runSyscallAfterExecStop struct { 148 image *TaskImage 149 } 150 151 func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { 152 t.traceExecEvent(r.image) 153 t.tg.pidns.owner.mu.Lock() 154 t.tg.execing = nil 155 if t.killed() { 156 t.tg.pidns.owner.mu.Unlock() 157 r.image.release(t) 158 return (*runInterrupt)(nil) 159 } 160 // We are the thread group leader now. Save our old thread ID for 161 // PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this 162 // point it will get a PID of 0, but this is consistent with Linux. 163 oldTID := ThreadID(0) 164 if tracer := t.Tracer(); tracer != nil { 165 oldTID = tracer.tg.pidns.tids[t] 166 } 167 t.promoteLocked() 168 // "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle 169 // this first since POSIX timers are protected by the signal mutex, which 170 // we're about to change. Note that we have to stop and destroy timers 171 // without holding any mutexes to avoid circular lock ordering. 172 var its []*IntervalTimer 173 t.tg.signalHandlers.mu.Lock() 174 for _, it := range t.tg.timers { 175 its = append(its, it) 176 } 177 t.tg.timers = make(map[linux.TimerID]*IntervalTimer) 178 t.tg.signalHandlers.mu.Unlock() 179 t.tg.pidns.owner.mu.Unlock() 180 for _, it := range its { 181 it.DestroyTimer() 182 } 183 t.tg.pidns.owner.mu.Lock() 184 // "During an execve(2), the dispositions of handled signals are reset to 185 // the default; the dispositions of ignored signals are left unchanged. ... 186 // [The] signal mask is preserved across execve(2). ... [The] pending 187 // signal set is preserved across an execve(2)." - signal(7) 188 // 189 // Details: 190 // 191 // - If the thread group is sharing its signal handlers with another thread 192 // group via CLONE_SIGHAND, execve forces the signal handlers to be copied 193 // (see Linux's fs/exec.c:de_thread). We're not reference-counting signal 194 // handlers, so we always make a copy. 195 // 196 // - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags, 197 // restorer (if present), and mask are always reset. (See Linux's 198 // fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.) 199 t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec() 200 t.endStopCond.L = &t.tg.signalHandlers.mu 201 // "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2) 202 t.signalStack = linux.SignalStack{Flags: linux.SS_DISABLE} 203 // "The termination signal is reset to SIGCHLD (see clone(2))." 204 t.tg.terminationSignal = linux.SIGCHLD 205 // execed indicates that the process can no longer join a process group 206 // in some scenarios (namely, the parent call setpgid(2) on the child). 207 // See the JoinProcessGroup function in sessions.go for more context. 208 t.tg.execed = true 209 // Maximum RSS is preserved across execve(2). 210 t.updateRSSLocked() 211 // Restartable sequence state is discarded. 212 t.rseqPreempted = false 213 t.rseqCPU = -1 214 t.rseqAddr = 0 215 t.rseqSignature = 0 216 t.oldRSeqCPUAddr = 0 217 t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) 218 t.tg.pidns.owner.mu.Unlock() 219 220 oldFDTable := t.fdTable 221 t.fdTable = t.fdTable.Fork(t, int32(t.fdTable.CurrentMaxFDs())) 222 oldFDTable.DecRef(t) 223 224 // Remove FDs with the CloseOnExec flag set. 225 t.fdTable.RemoveIf(t, func(_ *vfs.FileDescription, flags FDFlags) bool { 226 return flags.CloseOnExec 227 }) 228 229 // Handle the robust futex list. 230 t.exitRobustList() 231 232 // NOTE(b/30815691): We currently do not implement privileged 233 // executables (set-user/group-ID bits and file capabilities). This 234 // allows us to unconditionally enable user dumpability on the new mm. 235 // See fs/exec.c:setup_new_exec. 236 r.image.MemoryManager.SetDumpability(mm.UserDumpable) 237 238 // Switch to the new process. 239 t.MemoryManager().Deactivate() 240 t.mu.Lock() 241 // Update credentials to reflect the execve. This should precede switching 242 // MMs to ensure that dumpability has been reset first, if needed. 243 t.updateCredsForExecLocked() 244 oldImage := t.image 245 t.image = *r.image 246 t.mu.Unlock() 247 248 // Don't hold t.mu while calling t.image.release(), that may 249 // attempt to acquire TaskImage.MemoryManager.mappingMu, a lock order 250 // violation. 251 oldImage.release(t) 252 253 t.unstopVforkParent() 254 t.p.FullStateChanged() 255 // NOTE(b/30316266): All locks must be dropped prior to calling Activate. 256 t.MemoryManager().Activate(t) 257 258 t.ptraceExec(oldTID) 259 return (*runSyscallExit)(nil) 260 } 261 262 // promoteLocked makes t the leader of its thread group. If t is already the 263 // thread group leader, promoteLocked is a no-op. 264 // 265 // Preconditions: 266 // - All other tasks in t's thread group, including the existing leader (if it 267 // is not t), have reached TaskExitZombie. 268 // - The TaskSet mutex must be locked for writing. 269 func (t *Task) promoteLocked() { 270 oldLeader := t.tg.leader 271 if t == oldLeader { 272 return 273 } 274 // Swap the leader's TIDs with the execing task's. The latter will be 275 // released when the old leader is reaped below. 276 for ns := t.tg.pidns; ns != nil; ns = ns.parent { 277 oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader] 278 ns.tids[oldLeader] = oldTID 279 ns.tids[t] = leaderTID 280 ns.tasks[oldTID] = oldLeader 281 ns.tasks[leaderTID] = t 282 // Neither the ThreadGroup nor TGID change, so no need to 283 // update ns.tgids. 284 } 285 286 // Inherit the old leader's start time. 287 oldStartTime := oldLeader.StartTime() 288 t.mu.Lock() 289 t.startTime = oldStartTime 290 t.mu.Unlock() 291 292 t.tg.leader = t 293 t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) 294 t.updateInfoLocked() 295 // Reap the original leader. If it has a tracer, detach it instead of 296 // waiting for it to acknowledge the original leader's death. 297 oldLeader.exitParentNotified = true 298 oldLeader.exitParentAcked = true 299 if tracer := oldLeader.Tracer(); tracer != nil { 300 delete(tracer.ptraceTracees, oldLeader) 301 oldLeader.forgetTracerLocked() 302 // Notify the tracer that it will no longer be receiving these events 303 // from the tracee. 304 tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue) 305 } 306 oldLeader.exitNotifyLocked(false) 307 } 308 309 func getExecveSeccheckInfo(t *Task, argv, env []string, executable *vfs.FileDescription, pathname string) (seccheck.FieldSet, *pb.ExecveInfo) { 310 fields := seccheck.Global.GetFieldSet(seccheck.PointExecve) 311 info := &pb.ExecveInfo{ 312 Argv: argv, 313 Env: env, 314 } 315 if executable != nil { 316 info.BinaryPath = pathname 317 if fields.Local.Contains(seccheck.FieldSentryExecveBinaryInfo) { 318 statOpts := vfs.StatOptions{ 319 Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID, 320 } 321 if stat, err := executable.Stat(t, statOpts); err == nil { 322 if stat.Mask&(linux.STATX_TYPE|linux.STATX_MODE) == (linux.STATX_TYPE | linux.STATX_MODE) { 323 info.BinaryMode = uint32(stat.Mode) 324 } 325 if stat.Mask&linux.STATX_UID != 0 { 326 info.BinaryUid = stat.UID 327 } 328 if stat.Mask&linux.STATX_GID != 0 { 329 info.BinaryGid = stat.GID 330 } 331 } 332 } 333 } 334 335 if !fields.Context.Empty() { 336 info.ContextData = &pb.ContextData{} 337 LoadSeccheckData(t, fields.Context, info.ContextData) 338 } 339 return fields, info 340 }