github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_exec.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 // This file implements the machinery behind the execve() syscall. In brief, a 18 // thread executes an execve() by killing all other threads in its thread 19 // group, assuming the leader's identity, and then switching process images. 20 // 21 // This design is effectively mandated by Linux. From ptrace(2): 22 // 23 // """ 24 // execve(2) under ptrace 25 // When one thread in a multithreaded process calls execve(2), the 26 // kernel destroys all other threads in the process, and resets the 27 // thread ID of the execing thread to the thread group ID (process ID). 28 // (Or, to put things another way, when a multithreaded process does an 29 // execve(2), at completion of the call, it appears as though the 30 // execve(2) occurred in the thread group leader, regardless of which 31 // thread did the execve(2).) This resetting of the thread ID looks 32 // very confusing to tracers: 33 // 34 // * All other threads stop in PTRACE_EVENT_EXIT stop, if the 35 // PTRACE_O_TRACEEXIT option was turned on. Then all other threads 36 // except the thread group leader report death as if they exited via 37 // _exit(2) with exit code 0. 38 // 39 // * The execing tracee changes its thread ID while it is in the 40 // execve(2). (Remember, under ptrace, the "pid" returned from 41 // waitpid(2), or fed into ptrace calls, is the tracee's thread ID.) 42 // That is, the tracee's thread ID is reset to be the same as its 43 // process ID, which is the same as the thread group leader's thread 44 // ID. 45 // 46 // * Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC 47 // option was turned on. 48 // 49 // * If the thread group leader has reported its PTRACE_EVENT_EXIT stop 50 // by this time, it appears to the tracer that the dead thread leader 51 // "reappears from nowhere". (Note: the thread group leader does not 52 // report death via WIFEXITED(status) until there is at least one 53 // other live thread. This eliminates the possibility that the 54 // tracer will see it dying and then reappearing.) If the thread 55 // group leader was still alive, for the tracer this may look as if 56 // thread group leader returns from a different system call than it 57 // entered, or even "returned from a system call even though it was 58 // not in any system call". If the thread group leader was not 59 // traced (or was traced by a different tracer), then during 60 // execve(2) it will appear as if it has become a tracee of the 61 // tracer of the execing tracee. 62 // 63 // All of the above effects are the artifacts of the thread ID change in 64 // the tracee. 65 // """ 66 67 import ( 68 "github.com/SagerNet/gvisor/pkg/abi/linux" 69 "github.com/SagerNet/gvisor/pkg/sentry/fs" 70 "github.com/SagerNet/gvisor/pkg/sentry/mm" 71 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 72 "github.com/SagerNet/gvisor/pkg/syserror" 73 ) 74 75 // execStop is a TaskStop that a task sets on itself when it wants to execve 76 // and is waiting for the other tasks in its thread group to exit first. 77 // 78 // +stateify savable 79 type execStop struct{} 80 81 // Killable implements TaskStop.Killable. 82 func (*execStop) Killable() bool { return true } 83 84 // Execve implements the execve(2) syscall by killing all other tasks in its 85 // thread group and switching to newImage. Execve always takes ownership of 86 // newImage. 87 // 88 // Preconditions: The caller must be running Task.doSyscallInvoke on the task 89 // goroutine. 90 func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) { 91 t.tg.pidns.owner.mu.Lock() 92 defer t.tg.pidns.owner.mu.Unlock() 93 t.tg.signalHandlers.mu.Lock() 94 defer t.tg.signalHandlers.mu.Unlock() 95 96 if t.tg.exiting || t.tg.execing != nil { 97 // We lost to a racing group-exit, kill, or exec from another thread 98 // and should just exit. 99 newImage.release() 100 return nil, syserror.EINTR 101 } 102 103 // Cancel any racing group stops. 104 t.tg.endGroupStopLocked(false) 105 106 // If the task has any siblings, they have to exit before the exec can 107 // continue. 108 t.tg.execing = t 109 if t.tg.tasks.Front() != t.tg.tasks.Back() { 110 // "[All] other threads except the thread group leader report death as 111 // if they exited via _exit(2) with exit code 0." - ptrace(2) 112 for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() { 113 if t != sibling { 114 sibling.killLocked() 115 } 116 } 117 // The last sibling to exit will wake t. 118 t.beginInternalStopLocked((*execStop)(nil)) 119 } 120 121 return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil 122 } 123 124 // The runSyscallAfterExecStop state continues execve(2) after all siblings of 125 // a thread in the execve syscall have exited. 126 // 127 // +stateify savable 128 type runSyscallAfterExecStop struct { 129 image *TaskImage 130 } 131 132 func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { 133 t.traceExecEvent(r.image) 134 t.tg.pidns.owner.mu.Lock() 135 t.tg.execing = nil 136 if t.killed() { 137 t.tg.pidns.owner.mu.Unlock() 138 r.image.release() 139 return (*runInterrupt)(nil) 140 } 141 // We are the thread group leader now. Save our old thread ID for 142 // PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this 143 // point it will get a PID of 0, but this is consistent with Linux. 144 oldTID := ThreadID(0) 145 if tracer := t.Tracer(); tracer != nil { 146 oldTID = tracer.tg.pidns.tids[t] 147 } 148 t.promoteLocked() 149 // "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle 150 // this first since POSIX timers are protected by the signal mutex, which 151 // we're about to change. Note that we have to stop and destroy timers 152 // without holding any mutexes to avoid circular lock ordering. 153 var its []*IntervalTimer 154 t.tg.signalHandlers.mu.Lock() 155 for _, it := range t.tg.timers { 156 its = append(its, it) 157 } 158 t.tg.timers = make(map[linux.TimerID]*IntervalTimer) 159 t.tg.signalHandlers.mu.Unlock() 160 t.tg.pidns.owner.mu.Unlock() 161 for _, it := range its { 162 it.DestroyTimer() 163 } 164 t.tg.pidns.owner.mu.Lock() 165 // "During an execve(2), the dispositions of handled signals are reset to 166 // the default; the dispositions of ignored signals are left unchanged. ... 167 // [The] signal mask is preserved across execve(2). ... [The] pending 168 // signal set is preserved across an execve(2)." - signal(7) 169 // 170 // Details: 171 // 172 // - If the thread group is sharing its signal handlers with another thread 173 // group via CLONE_SIGHAND, execve forces the signal handlers to be copied 174 // (see Linux's fs/exec.c:de_thread). We're not reference-counting signal 175 // handlers, so we always make a copy. 176 // 177 // - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags, 178 // restorer (if present), and mask are always reset. (See Linux's 179 // fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.) 180 t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec() 181 t.endStopCond.L = &t.tg.signalHandlers.mu 182 // "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2) 183 t.signalStack = linux.SignalStack{Flags: linux.SS_DISABLE} 184 // "The termination signal is reset to SIGCHLD (see clone(2))." 185 t.tg.terminationSignal = linux.SIGCHLD 186 // execed indicates that the process can no longer join a process group 187 // in some scenarios (namely, the parent call setpgid(2) on the child). 188 // See the JoinProcessGroup function in sessions.go for more context. 189 t.tg.execed = true 190 // Maximum RSS is preserved across execve(2). 191 t.updateRSSLocked() 192 // Restartable sequence state is discarded. 193 t.rseqPreempted = false 194 t.rseqCPU = -1 195 t.rseqAddr = 0 196 t.rseqSignature = 0 197 t.oldRSeqCPUAddr = 0 198 t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{}) 199 t.tg.pidns.owner.mu.Unlock() 200 201 oldFDTable := t.fdTable 202 t.fdTable = t.fdTable.Fork(t) 203 oldFDTable.DecRef(t) 204 205 // Remove FDs with the CloseOnExec flag set. 206 t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { 207 return flags.CloseOnExec 208 }) 209 210 // Handle the robust futex list. 211 t.exitRobustList() 212 213 // NOTE(b/30815691): We currently do not implement privileged 214 // executables (set-user/group-ID bits and file capabilities). This 215 // allows us to unconditionally enable user dumpability on the new mm. 216 // See fs/exec.c:setup_new_exec. 217 r.image.MemoryManager.SetDumpability(mm.UserDumpable) 218 219 // Switch to the new process. 220 t.MemoryManager().Deactivate() 221 t.mu.Lock() 222 // Update credentials to reflect the execve. This should precede switching 223 // MMs to ensure that dumpability has been reset first, if needed. 224 t.updateCredsForExecLocked() 225 t.image.release() 226 t.image = *r.image 227 t.mu.Unlock() 228 t.unstopVforkParent() 229 t.p.FullStateChanged() 230 // NOTE(b/30316266): All locks must be dropped prior to calling Activate. 231 t.MemoryManager().Activate(t) 232 233 t.ptraceExec(oldTID) 234 return (*runSyscallExit)(nil) 235 } 236 237 // promoteLocked makes t the leader of its thread group. If t is already the 238 // thread group leader, promoteLocked is a no-op. 239 // 240 // Preconditions: 241 // * All other tasks in t's thread group, including the existing leader (if it 242 // is not t), have reached TaskExitZombie. 243 // * The TaskSet mutex must be locked for writing. 244 func (t *Task) promoteLocked() { 245 oldLeader := t.tg.leader 246 if t == oldLeader { 247 return 248 } 249 // Swap the leader's TIDs with the execing task's. The latter will be 250 // released when the old leader is reaped below. 251 for ns := t.tg.pidns; ns != nil; ns = ns.parent { 252 oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader] 253 ns.tids[oldLeader] = oldTID 254 ns.tids[t] = leaderTID 255 ns.tasks[oldTID] = oldLeader 256 ns.tasks[leaderTID] = t 257 // Neither the ThreadGroup nor TGID change, so no need to 258 // update ns.tgids. 259 } 260 261 // Inherit the old leader's start time. 262 oldStartTime := oldLeader.StartTime() 263 t.mu.Lock() 264 t.startTime = oldStartTime 265 t.mu.Unlock() 266 267 t.tg.leader = t 268 t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t]) 269 t.updateInfoLocked() 270 // Reap the original leader. If it has a tracer, detach it instead of 271 // waiting for it to acknowledge the original leader's death. 272 oldLeader.exitParentNotified = true 273 oldLeader.exitParentAcked = true 274 if tracer := oldLeader.Tracer(); tracer != nil { 275 delete(tracer.ptraceTracees, oldLeader) 276 oldLeader.forgetTracerLocked() 277 // Notify the tracer that it will no longer be receiving these events 278 // from the tracee. 279 tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue) 280 } 281 oldLeader.exitNotifyLocked(false) 282 }