github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_exec.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  // This file implements the machinery behind the execve() syscall. In brief, a
    18  // thread executes an execve() by killing all other threads in its thread
    19  // group, assuming the leader's identity, and then switching process images.
    20  //
    21  // This design is effectively mandated by Linux. From ptrace(2):
    22  //
    23  // """
    24  // execve(2) under ptrace
    25  //     When one thread in a multithreaded process calls execve(2), the
    26  //     kernel destroys all other threads in the process, and resets the
    27  //     thread ID of the execing thread to the thread group ID (process ID).
    28  //     (Or, to put things another way, when a multithreaded process does an
    29  //     execve(2), at completion of the call, it appears as though the
    30  //     execve(2) occurred in the thread group leader, regardless of which
    31  //     thread did the execve(2).)  This resetting of the thread ID looks
    32  //     very confusing to tracers:
    33  //
    34  //     *  All other threads stop in PTRACE_EVENT_EXIT stop, if the
    35  //        PTRACE_O_TRACEEXIT option was turned on.  Then all other threads
    36  //        except the thread group leader report death as if they exited via
    37  //        _exit(2) with exit code 0.
    38  //
    39  //     *  The execing tracee changes its thread ID while it is in the
    40  //        execve(2).  (Remember, under ptrace, the "pid" returned from
    41  //        waitpid(2), or fed into ptrace calls, is the tracee's thread ID.)
    42  //        That is, the tracee's thread ID is reset to be the same as its
    43  //        process ID, which is the same as the thread group leader's thread
    44  //        ID.
    45  //
    46  //     *  Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC
    47  //        option was turned on.
    48  //
    49  //     *  If the thread group leader has reported its PTRACE_EVENT_EXIT stop
    50  //        by this time, it appears to the tracer that the dead thread leader
    51  //        "reappears from nowhere".  (Note: the thread group leader does not
    52  //        report death via WIFEXITED(status) until there is at least one
    53  //        other live thread.  This eliminates the possibility that the
    54  //        tracer will see it dying and then reappearing.)  If the thread
    55  //        group leader was still alive, for the tracer this may look as if
    56  //        thread group leader returns from a different system call than it
    57  //        entered, or even "returned from a system call even though it was
    58  //        not in any system call".  If the thread group leader was not
    59  //        traced (or was traced by a different tracer), then during
    60  //        execve(2) it will appear as if it has become a tracee of the
    61  //        tracer of the execing tracee.
    62  //
    63  //     All of the above effects are the artifacts of the thread ID change in
    64  //     the tracee.
    65  // """
    66  
    67  import (
    68  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    69  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    70  	"github.com/SagerNet/gvisor/pkg/sentry/mm"
    71  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    72  	"github.com/SagerNet/gvisor/pkg/syserror"
    73  )
    74  
    75  // execStop is a TaskStop that a task sets on itself when it wants to execve
    76  // and is waiting for the other tasks in its thread group to exit first.
    77  //
    78  // +stateify savable
    79  type execStop struct{}
    80  
    81  // Killable implements TaskStop.Killable.
    82  func (*execStop) Killable() bool { return true }
    83  
    84  // Execve implements the execve(2) syscall by killing all other tasks in its
    85  // thread group and switching to newImage. Execve always takes ownership of
    86  // newImage.
    87  //
    88  // Preconditions: The caller must be running Task.doSyscallInvoke on the task
    89  // goroutine.
    90  func (t *Task) Execve(newImage *TaskImage) (*SyscallControl, error) {
    91  	t.tg.pidns.owner.mu.Lock()
    92  	defer t.tg.pidns.owner.mu.Unlock()
    93  	t.tg.signalHandlers.mu.Lock()
    94  	defer t.tg.signalHandlers.mu.Unlock()
    95  
    96  	if t.tg.exiting || t.tg.execing != nil {
    97  		// We lost to a racing group-exit, kill, or exec from another thread
    98  		// and should just exit.
    99  		newImage.release()
   100  		return nil, syserror.EINTR
   101  	}
   102  
   103  	// Cancel any racing group stops.
   104  	t.tg.endGroupStopLocked(false)
   105  
   106  	// If the task has any siblings, they have to exit before the exec can
   107  	// continue.
   108  	t.tg.execing = t
   109  	if t.tg.tasks.Front() != t.tg.tasks.Back() {
   110  		// "[All] other threads except the thread group leader report death as
   111  		// if they exited via _exit(2) with exit code 0." - ptrace(2)
   112  		for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
   113  			if t != sibling {
   114  				sibling.killLocked()
   115  			}
   116  		}
   117  		// The last sibling to exit will wake t.
   118  		t.beginInternalStopLocked((*execStop)(nil))
   119  	}
   120  
   121  	return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil
   122  }
   123  
   124  // The runSyscallAfterExecStop state continues execve(2) after all siblings of
   125  // a thread in the execve syscall have exited.
   126  //
   127  // +stateify savable
   128  type runSyscallAfterExecStop struct {
   129  	image *TaskImage
   130  }
   131  
   132  func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
   133  	t.traceExecEvent(r.image)
   134  	t.tg.pidns.owner.mu.Lock()
   135  	t.tg.execing = nil
   136  	if t.killed() {
   137  		t.tg.pidns.owner.mu.Unlock()
   138  		r.image.release()
   139  		return (*runInterrupt)(nil)
   140  	}
   141  	// We are the thread group leader now. Save our old thread ID for
   142  	// PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this
   143  	// point it will get a PID of 0, but this is consistent with Linux.
   144  	oldTID := ThreadID(0)
   145  	if tracer := t.Tracer(); tracer != nil {
   146  		oldTID = tracer.tg.pidns.tids[t]
   147  	}
   148  	t.promoteLocked()
   149  	// "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle
   150  	// this first since POSIX timers are protected by the signal mutex, which
   151  	// we're about to change. Note that we have to stop and destroy timers
   152  	// without holding any mutexes to avoid circular lock ordering.
   153  	var its []*IntervalTimer
   154  	t.tg.signalHandlers.mu.Lock()
   155  	for _, it := range t.tg.timers {
   156  		its = append(its, it)
   157  	}
   158  	t.tg.timers = make(map[linux.TimerID]*IntervalTimer)
   159  	t.tg.signalHandlers.mu.Unlock()
   160  	t.tg.pidns.owner.mu.Unlock()
   161  	for _, it := range its {
   162  		it.DestroyTimer()
   163  	}
   164  	t.tg.pidns.owner.mu.Lock()
   165  	// "During an execve(2), the dispositions of handled signals are reset to
   166  	// the default; the dispositions of ignored signals are left unchanged. ...
   167  	// [The] signal mask is preserved across execve(2). ... [The] pending
   168  	// signal set is preserved across an execve(2)." - signal(7)
   169  	//
   170  	// Details:
   171  	//
   172  	// - If the thread group is sharing its signal handlers with another thread
   173  	// group via CLONE_SIGHAND, execve forces the signal handlers to be copied
   174  	// (see Linux's fs/exec.c:de_thread). We're not reference-counting signal
   175  	// handlers, so we always make a copy.
   176  	//
   177  	// - "Disposition" only means sigaction::sa_handler/sa_sigaction; flags,
   178  	// restorer (if present), and mask are always reset. (See Linux's
   179  	// fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.)
   180  	t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec()
   181  	t.endStopCond.L = &t.tg.signalHandlers.mu
   182  	// "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2)
   183  	t.signalStack = linux.SignalStack{Flags: linux.SS_DISABLE}
   184  	// "The termination signal is reset to SIGCHLD (see clone(2))."
   185  	t.tg.terminationSignal = linux.SIGCHLD
   186  	// execed indicates that the process can no longer join a process group
   187  	// in some scenarios (namely, the parent call setpgid(2) on the child).
   188  	// See the JoinProcessGroup function in sessions.go for more context.
   189  	t.tg.execed = true
   190  	// Maximum RSS is preserved across execve(2).
   191  	t.updateRSSLocked()
   192  	// Restartable sequence state is discarded.
   193  	t.rseqPreempted = false
   194  	t.rseqCPU = -1
   195  	t.rseqAddr = 0
   196  	t.rseqSignature = 0
   197  	t.oldRSeqCPUAddr = 0
   198  	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
   199  	t.tg.pidns.owner.mu.Unlock()
   200  
   201  	oldFDTable := t.fdTable
   202  	t.fdTable = t.fdTable.Fork(t)
   203  	oldFDTable.DecRef(t)
   204  
   205  	// Remove FDs with the CloseOnExec flag set.
   206  	t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool {
   207  		return flags.CloseOnExec
   208  	})
   209  
   210  	// Handle the robust futex list.
   211  	t.exitRobustList()
   212  
   213  	// NOTE(b/30815691): We currently do not implement privileged
   214  	// executables (set-user/group-ID bits and file capabilities). This
   215  	// allows us to unconditionally enable user dumpability on the new mm.
   216  	// See fs/exec.c:setup_new_exec.
   217  	r.image.MemoryManager.SetDumpability(mm.UserDumpable)
   218  
   219  	// Switch to the new process.
   220  	t.MemoryManager().Deactivate()
   221  	t.mu.Lock()
   222  	// Update credentials to reflect the execve. This should precede switching
   223  	// MMs to ensure that dumpability has been reset first, if needed.
   224  	t.updateCredsForExecLocked()
   225  	t.image.release()
   226  	t.image = *r.image
   227  	t.mu.Unlock()
   228  	t.unstopVforkParent()
   229  	t.p.FullStateChanged()
   230  	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
   231  	t.MemoryManager().Activate(t)
   232  
   233  	t.ptraceExec(oldTID)
   234  	return (*runSyscallExit)(nil)
   235  }
   236  
   237  // promoteLocked makes t the leader of its thread group. If t is already the
   238  // thread group leader, promoteLocked is a no-op.
   239  //
   240  // Preconditions:
   241  // * All other tasks in t's thread group, including the existing leader (if it
   242  //   is not t), have reached TaskExitZombie.
   243  // * The TaskSet mutex must be locked for writing.
   244  func (t *Task) promoteLocked() {
   245  	oldLeader := t.tg.leader
   246  	if t == oldLeader {
   247  		return
   248  	}
   249  	// Swap the leader's TIDs with the execing task's. The latter will be
   250  	// released when the old leader is reaped below.
   251  	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
   252  		oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader]
   253  		ns.tids[oldLeader] = oldTID
   254  		ns.tids[t] = leaderTID
   255  		ns.tasks[oldTID] = oldLeader
   256  		ns.tasks[leaderTID] = t
   257  		// Neither the ThreadGroup nor TGID change, so no need to
   258  		// update ns.tgids.
   259  	}
   260  
   261  	// Inherit the old leader's start time.
   262  	oldStartTime := oldLeader.StartTime()
   263  	t.mu.Lock()
   264  	t.startTime = oldStartTime
   265  	t.mu.Unlock()
   266  
   267  	t.tg.leader = t
   268  	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
   269  	t.updateInfoLocked()
   270  	// Reap the original leader. If it has a tracer, detach it instead of
   271  	// waiting for it to acknowledge the original leader's death.
   272  	oldLeader.exitParentNotified = true
   273  	oldLeader.exitParentAcked = true
   274  	if tracer := oldLeader.Tracer(); tracer != nil {
   275  		delete(tracer.ptraceTracees, oldLeader)
   276  		oldLeader.forgetTracerLocked()
   277  		// Notify the tracer that it will no longer be receiving these events
   278  		// from the tracee.
   279  		tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue)
   280  	}
   281  	oldLeader.exitNotifyLocked(false)
   282  }