github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/task_exec.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  // This file implements the machinery behind the execve() syscall. In brief, a
    18  // thread executes an execve() by killing all other threads in its thread
    19  // group, assuming the leader's identity, and then switching process images.
    20  //
    21  // This design is effectively mandated by Linux. From ptrace(2):
    22  //
    23  // """
    24  // execve(2) under ptrace
    25  //     When one thread in a multithreaded process calls execve(2), the
    26  //     kernel destroys all other threads in the process, and resets the
    27  //     thread ID of the execing thread to the thread group ID (process ID).
    28  //     (Or, to put things another way, when a multithreaded process does an
    29  //     execve(2), at completion of the call, it appears as though the
    30  //     execve(2) occurred in the thread group leader, regardless of which
    31  //     thread did the execve(2).)  This resetting of the thread ID looks
    32  //     very confusing to tracers:
    33  //
    34  //     *  All other threads stop in PTRACE_EVENT_EXIT stop, if the
    35  //        PTRACE_O_TRACEEXIT option was turned on.  Then all other threads
    36  //        except the thread group leader report death as if they exited via
    37  //        _exit(2) with exit code 0.
    38  //
    39  //     *  The execing tracee changes its thread ID while it is in the
    40  //        execve(2).  (Remember, under ptrace, the "pid" returned from
    41  //        waitpid(2), or fed into ptrace calls, is the tracee's thread ID.)
    42  //        That is, the tracee's thread ID is reset to be the same as its
    43  //        process ID, which is the same as the thread group leader's thread
    44  //        ID.
    45  //
    46  //     *  Then a PTRACE_EVENT_EXEC stop happens, if the PTRACE_O_TRACEEXEC
    47  //        option was turned on.
    48  //
    49  //     *  If the thread group leader has reported its PTRACE_EVENT_EXIT stop
    50  //        by this time, it appears to the tracer that the dead thread leader
    51  //        "reappears from nowhere".  (Note: the thread group leader does not
    52  //        report death via WIFEXITED(status) until there is at least one
    53  //        other live thread.  This eliminates the possibility that the
    54  //        tracer will see it dying and then reappearing.)  If the thread
    55  //        group leader was still alive, for the tracer this may look as if
    56  //        thread group leader returns from a different system call than it
    57  //        entered, or even "returned from a system call even though it was
    58  //        not in any system call".  If the thread group leader was not
    59  //        traced (or was traced by a different tracer), then during
    60  //        execve(2) it will appear as if it has become a tracee of the
    61  //        tracer of the execing tracee.
    62  //
    63  //     All of the above effects are the artifacts of the thread ID change in
    64  //     the tracee.
    65  // """
    66  
    67  import (
    68  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    69  	"github.com/MerlinKodo/gvisor/pkg/cleanup"
    70  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    71  	"github.com/MerlinKodo/gvisor/pkg/sentry/mm"
    72  	"github.com/MerlinKodo/gvisor/pkg/sentry/seccheck"
    73  	pb "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck/points/points_go_proto"
    74  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    75  )
    76  
    77  // execStop is a TaskStop that a task sets on itself when it wants to execve
    78  // and is waiting for the other tasks in its thread group to exit first.
    79  //
    80  // +stateify savable
    81  type execStop struct{}
    82  
    83  // Killable implements TaskStop.Killable.
    84  func (*execStop) Killable() bool { return true }
    85  
    86  // Execve implements the execve(2) syscall by killing all other tasks in its
    87  // thread group and switching to newImage. Execve always takes ownership of
    88  // newImage.
    89  //
    90  // If executable is not nil, it is the first executable file that was loaded in
    91  // the process of obtaining newImage, and pathname is a path to it.
    92  //
    93  // Preconditions: The caller must be running Task.doSyscallInvoke on the task
    94  // goroutine.
    95  func (t *Task) Execve(newImage *TaskImage, argv, env []string, executable *vfs.FileDescription, pathname string) (*SyscallControl, error) {
    96  	cu := cleanup.Make(func() {
    97  		newImage.release(t)
    98  	})
    99  	defer cu.Clean()
   100  	// We can't clearly hold kernel package locks while stat'ing executable.
   101  	if seccheck.Global.Enabled(seccheck.PointExecve) {
   102  		mask, info := getExecveSeccheckInfo(t, argv, env, executable, pathname)
   103  		if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   104  			return c.Execve(t, mask, info)
   105  		}); err != nil {
   106  			return nil, err
   107  		}
   108  	}
   109  
   110  	t.tg.pidns.owner.mu.Lock()
   111  	defer t.tg.pidns.owner.mu.Unlock()
   112  	t.tg.signalHandlers.mu.Lock()
   113  	defer t.tg.signalHandlers.mu.Unlock()
   114  
   115  	if t.tg.exiting || t.tg.execing != nil {
   116  		// We lost to a racing group-exit, kill, or exec from another thread
   117  		// and should just exit.
   118  		return nil, linuxerr.EINTR
   119  	}
   120  
   121  	// Cancel any racing group stops.
   122  	t.tg.endGroupStopLocked(false)
   123  
   124  	// If the task has any siblings, they have to exit before the exec can
   125  	// continue.
   126  	t.tg.execing = t
   127  	if t.tg.tasks.Front() != t.tg.tasks.Back() {
   128  		// "[All] other threads except the thread group leader report death as
   129  		// if they exited via _exit(2) with exit code 0." - ptrace(2)
   130  		for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
   131  			if t != sibling {
   132  				sibling.killLocked()
   133  			}
   134  		}
   135  		// The last sibling to exit will wake t.
   136  		t.beginInternalStopLocked((*execStop)(nil))
   137  	}
   138  
   139  	cu.Release()
   140  	return &SyscallControl{next: &runSyscallAfterExecStop{newImage}, ignoreReturn: true}, nil
   141  }
   142  
   143  // The runSyscallAfterExecStop state continues execve(2) after all siblings of
   144  // a thread in the execve syscall have exited.
   145  //
   146  // +stateify savable
   147  type runSyscallAfterExecStop struct {
   148  	image *TaskImage
   149  }
   150  
   151  func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
   152  	t.traceExecEvent(r.image)
   153  	t.tg.pidns.owner.mu.Lock()
   154  	t.tg.execing = nil
   155  	if t.killed() {
   156  		t.tg.pidns.owner.mu.Unlock()
   157  		r.image.release(t)
   158  		return (*runInterrupt)(nil)
   159  	}
   160  	// We are the thread group leader now. Save our old thread ID for
   161  	// PTRACE_EVENT_EXEC. This is racy in that if a tracer attaches after this
   162  	// point it will get a PID of 0, but this is consistent with Linux.
   163  	oldTID := ThreadID(0)
   164  	if tracer := t.Tracer(); tracer != nil {
   165  		oldTID = tracer.tg.pidns.tids[t]
   166  	}
   167  	t.promoteLocked()
   168  	// "POSIX timers are not preserved (timer_create(2))." - execve(2). Handle
   169  	// this first since POSIX timers are protected by the signal mutex, which
   170  	// we're about to change. Note that we have to stop and destroy timers
   171  	// without holding any mutexes to avoid circular lock ordering.
   172  	var its []*IntervalTimer
   173  	t.tg.signalHandlers.mu.Lock()
   174  	for _, it := range t.tg.timers {
   175  		its = append(its, it)
   176  	}
   177  	t.tg.timers = make(map[linux.TimerID]*IntervalTimer)
   178  	t.tg.signalHandlers.mu.Unlock()
   179  	t.tg.pidns.owner.mu.Unlock()
   180  	for _, it := range its {
   181  		it.DestroyTimer()
   182  	}
   183  	t.tg.pidns.owner.mu.Lock()
   184  	// "During an execve(2), the dispositions of handled signals are reset to
   185  	// the default; the dispositions of ignored signals are left unchanged. ...
   186  	// [The] signal mask is preserved across execve(2). ... [The] pending
   187  	// signal set is preserved across an execve(2)." - signal(7)
   188  	//
   189  	// Details:
   190  	//
   191  	//	- If the thread group is sharing its signal handlers with another thread
   192  	//		group via CLONE_SIGHAND, execve forces the signal handlers to be copied
   193  	//		(see Linux's fs/exec.c:de_thread). We're not reference-counting signal
   194  	//		handlers, so we always make a copy.
   195  	//
   196  	//	- "Disposition" only means sigaction::sa_handler/sa_sigaction; flags,
   197  	//		restorer (if present), and mask are always reset. (See Linux's
   198  	//		fs/exec.c:setup_new_exec => kernel/signal.c:flush_signal_handlers.)
   199  	t.tg.signalHandlers = t.tg.signalHandlers.CopyForExec()
   200  	t.endStopCond.L = &t.tg.signalHandlers.mu
   201  	// "Any alternate signal stack is not preserved (sigaltstack(2))." - execve(2)
   202  	t.signalStack = linux.SignalStack{Flags: linux.SS_DISABLE}
   203  	// "The termination signal is reset to SIGCHLD (see clone(2))."
   204  	t.tg.terminationSignal = linux.SIGCHLD
   205  	// execed indicates that the process can no longer join a process group
   206  	// in some scenarios (namely, the parent call setpgid(2) on the child).
   207  	// See the JoinProcessGroup function in sessions.go for more context.
   208  	t.tg.execed = true
   209  	// Maximum RSS is preserved across execve(2).
   210  	t.updateRSSLocked()
   211  	// Restartable sequence state is discarded.
   212  	t.rseqPreempted = false
   213  	t.rseqCPU = -1
   214  	t.rseqAddr = 0
   215  	t.rseqSignature = 0
   216  	t.oldRSeqCPUAddr = 0
   217  	t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
   218  	t.tg.pidns.owner.mu.Unlock()
   219  
   220  	oldFDTable := t.fdTable
   221  	t.fdTable = t.fdTable.Fork(t, int32(t.fdTable.CurrentMaxFDs()))
   222  	oldFDTable.DecRef(t)
   223  
   224  	// Remove FDs with the CloseOnExec flag set.
   225  	t.fdTable.RemoveIf(t, func(_ *vfs.FileDescription, flags FDFlags) bool {
   226  		return flags.CloseOnExec
   227  	})
   228  
   229  	// Handle the robust futex list.
   230  	t.exitRobustList()
   231  
   232  	// NOTE(b/30815691): We currently do not implement privileged
   233  	// executables (set-user/group-ID bits and file capabilities). This
   234  	// allows us to unconditionally enable user dumpability on the new mm.
   235  	// See fs/exec.c:setup_new_exec.
   236  	r.image.MemoryManager.SetDumpability(mm.UserDumpable)
   237  
   238  	// Switch to the new process.
   239  	t.MemoryManager().Deactivate()
   240  	t.mu.Lock()
   241  	// Update credentials to reflect the execve. This should precede switching
   242  	// MMs to ensure that dumpability has been reset first, if needed.
   243  	t.updateCredsForExecLocked()
   244  	oldImage := t.image
   245  	t.image = *r.image
   246  	t.mu.Unlock()
   247  
   248  	// Don't hold t.mu while calling t.image.release(), that may
   249  	// attempt to acquire TaskImage.MemoryManager.mappingMu, a lock order
   250  	// violation.
   251  	oldImage.release(t)
   252  
   253  	t.unstopVforkParent()
   254  	t.p.FullStateChanged()
   255  	// NOTE(b/30316266): All locks must be dropped prior to calling Activate.
   256  	t.MemoryManager().Activate(t)
   257  
   258  	t.ptraceExec(oldTID)
   259  	return (*runSyscallExit)(nil)
   260  }
   261  
   262  // promoteLocked makes t the leader of its thread group. If t is already the
   263  // thread group leader, promoteLocked is a no-op.
   264  //
   265  // Preconditions:
   266  //   - All other tasks in t's thread group, including the existing leader (if it
   267  //     is not t), have reached TaskExitZombie.
   268  //   - The TaskSet mutex must be locked for writing.
   269  func (t *Task) promoteLocked() {
   270  	oldLeader := t.tg.leader
   271  	if t == oldLeader {
   272  		return
   273  	}
   274  	// Swap the leader's TIDs with the execing task's. The latter will be
   275  	// released when the old leader is reaped below.
   276  	for ns := t.tg.pidns; ns != nil; ns = ns.parent {
   277  		oldTID, leaderTID := ns.tids[t], ns.tids[oldLeader]
   278  		ns.tids[oldLeader] = oldTID
   279  		ns.tids[t] = leaderTID
   280  		ns.tasks[oldTID] = oldLeader
   281  		ns.tasks[leaderTID] = t
   282  		// Neither the ThreadGroup nor TGID change, so no need to
   283  		// update ns.tgids.
   284  	}
   285  
   286  	// Inherit the old leader's start time.
   287  	oldStartTime := oldLeader.StartTime()
   288  	t.mu.Lock()
   289  	t.startTime = oldStartTime
   290  	t.mu.Unlock()
   291  
   292  	t.tg.leader = t
   293  	t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
   294  	t.updateInfoLocked()
   295  	// Reap the original leader. If it has a tracer, detach it instead of
   296  	// waiting for it to acknowledge the original leader's death.
   297  	oldLeader.exitParentNotified = true
   298  	oldLeader.exitParentAcked = true
   299  	if tracer := oldLeader.Tracer(); tracer != nil {
   300  		delete(tracer.ptraceTracees, oldLeader)
   301  		oldLeader.forgetTracerLocked()
   302  		// Notify the tracer that it will no longer be receiving these events
   303  		// from the tracee.
   304  		tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop | EventGroupContinue)
   305  	}
   306  	oldLeader.exitNotifyLocked(false)
   307  }
   308  
   309  func getExecveSeccheckInfo(t *Task, argv, env []string, executable *vfs.FileDescription, pathname string) (seccheck.FieldSet, *pb.ExecveInfo) {
   310  	fields := seccheck.Global.GetFieldSet(seccheck.PointExecve)
   311  	info := &pb.ExecveInfo{
   312  		Argv: argv,
   313  		Env:  env,
   314  	}
   315  	if executable != nil {
   316  		info.BinaryPath = pathname
   317  		if fields.Local.Contains(seccheck.FieldSentryExecveBinaryInfo) {
   318  			statOpts := vfs.StatOptions{
   319  				Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID,
   320  			}
   321  			if stat, err := executable.Stat(t, statOpts); err == nil {
   322  				if stat.Mask&(linux.STATX_TYPE|linux.STATX_MODE) == (linux.STATX_TYPE | linux.STATX_MODE) {
   323  					info.BinaryMode = uint32(stat.Mode)
   324  				}
   325  				if stat.Mask&linux.STATX_UID != 0 {
   326  					info.BinaryUid = stat.UID
   327  				}
   328  				if stat.Mask&linux.STATX_GID != 0 {
   329  					info.BinaryGid = stat.GID
   330  				}
   331  			}
   332  		}
   333  	}
   334  
   335  	if !fields.Context.Empty() {
   336  		info.ContextData = &pb.ContextData{}
   337  		LoadSeccheckData(t, fields.Context, info.ContextData)
   338  	}
   339  	return fields, info
   340  }