github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_clone.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"sync/atomic"
    19  
    20  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    21  	"github.com/SagerNet/gvisor/pkg/bpf"
    22  	"github.com/SagerNet/gvisor/pkg/cleanup"
    23  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    24  	"github.com/SagerNet/gvisor/pkg/hostarch"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/inet"
    26  	"github.com/SagerNet/gvisor/pkg/usermem"
    27  )
    28  
    29  // SharingOptions controls what resources are shared by a new task created by
    30  // Task.Clone, or an existing task affected by Task.Unshare.
    31  type SharingOptions struct {
    32  	// If NewAddressSpace is true, the task should have an independent virtual
    33  	// address space.
    34  	NewAddressSpace bool
    35  
    36  	// If NewSignalHandlers is true, the task should use an independent set of
    37  	// signal handlers.
    38  	NewSignalHandlers bool
    39  
    40  	// If NewThreadGroup is true, the task should be the leader of its own
    41  	// thread group. TerminationSignal is the signal that the thread group
    42  	// will send to its parent when it exits. If NewThreadGroup is false,
    43  	// TerminationSignal is ignored.
    44  	NewThreadGroup    bool
    45  	TerminationSignal linux.Signal
    46  
    47  	// If NewPIDNamespace is true:
    48  	//
    49  	// - In the context of Task.Clone, the new task should be the init task
    50  	// (TID 1) in a new PID namespace.
    51  	//
    52  	// - In the context of Task.Unshare, the task should create a new PID
    53  	// namespace, and all subsequent clones of the task should be members of
    54  	// the new PID namespace.
    55  	NewPIDNamespace bool
    56  
    57  	// If NewUserNamespace is true, the task should have an independent user
    58  	// namespace.
    59  	NewUserNamespace bool
    60  
    61  	// If NewNetworkNamespace is true, the task should have an independent
    62  	// network namespace.
    63  	NewNetworkNamespace bool
    64  
    65  	// If NewFiles is true, the task should use an independent file descriptor
    66  	// table.
    67  	NewFiles bool
    68  
    69  	// If NewFSContext is true, the task should have an independent FSContext.
    70  	NewFSContext bool
    71  
    72  	// If NewUTSNamespace is true, the task should have an independent UTS
    73  	// namespace.
    74  	NewUTSNamespace bool
    75  
    76  	// If NewIPCNamespace is true, the task should have an independent IPC
    77  	// namespace.
    78  	NewIPCNamespace bool
    79  }
    80  
    81  // CloneOptions controls the behavior of Task.Clone.
    82  type CloneOptions struct {
    83  	// SharingOptions defines the set of resources that the new task will share
    84  	// with its parent.
    85  	SharingOptions
    86  
    87  	// Stack is the initial stack pointer of the new task. If Stack is 0, the
    88  	// new task will start with the same stack pointer as its parent.
    89  	Stack hostarch.Addr
    90  
    91  	// If SetTLS is true, set the new task's TLS (thread-local storage)
    92  	// descriptor to TLS. If SetTLS is false, TLS is ignored.
    93  	SetTLS bool
    94  	TLS    hostarch.Addr
    95  
    96  	// If ChildClearTID is true, when the child exits, 0 is written to the
    97  	// address ChildTID in the child's memory, and if the write is successful a
    98  	// futex wake on the same address is performed.
    99  	//
   100  	// If ChildSetTID is true, the child's thread ID (in the child's PID
   101  	// namespace) is written to address ChildTID in the child's memory. (As in
   102  	// Linux, failed writes are silently ignored.)
   103  	ChildClearTID bool
   104  	ChildSetTID   bool
   105  	ChildTID      hostarch.Addr
   106  
   107  	// If ParentSetTID is true, the child's thread ID (in the parent's PID
   108  	// namespace) is written to address ParentTID in the parent's memory. (As
   109  	// in Linux, failed writes are silently ignored.)
   110  	//
   111  	// Older versions of the clone(2) man page state that CLONE_PARENT_SETTID
   112  	// causes the child's thread ID to be written to ptid in both the parent
   113  	// and child's memory, but this is a documentation error fixed by
   114  	// 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
   115  	ParentSetTID bool
   116  	ParentTID    hostarch.Addr
   117  
   118  	// If Vfork is true, place the parent in vforkStop until the cloned task
   119  	// releases its TaskImage.
   120  	Vfork bool
   121  
   122  	// If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
   123  	// this clone(), and do not ptrace-attach the caller's tracer to the new
   124  	// task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate).
   125  	Untraced bool
   126  
   127  	// If InheritTracer is true, ptrace-attach the caller's tracer to the new
   128  	// task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported
   129  	// for it. If both Untraced and InheritTracer are true, no event will be
   130  	// reported, but tracer inheritance will still occur.
   131  	InheritTracer bool
   132  }
   133  
   134  // Clone implements the clone(2) syscall and returns the thread ID of the new
   135  // task in t's PID namespace. Clone may return both a non-zero thread ID and a
   136  // non-nil error.
   137  //
   138  // Preconditions: The caller must be running Task.doSyscallInvoke on the task
   139  // goroutine.
   140  func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
   141  	// Since signal actions may refer to application signal handlers by virtual
   142  	// address, any set of signal handlers must refer to the same address
   143  	// space.
   144  	if !opts.NewSignalHandlers && opts.NewAddressSpace {
   145  		return 0, nil, linuxerr.EINVAL
   146  	}
   147  	// In order for the behavior of thread-group-directed signals to be sane,
   148  	// all tasks in a thread group must share signal handlers.
   149  	if !opts.NewThreadGroup && opts.NewSignalHandlers {
   150  		return 0, nil, linuxerr.EINVAL
   151  	}
   152  	// All tasks in a thread group must be in the same PID namespace.
   153  	if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) {
   154  		return 0, nil, linuxerr.EINVAL
   155  	}
   156  	// The two different ways of specifying a new PID namespace are
   157  	// incompatible.
   158  	if opts.NewPIDNamespace && t.childPIDNamespace != nil {
   159  		return 0, nil, linuxerr.EINVAL
   160  	}
   161  	// Thread groups and FS contexts cannot span user namespaces.
   162  	if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) {
   163  		return 0, nil, linuxerr.EINVAL
   164  	}
   165  
   166  	// Pull task registers and FPU state, a cloned task will inherit the
   167  	// state of the current task.
   168  	t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
   169  
   170  	// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
   171  	// single clone(2) or unshare(2) call, the user namespace is guaranteed to
   172  	// be created first, giving the child (clone(2)) or caller (unshare(2))
   173  	// privileges over the remaining namespaces created by the call." -
   174  	// user_namespaces(7)
   175  	creds := t.Credentials()
   176  	userns := creds.UserNamespace
   177  	if opts.NewUserNamespace {
   178  		var err error
   179  		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
   180  		// the caller is in a chroot environment (i.e., the caller's root
   181  		// directory does not match the root directory of the mount namespace
   182  		// in which it resides)." - clone(2). Neither chroot(2) nor
   183  		// user_namespaces(7) document this.
   184  		if t.IsChrooted() {
   185  			return 0, nil, linuxerr.EPERM
   186  		}
   187  		userns, err = creds.NewChildUserNamespace()
   188  		if err != nil {
   189  			return 0, nil, err
   190  		}
   191  	}
   192  	if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
   193  		return 0, nil, linuxerr.EPERM
   194  	}
   195  
   196  	utsns := t.UTSNamespace()
   197  	if opts.NewUTSNamespace {
   198  		// Note that this must happen after NewUserNamespace so we get
   199  		// the new userns if there is one.
   200  		utsns = t.UTSNamespace().Clone(userns)
   201  	}
   202  
   203  	ipcns := t.IPCNamespace()
   204  	if opts.NewIPCNamespace {
   205  		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
   206  		// namespace"
   207  		ipcns = NewIPCNamespace(userns)
   208  	} else {
   209  		ipcns.IncRef()
   210  	}
   211  	cu := cleanup.Make(func() {
   212  		ipcns.DecRef(t)
   213  	})
   214  	defer cu.Clean()
   215  
   216  	netns := t.NetworkNamespace()
   217  	if opts.NewNetworkNamespace {
   218  		netns = inet.NewNamespace(netns)
   219  	}
   220  
   221  	// TODO(b/63601033): Implement CLONE_NEWNS.
   222  	mntnsVFS2 := t.mountNamespaceVFS2
   223  	if mntnsVFS2 != nil {
   224  		mntnsVFS2.IncRef()
   225  		cu.Add(func() {
   226  			mntnsVFS2.DecRef(t)
   227  		})
   228  	}
   229  
   230  	image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace)
   231  	if err != nil {
   232  		return 0, nil, err
   233  	}
   234  	cu.Add(func() {
   235  		image.release()
   236  	})
   237  	// clone() returns 0 in the child.
   238  	image.Arch.SetReturn(0)
   239  	if opts.Stack != 0 {
   240  		image.Arch.SetStack(uintptr(opts.Stack))
   241  	}
   242  	if opts.SetTLS {
   243  		if !image.Arch.SetTLS(uintptr(opts.TLS)) {
   244  			return 0, nil, linuxerr.EPERM
   245  		}
   246  	}
   247  
   248  	var fsContext *FSContext
   249  	if opts.NewFSContext {
   250  		fsContext = t.fsContext.Fork()
   251  	} else {
   252  		fsContext = t.fsContext
   253  		fsContext.IncRef()
   254  	}
   255  
   256  	var fdTable *FDTable
   257  	if opts.NewFiles {
   258  		fdTable = t.fdTable.Fork(t)
   259  	} else {
   260  		fdTable = t.fdTable
   261  		fdTable.IncRef()
   262  	}
   263  
   264  	pidns := t.tg.pidns
   265  	if t.childPIDNamespace != nil {
   266  		pidns = t.childPIDNamespace
   267  	} else if opts.NewPIDNamespace {
   268  		pidns = pidns.NewChild(userns)
   269  	}
   270  
   271  	tg := t.tg
   272  	rseqAddr := hostarch.Addr(0)
   273  	rseqSignature := uint32(0)
   274  	if opts.NewThreadGroup {
   275  		if tg.mounts != nil {
   276  			tg.mounts.IncRef()
   277  		}
   278  		sh := t.tg.signalHandlers
   279  		if opts.NewSignalHandlers {
   280  			sh = sh.Fork()
   281  		}
   282  		tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
   283  		tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj)
   284  		rseqAddr = t.rseqAddr
   285  		rseqSignature = t.rseqSignature
   286  	}
   287  
   288  	cfg := &TaskConfig{
   289  		Kernel:                  t.k,
   290  		ThreadGroup:             tg,
   291  		SignalMask:              t.SignalMask(),
   292  		TaskImage:               image,
   293  		FSContext:               fsContext,
   294  		FDTable:                 fdTable,
   295  		Credentials:             creds,
   296  		Niceness:                t.Niceness(),
   297  		NetworkNamespace:        netns,
   298  		AllowedCPUMask:          t.CPUMask(),
   299  		UTSNamespace:            utsns,
   300  		IPCNamespace:            ipcns,
   301  		AbstractSocketNamespace: t.abstractSockets,
   302  		MountNamespaceVFS2:      mntnsVFS2,
   303  		RSeqAddr:                rseqAddr,
   304  		RSeqSignature:           rseqSignature,
   305  		ContainerID:             t.ContainerID(),
   306  	}
   307  	if opts.NewThreadGroup {
   308  		cfg.Parent = t
   309  	} else {
   310  		cfg.InheritParent = t
   311  	}
   312  	nt, err := t.tg.pidns.owner.NewTask(t, cfg)
   313  	// If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
   314  	// the cleanup for us.
   315  	cu.Release()
   316  	if err != nil {
   317  		return 0, nil, err
   318  	}
   319  
   320  	// "A child process created via fork(2) inherits a copy of its parent's
   321  	// alternate signal stack settings" - sigaltstack(2).
   322  	//
   323  	// However kernel/fork.c:copy_process() adds a limitation to this:
   324  	// "sigaltstack should be cleared when sharing the same VM".
   325  	if opts.NewAddressSpace || opts.Vfork {
   326  		nt.SetSignalStack(t.SignalStack())
   327  	}
   328  
   329  	if userns != creds.UserNamespace {
   330  		if err := nt.SetUserNamespace(userns); err != nil {
   331  			// This shouldn't be possible: userns was created from nt.creds, so
   332  			// nt should have CAP_SYS_ADMIN in userns.
   333  			panic("Task.Clone: SetUserNamespace failed: " + err.Error())
   334  		}
   335  	}
   336  
   337  	// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
   338  	// nt that it must receive before its task goroutine starts running.
   339  	tid := nt.k.tasks.Root.IDOfTask(nt)
   340  	defer nt.Start(tid)
   341  	t.traceCloneEvent(tid)
   342  
   343  	// "If fork/clone and execve are allowed by @prog, any child processes will
   344  	// be constrained to the same filters and system call ABI as the parent." -
   345  	// Documentation/prctl/seccomp_filter.txt
   346  	if f := t.syscallFilters.Load(); f != nil {
   347  		copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
   348  		nt.syscallFilters.Store(copiedFilters)
   349  	}
   350  	if opts.Vfork {
   351  		nt.vforkParent = t
   352  	}
   353  
   354  	if opts.ChildClearTID {
   355  		nt.SetClearTID(opts.ChildTID)
   356  	}
   357  	if opts.ChildSetTID {
   358  		ctid := nt.ThreadID()
   359  		ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
   360  	}
   361  	ntid := t.tg.pidns.IDOfTask(nt)
   362  	if opts.ParentSetTID {
   363  		ntid.CopyOut(t, opts.ParentTID)
   364  	}
   365  
   366  	kind := ptraceCloneKindClone
   367  	if opts.Vfork {
   368  		kind = ptraceCloneKindVfork
   369  	} else if opts.TerminationSignal == linux.SIGCHLD {
   370  		kind = ptraceCloneKindFork
   371  	}
   372  	if t.ptraceClone(kind, nt, opts) {
   373  		if opts.Vfork {
   374  			return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
   375  		}
   376  		return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
   377  	}
   378  	if opts.Vfork {
   379  		t.maybeBeginVforkStop(nt)
   380  		return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
   381  	}
   382  	return ntid, nil, nil
   383  }
   384  
   385  // maybeBeginVforkStop checks if a previously-started vfork child is still
   386  // running and has not yet released its MM, such that its parent t should enter
   387  // a vforkStop.
   388  //
   389  // Preconditions: The caller must be running on t's task goroutine.
   390  func (t *Task) maybeBeginVforkStop(child *Task) {
   391  	t.tg.pidns.owner.mu.RLock()
   392  	defer t.tg.pidns.owner.mu.RUnlock()
   393  	t.tg.signalHandlers.mu.Lock()
   394  	defer t.tg.signalHandlers.mu.Unlock()
   395  	if t.killedLocked() {
   396  		child.vforkParent = nil
   397  		return
   398  	}
   399  	if child.vforkParent == t {
   400  		t.beginInternalStopLocked((*vforkStop)(nil))
   401  	}
   402  }
   403  
   404  func (t *Task) unstopVforkParent() {
   405  	t.tg.pidns.owner.mu.RLock()
   406  	defer t.tg.pidns.owner.mu.RUnlock()
   407  	if p := t.vforkParent; p != nil {
   408  		p.tg.signalHandlers.mu.Lock()
   409  		defer p.tg.signalHandlers.mu.Unlock()
   410  		if _, ok := p.stop.(*vforkStop); ok {
   411  			p.endInternalStopLocked()
   412  		}
   413  		// Parent no longer needs to be unstopped.
   414  		t.vforkParent = nil
   415  	}
   416  }
   417  
   418  // +stateify savable
   419  type runSyscallAfterPtraceEventClone struct {
   420  	vforkChild *Task
   421  
   422  	// If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
   423  	// PID namespace. vforkChildTID must be stored since the child may exit and
   424  	// release its TID before the PTRACE_EVENT stop ends.
   425  	vforkChildTID ThreadID
   426  }
   427  
   428  func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
   429  	if r.vforkChild != nil {
   430  		t.maybeBeginVforkStop(r.vforkChild)
   431  		return &runSyscallAfterVforkStop{r.vforkChildTID}
   432  	}
   433  	return (*runSyscallExit)(nil)
   434  }
   435  
   436  // +stateify savable
   437  type runSyscallAfterVforkStop struct {
   438  	// childTID has the same meaning as
   439  	// runSyscallAfterPtraceEventClone.vforkChildTID.
   440  	childTID ThreadID
   441  }
   442  
   443  func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
   444  	t.ptraceVforkDone(r.childTID)
   445  	return (*runSyscallExit)(nil)
   446  }
   447  
   448  // Unshare changes the set of resources t shares with other tasks, as specified
   449  // by opts.
   450  //
   451  // Preconditions: The caller must be running on the task goroutine.
   452  func (t *Task) Unshare(opts *SharingOptions) error {
   453  	// In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and
   454  	// NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if
   455  	// t is the only task using its MM, which due to clone(2)'s rules imply
   456  	// that it is also the only task using its signal handlers / in its thread
   457  	// group, and cause EINVAL to be returned otherwise.
   458  	//
   459  	// Since we don't count the number of tasks using each address space or set
   460  	// of signal handlers, we reject NewSignalHandlers and NewAddressSpace
   461  	// altogether, and interpret NewThreadGroup as requiring that t be the only
   462  	// member of its thread group. This seems to be logically coherent, in the
   463  	// sense that clone(2) allows a task to share signal handlers and address
   464  	// spaces with tasks in other thread groups.
   465  	if opts.NewAddressSpace || opts.NewSignalHandlers {
   466  		return linuxerr.EINVAL
   467  	}
   468  	creds := t.Credentials()
   469  	if opts.NewThreadGroup {
   470  		t.tg.signalHandlers.mu.Lock()
   471  		if t.tg.tasksCount != 1 {
   472  			t.tg.signalHandlers.mu.Unlock()
   473  			return linuxerr.EINVAL
   474  		}
   475  		t.tg.signalHandlers.mu.Unlock()
   476  		// This isn't racy because we're the only living task, and therefore
   477  		// the only task capable of creating new ones, in our thread group.
   478  	}
   479  	if opts.NewUserNamespace {
   480  		if t.IsChrooted() {
   481  			return linuxerr.EPERM
   482  		}
   483  		newUserNS, err := creds.NewChildUserNamespace()
   484  		if err != nil {
   485  			return err
   486  		}
   487  		err = t.SetUserNamespace(newUserNS)
   488  		if err != nil {
   489  			return err
   490  		}
   491  		// Need to reload creds, becaue t.SetUserNamespace() changed task credentials.
   492  		creds = t.Credentials()
   493  	}
   494  	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
   495  	if opts.NewPIDNamespace {
   496  		if !haveCapSysAdmin {
   497  			return linuxerr.EPERM
   498  		}
   499  		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
   500  	}
   501  	t.mu.Lock()
   502  	// Can't defer unlock: DecRefs must occur without holding t.mu.
   503  	if opts.NewNetworkNamespace {
   504  		if !haveCapSysAdmin {
   505  			t.mu.Unlock()
   506  			return linuxerr.EPERM
   507  		}
   508  		t.netns = inet.NewNamespace(t.netns)
   509  	}
   510  	if opts.NewUTSNamespace {
   511  		if !haveCapSysAdmin {
   512  			t.mu.Unlock()
   513  			return linuxerr.EPERM
   514  		}
   515  		// Note that this must happen after NewUserNamespace, so the
   516  		// new user namespace is used if there is one.
   517  		t.utsns = t.utsns.Clone(creds.UserNamespace)
   518  	}
   519  	if opts.NewIPCNamespace {
   520  		if !haveCapSysAdmin {
   521  			t.mu.Unlock()
   522  			return linuxerr.EPERM
   523  		}
   524  		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
   525  		// namespace"
   526  		t.ipcns.DecRef(t)
   527  		t.ipcns = NewIPCNamespace(creds.UserNamespace)
   528  	}
   529  	var oldFDTable *FDTable
   530  	if opts.NewFiles {
   531  		oldFDTable = t.fdTable
   532  		t.fdTable = oldFDTable.Fork(t)
   533  	}
   534  	var oldFSContext *FSContext
   535  	if opts.NewFSContext {
   536  		oldFSContext = t.fsContext
   537  		t.fsContext = oldFSContext.Fork()
   538  	}
   539  	t.mu.Unlock()
   540  	if oldFDTable != nil {
   541  		oldFDTable.DecRef(t)
   542  	}
   543  	if oldFSContext != nil {
   544  		oldFSContext.DecRef(t)
   545  	}
   546  	return nil
   547  }
   548  
   549  // vforkStop is a TaskStop imposed on a task that creates a child with
   550  // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
   551  // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
   552  // that the child and parent share mappings until the child execve()s into a
   553  // new process image or exits.)
   554  //
   555  // +stateify savable
   556  type vforkStop struct{}
   557  
   558  // StopIgnoresKill implements TaskStop.Killable.
   559  func (*vforkStop) Killable() bool { return true }