gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/task_clone.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/task_clone.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"gvisor.dev/gvisor/pkg/abi/linux"
    19  	"gvisor.dev/gvisor/pkg/atomicbitops"
    20  	"gvisor.dev/gvisor/pkg/cleanup"
    21  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    22  	"gvisor.dev/gvisor/pkg/hostarch"
    23  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
    24  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs"
    25  	"gvisor.dev/gvisor/pkg/sentry/inet"
    26  	"gvisor.dev/gvisor/pkg/sentry/seccheck"
    27  	pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto"
    28  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    29  	"gvisor.dev/gvisor/pkg/usermem"
    30  )
    31  
    32  // SupportedCloneFlags is the bitwise OR of all the supported flags for clone.
    33  // TODO(b/290826530): Implement CLONE_INTO_CGROUP when cgroups v2 is
    34  // implemented.
    35  const SupportedCloneFlags = linux.CLONE_VM | linux.CLONE_FS | linux.CLONE_FILES | linux.CLONE_SYSVSEM |
    36  	linux.CLONE_THREAD | linux.CLONE_SIGHAND | linux.CLONE_CHILD_SETTID | linux.CLONE_NEWPID |
    37  	linux.CLONE_CHILD_CLEARTID | linux.CLONE_CHILD_SETTID | linux.CLONE_PARENT |
    38  	linux.CLONE_PARENT_SETTID | linux.CLONE_SETTLS | linux.CLONE_NEWUSER | linux.CLONE_NEWUTS |
    39  	linux.CLONE_NEWIPC | linux.CLONE_NEWNET | linux.CLONE_PTRACE | linux.CLONE_UNTRACED |
    40  	linux.CLONE_IO | linux.CLONE_VFORK | linux.CLONE_DETACHED | linux.CLONE_NEWNS
    41  
    42  // Clone implements the clone(2) syscall and returns the thread ID of the new
    43  // task in t's PID namespace. Clone may return both a non-zero thread ID and a
    44  // non-nil error.
    45  //
    46  // Preconditions: The caller must be running Task.doSyscallInvoke on the task
    47  // goroutine.
    48  func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
    49  	if args.Flags&^SupportedCloneFlags != 0 {
    50  		return 0, nil, linuxerr.EINVAL
    51  	}
    52  	// Since signal actions may refer to application signal handlers by virtual
    53  	// address, any set of signal handlers must refer to the same address
    54  	// space.
    55  	if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND {
    56  		return 0, nil, linuxerr.EINVAL
    57  	}
    58  	if args.SetTID != 0 {
    59  		return 0, nil, linuxerr.ENOTSUP
    60  	}
    61  	// In order for the behavior of thread-group-directed signals to be sane,
    62  	// all tasks in a thread group must share signal handlers.
    63  	if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD {
    64  		return 0, nil, linuxerr.EINVAL
    65  	}
    66  	// All tasks in a thread group must be in the same PID namespace.
    67  	if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) {
    68  		return 0, nil, linuxerr.EINVAL
    69  	}
    70  	// The two different ways of specifying a new PID namespace are
    71  	// incompatible.
    72  	if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil {
    73  		return 0, nil, linuxerr.EINVAL
    74  	}
    75  	// Thread groups and FS contexts cannot span user namespaces.
    76  	if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 {
    77  		return 0, nil, linuxerr.EINVAL
    78  	}
    79  	// args.ExitSignal must be a valid signal.
    80  	if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() {
    81  		return 0, nil, linuxerr.EINVAL
    82  	}
    83  	if args.Flags&(linux.CLONE_FS|linux.CLONE_NEWNS) == linux.CLONE_FS|linux.CLONE_NEWNS {
    84  		return 0, nil, linuxerr.EINVAL
    85  	}
    86  
    87  	// Pull task registers and FPU state, a cloned task will inherit the
    88  	// state of the current task.
    89  	if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil {
    90  		t.Warningf("Unable to pull a full state: %v", err)
    91  		t.forceSignal(linux.SIGILL, true /* unconditional */)
    92  		t.SendSignal(SignalInfoPriv(linux.SIGILL))
    93  		return 0, nil, linuxerr.EFAULT
    94  	}
    95  
    96  	// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
    97  	// single clone(2) or unshare(2) call, the user namespace is guaranteed to
    98  	// be created first, giving the child (clone(2)) or caller (unshare(2))
    99  	// privileges over the remaining namespaces created by the call." -
   100  	// user_namespaces(7)
   101  	creds := t.Credentials()
   102  	userns := creds.UserNamespace
   103  	if args.Flags&linux.CLONE_NEWUSER != 0 {
   104  		var err error
   105  		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
   106  		// the caller is in a chroot environment (i.e., the caller's root
   107  		// directory does not match the root directory of the mount namespace
   108  		// in which it resides)." - clone(2). Neither chroot(2) nor
   109  		// user_namespaces(7) document this.
   110  		if t.IsChrooted() {
   111  			return 0, nil, linuxerr.EPERM
   112  		}
   113  		userns, err = creds.NewChildUserNamespace()
   114  		if err != nil {
   115  			return 0, nil, err
   116  		}
   117  	}
   118  	if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
   119  		return 0, nil, linuxerr.EPERM
   120  	}
   121  
   122  	cu := cleanup.Make(func() {})
   123  	defer cu.Clean()
   124  
   125  	utsns := t.utsns
   126  	if args.Flags&linux.CLONE_NEWUTS != 0 {
   127  		// Note that this must happen after NewUserNamespace so we get
   128  		// the new userns if there is one.
   129  		utsns = utsns.Clone(userns)
   130  		utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, utsns))
   131  	} else {
   132  		utsns.IncRef()
   133  	}
   134  	cu.Add(func() {
   135  		utsns.DecRef(t)
   136  	})
   137  
   138  	ipcns := t.ipcns
   139  	if args.Flags&linux.CLONE_NEWIPC != 0 {
   140  		ipcns = NewIPCNamespace(userns)
   141  		ipcns.InitPosixQueues(t, t.k.VFS(), creds)
   142  		ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, ipcns))
   143  	} else {
   144  		ipcns.IncRef()
   145  	}
   146  	cu.Add(func() {
   147  		ipcns.DecRef(t)
   148  	})
   149  
   150  	netns := t.netns
   151  	if args.Flags&linux.CLONE_NEWNET != 0 {
   152  		netns = inet.NewNamespace(netns, userns)
   153  		inode := nsfs.NewInode(t, t.k.nsfsMount, netns)
   154  		netns.SetInode(inode)
   155  	} else {
   156  		netns.IncRef()
   157  	}
   158  	cu.Add(func() {
   159  		netns.DecRef(t)
   160  	})
   161  
   162  	// We must hold t.mu to access t.image, but we can't hold it during Fork(),
   163  	// since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered
   164  	// above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy.
   165  	t.mu.Lock()
   166  	curImage := t.image
   167  	sessionKeyring := t.sessionKeyring
   168  	t.mu.Unlock()
   169  	image, err := curImage.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0)
   170  	if err != nil {
   171  		return 0, nil, err
   172  	}
   173  	cu.Add(func() {
   174  		image.release(t)
   175  	})
   176  
   177  	if args.Flags&linux.CLONE_NEWUSER != 0 {
   178  		// If the task is in a new user namespace, it cannot share keys.
   179  		sessionKeyring = nil
   180  	}
   181  
   182  	// clone() returns 0 in the child.
   183  	image.Arch.SetReturn(0)
   184  	if args.Stack != 0 {
   185  		image.Arch.SetStack(uintptr(args.Stack + args.StackSize))
   186  	}
   187  	if args.Flags&linux.CLONE_SETTLS != 0 {
   188  		if !image.Arch.SetTLS(uintptr(args.TLS)) {
   189  			return 0, nil, linuxerr.EPERM
   190  		}
   191  	}
   192  
   193  	var fsContext *FSContext
   194  	if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 {
   195  		fsContext = t.fsContext.Fork()
   196  	} else {
   197  		fsContext = t.fsContext
   198  		fsContext.IncRef()
   199  	}
   200  
   201  	mntns := t.mountNamespace
   202  	if args.Flags&linux.CLONE_NEWNS != 0 {
   203  		var err error
   204  		mntns, err = t.k.vfs.CloneMountNamespace(t, creds, mntns, &fsContext.root, &fsContext.cwd, t.k)
   205  		if err != nil {
   206  			return 0, nil, err
   207  		}
   208  	} else {
   209  		mntns.IncRef()
   210  	}
   211  	cu.Add(func() {
   212  		mntns.DecRef(t)
   213  	})
   214  
   215  	var fdTable *FDTable
   216  	if args.Flags&linux.CLONE_FILES == 0 {
   217  		fdTable = t.fdTable.Fork(t, MaxFdLimit)
   218  	} else {
   219  		fdTable = t.fdTable
   220  		fdTable.IncRef()
   221  	}
   222  
   223  	pidns := t.tg.pidns
   224  	if t.childPIDNamespace != nil {
   225  		pidns = t.childPIDNamespace
   226  	} else if args.Flags&linux.CLONE_NEWPID != 0 {
   227  		pidns = pidns.NewChild(userns)
   228  	}
   229  
   230  	tg := t.tg
   231  	rseqAddr := hostarch.Addr(0)
   232  	rseqSignature := uint32(0)
   233  	if args.Flags&linux.CLONE_THREAD == 0 {
   234  		sh := t.tg.signalHandlers
   235  		if args.Flags&linux.CLONE_SIGHAND == 0 {
   236  			sh = sh.Fork()
   237  		}
   238  		tg = t.k.NewThreadGroup(pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy())
   239  		tg.oomScoreAdj = atomicbitops.FromInt32(t.tg.oomScoreAdj.Load())
   240  		rseqAddr = t.rseqAddr
   241  		rseqSignature = t.rseqSignature
   242  	}
   243  
   244  	uc := t.userCounters
   245  	if uc.uid != creds.RealKUID {
   246  		uc = t.k.GetUserCounters(creds.RealKUID)
   247  	}
   248  
   249  	cfg := &TaskConfig{
   250  		Kernel:           t.k,
   251  		ThreadGroup:      tg,
   252  		SignalMask:       t.SignalMask(),
   253  		TaskImage:        image,
   254  		FSContext:        fsContext,
   255  		FDTable:          fdTable,
   256  		Credentials:      creds,
   257  		Niceness:         t.Niceness(),
   258  		NetworkNamespace: netns,
   259  		AllowedCPUMask:   t.CPUMask(),
   260  		UTSNamespace:     utsns,
   261  		IPCNamespace:     ipcns,
   262  		MountNamespace:   mntns,
   263  		RSeqAddr:         rseqAddr,
   264  		RSeqSignature:    rseqSignature,
   265  		ContainerID:      t.ContainerID(),
   266  		UserCounters:     uc,
   267  		SessionKeyring:   sessionKeyring,
   268  		Origin:           t.Origin,
   269  	}
   270  	if args.Flags&linux.CLONE_THREAD == 0 {
   271  		cfg.Parent = t
   272  	} else {
   273  		cfg.InheritParent = t
   274  	}
   275  	nt, err := t.tg.pidns.owner.NewTask(t, cfg)
   276  	// If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
   277  	// the cleanup for us.
   278  	cu.Release()
   279  	if err != nil {
   280  		return 0, nil, err
   281  	}
   282  
   283  	// "A child process created via fork(2) inherits a copy of its parent's
   284  	// alternate signal stack settings" - sigaltstack(2).
   285  	//
   286  	// However kernel/fork.c:copy_process() adds a limitation to this:
   287  	// "sigaltstack should be cleared when sharing the same VM".
   288  	if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 {
   289  		nt.SetSignalStack(t.SignalStack())
   290  	}
   291  
   292  	if userns != creds.UserNamespace {
   293  		if err := nt.SetUserNamespace(userns); err != nil {
   294  			// This shouldn't be possible: userns was created from nt.creds, so
   295  			// nt should have CAP_SYS_ADMIN in userns.
   296  			panic("Task.Clone: SetUserNamespace failed: " + err.Error())
   297  		}
   298  	}
   299  
   300  	// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
   301  	// nt that it must receive before its task goroutine starts running.
   302  	tid := nt.k.tasks.Root.IDOfTask(nt)
   303  	defer nt.Start(tid)
   304  
   305  	if seccheck.Global.Enabled(seccheck.PointClone) {
   306  		mask, info := getCloneSeccheckInfo(t, nt, args.Flags)
   307  		if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   308  			return c.Clone(t, mask, info)
   309  		}); err != nil {
   310  			// nt has been visible to the rest of the system since NewTask, so
   311  			// it may be blocking execve or a group stop, have been notified
   312  			// for group signal delivery, had children reparented to it, etc.
   313  			// Thus we can't just drop it on the floor. Instead, instruct the
   314  			// task goroutine to exit immediately, as quietly as possible.
   315  			nt.exitTracerNotified = true
   316  			nt.exitTracerAcked = true
   317  			nt.exitParentNotified = true
   318  			nt.exitParentAcked = true
   319  			nt.runState = (*runExitMain)(nil)
   320  			return 0, nil, err
   321  		}
   322  	}
   323  
   324  	// "If fork/clone and execve are allowed by @prog, any child processes will
   325  	// be constrained to the same filters and system call ABI as the parent." -
   326  	// Documentation/prctl/seccomp_filter.txt
   327  	if ts := t.seccomp.Load(); ts != nil {
   328  		seccompCopy := ts.copy()
   329  		seccompCopy.populateCache(nt)
   330  		nt.seccomp.Store(seccompCopy)
   331  	} else {
   332  		nt.seccomp.Store(nil)
   333  	}
   334  	if args.Flags&linux.CLONE_VFORK != 0 {
   335  		nt.vforkParent = t
   336  	}
   337  
   338  	if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 {
   339  		nt.SetClearTID(hostarch.Addr(args.ChildTID))
   340  	}
   341  	if args.Flags&linux.CLONE_CHILD_SETTID != 0 {
   342  		ctid := nt.ThreadID()
   343  		ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID))
   344  	}
   345  	ntid := t.tg.pidns.IDOfTask(nt)
   346  	if args.Flags&linux.CLONE_PARENT_SETTID != 0 {
   347  		ntid.CopyOut(t, hostarch.Addr(args.ParentTID))
   348  	}
   349  
   350  	t.traceCloneEvent(tid)
   351  	kind := ptraceCloneKindClone
   352  	if args.Flags&linux.CLONE_VFORK != 0 {
   353  		kind = ptraceCloneKindVfork
   354  	} else if linux.Signal(args.ExitSignal) == linux.SIGCHLD {
   355  		kind = ptraceCloneKindFork
   356  	}
   357  	if t.ptraceClone(kind, nt, args) {
   358  		if args.Flags&linux.CLONE_VFORK != 0 {
   359  			return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
   360  		}
   361  		return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
   362  	}
   363  	if args.Flags&linux.CLONE_VFORK != 0 {
   364  		t.maybeBeginVforkStop(nt)
   365  		return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
   366  	}
   367  	return ntid, nil, nil
   368  }
   369  
   370  func getCloneSeccheckInfo(t, nt *Task, flags uint64) (seccheck.FieldSet, *pb.CloneInfo) {
   371  	fields := seccheck.Global.GetFieldSet(seccheck.PointClone)
   372  	var cwd string
   373  	if fields.Context.Contains(seccheck.FieldCtxtCwd) {
   374  		cwd = getTaskCurrentWorkingDirectory(t)
   375  	}
   376  	t.k.tasks.mu.RLock()
   377  	defer t.k.tasks.mu.RUnlock()
   378  	info := &pb.CloneInfo{
   379  		CreatedThreadId:          int32(nt.k.tasks.Root.tids[nt]),
   380  		CreatedThreadGroupId:     int32(nt.k.tasks.Root.tgids[nt.tg]),
   381  		CreatedThreadStartTimeNs: nt.startTime.Nanoseconds(),
   382  		Flags:                    flags,
   383  	}
   384  
   385  	if !fields.Context.Empty() {
   386  		info.ContextData = &pb.ContextData{}
   387  		LoadSeccheckDataLocked(t, fields.Context, info.ContextData, cwd)
   388  	}
   389  
   390  	return fields, info
   391  }
   392  
   393  // maybeBeginVforkStop checks if a previously-started vfork child is still
   394  // running and has not yet released its MM, such that its parent t should enter
   395  // a vforkStop.
   396  //
   397  // Preconditions: The caller must be running on t's task goroutine.
   398  func (t *Task) maybeBeginVforkStop(child *Task) {
   399  	t.tg.pidns.owner.mu.RLock()
   400  	defer t.tg.pidns.owner.mu.RUnlock()
   401  	t.tg.signalHandlers.mu.Lock()
   402  	defer t.tg.signalHandlers.mu.Unlock()
   403  	if t.killedLocked() {
   404  		child.vforkParent = nil
   405  		return
   406  	}
   407  	if child.vforkParent == t {
   408  		t.beginInternalStopLocked((*vforkStop)(nil))
   409  	}
   410  }
   411  
   412  func (t *Task) unstopVforkParent() {
   413  	t.tg.pidns.owner.mu.RLock()
   414  	defer t.tg.pidns.owner.mu.RUnlock()
   415  	if p := t.vforkParent; p != nil {
   416  		p.tg.signalHandlers.mu.Lock()
   417  		defer p.tg.signalHandlers.mu.Unlock()
   418  		if _, ok := p.stop.(*vforkStop); ok {
   419  			p.endInternalStopLocked()
   420  		}
   421  		// Parent no longer needs to be unstopped.
   422  		t.vforkParent = nil
   423  	}
   424  }
   425  
   426  // +stateify savable
   427  type runSyscallAfterPtraceEventClone struct {
   428  	vforkChild *Task
   429  
   430  	// If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
   431  	// PID namespace. vforkChildTID must be stored since the child may exit and
   432  	// release its TID before the PTRACE_EVENT stop ends.
   433  	vforkChildTID ThreadID
   434  }
   435  
   436  func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
   437  	if r.vforkChild != nil {
   438  		t.maybeBeginVforkStop(r.vforkChild)
   439  		return &runSyscallAfterVforkStop{r.vforkChildTID}
   440  	}
   441  	return (*runSyscallExit)(nil)
   442  }
   443  
   444  // +stateify savable
   445  type runSyscallAfterVforkStop struct {
   446  	// childTID has the same meaning as
   447  	// runSyscallAfterPtraceEventClone.vforkChildTID.
   448  	childTID ThreadID
   449  }
   450  
   451  func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
   452  	t.ptraceVforkDone(r.childTID)
   453  	return (*runSyscallExit)(nil)
   454  }
   455  
   456  // Setns reassociates thread with the specified namespace.
   457  func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
   458  	d, ok := fd.Dentry().Impl().(*kernfs.Dentry)
   459  	if !ok {
   460  		return linuxerr.EINVAL
   461  	}
   462  	i, ok := d.Inode().(*nsfs.Inode)
   463  	if !ok {
   464  		return linuxerr.EINVAL
   465  	}
   466  
   467  	switch ns := i.Namespace().(type) {
   468  	case *inet.Namespace:
   469  		if flags != 0 && flags != linux.CLONE_NEWNET {
   470  			return linuxerr.EINVAL
   471  		}
   472  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   473  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   474  			return linuxerr.EPERM
   475  		}
   476  		oldNS := t.NetworkNamespace()
   477  		ns.IncRef()
   478  		t.mu.Lock()
   479  		t.netns = ns
   480  		t.mu.Unlock()
   481  		oldNS.DecRef(t)
   482  		return nil
   483  	case *IPCNamespace:
   484  		if flags != 0 && flags != linux.CLONE_NEWIPC {
   485  			return linuxerr.EINVAL
   486  		}
   487  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   488  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   489  			return linuxerr.EPERM
   490  		}
   491  		oldNS := t.IPCNamespace()
   492  		ns.IncRef()
   493  		t.mu.Lock()
   494  		t.ipcns = ns
   495  		t.mu.Unlock()
   496  		oldNS.DecRef(t)
   497  		return nil
   498  	case *vfs.MountNamespace:
   499  		if flags != 0 && flags != linux.CLONE_NEWNS {
   500  			return linuxerr.EINVAL
   501  		}
   502  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.Owner) ||
   503  			!t.Credentials().HasCapability(linux.CAP_SYS_CHROOT) ||
   504  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   505  			return linuxerr.EPERM
   506  		}
   507  		oldFSContext := t.fsContext
   508  		// The current task has to be an exclusive owner of its fs context.
   509  		if oldFSContext.ReadRefs() != 1 {
   510  			return linuxerr.EINVAL
   511  		}
   512  		fsContext := oldFSContext.Fork()
   513  		fsContext.root.DecRef(t)
   514  		fsContext.cwd.DecRef(t)
   515  		vd := ns.Root(t)
   516  		fsContext.root = vd
   517  		vd.IncRef()
   518  		fsContext.cwd = vd
   519  
   520  		oldNS := t.mountNamespace
   521  		ns.IncRef()
   522  		t.mu.Lock()
   523  		t.mountNamespace = ns
   524  		t.fsContext = fsContext
   525  		t.mu.Unlock()
   526  		oldNS.DecRef(t)
   527  		oldFSContext.DecRef(t)
   528  		return nil
   529  	case *UTSNamespace:
   530  		if flags != 0 && flags != linux.CLONE_NEWUTS {
   531  			return linuxerr.EINVAL
   532  		}
   533  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   534  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   535  			return linuxerr.EPERM
   536  		}
   537  		oldNS := t.UTSNamespace()
   538  		ns.IncRef()
   539  		t.mu.Lock()
   540  		t.utsns = ns
   541  		t.mu.Unlock()
   542  		oldNS.DecRef(t)
   543  		return nil
   544  	default:
   545  		return linuxerr.EINVAL
   546  	}
   547  }
   548  
   549  // Unshare changes the set of resources t shares with other tasks, as specified
   550  // by flags.
   551  //
   552  // Preconditions: The caller must be running on the task goroutine.
   553  func (t *Task) Unshare(flags int32) error {
   554  	// "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if
   555  	// the caller is single threaded (i.e., it is not sharing its address space
   556  	// with another process or thread). In this case, these flags have no
   557  	// effect. (Note also that specifying CLONE_THREAD automatically implies
   558  	// CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.)
   559  	// If the process is multithreaded, then the use of these flags results in
   560  	// an error." - unshare(2). This is incorrect (cf.
   561  	// kernel/fork.c:ksys_unshare()):
   562  	//
   563  	//	- CLONE_THREAD does not imply CLONE_VM.
   564  	//
   565  	//	- CLONE_SIGHAND implies CLONE_THREAD.
   566  	//
   567  	//	- Only CLONE_VM requires that the caller is not sharing its address
   568  	//		space with another thread. CLONE_SIGHAND requires that the caller is not
   569  	//		sharing its signal handlers, and CLONE_THREAD requires that the caller
   570  	//		is the only thread in its thread group.
   571  	//
   572  	// Since we don't count the number of tasks using each address space or set
   573  	// of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether.
   574  	if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 {
   575  		return linuxerr.EINVAL
   576  	}
   577  	creds := t.Credentials()
   578  	if flags&linux.CLONE_THREAD != 0 {
   579  		t.tg.signalHandlers.mu.Lock()
   580  		if t.tg.tasksCount != 1 {
   581  			t.tg.signalHandlers.mu.Unlock()
   582  			return linuxerr.EINVAL
   583  		}
   584  		t.tg.signalHandlers.mu.Unlock()
   585  		// This isn't racy because we're the only living task, and therefore
   586  		// the only task capable of creating new ones, in our thread group.
   587  	}
   588  	if flags&linux.CLONE_NEWUSER != 0 {
   589  		if t.IsChrooted() {
   590  			return linuxerr.EPERM
   591  		}
   592  		newUserNS, err := creds.NewChildUserNamespace()
   593  		if err != nil {
   594  			return err
   595  		}
   596  		err = t.SetUserNamespace(newUserNS)
   597  		if err != nil {
   598  			return err
   599  		}
   600  		// Need to reload creds, because t.SetUserNamespace() changed task credentials.
   601  		creds = t.Credentials()
   602  	}
   603  	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
   604  	if flags&linux.CLONE_NEWPID != 0 {
   605  		if !haveCapSysAdmin {
   606  			return linuxerr.EPERM
   607  		}
   608  		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
   609  	}
   610  	if flags&linux.CLONE_NEWNET != 0 {
   611  		if !haveCapSysAdmin {
   612  			return linuxerr.EPERM
   613  		}
   614  		netns := t.NetworkNamespace()
   615  		netns = inet.NewNamespace(netns, t.UserNamespace())
   616  		netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns)
   617  		netns.SetInode(netnsInode)
   618  		t.mu.Lock()
   619  		oldNetns := t.netns
   620  		t.netns = netns
   621  		t.mu.Unlock()
   622  		oldNetns.DecRef(t)
   623  	}
   624  
   625  	cu := cleanup.Cleanup{}
   626  	// All cu actions has to be executed after releasing t.mu.
   627  	defer cu.Clean()
   628  	t.mu.Lock()
   629  	defer t.mu.Unlock()
   630  	// Can't defer unlock: DecRefs must occur without holding t.mu.
   631  	if flags&linux.CLONE_NEWUTS != 0 {
   632  		if !haveCapSysAdmin {
   633  			return linuxerr.EPERM
   634  		}
   635  		// Note that this must happen after NewUserNamespace, so the
   636  		// new user namespace is used if there is one.
   637  		oldUTSNS := t.utsns
   638  		t.utsns = t.utsns.Clone(creds.UserNamespace)
   639  		t.utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.utsns))
   640  		cu.Add(func() { oldUTSNS.DecRef(t) })
   641  	}
   642  	if flags&linux.CLONE_NEWIPC != 0 {
   643  		if !haveCapSysAdmin {
   644  			return linuxerr.EPERM
   645  		}
   646  		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
   647  		// namespace"
   648  		oldIPCNS := t.ipcns
   649  		t.ipcns = NewIPCNamespace(creds.UserNamespace)
   650  		t.ipcns.InitPosixQueues(t, t.k.VFS(), creds)
   651  		t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns))
   652  		cu.Add(func() { oldIPCNS.DecRef(t) })
   653  	}
   654  	if flags&linux.CLONE_FILES != 0 {
   655  		oldFDTable := t.fdTable
   656  		t.fdTable = oldFDTable.Fork(t, MaxFdLimit)
   657  		cu.Add(func() { oldFDTable.DecRef(t) })
   658  	}
   659  	if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
   660  		oldFSContext := t.fsContext
   661  		t.fsContext = oldFSContext.Fork()
   662  		cu.Add(func() { oldFSContext.DecRef(t) })
   663  	}
   664  	if flags&linux.CLONE_NEWNS != 0 {
   665  		if !haveCapSysAdmin {
   666  			return linuxerr.EPERM
   667  		}
   668  		oldMountNS := t.mountNamespace
   669  		mntns, err := t.k.vfs.CloneMountNamespace(t, creds, oldMountNS, &t.fsContext.root, &t.fsContext.cwd, t.k)
   670  		if err != nil {
   671  			return err
   672  		}
   673  		t.mountNamespace = mntns
   674  		cu.Add(func() { oldMountNS.DecRef(t) })
   675  	}
   676  	return nil
   677  }
   678  
   679  // UnshareFdTable unshares the FdTable that task t shares with other tasks, upto
   680  // the maxFd.
   681  //
   682  // Preconditions: The caller must be running on the task goroutine.
   683  func (t *Task) UnshareFdTable(maxFd int32) {
   684  	t.mu.Lock()
   685  	oldFDTable := t.fdTable
   686  	t.fdTable = oldFDTable.Fork(t, maxFd)
   687  	t.mu.Unlock()
   688  
   689  	oldFDTable.DecRef(t)
   690  }
   691  
   692  // vforkStop is a TaskStop imposed on a task that creates a child with
   693  // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
   694  // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
   695  // that the child and parent share mappings until the child execve()s into a
   696  // new process image or exits.)
   697  //
   698  // +stateify savable
   699  type vforkStop struct{}
   700  
   701  // StopIgnoresKill implements TaskStop.Killable.
   702  func (*vforkStop) Killable() bool { return true }