github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/task_clone.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/task_clone.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    19  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/bpf"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/cleanup"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/kernfs"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/nsfs"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/inet"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck"
    28  	pb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck/points/points_go_proto"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    31  )
    32  
    33  // Clone implements the clone(2) syscall and returns the thread ID of the new
    34  // task in t's PID namespace. Clone may return both a non-zero thread ID and a
    35  // non-nil error.
    36  //
    37  // Preconditions: The caller must be running Task.doSyscallInvoke on the task
    38  // goroutine.
    39  func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
    40  	// Since signal actions may refer to application signal handlers by virtual
    41  	// address, any set of signal handlers must refer to the same address
    42  	// space.
    43  	if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND {
    44  		return 0, nil, linuxerr.EINVAL
    45  	}
    46  	// In order for the behavior of thread-group-directed signals to be sane,
    47  	// all tasks in a thread group must share signal handlers.
    48  	if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD {
    49  		return 0, nil, linuxerr.EINVAL
    50  	}
    51  	// All tasks in a thread group must be in the same PID namespace.
    52  	if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) {
    53  		return 0, nil, linuxerr.EINVAL
    54  	}
    55  	// The two different ways of specifying a new PID namespace are
    56  	// incompatible.
    57  	if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil {
    58  		return 0, nil, linuxerr.EINVAL
    59  	}
    60  	// Thread groups and FS contexts cannot span user namespaces.
    61  	if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 {
    62  		return 0, nil, linuxerr.EINVAL
    63  	}
    64  	// args.ExitSignal must be a valid signal.
    65  	if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() {
    66  		return 0, nil, linuxerr.EINVAL
    67  	}
    68  
    69  	// Pull task registers and FPU state, a cloned task will inherit the
    70  	// state of the current task.
    71  	if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil {
    72  		t.Warningf("Unable to pull a full state: %v", err)
    73  		t.forceSignal(linux.SIGILL, true /* unconditional */)
    74  		t.SendSignal(SignalInfoPriv(linux.SIGILL))
    75  		return 0, nil, linuxerr.EFAULT
    76  	}
    77  
    78  	// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
    79  	// single clone(2) or unshare(2) call, the user namespace is guaranteed to
    80  	// be created first, giving the child (clone(2)) or caller (unshare(2))
    81  	// privileges over the remaining namespaces created by the call." -
    82  	// user_namespaces(7)
    83  	creds := t.Credentials()
    84  	userns := creds.UserNamespace
    85  	if args.Flags&linux.CLONE_NEWUSER != 0 {
    86  		var err error
    87  		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
    88  		// the caller is in a chroot environment (i.e., the caller's root
    89  		// directory does not match the root directory of the mount namespace
    90  		// in which it resides)." - clone(2). Neither chroot(2) nor
    91  		// user_namespaces(7) document this.
    92  		if t.IsChrooted() {
    93  			return 0, nil, linuxerr.EPERM
    94  		}
    95  		userns, err = creds.NewChildUserNamespace()
    96  		if err != nil {
    97  			return 0, nil, err
    98  		}
    99  	}
   100  	if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
   101  		return 0, nil, linuxerr.EPERM
   102  	}
   103  
   104  	utsns := t.UTSNamespace()
   105  	if args.Flags&linux.CLONE_NEWUTS != 0 {
   106  		// Note that this must happen after NewUserNamespace so we get
   107  		// the new userns if there is one.
   108  		utsns = t.UTSNamespace().Clone(userns)
   109  	}
   110  
   111  	ipcns := t.IPCNamespace()
   112  	if args.Flags&linux.CLONE_NEWIPC != 0 {
   113  		ipcns = NewIPCNamespace(userns)
   114  		ipcns.InitPosixQueues(t, t.k.VFS(), creds)
   115  	} else {
   116  		ipcns.IncRef()
   117  	}
   118  	cu := cleanup.Make(func() {
   119  		ipcns.DecRef(t)
   120  	})
   121  	defer cu.Clean()
   122  
   123  	netns := t.netns.Load()
   124  	if args.Flags&linux.CLONE_NEWNET != 0 {
   125  		netns = inet.NewNamespace(netns, userns)
   126  		inode := nsfs.NewInode(t, t.k.nsfsMount, netns)
   127  		netns.SetInode(inode)
   128  	} else {
   129  		netns.IncRef()
   130  	}
   131  	cu.Add(func() {
   132  		netns.DecRef(t)
   133  	})
   134  
   135  	// TODO(b/63601033): Implement CLONE_NEWNS.
   136  	mntns := t.mountNamespace
   137  	if mntns != nil {
   138  		mntns.IncRef()
   139  		cu.Add(func() {
   140  			mntns.DecRef(t)
   141  		})
   142  	}
   143  
   144  	// We must hold t.mu to access t.image, but we can't hold it during Fork(),
   145  	// since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered
   146  	// above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy.
   147  	t.mu.Lock()
   148  	curImage := t.image
   149  	t.mu.Unlock()
   150  	image, err := curImage.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0)
   151  	if err != nil {
   152  		return 0, nil, err
   153  	}
   154  	cu.Add(func() {
   155  		image.release(t)
   156  	})
   157  	// clone() returns 0 in the child.
   158  	image.Arch.SetReturn(0)
   159  	if args.Stack != 0 {
   160  		image.Arch.SetStack(uintptr(args.Stack))
   161  	}
   162  	if args.Flags&linux.CLONE_SETTLS != 0 {
   163  		if !image.Arch.SetTLS(uintptr(args.TLS)) {
   164  			return 0, nil, linuxerr.EPERM
   165  		}
   166  	}
   167  
   168  	var fsContext *FSContext
   169  	if args.Flags&linux.CLONE_FS == 0 {
   170  		fsContext = t.fsContext.Fork()
   171  	} else {
   172  		fsContext = t.fsContext
   173  		fsContext.IncRef()
   174  	}
   175  
   176  	var fdTable *FDTable
   177  	if args.Flags&linux.CLONE_FILES == 0 {
   178  		fdTable = t.fdTable.Fork(t, MaxFdLimit)
   179  	} else {
   180  		fdTable = t.fdTable
   181  		fdTable.IncRef()
   182  	}
   183  
   184  	pidns := t.tg.pidns
   185  	if t.childPIDNamespace != nil {
   186  		pidns = t.childPIDNamespace
   187  	} else if args.Flags&linux.CLONE_NEWPID != 0 {
   188  		pidns = pidns.NewChild(userns)
   189  	}
   190  
   191  	tg := t.tg
   192  	rseqAddr := hostarch.Addr(0)
   193  	rseqSignature := uint32(0)
   194  	if args.Flags&linux.CLONE_THREAD == 0 {
   195  		sh := t.tg.signalHandlers
   196  		if args.Flags&linux.CLONE_SIGHAND == 0 {
   197  			sh = sh.Fork()
   198  		}
   199  		tg = t.k.NewThreadGroup(pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy())
   200  		tg.oomScoreAdj = atomicbitops.FromInt32(t.tg.oomScoreAdj.Load())
   201  		rseqAddr = t.rseqAddr
   202  		rseqSignature = t.rseqSignature
   203  	}
   204  
   205  	uc := t.userCounters
   206  	if uc.uid != creds.RealKUID {
   207  		uc = t.k.GetUserCounters(creds.RealKUID)
   208  	}
   209  
   210  	cfg := &TaskConfig{
   211  		Kernel:                  t.k,
   212  		ThreadGroup:             tg,
   213  		SignalMask:              t.SignalMask(),
   214  		TaskImage:               image,
   215  		FSContext:               fsContext,
   216  		FDTable:                 fdTable,
   217  		Credentials:             creds,
   218  		Niceness:                t.Niceness(),
   219  		NetworkNamespace:        netns,
   220  		AllowedCPUMask:          t.CPUMask(),
   221  		UTSNamespace:            utsns,
   222  		IPCNamespace:            ipcns,
   223  		AbstractSocketNamespace: t.abstractSockets,
   224  		MountNamespace:          mntns,
   225  		RSeqAddr:                rseqAddr,
   226  		RSeqSignature:           rseqSignature,
   227  		ContainerID:             t.ContainerID(),
   228  		UserCounters:            uc,
   229  	}
   230  	if args.Flags&linux.CLONE_THREAD == 0 {
   231  		cfg.Parent = t
   232  	} else {
   233  		cfg.InheritParent = t
   234  	}
   235  	nt, err := t.tg.pidns.owner.NewTask(t, cfg)
   236  	// If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
   237  	// the cleanup for us.
   238  	cu.Release()
   239  	if err != nil {
   240  		return 0, nil, err
   241  	}
   242  
   243  	// "A child process created via fork(2) inherits a copy of its parent's
   244  	// alternate signal stack settings" - sigaltstack(2).
   245  	//
   246  	// However kernel/fork.c:copy_process() adds a limitation to this:
   247  	// "sigaltstack should be cleared when sharing the same VM".
   248  	if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 {
   249  		nt.SetSignalStack(t.SignalStack())
   250  	}
   251  
   252  	if userns != creds.UserNamespace {
   253  		if err := nt.SetUserNamespace(userns); err != nil {
   254  			// This shouldn't be possible: userns was created from nt.creds, so
   255  			// nt should have CAP_SYS_ADMIN in userns.
   256  			panic("Task.Clone: SetUserNamespace failed: " + err.Error())
   257  		}
   258  	}
   259  
   260  	// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
   261  	// nt that it must receive before its task goroutine starts running.
   262  	tid := nt.k.tasks.Root.IDOfTask(nt)
   263  	defer nt.Start(tid)
   264  
   265  	if seccheck.Global.Enabled(seccheck.PointClone) {
   266  		mask, info := getCloneSeccheckInfo(t, nt, args.Flags)
   267  		if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   268  			return c.Clone(t, mask, info)
   269  		}); err != nil {
   270  			// nt has been visible to the rest of the system since NewTask, so
   271  			// it may be blocking execve or a group stop, have been notified
   272  			// for group signal delivery, had children reparented to it, etc.
   273  			// Thus we can't just drop it on the floor. Instead, instruct the
   274  			// task goroutine to exit immediately, as quietly as possible.
   275  			nt.exitTracerNotified = true
   276  			nt.exitTracerAcked = true
   277  			nt.exitParentNotified = true
   278  			nt.exitParentAcked = true
   279  			nt.runState = (*runExitMain)(nil)
   280  			return 0, nil, err
   281  		}
   282  	}
   283  
   284  	// "If fork/clone and execve are allowed by @prog, any child processes will
   285  	// be constrained to the same filters and system call ABI as the parent." -
   286  	// Documentation/prctl/seccomp_filter.txt
   287  	if f := t.syscallFilters.Load(); f != nil {
   288  		copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
   289  		nt.syscallFilters.Store(copiedFilters)
   290  	}
   291  	if args.Flags&linux.CLONE_VFORK != 0 {
   292  		nt.vforkParent = t
   293  	}
   294  
   295  	if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 {
   296  		nt.SetClearTID(hostarch.Addr(args.ChildTID))
   297  	}
   298  	if args.Flags&linux.CLONE_CHILD_SETTID != 0 {
   299  		ctid := nt.ThreadID()
   300  		ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID))
   301  	}
   302  	ntid := t.tg.pidns.IDOfTask(nt)
   303  	if args.Flags&linux.CLONE_PARENT_SETTID != 0 {
   304  		ntid.CopyOut(t, hostarch.Addr(args.ParentTID))
   305  	}
   306  
   307  	t.traceCloneEvent(tid)
   308  	kind := ptraceCloneKindClone
   309  	if args.Flags&linux.CLONE_VFORK != 0 {
   310  		kind = ptraceCloneKindVfork
   311  	} else if linux.Signal(args.ExitSignal) == linux.SIGCHLD {
   312  		kind = ptraceCloneKindFork
   313  	}
   314  	if t.ptraceClone(kind, nt, args) {
   315  		if args.Flags&linux.CLONE_VFORK != 0 {
   316  			return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
   317  		}
   318  		return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
   319  	}
   320  	if args.Flags&linux.CLONE_VFORK != 0 {
   321  		t.maybeBeginVforkStop(nt)
   322  		return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
   323  	}
   324  	return ntid, nil, nil
   325  }
   326  
   327  func getCloneSeccheckInfo(t, nt *Task, flags uint64) (seccheck.FieldSet, *pb.CloneInfo) {
   328  	fields := seccheck.Global.GetFieldSet(seccheck.PointClone)
   329  	var cwd string
   330  	if fields.Context.Contains(seccheck.FieldCtxtCwd) {
   331  		cwd = getTaskCurrentWorkingDirectory(t)
   332  	}
   333  	t.k.tasks.mu.RLock()
   334  	defer t.k.tasks.mu.RUnlock()
   335  	info := &pb.CloneInfo{
   336  		CreatedThreadId:          int32(nt.k.tasks.Root.tids[nt]),
   337  		CreatedThreadGroupId:     int32(nt.k.tasks.Root.tgids[nt.tg]),
   338  		CreatedThreadStartTimeNs: nt.startTime.Nanoseconds(),
   339  		Flags:                    flags,
   340  	}
   341  
   342  	if !fields.Context.Empty() {
   343  		info.ContextData = &pb.ContextData{}
   344  		LoadSeccheckDataLocked(t, fields.Context, info.ContextData, cwd)
   345  	}
   346  
   347  	return fields, info
   348  }
   349  
   350  // maybeBeginVforkStop checks if a previously-started vfork child is still
   351  // running and has not yet released its MM, such that its parent t should enter
   352  // a vforkStop.
   353  //
   354  // Preconditions: The caller must be running on t's task goroutine.
   355  func (t *Task) maybeBeginVforkStop(child *Task) {
   356  	t.tg.pidns.owner.mu.RLock()
   357  	defer t.tg.pidns.owner.mu.RUnlock()
   358  	t.tg.signalHandlers.mu.Lock()
   359  	defer t.tg.signalHandlers.mu.Unlock()
   360  	if t.killedLocked() {
   361  		child.vforkParent = nil
   362  		return
   363  	}
   364  	if child.vforkParent == t {
   365  		t.beginInternalStopLocked((*vforkStop)(nil))
   366  	}
   367  }
   368  
   369  func (t *Task) unstopVforkParent() {
   370  	t.tg.pidns.owner.mu.RLock()
   371  	defer t.tg.pidns.owner.mu.RUnlock()
   372  	if p := t.vforkParent; p != nil {
   373  		p.tg.signalHandlers.mu.Lock()
   374  		defer p.tg.signalHandlers.mu.Unlock()
   375  		if _, ok := p.stop.(*vforkStop); ok {
   376  			p.endInternalStopLocked()
   377  		}
   378  		// Parent no longer needs to be unstopped.
   379  		t.vforkParent = nil
   380  	}
   381  }
   382  
   383  // +stateify savable
   384  type runSyscallAfterPtraceEventClone struct {
   385  	vforkChild *Task
   386  
   387  	// If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
   388  	// PID namespace. vforkChildTID must be stored since the child may exit and
   389  	// release its TID before the PTRACE_EVENT stop ends.
   390  	vforkChildTID ThreadID
   391  }
   392  
   393  func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
   394  	if r.vforkChild != nil {
   395  		t.maybeBeginVforkStop(r.vforkChild)
   396  		return &runSyscallAfterVforkStop{r.vforkChildTID}
   397  	}
   398  	return (*runSyscallExit)(nil)
   399  }
   400  
   401  // +stateify savable
   402  type runSyscallAfterVforkStop struct {
   403  	// childTID has the same meaning as
   404  	// runSyscallAfterPtraceEventClone.vforkChildTID.
   405  	childTID ThreadID
   406  }
   407  
   408  func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
   409  	t.ptraceVforkDone(r.childTID)
   410  	return (*runSyscallExit)(nil)
   411  }
   412  
   413  // Setns reassociates thread with the specified namespace.
   414  func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
   415  	d, ok := fd.Dentry().Impl().(*kernfs.Dentry)
   416  	if !ok {
   417  		return linuxerr.EINVAL
   418  	}
   419  	i, ok := d.Inode().(*nsfs.Inode)
   420  	if !ok {
   421  		return linuxerr.EINVAL
   422  	}
   423  
   424  	switch ns := i.Namespace().(type) {
   425  	case *inet.Namespace:
   426  		if flags != 0 && flags != linux.CLONE_NEWNET {
   427  			return linuxerr.EINVAL
   428  		}
   429  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   430  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   431  			return linuxerr.EPERM
   432  		}
   433  		oldNS := t.NetworkNamespace()
   434  		ns.IncRef()
   435  		t.mu.Lock()
   436  		t.netns.Store(ns)
   437  		t.mu.Unlock()
   438  		oldNS.DecRef(t)
   439  		return nil
   440  	default:
   441  		return linuxerr.EINVAL
   442  	}
   443  }
   444  
   445  // Unshare changes the set of resources t shares with other tasks, as specified
   446  // by flags.
   447  //
   448  // Preconditions: The caller must be running on the task goroutine.
   449  func (t *Task) Unshare(flags int32) error {
   450  	// "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if
   451  	// the caller is single threaded (i.e., it is not sharing its address space
   452  	// with another process or thread). In this case, these flags have no
   453  	// effect. (Note also that specifying CLONE_THREAD automatically implies
   454  	// CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.)
   455  	// If the process is multithreaded, then the use of these flags results in
   456  	// an error." - unshare(2). This is incorrect (cf.
   457  	// kernel/fork.c:ksys_unshare()):
   458  	//
   459  	//	- CLONE_THREAD does not imply CLONE_VM.
   460  	//
   461  	//	- CLONE_SIGHAND implies CLONE_THREAD.
   462  	//
   463  	//	- Only CLONE_VM requires that the caller is not sharing its address
   464  	//		space with another thread. CLONE_SIGHAND requires that the caller is not
   465  	//		sharing its signal handlers, and CLONE_THREAD requires that the caller
   466  	//		is the only thread in its thread group.
   467  	//
   468  	// Since we don't count the number of tasks using each address space or set
   469  	// of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether.
   470  	if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 {
   471  		return linuxerr.EINVAL
   472  	}
   473  	creds := t.Credentials()
   474  	if flags&linux.CLONE_THREAD != 0 {
   475  		t.tg.signalHandlers.mu.Lock()
   476  		if t.tg.tasksCount != 1 {
   477  			t.tg.signalHandlers.mu.Unlock()
   478  			return linuxerr.EINVAL
   479  		}
   480  		t.tg.signalHandlers.mu.Unlock()
   481  		// This isn't racy because we're the only living task, and therefore
   482  		// the only task capable of creating new ones, in our thread group.
   483  	}
   484  	if flags&linux.CLONE_NEWUSER != 0 {
   485  		if t.IsChrooted() {
   486  			return linuxerr.EPERM
   487  		}
   488  		newUserNS, err := creds.NewChildUserNamespace()
   489  		if err != nil {
   490  			return err
   491  		}
   492  		err = t.SetUserNamespace(newUserNS)
   493  		if err != nil {
   494  			return err
   495  		}
   496  		// Need to reload creds, because t.SetUserNamespace() changed task credentials.
   497  		creds = t.Credentials()
   498  	}
   499  	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
   500  	if flags&linux.CLONE_NEWPID != 0 {
   501  		if !haveCapSysAdmin {
   502  			return linuxerr.EPERM
   503  		}
   504  		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
   505  	}
   506  	if flags&linux.CLONE_NEWNET != 0 {
   507  		if !haveCapSysAdmin {
   508  			return linuxerr.EPERM
   509  		}
   510  		netns := t.NetworkNamespace()
   511  		netns = inet.NewNamespace(netns, t.UserNamespace())
   512  		netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns)
   513  		netns.SetInode(netnsInode)
   514  		t.mu.Lock()
   515  		netns = t.netns.Swap(netns)
   516  		t.mu.Unlock()
   517  		netns.DecRef(t)
   518  	}
   519  	t.mu.Lock()
   520  	// Can't defer unlock: DecRefs must occur without holding t.mu.
   521  	if flags&linux.CLONE_NEWUTS != 0 {
   522  		if !haveCapSysAdmin {
   523  			t.mu.Unlock()
   524  			return linuxerr.EPERM
   525  		}
   526  		// Note that this must happen after NewUserNamespace, so the
   527  		// new user namespace is used if there is one.
   528  		t.utsns = t.utsns.Clone(creds.UserNamespace)
   529  	}
   530  	var oldIPCNS *IPCNamespace
   531  	if flags&linux.CLONE_NEWIPC != 0 {
   532  		if !haveCapSysAdmin {
   533  			t.mu.Unlock()
   534  			return linuxerr.EPERM
   535  		}
   536  		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
   537  		// namespace"
   538  		oldIPCNS = t.ipcns
   539  		t.ipcns = NewIPCNamespace(creds.UserNamespace)
   540  		t.ipcns.InitPosixQueues(t, t.k.VFS(), creds)
   541  	}
   542  	var oldFDTable *FDTable
   543  	if flags&linux.CLONE_FILES != 0 {
   544  		oldFDTable = t.fdTable
   545  		t.fdTable = oldFDTable.Fork(t, MaxFdLimit)
   546  	}
   547  	var oldFSContext *FSContext
   548  	if flags&linux.CLONE_FS != 0 {
   549  		oldFSContext = t.fsContext
   550  		t.fsContext = oldFSContext.Fork()
   551  	}
   552  	t.mu.Unlock()
   553  	if oldIPCNS != nil {
   554  		oldIPCNS.DecRef(t)
   555  	}
   556  	if oldFDTable != nil {
   557  		oldFDTable.DecRef(t)
   558  	}
   559  	if oldFSContext != nil {
   560  		oldFSContext.DecRef(t)
   561  	}
   562  	return nil
   563  }
   564  
   565  // UnshareFdTable unshares the FdTable that task t shares with other tasks, upto
   566  // the maxFd.
   567  //
   568  // Preconditions: The caller must be running on the task goroutine.
   569  func (t *Task) UnshareFdTable(maxFd int32) {
   570  	t.mu.Lock()
   571  	oldFDTable := t.fdTable
   572  	t.fdTable = oldFDTable.Fork(t, maxFd)
   573  	t.mu.Unlock()
   574  
   575  	oldFDTable.DecRef(t)
   576  }
   577  
   578  // vforkStop is a TaskStop imposed on a task that creates a child with
   579  // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
   580  // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
   581  // that the child and parent share mappings until the child execve()s into a
   582  // new process image or exits.)
   583  //
   584  // +stateify savable
   585  type vforkStop struct{}
   586  
   587  // StopIgnoresKill implements TaskStop.Killable.
   588  func (*vforkStop) Killable() bool { return true }