github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/task_clone.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"github.com/metacubex/gvisor/pkg/abi/linux"
    19  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    20  	"github.com/metacubex/gvisor/pkg/cleanup"
    21  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    22  	"github.com/metacubex/gvisor/pkg/hostarch"
    23  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs"
    24  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/nsfs"
    25  	"github.com/metacubex/gvisor/pkg/sentry/inet"
    26  	"github.com/metacubex/gvisor/pkg/sentry/seccheck"
    27  	pb "github.com/metacubex/gvisor/pkg/sentry/seccheck/points/points_go_proto"
    28  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    29  	"github.com/metacubex/gvisor/pkg/usermem"
    30  )
    31  
    32  // SupportedCloneFlags is the bitwise OR of all the supported flags for clone.
    33  // TODO(b/290826530): Implement CLONE_INTO_CGROUP when cgroups v2 is
    34  // implemented.
    35  const SupportedCloneFlags = linux.CLONE_VM | linux.CLONE_FS | linux.CLONE_FILES | linux.CLONE_SYSVSEM |
    36  	linux.CLONE_THREAD | linux.CLONE_SIGHAND | linux.CLONE_CHILD_SETTID | linux.CLONE_NEWPID |
    37  	linux.CLONE_CHILD_CLEARTID | linux.CLONE_CHILD_SETTID | linux.CLONE_PARENT |
    38  	linux.CLONE_PARENT_SETTID | linux.CLONE_SETTLS | linux.CLONE_NEWUSER | linux.CLONE_NEWUTS |
    39  	linux.CLONE_NEWIPC | linux.CLONE_NEWNET | linux.CLONE_PTRACE | linux.CLONE_UNTRACED |
    40  	linux.CLONE_IO | linux.CLONE_VFORK | linux.CLONE_DETACHED | linux.CLONE_NEWNS
    41  
    42  // Clone implements the clone(2) syscall and returns the thread ID of the new
    43  // task in t's PID namespace. Clone may return both a non-zero thread ID and a
    44  // non-nil error.
    45  //
    46  // Preconditions: The caller must be running Task.doSyscallInvoke on the task
    47  // goroutine.
    48  func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
    49  	if args.Flags&^SupportedCloneFlags != 0 {
    50  		return 0, nil, linuxerr.EINVAL
    51  	}
    52  	// Since signal actions may refer to application signal handlers by virtual
    53  	// address, any set of signal handlers must refer to the same address
    54  	// space.
    55  	if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND {
    56  		return 0, nil, linuxerr.EINVAL
    57  	}
    58  	if args.SetTID != 0 {
    59  		return 0, nil, linuxerr.ENOTSUP
    60  	}
    61  	// In order for the behavior of thread-group-directed signals to be sane,
    62  	// all tasks in a thread group must share signal handlers.
    63  	if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD {
    64  		return 0, nil, linuxerr.EINVAL
    65  	}
    66  	// All tasks in a thread group must be in the same PID namespace.
    67  	if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) {
    68  		return 0, nil, linuxerr.EINVAL
    69  	}
    70  	// The two different ways of specifying a new PID namespace are
    71  	// incompatible.
    72  	if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil {
    73  		return 0, nil, linuxerr.EINVAL
    74  	}
    75  	// Thread groups and FS contexts cannot span user namespaces.
    76  	if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 {
    77  		return 0, nil, linuxerr.EINVAL
    78  	}
    79  	// args.ExitSignal must be a valid signal.
    80  	if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() {
    81  		return 0, nil, linuxerr.EINVAL
    82  	}
    83  	if args.Flags&(linux.CLONE_FS|linux.CLONE_NEWNS) == linux.CLONE_FS|linux.CLONE_NEWNS {
    84  		return 0, nil, linuxerr.EINVAL
    85  	}
    86  
    87  	// Pull task registers and FPU state, a cloned task will inherit the
    88  	// state of the current task.
    89  	if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil {
    90  		t.Warningf("Unable to pull a full state: %v", err)
    91  		t.forceSignal(linux.SIGILL, true /* unconditional */)
    92  		t.SendSignal(SignalInfoPriv(linux.SIGILL))
    93  		return 0, nil, linuxerr.EFAULT
    94  	}
    95  
    96  	// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
    97  	// single clone(2) or unshare(2) call, the user namespace is guaranteed to
    98  	// be created first, giving the child (clone(2)) or caller (unshare(2))
    99  	// privileges over the remaining namespaces created by the call." -
   100  	// user_namespaces(7)
   101  	creds := t.Credentials()
   102  	userns := creds.UserNamespace
   103  	if args.Flags&linux.CLONE_NEWUSER != 0 {
   104  		var err error
   105  		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
   106  		// the caller is in a chroot environment (i.e., the caller's root
   107  		// directory does not match the root directory of the mount namespace
   108  		// in which it resides)." - clone(2). Neither chroot(2) nor
   109  		// user_namespaces(7) document this.
   110  		if t.IsChrooted() {
   111  			return 0, nil, linuxerr.EPERM
   112  		}
   113  		userns, err = creds.NewChildUserNamespace()
   114  		if err != nil {
   115  			return 0, nil, err
   116  		}
   117  	}
   118  	if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
   119  		return 0, nil, linuxerr.EPERM
   120  	}
   121  
   122  	cu := cleanup.Make(func() {})
   123  	defer cu.Clean()
   124  
   125  	utsns := t.utsns
   126  	if args.Flags&linux.CLONE_NEWUTS != 0 {
   127  		// Note that this must happen after NewUserNamespace so we get
   128  		// the new userns if there is one.
   129  		utsns = utsns.Clone(userns)
   130  		utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, utsns))
   131  	} else {
   132  		utsns.IncRef()
   133  	}
   134  	cu.Add(func() {
   135  		utsns.DecRef(t)
   136  	})
   137  
   138  	ipcns := t.ipcns
   139  	if args.Flags&linux.CLONE_NEWIPC != 0 {
   140  		ipcns = NewIPCNamespace(userns)
   141  		ipcns.InitPosixQueues(t, t.k.VFS(), creds)
   142  		ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, ipcns))
   143  	} else {
   144  		ipcns.IncRef()
   145  	}
   146  	cu.Add(func() {
   147  		ipcns.DecRef(t)
   148  	})
   149  
   150  	netns := t.netns
   151  	if args.Flags&linux.CLONE_NEWNET != 0 {
   152  		netns = inet.NewNamespace(netns, userns)
   153  		inode := nsfs.NewInode(t, t.k.nsfsMount, netns)
   154  		netns.SetInode(inode)
   155  	} else {
   156  		netns.IncRef()
   157  	}
   158  	cu.Add(func() {
   159  		netns.DecRef(t)
   160  	})
   161  
   162  	// We must hold t.mu to access t.image, but we can't hold it during Fork(),
   163  	// since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered
   164  	// above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy.
   165  	t.mu.Lock()
   166  	curImage := t.image
   167  	sessionKeyring := t.sessionKeyring
   168  	t.mu.Unlock()
   169  	image, err := curImage.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0)
   170  	if err != nil {
   171  		return 0, nil, err
   172  	}
   173  	cu.Add(func() {
   174  		image.release(t)
   175  	})
   176  
   177  	if args.Flags&linux.CLONE_NEWUSER != 0 {
   178  		// If the task is in a new user namespace, it cannot share keys.
   179  		sessionKeyring = nil
   180  	}
   181  
   182  	// clone() returns 0 in the child.
   183  	image.Arch.SetReturn(0)
   184  	if args.Stack != 0 {
   185  		image.Arch.SetStack(uintptr(args.Stack + args.StackSize))
   186  	}
   187  	if args.Flags&linux.CLONE_SETTLS != 0 {
   188  		if !image.Arch.SetTLS(uintptr(args.TLS)) {
   189  			return 0, nil, linuxerr.EPERM
   190  		}
   191  	}
   192  
   193  	var fsContext *FSContext
   194  	if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 {
   195  		fsContext = t.fsContext.Fork()
   196  	} else {
   197  		fsContext = t.fsContext
   198  		fsContext.IncRef()
   199  	}
   200  
   201  	mntns := t.mountNamespace
   202  	if args.Flags&linux.CLONE_NEWNS != 0 {
   203  		var err error
   204  		mntns, err = t.k.vfs.CloneMountNamespace(t, creds, mntns, &fsContext.root, &fsContext.cwd, t.k)
   205  		if err != nil {
   206  			return 0, nil, err
   207  		}
   208  	} else {
   209  		mntns.IncRef()
   210  	}
   211  	cu.Add(func() {
   212  		mntns.DecRef(t)
   213  	})
   214  
   215  	var fdTable *FDTable
   216  	if args.Flags&linux.CLONE_FILES == 0 {
   217  		fdTable = t.fdTable.Fork(t, MaxFdLimit)
   218  	} else {
   219  		fdTable = t.fdTable
   220  		fdTable.IncRef()
   221  	}
   222  
   223  	pidns := t.tg.pidns
   224  	if t.childPIDNamespace != nil {
   225  		pidns = t.childPIDNamespace
   226  	} else if args.Flags&linux.CLONE_NEWPID != 0 {
   227  		pidns = pidns.NewChild(userns)
   228  	}
   229  
   230  	tg := t.tg
   231  	rseqAddr := hostarch.Addr(0)
   232  	rseqSignature := uint32(0)
   233  	if args.Flags&linux.CLONE_THREAD == 0 {
   234  		sh := t.tg.signalHandlers
   235  		if args.Flags&linux.CLONE_SIGHAND == 0 {
   236  			sh = sh.Fork()
   237  		}
   238  		tg = t.k.NewThreadGroup(pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy())
   239  		tg.oomScoreAdj = atomicbitops.FromInt32(t.tg.oomScoreAdj.Load())
   240  		rseqAddr = t.rseqAddr
   241  		rseqSignature = t.rseqSignature
   242  	}
   243  
   244  	uc := t.userCounters
   245  	if uc.uid != creds.RealKUID {
   246  		uc = t.k.GetUserCounters(creds.RealKUID)
   247  	}
   248  
   249  	cfg := &TaskConfig{
   250  		Kernel:           t.k,
   251  		ThreadGroup:      tg,
   252  		SignalMask:       t.SignalMask(),
   253  		TaskImage:        image,
   254  		FSContext:        fsContext,
   255  		FDTable:          fdTable,
   256  		Credentials:      creds,
   257  		Niceness:         t.Niceness(),
   258  		NetworkNamespace: netns,
   259  		AllowedCPUMask:   t.CPUMask(),
   260  		UTSNamespace:     utsns,
   261  		IPCNamespace:     ipcns,
   262  		MountNamespace:   mntns,
   263  		RSeqAddr:         rseqAddr,
   264  		RSeqSignature:    rseqSignature,
   265  		ContainerID:      t.ContainerID(),
   266  		UserCounters:     uc,
   267  		SessionKeyring:   sessionKeyring,
   268  	}
   269  	if args.Flags&linux.CLONE_THREAD == 0 {
   270  		cfg.Parent = t
   271  	} else {
   272  		cfg.InheritParent = t
   273  	}
   274  	nt, err := t.tg.pidns.owner.NewTask(t, cfg)
   275  	// If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
   276  	// the cleanup for us.
   277  	cu.Release()
   278  	if err != nil {
   279  		return 0, nil, err
   280  	}
   281  
   282  	// "A child process created via fork(2) inherits a copy of its parent's
   283  	// alternate signal stack settings" - sigaltstack(2).
   284  	//
   285  	// However kernel/fork.c:copy_process() adds a limitation to this:
   286  	// "sigaltstack should be cleared when sharing the same VM".
   287  	if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 {
   288  		nt.SetSignalStack(t.SignalStack())
   289  	}
   290  
   291  	if userns != creds.UserNamespace {
   292  		if err := nt.SetUserNamespace(userns); err != nil {
   293  			// This shouldn't be possible: userns was created from nt.creds, so
   294  			// nt should have CAP_SYS_ADMIN in userns.
   295  			panic("Task.Clone: SetUserNamespace failed: " + err.Error())
   296  		}
   297  	}
   298  
   299  	// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
   300  	// nt that it must receive before its task goroutine starts running.
   301  	tid := nt.k.tasks.Root.IDOfTask(nt)
   302  	defer nt.Start(tid)
   303  
   304  	if seccheck.Global.Enabled(seccheck.PointClone) {
   305  		mask, info := getCloneSeccheckInfo(t, nt, args.Flags)
   306  		if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   307  			return c.Clone(t, mask, info)
   308  		}); err != nil {
   309  			// nt has been visible to the rest of the system since NewTask, so
   310  			// it may be blocking execve or a group stop, have been notified
   311  			// for group signal delivery, had children reparented to it, etc.
   312  			// Thus we can't just drop it on the floor. Instead, instruct the
   313  			// task goroutine to exit immediately, as quietly as possible.
   314  			nt.exitTracerNotified = true
   315  			nt.exitTracerAcked = true
   316  			nt.exitParentNotified = true
   317  			nt.exitParentAcked = true
   318  			nt.runState = (*runExitMain)(nil)
   319  			return 0, nil, err
   320  		}
   321  	}
   322  
   323  	// "If fork/clone and execve are allowed by @prog, any child processes will
   324  	// be constrained to the same filters and system call ABI as the parent." -
   325  	// Documentation/prctl/seccomp_filter.txt
   326  	if ts := t.seccomp.Load(); ts != nil {
   327  		seccompCopy := ts.copy()
   328  		seccompCopy.populateCache(nt)
   329  		nt.seccomp.Store(seccompCopy)
   330  	} else {
   331  		nt.seccomp.Store(nil)
   332  	}
   333  	if args.Flags&linux.CLONE_VFORK != 0 {
   334  		nt.vforkParent = t
   335  	}
   336  
   337  	if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 {
   338  		nt.SetClearTID(hostarch.Addr(args.ChildTID))
   339  	}
   340  	if args.Flags&linux.CLONE_CHILD_SETTID != 0 {
   341  		ctid := nt.ThreadID()
   342  		ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID))
   343  	}
   344  	ntid := t.tg.pidns.IDOfTask(nt)
   345  	if args.Flags&linux.CLONE_PARENT_SETTID != 0 {
   346  		ntid.CopyOut(t, hostarch.Addr(args.ParentTID))
   347  	}
   348  
   349  	t.traceCloneEvent(tid)
   350  	kind := ptraceCloneKindClone
   351  	if args.Flags&linux.CLONE_VFORK != 0 {
   352  		kind = ptraceCloneKindVfork
   353  	} else if linux.Signal(args.ExitSignal) == linux.SIGCHLD {
   354  		kind = ptraceCloneKindFork
   355  	}
   356  	if t.ptraceClone(kind, nt, args) {
   357  		if args.Flags&linux.CLONE_VFORK != 0 {
   358  			return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
   359  		}
   360  		return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
   361  	}
   362  	if args.Flags&linux.CLONE_VFORK != 0 {
   363  		t.maybeBeginVforkStop(nt)
   364  		return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
   365  	}
   366  	return ntid, nil, nil
   367  }
   368  
   369  func getCloneSeccheckInfo(t, nt *Task, flags uint64) (seccheck.FieldSet, *pb.CloneInfo) {
   370  	fields := seccheck.Global.GetFieldSet(seccheck.PointClone)
   371  	var cwd string
   372  	if fields.Context.Contains(seccheck.FieldCtxtCwd) {
   373  		cwd = getTaskCurrentWorkingDirectory(t)
   374  	}
   375  	t.k.tasks.mu.RLock()
   376  	defer t.k.tasks.mu.RUnlock()
   377  	info := &pb.CloneInfo{
   378  		CreatedThreadId:          int32(nt.k.tasks.Root.tids[nt]),
   379  		CreatedThreadGroupId:     int32(nt.k.tasks.Root.tgids[nt.tg]),
   380  		CreatedThreadStartTimeNs: nt.startTime.Nanoseconds(),
   381  		Flags:                    flags,
   382  	}
   383  
   384  	if !fields.Context.Empty() {
   385  		info.ContextData = &pb.ContextData{}
   386  		LoadSeccheckDataLocked(t, fields.Context, info.ContextData, cwd)
   387  	}
   388  
   389  	return fields, info
   390  }
   391  
   392  // maybeBeginVforkStop checks if a previously-started vfork child is still
   393  // running and has not yet released its MM, such that its parent t should enter
   394  // a vforkStop.
   395  //
   396  // Preconditions: The caller must be running on t's task goroutine.
   397  func (t *Task) maybeBeginVforkStop(child *Task) {
   398  	t.tg.pidns.owner.mu.RLock()
   399  	defer t.tg.pidns.owner.mu.RUnlock()
   400  	t.tg.signalHandlers.mu.Lock()
   401  	defer t.tg.signalHandlers.mu.Unlock()
   402  	if t.killedLocked() {
   403  		child.vforkParent = nil
   404  		return
   405  	}
   406  	if child.vforkParent == t {
   407  		t.beginInternalStopLocked((*vforkStop)(nil))
   408  	}
   409  }
   410  
   411  func (t *Task) unstopVforkParent() {
   412  	t.tg.pidns.owner.mu.RLock()
   413  	defer t.tg.pidns.owner.mu.RUnlock()
   414  	if p := t.vforkParent; p != nil {
   415  		p.tg.signalHandlers.mu.Lock()
   416  		defer p.tg.signalHandlers.mu.Unlock()
   417  		if _, ok := p.stop.(*vforkStop); ok {
   418  			p.endInternalStopLocked()
   419  		}
   420  		// Parent no longer needs to be unstopped.
   421  		t.vforkParent = nil
   422  	}
   423  }
   424  
   425  // +stateify savable
   426  type runSyscallAfterPtraceEventClone struct {
   427  	vforkChild *Task
   428  
   429  	// If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
   430  	// PID namespace. vforkChildTID must be stored since the child may exit and
   431  	// release its TID before the PTRACE_EVENT stop ends.
   432  	vforkChildTID ThreadID
   433  }
   434  
   435  func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
   436  	if r.vforkChild != nil {
   437  		t.maybeBeginVforkStop(r.vforkChild)
   438  		return &runSyscallAfterVforkStop{r.vforkChildTID}
   439  	}
   440  	return (*runSyscallExit)(nil)
   441  }
   442  
   443  // +stateify savable
   444  type runSyscallAfterVforkStop struct {
   445  	// childTID has the same meaning as
   446  	// runSyscallAfterPtraceEventClone.vforkChildTID.
   447  	childTID ThreadID
   448  }
   449  
   450  func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
   451  	t.ptraceVforkDone(r.childTID)
   452  	return (*runSyscallExit)(nil)
   453  }
   454  
   455  // Setns reassociates thread with the specified namespace.
   456  func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
   457  	d, ok := fd.Dentry().Impl().(*kernfs.Dentry)
   458  	if !ok {
   459  		return linuxerr.EINVAL
   460  	}
   461  	i, ok := d.Inode().(*nsfs.Inode)
   462  	if !ok {
   463  		return linuxerr.EINVAL
   464  	}
   465  
   466  	switch ns := i.Namespace().(type) {
   467  	case *inet.Namespace:
   468  		if flags != 0 && flags != linux.CLONE_NEWNET {
   469  			return linuxerr.EINVAL
   470  		}
   471  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   472  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   473  			return linuxerr.EPERM
   474  		}
   475  		oldNS := t.NetworkNamespace()
   476  		ns.IncRef()
   477  		t.mu.Lock()
   478  		t.netns = ns
   479  		t.mu.Unlock()
   480  		oldNS.DecRef(t)
   481  		return nil
   482  	case *IPCNamespace:
   483  		if flags != 0 && flags != linux.CLONE_NEWIPC {
   484  			return linuxerr.EINVAL
   485  		}
   486  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   487  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   488  			return linuxerr.EPERM
   489  		}
   490  		oldNS := t.IPCNamespace()
   491  		ns.IncRef()
   492  		t.mu.Lock()
   493  		t.ipcns = ns
   494  		t.mu.Unlock()
   495  		oldNS.DecRef(t)
   496  		return nil
   497  	case *vfs.MountNamespace:
   498  		if flags != 0 && flags != linux.CLONE_NEWNS {
   499  			return linuxerr.EINVAL
   500  		}
   501  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.Owner) ||
   502  			!t.Credentials().HasCapability(linux.CAP_SYS_CHROOT) ||
   503  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   504  			return linuxerr.EPERM
   505  		}
   506  		oldFSContext := t.fsContext
   507  		// The current task has to be an exclusive owner of its fs context.
   508  		if oldFSContext.ReadRefs() != 1 {
   509  			return linuxerr.EINVAL
   510  		}
   511  		fsContext := oldFSContext.Fork()
   512  		fsContext.root.DecRef(t)
   513  		fsContext.cwd.DecRef(t)
   514  		vd := ns.Root(t)
   515  		fsContext.root = vd
   516  		vd.IncRef()
   517  		fsContext.cwd = vd
   518  
   519  		oldNS := t.mountNamespace
   520  		ns.IncRef()
   521  		t.mu.Lock()
   522  		t.mountNamespace = ns
   523  		t.fsContext = fsContext
   524  		t.mu.Unlock()
   525  		oldNS.DecRef(t)
   526  		oldFSContext.DecRef(t)
   527  		return nil
   528  	case *UTSNamespace:
   529  		if flags != 0 && flags != linux.CLONE_NEWUTS {
   530  			return linuxerr.EINVAL
   531  		}
   532  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   533  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   534  			return linuxerr.EPERM
   535  		}
   536  		oldNS := t.UTSNamespace()
   537  		ns.IncRef()
   538  		t.mu.Lock()
   539  		t.utsns = ns
   540  		t.mu.Unlock()
   541  		oldNS.DecRef(t)
   542  		return nil
   543  	default:
   544  		return linuxerr.EINVAL
   545  	}
   546  }
   547  
   548  // Unshare changes the set of resources t shares with other tasks, as specified
   549  // by flags.
   550  //
   551  // Preconditions: The caller must be running on the task goroutine.
   552  func (t *Task) Unshare(flags int32) error {
   553  	// "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if
   554  	// the caller is single threaded (i.e., it is not sharing its address space
   555  	// with another process or thread). In this case, these flags have no
   556  	// effect. (Note also that specifying CLONE_THREAD automatically implies
   557  	// CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.)
   558  	// If the process is multithreaded, then the use of these flags results in
   559  	// an error." - unshare(2). This is incorrect (cf.
   560  	// kernel/fork.c:ksys_unshare()):
   561  	//
   562  	//	- CLONE_THREAD does not imply CLONE_VM.
   563  	//
   564  	//	- CLONE_SIGHAND implies CLONE_THREAD.
   565  	//
   566  	//	- Only CLONE_VM requires that the caller is not sharing its address
   567  	//		space with another thread. CLONE_SIGHAND requires that the caller is not
   568  	//		sharing its signal handlers, and CLONE_THREAD requires that the caller
   569  	//		is the only thread in its thread group.
   570  	//
   571  	// Since we don't count the number of tasks using each address space or set
   572  	// of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether.
   573  	if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 {
   574  		return linuxerr.EINVAL
   575  	}
   576  	creds := t.Credentials()
   577  	if flags&linux.CLONE_THREAD != 0 {
   578  		t.tg.signalHandlers.mu.Lock()
   579  		if t.tg.tasksCount != 1 {
   580  			t.tg.signalHandlers.mu.Unlock()
   581  			return linuxerr.EINVAL
   582  		}
   583  		t.tg.signalHandlers.mu.Unlock()
   584  		// This isn't racy because we're the only living task, and therefore
   585  		// the only task capable of creating new ones, in our thread group.
   586  	}
   587  	if flags&linux.CLONE_NEWUSER != 0 {
   588  		if t.IsChrooted() {
   589  			return linuxerr.EPERM
   590  		}
   591  		newUserNS, err := creds.NewChildUserNamespace()
   592  		if err != nil {
   593  			return err
   594  		}
   595  		err = t.SetUserNamespace(newUserNS)
   596  		if err != nil {
   597  			return err
   598  		}
   599  		// Need to reload creds, because t.SetUserNamespace() changed task credentials.
   600  		creds = t.Credentials()
   601  	}
   602  	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
   603  	if flags&linux.CLONE_NEWPID != 0 {
   604  		if !haveCapSysAdmin {
   605  			return linuxerr.EPERM
   606  		}
   607  		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
   608  	}
   609  	if flags&linux.CLONE_NEWNET != 0 {
   610  		if !haveCapSysAdmin {
   611  			return linuxerr.EPERM
   612  		}
   613  		netns := t.NetworkNamespace()
   614  		netns = inet.NewNamespace(netns, t.UserNamespace())
   615  		netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns)
   616  		netns.SetInode(netnsInode)
   617  		t.mu.Lock()
   618  		oldNetns := t.netns
   619  		t.netns = netns
   620  		t.mu.Unlock()
   621  		oldNetns.DecRef(t)
   622  	}
   623  
   624  	cu := cleanup.Cleanup{}
   625  	// All cu actions has to be executed after releasing t.mu.
   626  	defer cu.Clean()
   627  	t.mu.Lock()
   628  	defer t.mu.Unlock()
   629  	// Can't defer unlock: DecRefs must occur without holding t.mu.
   630  	if flags&linux.CLONE_NEWUTS != 0 {
   631  		if !haveCapSysAdmin {
   632  			return linuxerr.EPERM
   633  		}
   634  		// Note that this must happen after NewUserNamespace, so the
   635  		// new user namespace is used if there is one.
   636  		oldUTSNS := t.utsns
   637  		t.utsns = t.utsns.Clone(creds.UserNamespace)
   638  		t.utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.utsns))
   639  		cu.Add(func() { oldUTSNS.DecRef(t) })
   640  	}
   641  	if flags&linux.CLONE_NEWIPC != 0 {
   642  		if !haveCapSysAdmin {
   643  			return linuxerr.EPERM
   644  		}
   645  		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
   646  		// namespace"
   647  		oldIPCNS := t.ipcns
   648  		t.ipcns = NewIPCNamespace(creds.UserNamespace)
   649  		t.ipcns.InitPosixQueues(t, t.k.VFS(), creds)
   650  		t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns))
   651  		cu.Add(func() { oldIPCNS.DecRef(t) })
   652  	}
   653  	if flags&linux.CLONE_FILES != 0 {
   654  		oldFDTable := t.fdTable
   655  		t.fdTable = oldFDTable.Fork(t, MaxFdLimit)
   656  		cu.Add(func() { oldFDTable.DecRef(t) })
   657  	}
   658  	if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
   659  		oldFSContext := t.fsContext
   660  		t.fsContext = oldFSContext.Fork()
   661  		cu.Add(func() { oldFSContext.DecRef(t) })
   662  	}
   663  	if flags&linux.CLONE_NEWNS != 0 {
   664  		if !haveCapSysAdmin {
   665  			return linuxerr.EPERM
   666  		}
   667  		oldMountNS := t.mountNamespace
   668  		mntns, err := t.k.vfs.CloneMountNamespace(t, creds, oldMountNS, &t.fsContext.root, &t.fsContext.cwd, t.k)
   669  		if err != nil {
   670  			return err
   671  		}
   672  		t.mountNamespace = mntns
   673  		cu.Add(func() { oldMountNS.DecRef(t) })
   674  	}
   675  	return nil
   676  }
   677  
   678  // UnshareFdTable unshares the FdTable that task t shares with other tasks, upto
   679  // the maxFd.
   680  //
   681  // Preconditions: The caller must be running on the task goroutine.
   682  func (t *Task) UnshareFdTable(maxFd int32) {
   683  	t.mu.Lock()
   684  	oldFDTable := t.fdTable
   685  	t.fdTable = oldFDTable.Fork(t, maxFd)
   686  	t.mu.Unlock()
   687  
   688  	oldFDTable.DecRef(t)
   689  }
   690  
   691  // vforkStop is a TaskStop imposed on a task that creates a child with
   692  // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
   693  // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
   694  // that the child and parent share mappings until the child execve()s into a
   695  // new process image or exits.)
   696  //
   697  // +stateify savable
   698  type vforkStop struct{}
   699  
   700  // StopIgnoresKill implements TaskStop.Killable.
   701  func (*vforkStop) Killable() bool { return true }