github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/task_clone.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    19  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    20  	"github.com/MerlinKodo/gvisor/pkg/bpf"
    21  	"github.com/MerlinKodo/gvisor/pkg/cleanup"
    22  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    23  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    24  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/kernfs"
    25  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/nsfs"
    26  	"github.com/MerlinKodo/gvisor/pkg/sentry/inet"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/seccheck"
    28  	pb "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck/points/points_go_proto"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    30  	"github.com/MerlinKodo/gvisor/pkg/usermem"
    31  )
    32  
    33  // SupportedCloneFlags is the bitwise OR of all the supported flags for clone.
    34  // TODO(b/290826530): Implement CLONE_INTO_CGROUP when cgroups v2 is
    35  // implemented.
    36  const SupportedCloneFlags = linux.CLONE_VM | linux.CLONE_FS | linux.CLONE_FILES | linux.CLONE_SYSVSEM |
    37  	linux.CLONE_THREAD | linux.CLONE_SIGHAND | linux.CLONE_CHILD_SETTID | linux.CLONE_NEWPID |
    38  	linux.CLONE_CHILD_CLEARTID | linux.CLONE_CHILD_SETTID | linux.CLONE_PARENT |
    39  	linux.CLONE_PARENT_SETTID | linux.CLONE_SETTLS | linux.CLONE_NEWUSER | linux.CLONE_NEWUTS |
    40  	linux.CLONE_NEWIPC | linux.CLONE_NEWNET | linux.CLONE_PTRACE | linux.CLONE_UNTRACED |
    41  	linux.CLONE_IO | linux.CLONE_VFORK | linux.CLONE_DETACHED | linux.CLONE_NEWNS
    42  
    43  // Clone implements the clone(2) syscall and returns the thread ID of the new
    44  // task in t's PID namespace. Clone may return both a non-zero thread ID and a
    45  // non-nil error.
    46  //
    47  // Preconditions: The caller must be running Task.doSyscallInvoke on the task
    48  // goroutine.
    49  func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
    50  	if args.Flags&^SupportedCloneFlags != 0 {
    51  		return 0, nil, linuxerr.EINVAL
    52  	}
    53  	// Since signal actions may refer to application signal handlers by virtual
    54  	// address, any set of signal handlers must refer to the same address
    55  	// space.
    56  	if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND {
    57  		return 0, nil, linuxerr.EINVAL
    58  	}
    59  	if args.SetTID != 0 {
    60  		return 0, nil, linuxerr.ENOTSUP
    61  	}
    62  	// In order for the behavior of thread-group-directed signals to be sane,
    63  	// all tasks in a thread group must share signal handlers.
    64  	if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD {
    65  		return 0, nil, linuxerr.EINVAL
    66  	}
    67  	// All tasks in a thread group must be in the same PID namespace.
    68  	if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) {
    69  		return 0, nil, linuxerr.EINVAL
    70  	}
    71  	// The two different ways of specifying a new PID namespace are
    72  	// incompatible.
    73  	if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil {
    74  		return 0, nil, linuxerr.EINVAL
    75  	}
    76  	// Thread groups and FS contexts cannot span user namespaces.
    77  	if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 {
    78  		return 0, nil, linuxerr.EINVAL
    79  	}
    80  	// args.ExitSignal must be a valid signal.
    81  	if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() {
    82  		return 0, nil, linuxerr.EINVAL
    83  	}
    84  	if args.Flags&(linux.CLONE_FS|linux.CLONE_NEWNS) == linux.CLONE_FS|linux.CLONE_NEWNS {
    85  		return 0, nil, linuxerr.EINVAL
    86  	}
    87  
    88  	// Pull task registers and FPU state, a cloned task will inherit the
    89  	// state of the current task.
    90  	if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil {
    91  		t.Warningf("Unable to pull a full state: %v", err)
    92  		t.forceSignal(linux.SIGILL, true /* unconditional */)
    93  		t.SendSignal(SignalInfoPriv(linux.SIGILL))
    94  		return 0, nil, linuxerr.EFAULT
    95  	}
    96  
    97  	// "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
    98  	// single clone(2) or unshare(2) call, the user namespace is guaranteed to
    99  	// be created first, giving the child (clone(2)) or caller (unshare(2))
   100  	// privileges over the remaining namespaces created by the call." -
   101  	// user_namespaces(7)
   102  	creds := t.Credentials()
   103  	userns := creds.UserNamespace
   104  	if args.Flags&linux.CLONE_NEWUSER != 0 {
   105  		var err error
   106  		// "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
   107  		// the caller is in a chroot environment (i.e., the caller's root
   108  		// directory does not match the root directory of the mount namespace
   109  		// in which it resides)." - clone(2). Neither chroot(2) nor
   110  		// user_namespaces(7) document this.
   111  		if t.IsChrooted() {
   112  			return 0, nil, linuxerr.EPERM
   113  		}
   114  		userns, err = creds.NewChildUserNamespace()
   115  		if err != nil {
   116  			return 0, nil, err
   117  		}
   118  	}
   119  	if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
   120  		return 0, nil, linuxerr.EPERM
   121  	}
   122  
   123  	cu := cleanup.Make(func() {})
   124  	defer cu.Clean()
   125  
   126  	utsns := t.utsns
   127  	if args.Flags&linux.CLONE_NEWUTS != 0 {
   128  		// Note that this must happen after NewUserNamespace so we get
   129  		// the new userns if there is one.
   130  		utsns = utsns.Clone(userns)
   131  		utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, utsns))
   132  	} else {
   133  		utsns.IncRef()
   134  	}
   135  	cu.Add(func() {
   136  		utsns.DecRef(t)
   137  	})
   138  
   139  	ipcns := t.ipcns
   140  	if args.Flags&linux.CLONE_NEWIPC != 0 {
   141  		ipcns = NewIPCNamespace(userns)
   142  		ipcns.InitPosixQueues(t, t.k.VFS(), creds)
   143  		ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, ipcns))
   144  	} else {
   145  		ipcns.IncRef()
   146  	}
   147  	cu.Add(func() {
   148  		ipcns.DecRef(t)
   149  	})
   150  
   151  	netns := t.netns
   152  	if args.Flags&linux.CLONE_NEWNET != 0 {
   153  		netns = inet.NewNamespace(netns, userns)
   154  		inode := nsfs.NewInode(t, t.k.nsfsMount, netns)
   155  		netns.SetInode(inode)
   156  	} else {
   157  		netns.IncRef()
   158  	}
   159  	cu.Add(func() {
   160  		netns.DecRef(t)
   161  	})
   162  
   163  	// We must hold t.mu to access t.image, but we can't hold it during Fork(),
   164  	// since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered
   165  	// above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy.
   166  	t.mu.Lock()
   167  	curImage := t.image
   168  	sessionKeyring := t.sessionKeyring
   169  	t.mu.Unlock()
   170  	image, err := curImage.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0)
   171  	if err != nil {
   172  		return 0, nil, err
   173  	}
   174  	cu.Add(func() {
   175  		image.release(t)
   176  	})
   177  
   178  	if args.Flags&linux.CLONE_NEWUSER != 0 {
   179  		// If the task is in a new user namespace, it cannot share keys.
   180  		sessionKeyring = nil
   181  	}
   182  
   183  	// clone() returns 0 in the child.
   184  	image.Arch.SetReturn(0)
   185  	if args.Stack != 0 {
   186  		image.Arch.SetStack(uintptr(args.Stack + args.StackSize))
   187  	}
   188  	if args.Flags&linux.CLONE_SETTLS != 0 {
   189  		if !image.Arch.SetTLS(uintptr(args.TLS)) {
   190  			return 0, nil, linuxerr.EPERM
   191  		}
   192  	}
   193  
   194  	var fsContext *FSContext
   195  	if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 {
   196  		fsContext = t.fsContext.Fork()
   197  	} else {
   198  		fsContext = t.fsContext
   199  		fsContext.IncRef()
   200  	}
   201  
   202  	mntns := t.mountNamespace
   203  	if args.Flags&linux.CLONE_NEWNS != 0 {
   204  		var err error
   205  		mntns, err = t.k.vfs.CloneMountNamespace(t, creds, mntns, &fsContext.root, &fsContext.cwd, t.k)
   206  		if err != nil {
   207  			return 0, nil, err
   208  		}
   209  	} else {
   210  		mntns.IncRef()
   211  	}
   212  	cu.Add(func() {
   213  		mntns.DecRef(t)
   214  	})
   215  
   216  	var fdTable *FDTable
   217  	if args.Flags&linux.CLONE_FILES == 0 {
   218  		fdTable = t.fdTable.Fork(t, MaxFdLimit)
   219  	} else {
   220  		fdTable = t.fdTable
   221  		fdTable.IncRef()
   222  	}
   223  
   224  	pidns := t.tg.pidns
   225  	if t.childPIDNamespace != nil {
   226  		pidns = t.childPIDNamespace
   227  	} else if args.Flags&linux.CLONE_NEWPID != 0 {
   228  		pidns = pidns.NewChild(userns)
   229  	}
   230  
   231  	tg := t.tg
   232  	rseqAddr := hostarch.Addr(0)
   233  	rseqSignature := uint32(0)
   234  	if args.Flags&linux.CLONE_THREAD == 0 {
   235  		sh := t.tg.signalHandlers
   236  		if args.Flags&linux.CLONE_SIGHAND == 0 {
   237  			sh = sh.Fork()
   238  		}
   239  		tg = t.k.NewThreadGroup(pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy())
   240  		tg.oomScoreAdj = atomicbitops.FromInt32(t.tg.oomScoreAdj.Load())
   241  		rseqAddr = t.rseqAddr
   242  		rseqSignature = t.rseqSignature
   243  	}
   244  
   245  	uc := t.userCounters
   246  	if uc.uid != creds.RealKUID {
   247  		uc = t.k.GetUserCounters(creds.RealKUID)
   248  	}
   249  
   250  	cfg := &TaskConfig{
   251  		Kernel:                  t.k,
   252  		ThreadGroup:             tg,
   253  		SignalMask:              t.SignalMask(),
   254  		TaskImage:               image,
   255  		FSContext:               fsContext,
   256  		FDTable:                 fdTable,
   257  		Credentials:             creds,
   258  		Niceness:                t.Niceness(),
   259  		NetworkNamespace:        netns,
   260  		AllowedCPUMask:          t.CPUMask(),
   261  		UTSNamespace:            utsns,
   262  		IPCNamespace:            ipcns,
   263  		AbstractSocketNamespace: t.abstractSockets,
   264  		MountNamespace:          mntns,
   265  		RSeqAddr:                rseqAddr,
   266  		RSeqSignature:           rseqSignature,
   267  		ContainerID:             t.ContainerID(),
   268  		UserCounters:            uc,
   269  		SessionKeyring:          sessionKeyring,
   270  	}
   271  	if args.Flags&linux.CLONE_THREAD == 0 {
   272  		cfg.Parent = t
   273  	} else {
   274  		cfg.InheritParent = t
   275  	}
   276  	nt, err := t.tg.pidns.owner.NewTask(t, cfg)
   277  	// If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
   278  	// the cleanup for us.
   279  	cu.Release()
   280  	if err != nil {
   281  		return 0, nil, err
   282  	}
   283  
   284  	// "A child process created via fork(2) inherits a copy of its parent's
   285  	// alternate signal stack settings" - sigaltstack(2).
   286  	//
   287  	// However kernel/fork.c:copy_process() adds a limitation to this:
   288  	// "sigaltstack should be cleared when sharing the same VM".
   289  	if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 {
   290  		nt.SetSignalStack(t.SignalStack())
   291  	}
   292  
   293  	if userns != creds.UserNamespace {
   294  		if err := nt.SetUserNamespace(userns); err != nil {
   295  			// This shouldn't be possible: userns was created from nt.creds, so
   296  			// nt should have CAP_SYS_ADMIN in userns.
   297  			panic("Task.Clone: SetUserNamespace failed: " + err.Error())
   298  		}
   299  	}
   300  
   301  	// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
   302  	// nt that it must receive before its task goroutine starts running.
   303  	tid := nt.k.tasks.Root.IDOfTask(nt)
   304  	defer nt.Start(tid)
   305  
   306  	if seccheck.Global.Enabled(seccheck.PointClone) {
   307  		mask, info := getCloneSeccheckInfo(t, nt, args.Flags)
   308  		if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error {
   309  			return c.Clone(t, mask, info)
   310  		}); err != nil {
   311  			// nt has been visible to the rest of the system since NewTask, so
   312  			// it may be blocking execve or a group stop, have been notified
   313  			// for group signal delivery, had children reparented to it, etc.
   314  			// Thus we can't just drop it on the floor. Instead, instruct the
   315  			// task goroutine to exit immediately, as quietly as possible.
   316  			nt.exitTracerNotified = true
   317  			nt.exitTracerAcked = true
   318  			nt.exitParentNotified = true
   319  			nt.exitParentAcked = true
   320  			nt.runState = (*runExitMain)(nil)
   321  			return 0, nil, err
   322  		}
   323  	}
   324  
   325  	// "If fork/clone and execve are allowed by @prog, any child processes will
   326  	// be constrained to the same filters and system call ABI as the parent." -
   327  	// Documentation/prctl/seccomp_filter.txt
   328  	if f := t.syscallFilters.Load(); f != nil {
   329  		copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
   330  		nt.syscallFilters.Store(copiedFilters)
   331  	}
   332  	if args.Flags&linux.CLONE_VFORK != 0 {
   333  		nt.vforkParent = t
   334  	}
   335  
   336  	if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 {
   337  		nt.SetClearTID(hostarch.Addr(args.ChildTID))
   338  	}
   339  	if args.Flags&linux.CLONE_CHILD_SETTID != 0 {
   340  		ctid := nt.ThreadID()
   341  		ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID))
   342  	}
   343  	ntid := t.tg.pidns.IDOfTask(nt)
   344  	if args.Flags&linux.CLONE_PARENT_SETTID != 0 {
   345  		ntid.CopyOut(t, hostarch.Addr(args.ParentTID))
   346  	}
   347  
   348  	t.traceCloneEvent(tid)
   349  	kind := ptraceCloneKindClone
   350  	if args.Flags&linux.CLONE_VFORK != 0 {
   351  		kind = ptraceCloneKindVfork
   352  	} else if linux.Signal(args.ExitSignal) == linux.SIGCHLD {
   353  		kind = ptraceCloneKindFork
   354  	}
   355  	if t.ptraceClone(kind, nt, args) {
   356  		if args.Flags&linux.CLONE_VFORK != 0 {
   357  			return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
   358  		}
   359  		return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
   360  	}
   361  	if args.Flags&linux.CLONE_VFORK != 0 {
   362  		t.maybeBeginVforkStop(nt)
   363  		return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
   364  	}
   365  	return ntid, nil, nil
   366  }
   367  
   368  func getCloneSeccheckInfo(t, nt *Task, flags uint64) (seccheck.FieldSet, *pb.CloneInfo) {
   369  	fields := seccheck.Global.GetFieldSet(seccheck.PointClone)
   370  	var cwd string
   371  	if fields.Context.Contains(seccheck.FieldCtxtCwd) {
   372  		cwd = getTaskCurrentWorkingDirectory(t)
   373  	}
   374  	t.k.tasks.mu.RLock()
   375  	defer t.k.tasks.mu.RUnlock()
   376  	info := &pb.CloneInfo{
   377  		CreatedThreadId:          int32(nt.k.tasks.Root.tids[nt]),
   378  		CreatedThreadGroupId:     int32(nt.k.tasks.Root.tgids[nt.tg]),
   379  		CreatedThreadStartTimeNs: nt.startTime.Nanoseconds(),
   380  		Flags:                    flags,
   381  	}
   382  
   383  	if !fields.Context.Empty() {
   384  		info.ContextData = &pb.ContextData{}
   385  		LoadSeccheckDataLocked(t, fields.Context, info.ContextData, cwd)
   386  	}
   387  
   388  	return fields, info
   389  }
   390  
   391  // maybeBeginVforkStop checks if a previously-started vfork child is still
   392  // running and has not yet released its MM, such that its parent t should enter
   393  // a vforkStop.
   394  //
   395  // Preconditions: The caller must be running on t's task goroutine.
   396  func (t *Task) maybeBeginVforkStop(child *Task) {
   397  	t.tg.pidns.owner.mu.RLock()
   398  	defer t.tg.pidns.owner.mu.RUnlock()
   399  	t.tg.signalHandlers.mu.Lock()
   400  	defer t.tg.signalHandlers.mu.Unlock()
   401  	if t.killedLocked() {
   402  		child.vforkParent = nil
   403  		return
   404  	}
   405  	if child.vforkParent == t {
   406  		t.beginInternalStopLocked((*vforkStop)(nil))
   407  	}
   408  }
   409  
   410  func (t *Task) unstopVforkParent() {
   411  	t.tg.pidns.owner.mu.RLock()
   412  	defer t.tg.pidns.owner.mu.RUnlock()
   413  	if p := t.vforkParent; p != nil {
   414  		p.tg.signalHandlers.mu.Lock()
   415  		defer p.tg.signalHandlers.mu.Unlock()
   416  		if _, ok := p.stop.(*vforkStop); ok {
   417  			p.endInternalStopLocked()
   418  		}
   419  		// Parent no longer needs to be unstopped.
   420  		t.vforkParent = nil
   421  	}
   422  }
   423  
   424  // +stateify savable
   425  type runSyscallAfterPtraceEventClone struct {
   426  	vforkChild *Task
   427  
   428  	// If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
   429  	// PID namespace. vforkChildTID must be stored since the child may exit and
   430  	// release its TID before the PTRACE_EVENT stop ends.
   431  	vforkChildTID ThreadID
   432  }
   433  
   434  func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
   435  	if r.vforkChild != nil {
   436  		t.maybeBeginVforkStop(r.vforkChild)
   437  		return &runSyscallAfterVforkStop{r.vforkChildTID}
   438  	}
   439  	return (*runSyscallExit)(nil)
   440  }
   441  
   442  // +stateify savable
   443  type runSyscallAfterVforkStop struct {
   444  	// childTID has the same meaning as
   445  	// runSyscallAfterPtraceEventClone.vforkChildTID.
   446  	childTID ThreadID
   447  }
   448  
   449  func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
   450  	t.ptraceVforkDone(r.childTID)
   451  	return (*runSyscallExit)(nil)
   452  }
   453  
   454  // Setns reassociates thread with the specified namespace.
   455  func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
   456  	d, ok := fd.Dentry().Impl().(*kernfs.Dentry)
   457  	if !ok {
   458  		return linuxerr.EINVAL
   459  	}
   460  	i, ok := d.Inode().(*nsfs.Inode)
   461  	if !ok {
   462  		return linuxerr.EINVAL
   463  	}
   464  
   465  	switch ns := i.Namespace().(type) {
   466  	case *inet.Namespace:
   467  		if flags != 0 && flags != linux.CLONE_NEWNET {
   468  			return linuxerr.EINVAL
   469  		}
   470  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   471  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   472  			return linuxerr.EPERM
   473  		}
   474  		oldNS := t.NetworkNamespace()
   475  		ns.IncRef()
   476  		t.mu.Lock()
   477  		t.netns = ns
   478  		t.mu.Unlock()
   479  		oldNS.DecRef(t)
   480  		return nil
   481  	case *IPCNamespace:
   482  		if flags != 0 && flags != linux.CLONE_NEWIPC {
   483  			return linuxerr.EINVAL
   484  		}
   485  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   486  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   487  			return linuxerr.EPERM
   488  		}
   489  		oldNS := t.IPCNamespace()
   490  		ns.IncRef()
   491  		t.mu.Lock()
   492  		t.ipcns = ns
   493  		t.mu.Unlock()
   494  		oldNS.DecRef(t)
   495  		return nil
   496  	case *vfs.MountNamespace:
   497  		if flags != 0 && flags != linux.CLONE_NEWNS {
   498  			return linuxerr.EINVAL
   499  		}
   500  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.Owner) ||
   501  			!t.Credentials().HasCapability(linux.CAP_SYS_CHROOT) ||
   502  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   503  			return linuxerr.EPERM
   504  		}
   505  		oldFSContext := t.fsContext
   506  		// The current task has to be an exclusive owner of its fs context.
   507  		if oldFSContext.ReadRefs() != 1 {
   508  			return linuxerr.EINVAL
   509  		}
   510  		fsContext := oldFSContext.Fork()
   511  		fsContext.root.DecRef(t)
   512  		fsContext.cwd.DecRef(t)
   513  		vd := ns.Root(t)
   514  		fsContext.root = vd
   515  		vd.IncRef()
   516  		fsContext.cwd = vd
   517  
   518  		oldNS := t.mountNamespace
   519  		ns.IncRef()
   520  		t.mu.Lock()
   521  		t.mountNamespace = ns
   522  		t.fsContext = fsContext
   523  		t.mu.Unlock()
   524  		oldNS.DecRef(t)
   525  		oldFSContext.DecRef(t)
   526  		return nil
   527  	case *UTSNamespace:
   528  		if flags != 0 && flags != linux.CLONE_NEWUTS {
   529  			return linuxerr.EINVAL
   530  		}
   531  		if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) ||
   532  			!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
   533  			return linuxerr.EPERM
   534  		}
   535  		oldNS := t.UTSNamespace()
   536  		ns.IncRef()
   537  		t.mu.Lock()
   538  		t.utsns = ns
   539  		t.mu.Unlock()
   540  		oldNS.DecRef(t)
   541  		return nil
   542  	default:
   543  		return linuxerr.EINVAL
   544  	}
   545  }
   546  
   547  // Unshare changes the set of resources t shares with other tasks, as specified
   548  // by flags.
   549  //
   550  // Preconditions: The caller must be running on the task goroutine.
   551  func (t *Task) Unshare(flags int32) error {
   552  	// "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if
   553  	// the caller is single threaded (i.e., it is not sharing its address space
   554  	// with another process or thread). In this case, these flags have no
   555  	// effect. (Note also that specifying CLONE_THREAD automatically implies
   556  	// CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.)
   557  	// If the process is multithreaded, then the use of these flags results in
   558  	// an error." - unshare(2). This is incorrect (cf.
   559  	// kernel/fork.c:ksys_unshare()):
   560  	//
   561  	//	- CLONE_THREAD does not imply CLONE_VM.
   562  	//
   563  	//	- CLONE_SIGHAND implies CLONE_THREAD.
   564  	//
   565  	//	- Only CLONE_VM requires that the caller is not sharing its address
   566  	//		space with another thread. CLONE_SIGHAND requires that the caller is not
   567  	//		sharing its signal handlers, and CLONE_THREAD requires that the caller
   568  	//		is the only thread in its thread group.
   569  	//
   570  	// Since we don't count the number of tasks using each address space or set
   571  	// of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether.
   572  	if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 {
   573  		return linuxerr.EINVAL
   574  	}
   575  	creds := t.Credentials()
   576  	if flags&linux.CLONE_THREAD != 0 {
   577  		t.tg.signalHandlers.mu.Lock()
   578  		if t.tg.tasksCount != 1 {
   579  			t.tg.signalHandlers.mu.Unlock()
   580  			return linuxerr.EINVAL
   581  		}
   582  		t.tg.signalHandlers.mu.Unlock()
   583  		// This isn't racy because we're the only living task, and therefore
   584  		// the only task capable of creating new ones, in our thread group.
   585  	}
   586  	if flags&linux.CLONE_NEWUSER != 0 {
   587  		if t.IsChrooted() {
   588  			return linuxerr.EPERM
   589  		}
   590  		newUserNS, err := creds.NewChildUserNamespace()
   591  		if err != nil {
   592  			return err
   593  		}
   594  		err = t.SetUserNamespace(newUserNS)
   595  		if err != nil {
   596  			return err
   597  		}
   598  		// Need to reload creds, because t.SetUserNamespace() changed task credentials.
   599  		creds = t.Credentials()
   600  	}
   601  	haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
   602  	if flags&linux.CLONE_NEWPID != 0 {
   603  		if !haveCapSysAdmin {
   604  			return linuxerr.EPERM
   605  		}
   606  		t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
   607  	}
   608  	if flags&linux.CLONE_NEWNET != 0 {
   609  		if !haveCapSysAdmin {
   610  			return linuxerr.EPERM
   611  		}
   612  		netns := t.NetworkNamespace()
   613  		netns = inet.NewNamespace(netns, t.UserNamespace())
   614  		netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns)
   615  		netns.SetInode(netnsInode)
   616  		t.mu.Lock()
   617  		oldNetns := t.netns
   618  		t.netns = netns
   619  		t.mu.Unlock()
   620  		oldNetns.DecRef(t)
   621  	}
   622  
   623  	cu := cleanup.Cleanup{}
   624  	// All cu actions has to be executed after releasing t.mu.
   625  	defer cu.Clean()
   626  	t.mu.Lock()
   627  	defer t.mu.Unlock()
   628  	// Can't defer unlock: DecRefs must occur without holding t.mu.
   629  	if flags&linux.CLONE_NEWUTS != 0 {
   630  		if !haveCapSysAdmin {
   631  			return linuxerr.EPERM
   632  		}
   633  		// Note that this must happen after NewUserNamespace, so the
   634  		// new user namespace is used if there is one.
   635  		oldUTSNS := t.utsns
   636  		t.utsns = t.utsns.Clone(creds.UserNamespace)
   637  		t.utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.utsns))
   638  		cu.Add(func() { oldUTSNS.DecRef(t) })
   639  	}
   640  	if flags&linux.CLONE_NEWIPC != 0 {
   641  		if !haveCapSysAdmin {
   642  			return linuxerr.EPERM
   643  		}
   644  		// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
   645  		// namespace"
   646  		oldIPCNS := t.ipcns
   647  		t.ipcns = NewIPCNamespace(creds.UserNamespace)
   648  		t.ipcns.InitPosixQueues(t, t.k.VFS(), creds)
   649  		t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns))
   650  		cu.Add(func() { oldIPCNS.DecRef(t) })
   651  	}
   652  	if flags&linux.CLONE_FILES != 0 {
   653  		oldFDTable := t.fdTable
   654  		t.fdTable = oldFDTable.Fork(t, MaxFdLimit)
   655  		cu.Add(func() { oldFDTable.DecRef(t) })
   656  	}
   657  	if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
   658  		oldFSContext := t.fsContext
   659  		t.fsContext = oldFSContext.Fork()
   660  		cu.Add(func() { oldFSContext.DecRef(t) })
   661  	}
   662  	if flags&linux.CLONE_NEWNS != 0 {
   663  		if !haveCapSysAdmin {
   664  			return linuxerr.EPERM
   665  		}
   666  		oldMountNS := t.mountNamespace
   667  		mntns, err := t.k.vfs.CloneMountNamespace(t, creds, oldMountNS, &t.fsContext.root, &t.fsContext.cwd, t.k)
   668  		if err != nil {
   669  			return err
   670  		}
   671  		t.mountNamespace = mntns
   672  		cu.Add(func() { oldMountNS.DecRef(t) })
   673  	}
   674  	return nil
   675  }
   676  
   677  // UnshareFdTable unshares the FdTable that task t shares with other tasks, upto
   678  // the maxFd.
   679  //
   680  // Preconditions: The caller must be running on the task goroutine.
   681  func (t *Task) UnshareFdTable(maxFd int32) {
   682  	t.mu.Lock()
   683  	oldFDTable := t.fdTable
   684  	t.fdTable = oldFDTable.Fork(t, maxFd)
   685  	t.mu.Unlock()
   686  
   687  	oldFDTable.DecRef(t)
   688  }
   689  
   690  // vforkStop is a TaskStop imposed on a task that creates a child with
   691  // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
   692  // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
   693  // that the child and parent share mappings until the child execve()s into a
   694  // new process image or exits.)
   695  //
   696  // +stateify savable
   697  type vforkStop struct{}
   698  
   699  // StopIgnoresKill implements TaskStop.Killable.
   700  func (*vforkStop) Killable() bool { return true }