gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/task_clone.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "gvisor.dev/gvisor/pkg/abi/linux" 19 "gvisor.dev/gvisor/pkg/atomicbitops" 20 "gvisor.dev/gvisor/pkg/cleanup" 21 "gvisor.dev/gvisor/pkg/errors/linuxerr" 22 "gvisor.dev/gvisor/pkg/hostarch" 23 "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" 24 "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" 25 "gvisor.dev/gvisor/pkg/sentry/inet" 26 "gvisor.dev/gvisor/pkg/sentry/seccheck" 27 pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" 28 "gvisor.dev/gvisor/pkg/sentry/vfs" 29 "gvisor.dev/gvisor/pkg/usermem" 30 ) 31 32 // SupportedCloneFlags is the bitwise OR of all the supported flags for clone. 33 // TODO(b/290826530): Implement CLONE_INTO_CGROUP when cgroups v2 is 34 // implemented. 35 const SupportedCloneFlags = linux.CLONE_VM | linux.CLONE_FS | linux.CLONE_FILES | linux.CLONE_SYSVSEM | 36 linux.CLONE_THREAD | linux.CLONE_SIGHAND | linux.CLONE_CHILD_SETTID | linux.CLONE_NEWPID | 37 linux.CLONE_CHILD_CLEARTID | linux.CLONE_CHILD_SETTID | linux.CLONE_PARENT | 38 linux.CLONE_PARENT_SETTID | linux.CLONE_SETTLS | linux.CLONE_NEWUSER | linux.CLONE_NEWUTS | 39 linux.CLONE_NEWIPC | linux.CLONE_NEWNET | linux.CLONE_PTRACE | linux.CLONE_UNTRACED | 40 linux.CLONE_IO | linux.CLONE_VFORK | linux.CLONE_DETACHED | linux.CLONE_NEWNS 41 42 // Clone implements the clone(2) syscall and returns the thread ID of the new 43 // task in t's PID namespace. Clone may return both a non-zero thread ID and a 44 // non-nil error. 45 // 46 // Preconditions: The caller must be running Task.doSyscallInvoke on the task 47 // goroutine. 48 func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { 49 if args.Flags&^SupportedCloneFlags != 0 { 50 return 0, nil, linuxerr.EINVAL 51 } 52 // Since signal actions may refer to application signal handlers by virtual 53 // address, any set of signal handlers must refer to the same address 54 // space. 55 if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND { 56 return 0, nil, linuxerr.EINVAL 57 } 58 if args.SetTID != 0 { 59 return 0, nil, linuxerr.ENOTSUP 60 } 61 // In order for the behavior of thread-group-directed signals to be sane, 62 // all tasks in a thread group must share signal handlers. 63 if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD { 64 return 0, nil, linuxerr.EINVAL 65 } 66 // All tasks in a thread group must be in the same PID namespace. 67 if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) { 68 return 0, nil, linuxerr.EINVAL 69 } 70 // The two different ways of specifying a new PID namespace are 71 // incompatible. 72 if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil { 73 return 0, nil, linuxerr.EINVAL 74 } 75 // Thread groups and FS contexts cannot span user namespaces. 76 if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 { 77 return 0, nil, linuxerr.EINVAL 78 } 79 // args.ExitSignal must be a valid signal. 80 if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() { 81 return 0, nil, linuxerr.EINVAL 82 } 83 if args.Flags&(linux.CLONE_FS|linux.CLONE_NEWNS) == linux.CLONE_FS|linux.CLONE_NEWNS { 84 return 0, nil, linuxerr.EINVAL 85 } 86 87 // Pull task registers and FPU state, a cloned task will inherit the 88 // state of the current task. 89 if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil { 90 t.Warningf("Unable to pull a full state: %v", err) 91 t.forceSignal(linux.SIGILL, true /* unconditional */) 92 t.SendSignal(SignalInfoPriv(linux.SIGILL)) 93 return 0, nil, linuxerr.EFAULT 94 } 95 96 // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a 97 // single clone(2) or unshare(2) call, the user namespace is guaranteed to 98 // be created first, giving the child (clone(2)) or caller (unshare(2)) 99 // privileges over the remaining namespaces created by the call." - 100 // user_namespaces(7) 101 creds := t.Credentials() 102 userns := creds.UserNamespace 103 if args.Flags&linux.CLONE_NEWUSER != 0 { 104 var err error 105 // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and 106 // the caller is in a chroot environment (i.e., the caller's root 107 // directory does not match the root directory of the mount namespace 108 // in which it resides)." - clone(2). Neither chroot(2) nor 109 // user_namespaces(7) document this. 110 if t.IsChrooted() { 111 return 0, nil, linuxerr.EPERM 112 } 113 userns, err = creds.NewChildUserNamespace() 114 if err != nil { 115 return 0, nil, err 116 } 117 } 118 if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { 119 return 0, nil, linuxerr.EPERM 120 } 121 122 cu := cleanup.Make(func() {}) 123 defer cu.Clean() 124 125 utsns := t.utsns 126 if args.Flags&linux.CLONE_NEWUTS != 0 { 127 // Note that this must happen after NewUserNamespace so we get 128 // the new userns if there is one. 129 utsns = utsns.Clone(userns) 130 utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, utsns)) 131 } else { 132 utsns.IncRef() 133 } 134 cu.Add(func() { 135 utsns.DecRef(t) 136 }) 137 138 ipcns := t.ipcns 139 if args.Flags&linux.CLONE_NEWIPC != 0 { 140 ipcns = NewIPCNamespace(userns) 141 ipcns.InitPosixQueues(t, t.k.VFS(), creds) 142 ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, ipcns)) 143 } else { 144 ipcns.IncRef() 145 } 146 cu.Add(func() { 147 ipcns.DecRef(t) 148 }) 149 150 netns := t.netns 151 if args.Flags&linux.CLONE_NEWNET != 0 { 152 netns = inet.NewNamespace(netns, userns) 153 inode := nsfs.NewInode(t, t.k.nsfsMount, netns) 154 netns.SetInode(inode) 155 } else { 156 netns.IncRef() 157 } 158 cu.Add(func() { 159 netns.DecRef(t) 160 }) 161 162 // We must hold t.mu to access t.image, but we can't hold it during Fork(), 163 // since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered 164 // above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy. 165 t.mu.Lock() 166 curImage := t.image 167 sessionKeyring := t.sessionKeyring 168 t.mu.Unlock() 169 image, err := curImage.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0) 170 if err != nil { 171 return 0, nil, err 172 } 173 cu.Add(func() { 174 image.release(t) 175 }) 176 177 if args.Flags&linux.CLONE_NEWUSER != 0 { 178 // If the task is in a new user namespace, it cannot share keys. 179 sessionKeyring = nil 180 } 181 182 // clone() returns 0 in the child. 183 image.Arch.SetReturn(0) 184 if args.Stack != 0 { 185 image.Arch.SetStack(uintptr(args.Stack + args.StackSize)) 186 } 187 if args.Flags&linux.CLONE_SETTLS != 0 { 188 if !image.Arch.SetTLS(uintptr(args.TLS)) { 189 return 0, nil, linuxerr.EPERM 190 } 191 } 192 193 var fsContext *FSContext 194 if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 { 195 fsContext = t.fsContext.Fork() 196 } else { 197 fsContext = t.fsContext 198 fsContext.IncRef() 199 } 200 201 mntns := t.mountNamespace 202 if args.Flags&linux.CLONE_NEWNS != 0 { 203 var err error 204 mntns, err = t.k.vfs.CloneMountNamespace(t, creds, mntns, &fsContext.root, &fsContext.cwd, t.k) 205 if err != nil { 206 return 0, nil, err 207 } 208 } else { 209 mntns.IncRef() 210 } 211 cu.Add(func() { 212 mntns.DecRef(t) 213 }) 214 215 var fdTable *FDTable 216 if args.Flags&linux.CLONE_FILES == 0 { 217 fdTable = t.fdTable.Fork(t, MaxFdLimit) 218 } else { 219 fdTable = t.fdTable 220 fdTable.IncRef() 221 } 222 223 pidns := t.tg.pidns 224 if t.childPIDNamespace != nil { 225 pidns = t.childPIDNamespace 226 } else if args.Flags&linux.CLONE_NEWPID != 0 { 227 pidns = pidns.NewChild(userns) 228 } 229 230 tg := t.tg 231 rseqAddr := hostarch.Addr(0) 232 rseqSignature := uint32(0) 233 if args.Flags&linux.CLONE_THREAD == 0 { 234 sh := t.tg.signalHandlers 235 if args.Flags&linux.CLONE_SIGHAND == 0 { 236 sh = sh.Fork() 237 } 238 tg = t.k.NewThreadGroup(pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy()) 239 tg.oomScoreAdj = atomicbitops.FromInt32(t.tg.oomScoreAdj.Load()) 240 rseqAddr = t.rseqAddr 241 rseqSignature = t.rseqSignature 242 } 243 244 uc := t.userCounters 245 if uc.uid != creds.RealKUID { 246 uc = t.k.GetUserCounters(creds.RealKUID) 247 } 248 249 cfg := &TaskConfig{ 250 Kernel: t.k, 251 ThreadGroup: tg, 252 SignalMask: t.SignalMask(), 253 TaskImage: image, 254 FSContext: fsContext, 255 FDTable: fdTable, 256 Credentials: creds, 257 Niceness: t.Niceness(), 258 NetworkNamespace: netns, 259 AllowedCPUMask: t.CPUMask(), 260 UTSNamespace: utsns, 261 IPCNamespace: ipcns, 262 MountNamespace: mntns, 263 RSeqAddr: rseqAddr, 264 RSeqSignature: rseqSignature, 265 ContainerID: t.ContainerID(), 266 UserCounters: uc, 267 SessionKeyring: sessionKeyring, 268 Origin: t.Origin, 269 } 270 if args.Flags&linux.CLONE_THREAD == 0 { 271 cfg.Parent = t 272 } else { 273 cfg.InheritParent = t 274 } 275 nt, err := t.tg.pidns.owner.NewTask(t, cfg) 276 // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does 277 // the cleanup for us. 278 cu.Release() 279 if err != nil { 280 return 0, nil, err 281 } 282 283 // "A child process created via fork(2) inherits a copy of its parent's 284 // alternate signal stack settings" - sigaltstack(2). 285 // 286 // However kernel/fork.c:copy_process() adds a limitation to this: 287 // "sigaltstack should be cleared when sharing the same VM". 288 if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 { 289 nt.SetSignalStack(t.SignalStack()) 290 } 291 292 if userns != creds.UserNamespace { 293 if err := nt.SetUserNamespace(userns); err != nil { 294 // This shouldn't be possible: userns was created from nt.creds, so 295 // nt should have CAP_SYS_ADMIN in userns. 296 panic("Task.Clone: SetUserNamespace failed: " + err.Error()) 297 } 298 } 299 300 // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to 301 // nt that it must receive before its task goroutine starts running. 302 tid := nt.k.tasks.Root.IDOfTask(nt) 303 defer nt.Start(tid) 304 305 if seccheck.Global.Enabled(seccheck.PointClone) { 306 mask, info := getCloneSeccheckInfo(t, nt, args.Flags) 307 if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 308 return c.Clone(t, mask, info) 309 }); err != nil { 310 // nt has been visible to the rest of the system since NewTask, so 311 // it may be blocking execve or a group stop, have been notified 312 // for group signal delivery, had children reparented to it, etc. 313 // Thus we can't just drop it on the floor. Instead, instruct the 314 // task goroutine to exit immediately, as quietly as possible. 315 nt.exitTracerNotified = true 316 nt.exitTracerAcked = true 317 nt.exitParentNotified = true 318 nt.exitParentAcked = true 319 nt.runState = (*runExitMain)(nil) 320 return 0, nil, err 321 } 322 } 323 324 // "If fork/clone and execve are allowed by @prog, any child processes will 325 // be constrained to the same filters and system call ABI as the parent." - 326 // Documentation/prctl/seccomp_filter.txt 327 if ts := t.seccomp.Load(); ts != nil { 328 seccompCopy := ts.copy() 329 seccompCopy.populateCache(nt) 330 nt.seccomp.Store(seccompCopy) 331 } else { 332 nt.seccomp.Store(nil) 333 } 334 if args.Flags&linux.CLONE_VFORK != 0 { 335 nt.vforkParent = t 336 } 337 338 if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 { 339 nt.SetClearTID(hostarch.Addr(args.ChildTID)) 340 } 341 if args.Flags&linux.CLONE_CHILD_SETTID != 0 { 342 ctid := nt.ThreadID() 343 ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID)) 344 } 345 ntid := t.tg.pidns.IDOfTask(nt) 346 if args.Flags&linux.CLONE_PARENT_SETTID != 0 { 347 ntid.CopyOut(t, hostarch.Addr(args.ParentTID)) 348 } 349 350 t.traceCloneEvent(tid) 351 kind := ptraceCloneKindClone 352 if args.Flags&linux.CLONE_VFORK != 0 { 353 kind = ptraceCloneKindVfork 354 } else if linux.Signal(args.ExitSignal) == linux.SIGCHLD { 355 kind = ptraceCloneKindFork 356 } 357 if t.ptraceClone(kind, nt, args) { 358 if args.Flags&linux.CLONE_VFORK != 0 { 359 return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil 360 } 361 return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil 362 } 363 if args.Flags&linux.CLONE_VFORK != 0 { 364 t.maybeBeginVforkStop(nt) 365 return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil 366 } 367 return ntid, nil, nil 368 } 369 370 func getCloneSeccheckInfo(t, nt *Task, flags uint64) (seccheck.FieldSet, *pb.CloneInfo) { 371 fields := seccheck.Global.GetFieldSet(seccheck.PointClone) 372 var cwd string 373 if fields.Context.Contains(seccheck.FieldCtxtCwd) { 374 cwd = getTaskCurrentWorkingDirectory(t) 375 } 376 t.k.tasks.mu.RLock() 377 defer t.k.tasks.mu.RUnlock() 378 info := &pb.CloneInfo{ 379 CreatedThreadId: int32(nt.k.tasks.Root.tids[nt]), 380 CreatedThreadGroupId: int32(nt.k.tasks.Root.tgids[nt.tg]), 381 CreatedThreadStartTimeNs: nt.startTime.Nanoseconds(), 382 Flags: flags, 383 } 384 385 if !fields.Context.Empty() { 386 info.ContextData = &pb.ContextData{} 387 LoadSeccheckDataLocked(t, fields.Context, info.ContextData, cwd) 388 } 389 390 return fields, info 391 } 392 393 // maybeBeginVforkStop checks if a previously-started vfork child is still 394 // running and has not yet released its MM, such that its parent t should enter 395 // a vforkStop. 396 // 397 // Preconditions: The caller must be running on t's task goroutine. 398 func (t *Task) maybeBeginVforkStop(child *Task) { 399 t.tg.pidns.owner.mu.RLock() 400 defer t.tg.pidns.owner.mu.RUnlock() 401 t.tg.signalHandlers.mu.Lock() 402 defer t.tg.signalHandlers.mu.Unlock() 403 if t.killedLocked() { 404 child.vforkParent = nil 405 return 406 } 407 if child.vforkParent == t { 408 t.beginInternalStopLocked((*vforkStop)(nil)) 409 } 410 } 411 412 func (t *Task) unstopVforkParent() { 413 t.tg.pidns.owner.mu.RLock() 414 defer t.tg.pidns.owner.mu.RUnlock() 415 if p := t.vforkParent; p != nil { 416 p.tg.signalHandlers.mu.Lock() 417 defer p.tg.signalHandlers.mu.Unlock() 418 if _, ok := p.stop.(*vforkStop); ok { 419 p.endInternalStopLocked() 420 } 421 // Parent no longer needs to be unstopped. 422 t.vforkParent = nil 423 } 424 } 425 426 // +stateify savable 427 type runSyscallAfterPtraceEventClone struct { 428 vforkChild *Task 429 430 // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's 431 // PID namespace. vforkChildTID must be stored since the child may exit and 432 // release its TID before the PTRACE_EVENT stop ends. 433 vforkChildTID ThreadID 434 } 435 436 func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState { 437 if r.vforkChild != nil { 438 t.maybeBeginVforkStop(r.vforkChild) 439 return &runSyscallAfterVforkStop{r.vforkChildTID} 440 } 441 return (*runSyscallExit)(nil) 442 } 443 444 // +stateify savable 445 type runSyscallAfterVforkStop struct { 446 // childTID has the same meaning as 447 // runSyscallAfterPtraceEventClone.vforkChildTID. 448 childTID ThreadID 449 } 450 451 func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { 452 t.ptraceVforkDone(r.childTID) 453 return (*runSyscallExit)(nil) 454 } 455 456 // Setns reassociates thread with the specified namespace. 457 func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { 458 d, ok := fd.Dentry().Impl().(*kernfs.Dentry) 459 if !ok { 460 return linuxerr.EINVAL 461 } 462 i, ok := d.Inode().(*nsfs.Inode) 463 if !ok { 464 return linuxerr.EINVAL 465 } 466 467 switch ns := i.Namespace().(type) { 468 case *inet.Namespace: 469 if flags != 0 && flags != linux.CLONE_NEWNET { 470 return linuxerr.EINVAL 471 } 472 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || 473 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 474 return linuxerr.EPERM 475 } 476 oldNS := t.NetworkNamespace() 477 ns.IncRef() 478 t.mu.Lock() 479 t.netns = ns 480 t.mu.Unlock() 481 oldNS.DecRef(t) 482 return nil 483 case *IPCNamespace: 484 if flags != 0 && flags != linux.CLONE_NEWIPC { 485 return linuxerr.EINVAL 486 } 487 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || 488 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 489 return linuxerr.EPERM 490 } 491 oldNS := t.IPCNamespace() 492 ns.IncRef() 493 t.mu.Lock() 494 t.ipcns = ns 495 t.mu.Unlock() 496 oldNS.DecRef(t) 497 return nil 498 case *vfs.MountNamespace: 499 if flags != 0 && flags != linux.CLONE_NEWNS { 500 return linuxerr.EINVAL 501 } 502 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.Owner) || 503 !t.Credentials().HasCapability(linux.CAP_SYS_CHROOT) || 504 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 505 return linuxerr.EPERM 506 } 507 oldFSContext := t.fsContext 508 // The current task has to be an exclusive owner of its fs context. 509 if oldFSContext.ReadRefs() != 1 { 510 return linuxerr.EINVAL 511 } 512 fsContext := oldFSContext.Fork() 513 fsContext.root.DecRef(t) 514 fsContext.cwd.DecRef(t) 515 vd := ns.Root(t) 516 fsContext.root = vd 517 vd.IncRef() 518 fsContext.cwd = vd 519 520 oldNS := t.mountNamespace 521 ns.IncRef() 522 t.mu.Lock() 523 t.mountNamespace = ns 524 t.fsContext = fsContext 525 t.mu.Unlock() 526 oldNS.DecRef(t) 527 oldFSContext.DecRef(t) 528 return nil 529 case *UTSNamespace: 530 if flags != 0 && flags != linux.CLONE_NEWUTS { 531 return linuxerr.EINVAL 532 } 533 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || 534 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 535 return linuxerr.EPERM 536 } 537 oldNS := t.UTSNamespace() 538 ns.IncRef() 539 t.mu.Lock() 540 t.utsns = ns 541 t.mu.Unlock() 542 oldNS.DecRef(t) 543 return nil 544 default: 545 return linuxerr.EINVAL 546 } 547 } 548 549 // Unshare changes the set of resources t shares with other tasks, as specified 550 // by flags. 551 // 552 // Preconditions: The caller must be running on the task goroutine. 553 func (t *Task) Unshare(flags int32) error { 554 // "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if 555 // the caller is single threaded (i.e., it is not sharing its address space 556 // with another process or thread). In this case, these flags have no 557 // effect. (Note also that specifying CLONE_THREAD automatically implies 558 // CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.) 559 // If the process is multithreaded, then the use of these flags results in 560 // an error." - unshare(2). This is incorrect (cf. 561 // kernel/fork.c:ksys_unshare()): 562 // 563 // - CLONE_THREAD does not imply CLONE_VM. 564 // 565 // - CLONE_SIGHAND implies CLONE_THREAD. 566 // 567 // - Only CLONE_VM requires that the caller is not sharing its address 568 // space with another thread. CLONE_SIGHAND requires that the caller is not 569 // sharing its signal handlers, and CLONE_THREAD requires that the caller 570 // is the only thread in its thread group. 571 // 572 // Since we don't count the number of tasks using each address space or set 573 // of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether. 574 if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 { 575 return linuxerr.EINVAL 576 } 577 creds := t.Credentials() 578 if flags&linux.CLONE_THREAD != 0 { 579 t.tg.signalHandlers.mu.Lock() 580 if t.tg.tasksCount != 1 { 581 t.tg.signalHandlers.mu.Unlock() 582 return linuxerr.EINVAL 583 } 584 t.tg.signalHandlers.mu.Unlock() 585 // This isn't racy because we're the only living task, and therefore 586 // the only task capable of creating new ones, in our thread group. 587 } 588 if flags&linux.CLONE_NEWUSER != 0 { 589 if t.IsChrooted() { 590 return linuxerr.EPERM 591 } 592 newUserNS, err := creds.NewChildUserNamespace() 593 if err != nil { 594 return err 595 } 596 err = t.SetUserNamespace(newUserNS) 597 if err != nil { 598 return err 599 } 600 // Need to reload creds, because t.SetUserNamespace() changed task credentials. 601 creds = t.Credentials() 602 } 603 haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) 604 if flags&linux.CLONE_NEWPID != 0 { 605 if !haveCapSysAdmin { 606 return linuxerr.EPERM 607 } 608 t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) 609 } 610 if flags&linux.CLONE_NEWNET != 0 { 611 if !haveCapSysAdmin { 612 return linuxerr.EPERM 613 } 614 netns := t.NetworkNamespace() 615 netns = inet.NewNamespace(netns, t.UserNamespace()) 616 netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns) 617 netns.SetInode(netnsInode) 618 t.mu.Lock() 619 oldNetns := t.netns 620 t.netns = netns 621 t.mu.Unlock() 622 oldNetns.DecRef(t) 623 } 624 625 cu := cleanup.Cleanup{} 626 // All cu actions has to be executed after releasing t.mu. 627 defer cu.Clean() 628 t.mu.Lock() 629 defer t.mu.Unlock() 630 // Can't defer unlock: DecRefs must occur without holding t.mu. 631 if flags&linux.CLONE_NEWUTS != 0 { 632 if !haveCapSysAdmin { 633 return linuxerr.EPERM 634 } 635 // Note that this must happen after NewUserNamespace, so the 636 // new user namespace is used if there is one. 637 oldUTSNS := t.utsns 638 t.utsns = t.utsns.Clone(creds.UserNamespace) 639 t.utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.utsns)) 640 cu.Add(func() { oldUTSNS.DecRef(t) }) 641 } 642 if flags&linux.CLONE_NEWIPC != 0 { 643 if !haveCapSysAdmin { 644 return linuxerr.EPERM 645 } 646 // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC 647 // namespace" 648 oldIPCNS := t.ipcns 649 t.ipcns = NewIPCNamespace(creds.UserNamespace) 650 t.ipcns.InitPosixQueues(t, t.k.VFS(), creds) 651 t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns)) 652 cu.Add(func() { oldIPCNS.DecRef(t) }) 653 } 654 if flags&linux.CLONE_FILES != 0 { 655 oldFDTable := t.fdTable 656 t.fdTable = oldFDTable.Fork(t, MaxFdLimit) 657 cu.Add(func() { oldFDTable.DecRef(t) }) 658 } 659 if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 { 660 oldFSContext := t.fsContext 661 t.fsContext = oldFSContext.Fork() 662 cu.Add(func() { oldFSContext.DecRef(t) }) 663 } 664 if flags&linux.CLONE_NEWNS != 0 { 665 if !haveCapSysAdmin { 666 return linuxerr.EPERM 667 } 668 oldMountNS := t.mountNamespace 669 mntns, err := t.k.vfs.CloneMountNamespace(t, creds, oldMountNS, &t.fsContext.root, &t.fsContext.cwd, t.k) 670 if err != nil { 671 return err 672 } 673 t.mountNamespace = mntns 674 cu.Add(func() { oldMountNS.DecRef(t) }) 675 } 676 return nil 677 } 678 679 // UnshareFdTable unshares the FdTable that task t shares with other tasks, upto 680 // the maxFd. 681 // 682 // Preconditions: The caller must be running on the task goroutine. 683 func (t *Task) UnshareFdTable(maxFd int32) { 684 t.mu.Lock() 685 oldFDTable := t.fdTable 686 t.fdTable = oldFDTable.Fork(t, maxFd) 687 t.mu.Unlock() 688 689 oldFDTable.DecRef(t) 690 } 691 692 // vforkStop is a TaskStop imposed on a task that creates a child with 693 // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its 694 // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so 695 // that the child and parent share mappings until the child execve()s into a 696 // new process image or exits.) 697 // 698 // +stateify savable 699 type vforkStop struct{} 700 701 // StopIgnoresKill implements TaskStop.Killable. 702 func (*vforkStop) Killable() bool { return true }