github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/task_clone.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 19 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 20 "github.com/MerlinKodo/gvisor/pkg/bpf" 21 "github.com/MerlinKodo/gvisor/pkg/cleanup" 22 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 23 "github.com/MerlinKodo/gvisor/pkg/hostarch" 24 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/kernfs" 25 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/nsfs" 26 "github.com/MerlinKodo/gvisor/pkg/sentry/inet" 27 "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck" 28 pb "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck/points/points_go_proto" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 30 "github.com/MerlinKodo/gvisor/pkg/usermem" 31 ) 32 33 // SupportedCloneFlags is the bitwise OR of all the supported flags for clone. 34 // TODO(b/290826530): Implement CLONE_INTO_CGROUP when cgroups v2 is 35 // implemented. 36 const SupportedCloneFlags = linux.CLONE_VM | linux.CLONE_FS | linux.CLONE_FILES | linux.CLONE_SYSVSEM | 37 linux.CLONE_THREAD | linux.CLONE_SIGHAND | linux.CLONE_CHILD_SETTID | linux.CLONE_NEWPID | 38 linux.CLONE_CHILD_CLEARTID | linux.CLONE_CHILD_SETTID | linux.CLONE_PARENT | 39 linux.CLONE_PARENT_SETTID | linux.CLONE_SETTLS | linux.CLONE_NEWUSER | linux.CLONE_NEWUTS | 40 linux.CLONE_NEWIPC | linux.CLONE_NEWNET | linux.CLONE_PTRACE | linux.CLONE_UNTRACED | 41 linux.CLONE_IO | linux.CLONE_VFORK | linux.CLONE_DETACHED | linux.CLONE_NEWNS 42 43 // Clone implements the clone(2) syscall and returns the thread ID of the new 44 // task in t's PID namespace. Clone may return both a non-zero thread ID and a 45 // non-nil error. 46 // 47 // Preconditions: The caller must be running Task.doSyscallInvoke on the task 48 // goroutine. 49 func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { 50 if args.Flags&^SupportedCloneFlags != 0 { 51 return 0, nil, linuxerr.EINVAL 52 } 53 // Since signal actions may refer to application signal handlers by virtual 54 // address, any set of signal handlers must refer to the same address 55 // space. 56 if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND { 57 return 0, nil, linuxerr.EINVAL 58 } 59 if args.SetTID != 0 { 60 return 0, nil, linuxerr.ENOTSUP 61 } 62 // In order for the behavior of thread-group-directed signals to be sane, 63 // all tasks in a thread group must share signal handlers. 64 if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD { 65 return 0, nil, linuxerr.EINVAL 66 } 67 // All tasks in a thread group must be in the same PID namespace. 68 if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) { 69 return 0, nil, linuxerr.EINVAL 70 } 71 // The two different ways of specifying a new PID namespace are 72 // incompatible. 73 if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil { 74 return 0, nil, linuxerr.EINVAL 75 } 76 // Thread groups and FS contexts cannot span user namespaces. 77 if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 { 78 return 0, nil, linuxerr.EINVAL 79 } 80 // args.ExitSignal must be a valid signal. 81 if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() { 82 return 0, nil, linuxerr.EINVAL 83 } 84 if args.Flags&(linux.CLONE_FS|linux.CLONE_NEWNS) == linux.CLONE_FS|linux.CLONE_NEWNS { 85 return 0, nil, linuxerr.EINVAL 86 } 87 88 // Pull task registers and FPU state, a cloned task will inherit the 89 // state of the current task. 90 if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil { 91 t.Warningf("Unable to pull a full state: %v", err) 92 t.forceSignal(linux.SIGILL, true /* unconditional */) 93 t.SendSignal(SignalInfoPriv(linux.SIGILL)) 94 return 0, nil, linuxerr.EFAULT 95 } 96 97 // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a 98 // single clone(2) or unshare(2) call, the user namespace is guaranteed to 99 // be created first, giving the child (clone(2)) or caller (unshare(2)) 100 // privileges over the remaining namespaces created by the call." - 101 // user_namespaces(7) 102 creds := t.Credentials() 103 userns := creds.UserNamespace 104 if args.Flags&linux.CLONE_NEWUSER != 0 { 105 var err error 106 // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and 107 // the caller is in a chroot environment (i.e., the caller's root 108 // directory does not match the root directory of the mount namespace 109 // in which it resides)." - clone(2). Neither chroot(2) nor 110 // user_namespaces(7) document this. 111 if t.IsChrooted() { 112 return 0, nil, linuxerr.EPERM 113 } 114 userns, err = creds.NewChildUserNamespace() 115 if err != nil { 116 return 0, nil, err 117 } 118 } 119 if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { 120 return 0, nil, linuxerr.EPERM 121 } 122 123 cu := cleanup.Make(func() {}) 124 defer cu.Clean() 125 126 utsns := t.utsns 127 if args.Flags&linux.CLONE_NEWUTS != 0 { 128 // Note that this must happen after NewUserNamespace so we get 129 // the new userns if there is one. 130 utsns = utsns.Clone(userns) 131 utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, utsns)) 132 } else { 133 utsns.IncRef() 134 } 135 cu.Add(func() { 136 utsns.DecRef(t) 137 }) 138 139 ipcns := t.ipcns 140 if args.Flags&linux.CLONE_NEWIPC != 0 { 141 ipcns = NewIPCNamespace(userns) 142 ipcns.InitPosixQueues(t, t.k.VFS(), creds) 143 ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, ipcns)) 144 } else { 145 ipcns.IncRef() 146 } 147 cu.Add(func() { 148 ipcns.DecRef(t) 149 }) 150 151 netns := t.netns 152 if args.Flags&linux.CLONE_NEWNET != 0 { 153 netns = inet.NewNamespace(netns, userns) 154 inode := nsfs.NewInode(t, t.k.nsfsMount, netns) 155 netns.SetInode(inode) 156 } else { 157 netns.IncRef() 158 } 159 cu.Add(func() { 160 netns.DecRef(t) 161 }) 162 163 // We must hold t.mu to access t.image, but we can't hold it during Fork(), 164 // since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered 165 // above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy. 166 t.mu.Lock() 167 curImage := t.image 168 sessionKeyring := t.sessionKeyring 169 t.mu.Unlock() 170 image, err := curImage.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0) 171 if err != nil { 172 return 0, nil, err 173 } 174 cu.Add(func() { 175 image.release(t) 176 }) 177 178 if args.Flags&linux.CLONE_NEWUSER != 0 { 179 // If the task is in a new user namespace, it cannot share keys. 180 sessionKeyring = nil 181 } 182 183 // clone() returns 0 in the child. 184 image.Arch.SetReturn(0) 185 if args.Stack != 0 { 186 image.Arch.SetStack(uintptr(args.Stack + args.StackSize)) 187 } 188 if args.Flags&linux.CLONE_SETTLS != 0 { 189 if !image.Arch.SetTLS(uintptr(args.TLS)) { 190 return 0, nil, linuxerr.EPERM 191 } 192 } 193 194 var fsContext *FSContext 195 if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 { 196 fsContext = t.fsContext.Fork() 197 } else { 198 fsContext = t.fsContext 199 fsContext.IncRef() 200 } 201 202 mntns := t.mountNamespace 203 if args.Flags&linux.CLONE_NEWNS != 0 { 204 var err error 205 mntns, err = t.k.vfs.CloneMountNamespace(t, creds, mntns, &fsContext.root, &fsContext.cwd, t.k) 206 if err != nil { 207 return 0, nil, err 208 } 209 } else { 210 mntns.IncRef() 211 } 212 cu.Add(func() { 213 mntns.DecRef(t) 214 }) 215 216 var fdTable *FDTable 217 if args.Flags&linux.CLONE_FILES == 0 { 218 fdTable = t.fdTable.Fork(t, MaxFdLimit) 219 } else { 220 fdTable = t.fdTable 221 fdTable.IncRef() 222 } 223 224 pidns := t.tg.pidns 225 if t.childPIDNamespace != nil { 226 pidns = t.childPIDNamespace 227 } else if args.Flags&linux.CLONE_NEWPID != 0 { 228 pidns = pidns.NewChild(userns) 229 } 230 231 tg := t.tg 232 rseqAddr := hostarch.Addr(0) 233 rseqSignature := uint32(0) 234 if args.Flags&linux.CLONE_THREAD == 0 { 235 sh := t.tg.signalHandlers 236 if args.Flags&linux.CLONE_SIGHAND == 0 { 237 sh = sh.Fork() 238 } 239 tg = t.k.NewThreadGroup(pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy()) 240 tg.oomScoreAdj = atomicbitops.FromInt32(t.tg.oomScoreAdj.Load()) 241 rseqAddr = t.rseqAddr 242 rseqSignature = t.rseqSignature 243 } 244 245 uc := t.userCounters 246 if uc.uid != creds.RealKUID { 247 uc = t.k.GetUserCounters(creds.RealKUID) 248 } 249 250 cfg := &TaskConfig{ 251 Kernel: t.k, 252 ThreadGroup: tg, 253 SignalMask: t.SignalMask(), 254 TaskImage: image, 255 FSContext: fsContext, 256 FDTable: fdTable, 257 Credentials: creds, 258 Niceness: t.Niceness(), 259 NetworkNamespace: netns, 260 AllowedCPUMask: t.CPUMask(), 261 UTSNamespace: utsns, 262 IPCNamespace: ipcns, 263 AbstractSocketNamespace: t.abstractSockets, 264 MountNamespace: mntns, 265 RSeqAddr: rseqAddr, 266 RSeqSignature: rseqSignature, 267 ContainerID: t.ContainerID(), 268 UserCounters: uc, 269 SessionKeyring: sessionKeyring, 270 } 271 if args.Flags&linux.CLONE_THREAD == 0 { 272 cfg.Parent = t 273 } else { 274 cfg.InheritParent = t 275 } 276 nt, err := t.tg.pidns.owner.NewTask(t, cfg) 277 // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does 278 // the cleanup for us. 279 cu.Release() 280 if err != nil { 281 return 0, nil, err 282 } 283 284 // "A child process created via fork(2) inherits a copy of its parent's 285 // alternate signal stack settings" - sigaltstack(2). 286 // 287 // However kernel/fork.c:copy_process() adds a limitation to this: 288 // "sigaltstack should be cleared when sharing the same VM". 289 if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 { 290 nt.SetSignalStack(t.SignalStack()) 291 } 292 293 if userns != creds.UserNamespace { 294 if err := nt.SetUserNamespace(userns); err != nil { 295 // This shouldn't be possible: userns was created from nt.creds, so 296 // nt should have CAP_SYS_ADMIN in userns. 297 panic("Task.Clone: SetUserNamespace failed: " + err.Error()) 298 } 299 } 300 301 // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to 302 // nt that it must receive before its task goroutine starts running. 303 tid := nt.k.tasks.Root.IDOfTask(nt) 304 defer nt.Start(tid) 305 306 if seccheck.Global.Enabled(seccheck.PointClone) { 307 mask, info := getCloneSeccheckInfo(t, nt, args.Flags) 308 if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 309 return c.Clone(t, mask, info) 310 }); err != nil { 311 // nt has been visible to the rest of the system since NewTask, so 312 // it may be blocking execve or a group stop, have been notified 313 // for group signal delivery, had children reparented to it, etc. 314 // Thus we can't just drop it on the floor. Instead, instruct the 315 // task goroutine to exit immediately, as quietly as possible. 316 nt.exitTracerNotified = true 317 nt.exitTracerAcked = true 318 nt.exitParentNotified = true 319 nt.exitParentAcked = true 320 nt.runState = (*runExitMain)(nil) 321 return 0, nil, err 322 } 323 } 324 325 // "If fork/clone and execve are allowed by @prog, any child processes will 326 // be constrained to the same filters and system call ABI as the parent." - 327 // Documentation/prctl/seccomp_filter.txt 328 if f := t.syscallFilters.Load(); f != nil { 329 copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...) 330 nt.syscallFilters.Store(copiedFilters) 331 } 332 if args.Flags&linux.CLONE_VFORK != 0 { 333 nt.vforkParent = t 334 } 335 336 if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 { 337 nt.SetClearTID(hostarch.Addr(args.ChildTID)) 338 } 339 if args.Flags&linux.CLONE_CHILD_SETTID != 0 { 340 ctid := nt.ThreadID() 341 ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID)) 342 } 343 ntid := t.tg.pidns.IDOfTask(nt) 344 if args.Flags&linux.CLONE_PARENT_SETTID != 0 { 345 ntid.CopyOut(t, hostarch.Addr(args.ParentTID)) 346 } 347 348 t.traceCloneEvent(tid) 349 kind := ptraceCloneKindClone 350 if args.Flags&linux.CLONE_VFORK != 0 { 351 kind = ptraceCloneKindVfork 352 } else if linux.Signal(args.ExitSignal) == linux.SIGCHLD { 353 kind = ptraceCloneKindFork 354 } 355 if t.ptraceClone(kind, nt, args) { 356 if args.Flags&linux.CLONE_VFORK != 0 { 357 return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil 358 } 359 return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil 360 } 361 if args.Flags&linux.CLONE_VFORK != 0 { 362 t.maybeBeginVforkStop(nt) 363 return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil 364 } 365 return ntid, nil, nil 366 } 367 368 func getCloneSeccheckInfo(t, nt *Task, flags uint64) (seccheck.FieldSet, *pb.CloneInfo) { 369 fields := seccheck.Global.GetFieldSet(seccheck.PointClone) 370 var cwd string 371 if fields.Context.Contains(seccheck.FieldCtxtCwd) { 372 cwd = getTaskCurrentWorkingDirectory(t) 373 } 374 t.k.tasks.mu.RLock() 375 defer t.k.tasks.mu.RUnlock() 376 info := &pb.CloneInfo{ 377 CreatedThreadId: int32(nt.k.tasks.Root.tids[nt]), 378 CreatedThreadGroupId: int32(nt.k.tasks.Root.tgids[nt.tg]), 379 CreatedThreadStartTimeNs: nt.startTime.Nanoseconds(), 380 Flags: flags, 381 } 382 383 if !fields.Context.Empty() { 384 info.ContextData = &pb.ContextData{} 385 LoadSeccheckDataLocked(t, fields.Context, info.ContextData, cwd) 386 } 387 388 return fields, info 389 } 390 391 // maybeBeginVforkStop checks if a previously-started vfork child is still 392 // running and has not yet released its MM, such that its parent t should enter 393 // a vforkStop. 394 // 395 // Preconditions: The caller must be running on t's task goroutine. 396 func (t *Task) maybeBeginVforkStop(child *Task) { 397 t.tg.pidns.owner.mu.RLock() 398 defer t.tg.pidns.owner.mu.RUnlock() 399 t.tg.signalHandlers.mu.Lock() 400 defer t.tg.signalHandlers.mu.Unlock() 401 if t.killedLocked() { 402 child.vforkParent = nil 403 return 404 } 405 if child.vforkParent == t { 406 t.beginInternalStopLocked((*vforkStop)(nil)) 407 } 408 } 409 410 func (t *Task) unstopVforkParent() { 411 t.tg.pidns.owner.mu.RLock() 412 defer t.tg.pidns.owner.mu.RUnlock() 413 if p := t.vforkParent; p != nil { 414 p.tg.signalHandlers.mu.Lock() 415 defer p.tg.signalHandlers.mu.Unlock() 416 if _, ok := p.stop.(*vforkStop); ok { 417 p.endInternalStopLocked() 418 } 419 // Parent no longer needs to be unstopped. 420 t.vforkParent = nil 421 } 422 } 423 424 // +stateify savable 425 type runSyscallAfterPtraceEventClone struct { 426 vforkChild *Task 427 428 // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's 429 // PID namespace. vforkChildTID must be stored since the child may exit and 430 // release its TID before the PTRACE_EVENT stop ends. 431 vforkChildTID ThreadID 432 } 433 434 func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState { 435 if r.vforkChild != nil { 436 t.maybeBeginVforkStop(r.vforkChild) 437 return &runSyscallAfterVforkStop{r.vforkChildTID} 438 } 439 return (*runSyscallExit)(nil) 440 } 441 442 // +stateify savable 443 type runSyscallAfterVforkStop struct { 444 // childTID has the same meaning as 445 // runSyscallAfterPtraceEventClone.vforkChildTID. 446 childTID ThreadID 447 } 448 449 func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { 450 t.ptraceVforkDone(r.childTID) 451 return (*runSyscallExit)(nil) 452 } 453 454 // Setns reassociates thread with the specified namespace. 455 func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { 456 d, ok := fd.Dentry().Impl().(*kernfs.Dentry) 457 if !ok { 458 return linuxerr.EINVAL 459 } 460 i, ok := d.Inode().(*nsfs.Inode) 461 if !ok { 462 return linuxerr.EINVAL 463 } 464 465 switch ns := i.Namespace().(type) { 466 case *inet.Namespace: 467 if flags != 0 && flags != linux.CLONE_NEWNET { 468 return linuxerr.EINVAL 469 } 470 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || 471 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 472 return linuxerr.EPERM 473 } 474 oldNS := t.NetworkNamespace() 475 ns.IncRef() 476 t.mu.Lock() 477 t.netns = ns 478 t.mu.Unlock() 479 oldNS.DecRef(t) 480 return nil 481 case *IPCNamespace: 482 if flags != 0 && flags != linux.CLONE_NEWIPC { 483 return linuxerr.EINVAL 484 } 485 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || 486 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 487 return linuxerr.EPERM 488 } 489 oldNS := t.IPCNamespace() 490 ns.IncRef() 491 t.mu.Lock() 492 t.ipcns = ns 493 t.mu.Unlock() 494 oldNS.DecRef(t) 495 return nil 496 case *vfs.MountNamespace: 497 if flags != 0 && flags != linux.CLONE_NEWNS { 498 return linuxerr.EINVAL 499 } 500 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.Owner) || 501 !t.Credentials().HasCapability(linux.CAP_SYS_CHROOT) || 502 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 503 return linuxerr.EPERM 504 } 505 oldFSContext := t.fsContext 506 // The current task has to be an exclusive owner of its fs context. 507 if oldFSContext.ReadRefs() != 1 { 508 return linuxerr.EINVAL 509 } 510 fsContext := oldFSContext.Fork() 511 fsContext.root.DecRef(t) 512 fsContext.cwd.DecRef(t) 513 vd := ns.Root(t) 514 fsContext.root = vd 515 vd.IncRef() 516 fsContext.cwd = vd 517 518 oldNS := t.mountNamespace 519 ns.IncRef() 520 t.mu.Lock() 521 t.mountNamespace = ns 522 t.fsContext = fsContext 523 t.mu.Unlock() 524 oldNS.DecRef(t) 525 oldFSContext.DecRef(t) 526 return nil 527 case *UTSNamespace: 528 if flags != 0 && flags != linux.CLONE_NEWUTS { 529 return linuxerr.EINVAL 530 } 531 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || 532 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 533 return linuxerr.EPERM 534 } 535 oldNS := t.UTSNamespace() 536 ns.IncRef() 537 t.mu.Lock() 538 t.utsns = ns 539 t.mu.Unlock() 540 oldNS.DecRef(t) 541 return nil 542 default: 543 return linuxerr.EINVAL 544 } 545 } 546 547 // Unshare changes the set of resources t shares with other tasks, as specified 548 // by flags. 549 // 550 // Preconditions: The caller must be running on the task goroutine. 551 func (t *Task) Unshare(flags int32) error { 552 // "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if 553 // the caller is single threaded (i.e., it is not sharing its address space 554 // with another process or thread). In this case, these flags have no 555 // effect. (Note also that specifying CLONE_THREAD automatically implies 556 // CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.) 557 // If the process is multithreaded, then the use of these flags results in 558 // an error." - unshare(2). This is incorrect (cf. 559 // kernel/fork.c:ksys_unshare()): 560 // 561 // - CLONE_THREAD does not imply CLONE_VM. 562 // 563 // - CLONE_SIGHAND implies CLONE_THREAD. 564 // 565 // - Only CLONE_VM requires that the caller is not sharing its address 566 // space with another thread. CLONE_SIGHAND requires that the caller is not 567 // sharing its signal handlers, and CLONE_THREAD requires that the caller 568 // is the only thread in its thread group. 569 // 570 // Since we don't count the number of tasks using each address space or set 571 // of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether. 572 if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 { 573 return linuxerr.EINVAL 574 } 575 creds := t.Credentials() 576 if flags&linux.CLONE_THREAD != 0 { 577 t.tg.signalHandlers.mu.Lock() 578 if t.tg.tasksCount != 1 { 579 t.tg.signalHandlers.mu.Unlock() 580 return linuxerr.EINVAL 581 } 582 t.tg.signalHandlers.mu.Unlock() 583 // This isn't racy because we're the only living task, and therefore 584 // the only task capable of creating new ones, in our thread group. 585 } 586 if flags&linux.CLONE_NEWUSER != 0 { 587 if t.IsChrooted() { 588 return linuxerr.EPERM 589 } 590 newUserNS, err := creds.NewChildUserNamespace() 591 if err != nil { 592 return err 593 } 594 err = t.SetUserNamespace(newUserNS) 595 if err != nil { 596 return err 597 } 598 // Need to reload creds, because t.SetUserNamespace() changed task credentials. 599 creds = t.Credentials() 600 } 601 haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) 602 if flags&linux.CLONE_NEWPID != 0 { 603 if !haveCapSysAdmin { 604 return linuxerr.EPERM 605 } 606 t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) 607 } 608 if flags&linux.CLONE_NEWNET != 0 { 609 if !haveCapSysAdmin { 610 return linuxerr.EPERM 611 } 612 netns := t.NetworkNamespace() 613 netns = inet.NewNamespace(netns, t.UserNamespace()) 614 netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns) 615 netns.SetInode(netnsInode) 616 t.mu.Lock() 617 oldNetns := t.netns 618 t.netns = netns 619 t.mu.Unlock() 620 oldNetns.DecRef(t) 621 } 622 623 cu := cleanup.Cleanup{} 624 // All cu actions has to be executed after releasing t.mu. 625 defer cu.Clean() 626 t.mu.Lock() 627 defer t.mu.Unlock() 628 // Can't defer unlock: DecRefs must occur without holding t.mu. 629 if flags&linux.CLONE_NEWUTS != 0 { 630 if !haveCapSysAdmin { 631 return linuxerr.EPERM 632 } 633 // Note that this must happen after NewUserNamespace, so the 634 // new user namespace is used if there is one. 635 oldUTSNS := t.utsns 636 t.utsns = t.utsns.Clone(creds.UserNamespace) 637 t.utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.utsns)) 638 cu.Add(func() { oldUTSNS.DecRef(t) }) 639 } 640 if flags&linux.CLONE_NEWIPC != 0 { 641 if !haveCapSysAdmin { 642 return linuxerr.EPERM 643 } 644 // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC 645 // namespace" 646 oldIPCNS := t.ipcns 647 t.ipcns = NewIPCNamespace(creds.UserNamespace) 648 t.ipcns.InitPosixQueues(t, t.k.VFS(), creds) 649 t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns)) 650 cu.Add(func() { oldIPCNS.DecRef(t) }) 651 } 652 if flags&linux.CLONE_FILES != 0 { 653 oldFDTable := t.fdTable 654 t.fdTable = oldFDTable.Fork(t, MaxFdLimit) 655 cu.Add(func() { oldFDTable.DecRef(t) }) 656 } 657 if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 { 658 oldFSContext := t.fsContext 659 t.fsContext = oldFSContext.Fork() 660 cu.Add(func() { oldFSContext.DecRef(t) }) 661 } 662 if flags&linux.CLONE_NEWNS != 0 { 663 if !haveCapSysAdmin { 664 return linuxerr.EPERM 665 } 666 oldMountNS := t.mountNamespace 667 mntns, err := t.k.vfs.CloneMountNamespace(t, creds, oldMountNS, &t.fsContext.root, &t.fsContext.cwd, t.k) 668 if err != nil { 669 return err 670 } 671 t.mountNamespace = mntns 672 cu.Add(func() { oldMountNS.DecRef(t) }) 673 } 674 return nil 675 } 676 677 // UnshareFdTable unshares the FdTable that task t shares with other tasks, upto 678 // the maxFd. 679 // 680 // Preconditions: The caller must be running on the task goroutine. 681 func (t *Task) UnshareFdTable(maxFd int32) { 682 t.mu.Lock() 683 oldFDTable := t.fdTable 684 t.fdTable = oldFDTable.Fork(t, maxFd) 685 t.mu.Unlock() 686 687 oldFDTable.DecRef(t) 688 } 689 690 // vforkStop is a TaskStop imposed on a task that creates a child with 691 // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its 692 // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so 693 // that the child and parent share mappings until the child execve()s into a 694 // new process image or exits.) 695 // 696 // +stateify savable 697 type vforkStop struct{} 698 699 // StopIgnoresKill implements TaskStop.Killable. 700 func (*vforkStop) Killable() bool { return true }