github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/task_clone.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "github.com/metacubex/gvisor/pkg/abi/linux" 19 "github.com/metacubex/gvisor/pkg/atomicbitops" 20 "github.com/metacubex/gvisor/pkg/cleanup" 21 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 22 "github.com/metacubex/gvisor/pkg/hostarch" 23 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs" 24 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/nsfs" 25 "github.com/metacubex/gvisor/pkg/sentry/inet" 26 "github.com/metacubex/gvisor/pkg/sentry/seccheck" 27 pb "github.com/metacubex/gvisor/pkg/sentry/seccheck/points/points_go_proto" 28 "github.com/metacubex/gvisor/pkg/sentry/vfs" 29 "github.com/metacubex/gvisor/pkg/usermem" 30 ) 31 32 // SupportedCloneFlags is the bitwise OR of all the supported flags for clone. 33 // TODO(b/290826530): Implement CLONE_INTO_CGROUP when cgroups v2 is 34 // implemented. 35 const SupportedCloneFlags = linux.CLONE_VM | linux.CLONE_FS | linux.CLONE_FILES | linux.CLONE_SYSVSEM | 36 linux.CLONE_THREAD | linux.CLONE_SIGHAND | linux.CLONE_CHILD_SETTID | linux.CLONE_NEWPID | 37 linux.CLONE_CHILD_CLEARTID | linux.CLONE_CHILD_SETTID | linux.CLONE_PARENT | 38 linux.CLONE_PARENT_SETTID | linux.CLONE_SETTLS | linux.CLONE_NEWUSER | linux.CLONE_NEWUTS | 39 linux.CLONE_NEWIPC | linux.CLONE_NEWNET | linux.CLONE_PTRACE | linux.CLONE_UNTRACED | 40 linux.CLONE_IO | linux.CLONE_VFORK | linux.CLONE_DETACHED | linux.CLONE_NEWNS 41 42 // Clone implements the clone(2) syscall and returns the thread ID of the new 43 // task in t's PID namespace. Clone may return both a non-zero thread ID and a 44 // non-nil error. 45 // 46 // Preconditions: The caller must be running Task.doSyscallInvoke on the task 47 // goroutine. 48 func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { 49 if args.Flags&^SupportedCloneFlags != 0 { 50 return 0, nil, linuxerr.EINVAL 51 } 52 // Since signal actions may refer to application signal handlers by virtual 53 // address, any set of signal handlers must refer to the same address 54 // space. 55 if args.Flags&(linux.CLONE_SIGHAND|linux.CLONE_VM) == linux.CLONE_SIGHAND { 56 return 0, nil, linuxerr.EINVAL 57 } 58 if args.SetTID != 0 { 59 return 0, nil, linuxerr.ENOTSUP 60 } 61 // In order for the behavior of thread-group-directed signals to be sane, 62 // all tasks in a thread group must share signal handlers. 63 if args.Flags&(linux.CLONE_THREAD|linux.CLONE_SIGHAND) == linux.CLONE_THREAD { 64 return 0, nil, linuxerr.EINVAL 65 } 66 // All tasks in a thread group must be in the same PID namespace. 67 if (args.Flags&linux.CLONE_THREAD != 0) && (args.Flags&linux.CLONE_NEWPID != 0 || t.childPIDNamespace != nil) { 68 return 0, nil, linuxerr.EINVAL 69 } 70 // The two different ways of specifying a new PID namespace are 71 // incompatible. 72 if args.Flags&linux.CLONE_NEWPID != 0 && t.childPIDNamespace != nil { 73 return 0, nil, linuxerr.EINVAL 74 } 75 // Thread groups and FS contexts cannot span user namespaces. 76 if args.Flags&linux.CLONE_NEWUSER != 0 && args.Flags&(linux.CLONE_THREAD|linux.CLONE_FS) != 0 { 77 return 0, nil, linuxerr.EINVAL 78 } 79 // args.ExitSignal must be a valid signal. 80 if args.ExitSignal != 0 && !linux.Signal(args.ExitSignal).IsValid() { 81 return 0, nil, linuxerr.EINVAL 82 } 83 if args.Flags&(linux.CLONE_FS|linux.CLONE_NEWNS) == linux.CLONE_FS|linux.CLONE_NEWNS { 84 return 0, nil, linuxerr.EINVAL 85 } 86 87 // Pull task registers and FPU state, a cloned task will inherit the 88 // state of the current task. 89 if err := t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()); err != nil { 90 t.Warningf("Unable to pull a full state: %v", err) 91 t.forceSignal(linux.SIGILL, true /* unconditional */) 92 t.SendSignal(SignalInfoPriv(linux.SIGILL)) 93 return 0, nil, linuxerr.EFAULT 94 } 95 96 // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a 97 // single clone(2) or unshare(2) call, the user namespace is guaranteed to 98 // be created first, giving the child (clone(2)) or caller (unshare(2)) 99 // privileges over the remaining namespaces created by the call." - 100 // user_namespaces(7) 101 creds := t.Credentials() 102 userns := creds.UserNamespace 103 if args.Flags&linux.CLONE_NEWUSER != 0 { 104 var err error 105 // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and 106 // the caller is in a chroot environment (i.e., the caller's root 107 // directory does not match the root directory of the mount namespace 108 // in which it resides)." - clone(2). Neither chroot(2) nor 109 // user_namespaces(7) document this. 110 if t.IsChrooted() { 111 return 0, nil, linuxerr.EPERM 112 } 113 userns, err = creds.NewChildUserNamespace() 114 if err != nil { 115 return 0, nil, err 116 } 117 } 118 if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { 119 return 0, nil, linuxerr.EPERM 120 } 121 122 cu := cleanup.Make(func() {}) 123 defer cu.Clean() 124 125 utsns := t.utsns 126 if args.Flags&linux.CLONE_NEWUTS != 0 { 127 // Note that this must happen after NewUserNamespace so we get 128 // the new userns if there is one. 129 utsns = utsns.Clone(userns) 130 utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, utsns)) 131 } else { 132 utsns.IncRef() 133 } 134 cu.Add(func() { 135 utsns.DecRef(t) 136 }) 137 138 ipcns := t.ipcns 139 if args.Flags&linux.CLONE_NEWIPC != 0 { 140 ipcns = NewIPCNamespace(userns) 141 ipcns.InitPosixQueues(t, t.k.VFS(), creds) 142 ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, ipcns)) 143 } else { 144 ipcns.IncRef() 145 } 146 cu.Add(func() { 147 ipcns.DecRef(t) 148 }) 149 150 netns := t.netns 151 if args.Flags&linux.CLONE_NEWNET != 0 { 152 netns = inet.NewNamespace(netns, userns) 153 inode := nsfs.NewInode(t, t.k.nsfsMount, netns) 154 netns.SetInode(inode) 155 } else { 156 netns.IncRef() 157 } 158 cu.Add(func() { 159 netns.DecRef(t) 160 }) 161 162 // We must hold t.mu to access t.image, but we can't hold it during Fork(), 163 // since TaskImage.Fork()=>mm.Fork() takes mm.addressSpaceMu, which is ordered 164 // above Task.mu. So we copy t.image with t.mu held and call Fork() on the copy. 165 t.mu.Lock() 166 curImage := t.image 167 sessionKeyring := t.sessionKeyring 168 t.mu.Unlock() 169 image, err := curImage.Fork(t, t.k, args.Flags&linux.CLONE_VM != 0) 170 if err != nil { 171 return 0, nil, err 172 } 173 cu.Add(func() { 174 image.release(t) 175 }) 176 177 if args.Flags&linux.CLONE_NEWUSER != 0 { 178 // If the task is in a new user namespace, it cannot share keys. 179 sessionKeyring = nil 180 } 181 182 // clone() returns 0 in the child. 183 image.Arch.SetReturn(0) 184 if args.Stack != 0 { 185 image.Arch.SetStack(uintptr(args.Stack + args.StackSize)) 186 } 187 if args.Flags&linux.CLONE_SETTLS != 0 { 188 if !image.Arch.SetTLS(uintptr(args.TLS)) { 189 return 0, nil, linuxerr.EPERM 190 } 191 } 192 193 var fsContext *FSContext 194 if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 { 195 fsContext = t.fsContext.Fork() 196 } else { 197 fsContext = t.fsContext 198 fsContext.IncRef() 199 } 200 201 mntns := t.mountNamespace 202 if args.Flags&linux.CLONE_NEWNS != 0 { 203 var err error 204 mntns, err = t.k.vfs.CloneMountNamespace(t, creds, mntns, &fsContext.root, &fsContext.cwd, t.k) 205 if err != nil { 206 return 0, nil, err 207 } 208 } else { 209 mntns.IncRef() 210 } 211 cu.Add(func() { 212 mntns.DecRef(t) 213 }) 214 215 var fdTable *FDTable 216 if args.Flags&linux.CLONE_FILES == 0 { 217 fdTable = t.fdTable.Fork(t, MaxFdLimit) 218 } else { 219 fdTable = t.fdTable 220 fdTable.IncRef() 221 } 222 223 pidns := t.tg.pidns 224 if t.childPIDNamespace != nil { 225 pidns = t.childPIDNamespace 226 } else if args.Flags&linux.CLONE_NEWPID != 0 { 227 pidns = pidns.NewChild(userns) 228 } 229 230 tg := t.tg 231 rseqAddr := hostarch.Addr(0) 232 rseqSignature := uint32(0) 233 if args.Flags&linux.CLONE_THREAD == 0 { 234 sh := t.tg.signalHandlers 235 if args.Flags&linux.CLONE_SIGHAND == 0 { 236 sh = sh.Fork() 237 } 238 tg = t.k.NewThreadGroup(pidns, sh, linux.Signal(args.ExitSignal), tg.limits.GetCopy()) 239 tg.oomScoreAdj = atomicbitops.FromInt32(t.tg.oomScoreAdj.Load()) 240 rseqAddr = t.rseqAddr 241 rseqSignature = t.rseqSignature 242 } 243 244 uc := t.userCounters 245 if uc.uid != creds.RealKUID { 246 uc = t.k.GetUserCounters(creds.RealKUID) 247 } 248 249 cfg := &TaskConfig{ 250 Kernel: t.k, 251 ThreadGroup: tg, 252 SignalMask: t.SignalMask(), 253 TaskImage: image, 254 FSContext: fsContext, 255 FDTable: fdTable, 256 Credentials: creds, 257 Niceness: t.Niceness(), 258 NetworkNamespace: netns, 259 AllowedCPUMask: t.CPUMask(), 260 UTSNamespace: utsns, 261 IPCNamespace: ipcns, 262 MountNamespace: mntns, 263 RSeqAddr: rseqAddr, 264 RSeqSignature: rseqSignature, 265 ContainerID: t.ContainerID(), 266 UserCounters: uc, 267 SessionKeyring: sessionKeyring, 268 } 269 if args.Flags&linux.CLONE_THREAD == 0 { 270 cfg.Parent = t 271 } else { 272 cfg.InheritParent = t 273 } 274 nt, err := t.tg.pidns.owner.NewTask(t, cfg) 275 // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does 276 // the cleanup for us. 277 cu.Release() 278 if err != nil { 279 return 0, nil, err 280 } 281 282 // "A child process created via fork(2) inherits a copy of its parent's 283 // alternate signal stack settings" - sigaltstack(2). 284 // 285 // However kernel/fork.c:copy_process() adds a limitation to this: 286 // "sigaltstack should be cleared when sharing the same VM". 287 if args.Flags&linux.CLONE_VM == 0 || args.Flags&linux.CLONE_VFORK != 0 { 288 nt.SetSignalStack(t.SignalStack()) 289 } 290 291 if userns != creds.UserNamespace { 292 if err := nt.SetUserNamespace(userns); err != nil { 293 // This shouldn't be possible: userns was created from nt.creds, so 294 // nt should have CAP_SYS_ADMIN in userns. 295 panic("Task.Clone: SetUserNamespace failed: " + err.Error()) 296 } 297 } 298 299 // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to 300 // nt that it must receive before its task goroutine starts running. 301 tid := nt.k.tasks.Root.IDOfTask(nt) 302 defer nt.Start(tid) 303 304 if seccheck.Global.Enabled(seccheck.PointClone) { 305 mask, info := getCloneSeccheckInfo(t, nt, args.Flags) 306 if err := seccheck.Global.SentToSinks(func(c seccheck.Sink) error { 307 return c.Clone(t, mask, info) 308 }); err != nil { 309 // nt has been visible to the rest of the system since NewTask, so 310 // it may be blocking execve or a group stop, have been notified 311 // for group signal delivery, had children reparented to it, etc. 312 // Thus we can't just drop it on the floor. Instead, instruct the 313 // task goroutine to exit immediately, as quietly as possible. 314 nt.exitTracerNotified = true 315 nt.exitTracerAcked = true 316 nt.exitParentNotified = true 317 nt.exitParentAcked = true 318 nt.runState = (*runExitMain)(nil) 319 return 0, nil, err 320 } 321 } 322 323 // "If fork/clone and execve are allowed by @prog, any child processes will 324 // be constrained to the same filters and system call ABI as the parent." - 325 // Documentation/prctl/seccomp_filter.txt 326 if ts := t.seccomp.Load(); ts != nil { 327 seccompCopy := ts.copy() 328 seccompCopy.populateCache(nt) 329 nt.seccomp.Store(seccompCopy) 330 } else { 331 nt.seccomp.Store(nil) 332 } 333 if args.Flags&linux.CLONE_VFORK != 0 { 334 nt.vforkParent = t 335 } 336 337 if args.Flags&linux.CLONE_CHILD_CLEARTID != 0 { 338 nt.SetClearTID(hostarch.Addr(args.ChildTID)) 339 } 340 if args.Flags&linux.CLONE_CHILD_SETTID != 0 { 341 ctid := nt.ThreadID() 342 ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(args.ChildTID)) 343 } 344 ntid := t.tg.pidns.IDOfTask(nt) 345 if args.Flags&linux.CLONE_PARENT_SETTID != 0 { 346 ntid.CopyOut(t, hostarch.Addr(args.ParentTID)) 347 } 348 349 t.traceCloneEvent(tid) 350 kind := ptraceCloneKindClone 351 if args.Flags&linux.CLONE_VFORK != 0 { 352 kind = ptraceCloneKindVfork 353 } else if linux.Signal(args.ExitSignal) == linux.SIGCHLD { 354 kind = ptraceCloneKindFork 355 } 356 if t.ptraceClone(kind, nt, args) { 357 if args.Flags&linux.CLONE_VFORK != 0 { 358 return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil 359 } 360 return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil 361 } 362 if args.Flags&linux.CLONE_VFORK != 0 { 363 t.maybeBeginVforkStop(nt) 364 return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil 365 } 366 return ntid, nil, nil 367 } 368 369 func getCloneSeccheckInfo(t, nt *Task, flags uint64) (seccheck.FieldSet, *pb.CloneInfo) { 370 fields := seccheck.Global.GetFieldSet(seccheck.PointClone) 371 var cwd string 372 if fields.Context.Contains(seccheck.FieldCtxtCwd) { 373 cwd = getTaskCurrentWorkingDirectory(t) 374 } 375 t.k.tasks.mu.RLock() 376 defer t.k.tasks.mu.RUnlock() 377 info := &pb.CloneInfo{ 378 CreatedThreadId: int32(nt.k.tasks.Root.tids[nt]), 379 CreatedThreadGroupId: int32(nt.k.tasks.Root.tgids[nt.tg]), 380 CreatedThreadStartTimeNs: nt.startTime.Nanoseconds(), 381 Flags: flags, 382 } 383 384 if !fields.Context.Empty() { 385 info.ContextData = &pb.ContextData{} 386 LoadSeccheckDataLocked(t, fields.Context, info.ContextData, cwd) 387 } 388 389 return fields, info 390 } 391 392 // maybeBeginVforkStop checks if a previously-started vfork child is still 393 // running and has not yet released its MM, such that its parent t should enter 394 // a vforkStop. 395 // 396 // Preconditions: The caller must be running on t's task goroutine. 397 func (t *Task) maybeBeginVforkStop(child *Task) { 398 t.tg.pidns.owner.mu.RLock() 399 defer t.tg.pidns.owner.mu.RUnlock() 400 t.tg.signalHandlers.mu.Lock() 401 defer t.tg.signalHandlers.mu.Unlock() 402 if t.killedLocked() { 403 child.vforkParent = nil 404 return 405 } 406 if child.vforkParent == t { 407 t.beginInternalStopLocked((*vforkStop)(nil)) 408 } 409 } 410 411 func (t *Task) unstopVforkParent() { 412 t.tg.pidns.owner.mu.RLock() 413 defer t.tg.pidns.owner.mu.RUnlock() 414 if p := t.vforkParent; p != nil { 415 p.tg.signalHandlers.mu.Lock() 416 defer p.tg.signalHandlers.mu.Unlock() 417 if _, ok := p.stop.(*vforkStop); ok { 418 p.endInternalStopLocked() 419 } 420 // Parent no longer needs to be unstopped. 421 t.vforkParent = nil 422 } 423 } 424 425 // +stateify savable 426 type runSyscallAfterPtraceEventClone struct { 427 vforkChild *Task 428 429 // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's 430 // PID namespace. vforkChildTID must be stored since the child may exit and 431 // release its TID before the PTRACE_EVENT stop ends. 432 vforkChildTID ThreadID 433 } 434 435 func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState { 436 if r.vforkChild != nil { 437 t.maybeBeginVforkStop(r.vforkChild) 438 return &runSyscallAfterVforkStop{r.vforkChildTID} 439 } 440 return (*runSyscallExit)(nil) 441 } 442 443 // +stateify savable 444 type runSyscallAfterVforkStop struct { 445 // childTID has the same meaning as 446 // runSyscallAfterPtraceEventClone.vforkChildTID. 447 childTID ThreadID 448 } 449 450 func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { 451 t.ptraceVforkDone(r.childTID) 452 return (*runSyscallExit)(nil) 453 } 454 455 // Setns reassociates thread with the specified namespace. 456 func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { 457 d, ok := fd.Dentry().Impl().(*kernfs.Dentry) 458 if !ok { 459 return linuxerr.EINVAL 460 } 461 i, ok := d.Inode().(*nsfs.Inode) 462 if !ok { 463 return linuxerr.EINVAL 464 } 465 466 switch ns := i.Namespace().(type) { 467 case *inet.Namespace: 468 if flags != 0 && flags != linux.CLONE_NEWNET { 469 return linuxerr.EINVAL 470 } 471 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || 472 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 473 return linuxerr.EPERM 474 } 475 oldNS := t.NetworkNamespace() 476 ns.IncRef() 477 t.mu.Lock() 478 t.netns = ns 479 t.mu.Unlock() 480 oldNS.DecRef(t) 481 return nil 482 case *IPCNamespace: 483 if flags != 0 && flags != linux.CLONE_NEWIPC { 484 return linuxerr.EINVAL 485 } 486 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || 487 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 488 return linuxerr.EPERM 489 } 490 oldNS := t.IPCNamespace() 491 ns.IncRef() 492 t.mu.Lock() 493 t.ipcns = ns 494 t.mu.Unlock() 495 oldNS.DecRef(t) 496 return nil 497 case *vfs.MountNamespace: 498 if flags != 0 && flags != linux.CLONE_NEWNS { 499 return linuxerr.EINVAL 500 } 501 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.Owner) || 502 !t.Credentials().HasCapability(linux.CAP_SYS_CHROOT) || 503 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 504 return linuxerr.EPERM 505 } 506 oldFSContext := t.fsContext 507 // The current task has to be an exclusive owner of its fs context. 508 if oldFSContext.ReadRefs() != 1 { 509 return linuxerr.EINVAL 510 } 511 fsContext := oldFSContext.Fork() 512 fsContext.root.DecRef(t) 513 fsContext.cwd.DecRef(t) 514 vd := ns.Root(t) 515 fsContext.root = vd 516 vd.IncRef() 517 fsContext.cwd = vd 518 519 oldNS := t.mountNamespace 520 ns.IncRef() 521 t.mu.Lock() 522 t.mountNamespace = ns 523 t.fsContext = fsContext 524 t.mu.Unlock() 525 oldNS.DecRef(t) 526 oldFSContext.DecRef(t) 527 return nil 528 case *UTSNamespace: 529 if flags != 0 && flags != linux.CLONE_NEWUTS { 530 return linuxerr.EINVAL 531 } 532 if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns.UserNamespace()) || 533 !t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) { 534 return linuxerr.EPERM 535 } 536 oldNS := t.UTSNamespace() 537 ns.IncRef() 538 t.mu.Lock() 539 t.utsns = ns 540 t.mu.Unlock() 541 oldNS.DecRef(t) 542 return nil 543 default: 544 return linuxerr.EINVAL 545 } 546 } 547 548 // Unshare changes the set of resources t shares with other tasks, as specified 549 // by flags. 550 // 551 // Preconditions: The caller must be running on the task goroutine. 552 func (t *Task) Unshare(flags int32) error { 553 // "CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM can be specified in flags if 554 // the caller is single threaded (i.e., it is not sharing its address space 555 // with another process or thread). In this case, these flags have no 556 // effect. (Note also that specifying CLONE_THREAD automatically implies 557 // CLONE_VM, and specifying CLONE_VM automatically implies CLONE_SIGHAND.) 558 // If the process is multithreaded, then the use of these flags results in 559 // an error." - unshare(2). This is incorrect (cf. 560 // kernel/fork.c:ksys_unshare()): 561 // 562 // - CLONE_THREAD does not imply CLONE_VM. 563 // 564 // - CLONE_SIGHAND implies CLONE_THREAD. 565 // 566 // - Only CLONE_VM requires that the caller is not sharing its address 567 // space with another thread. CLONE_SIGHAND requires that the caller is not 568 // sharing its signal handlers, and CLONE_THREAD requires that the caller 569 // is the only thread in its thread group. 570 // 571 // Since we don't count the number of tasks using each address space or set 572 // of signal handlers, we reject CLONE_VM and CLONE_SIGHAND altogether. 573 if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 { 574 return linuxerr.EINVAL 575 } 576 creds := t.Credentials() 577 if flags&linux.CLONE_THREAD != 0 { 578 t.tg.signalHandlers.mu.Lock() 579 if t.tg.tasksCount != 1 { 580 t.tg.signalHandlers.mu.Unlock() 581 return linuxerr.EINVAL 582 } 583 t.tg.signalHandlers.mu.Unlock() 584 // This isn't racy because we're the only living task, and therefore 585 // the only task capable of creating new ones, in our thread group. 586 } 587 if flags&linux.CLONE_NEWUSER != 0 { 588 if t.IsChrooted() { 589 return linuxerr.EPERM 590 } 591 newUserNS, err := creds.NewChildUserNamespace() 592 if err != nil { 593 return err 594 } 595 err = t.SetUserNamespace(newUserNS) 596 if err != nil { 597 return err 598 } 599 // Need to reload creds, because t.SetUserNamespace() changed task credentials. 600 creds = t.Credentials() 601 } 602 haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) 603 if flags&linux.CLONE_NEWPID != 0 { 604 if !haveCapSysAdmin { 605 return linuxerr.EPERM 606 } 607 t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) 608 } 609 if flags&linux.CLONE_NEWNET != 0 { 610 if !haveCapSysAdmin { 611 return linuxerr.EPERM 612 } 613 netns := t.NetworkNamespace() 614 netns = inet.NewNamespace(netns, t.UserNamespace()) 615 netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns) 616 netns.SetInode(netnsInode) 617 t.mu.Lock() 618 oldNetns := t.netns 619 t.netns = netns 620 t.mu.Unlock() 621 oldNetns.DecRef(t) 622 } 623 624 cu := cleanup.Cleanup{} 625 // All cu actions has to be executed after releasing t.mu. 626 defer cu.Clean() 627 t.mu.Lock() 628 defer t.mu.Unlock() 629 // Can't defer unlock: DecRefs must occur without holding t.mu. 630 if flags&linux.CLONE_NEWUTS != 0 { 631 if !haveCapSysAdmin { 632 return linuxerr.EPERM 633 } 634 // Note that this must happen after NewUserNamespace, so the 635 // new user namespace is used if there is one. 636 oldUTSNS := t.utsns 637 t.utsns = t.utsns.Clone(creds.UserNamespace) 638 t.utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.utsns)) 639 cu.Add(func() { oldUTSNS.DecRef(t) }) 640 } 641 if flags&linux.CLONE_NEWIPC != 0 { 642 if !haveCapSysAdmin { 643 return linuxerr.EPERM 644 } 645 // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC 646 // namespace" 647 oldIPCNS := t.ipcns 648 t.ipcns = NewIPCNamespace(creds.UserNamespace) 649 t.ipcns.InitPosixQueues(t, t.k.VFS(), creds) 650 t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns)) 651 cu.Add(func() { oldIPCNS.DecRef(t) }) 652 } 653 if flags&linux.CLONE_FILES != 0 { 654 oldFDTable := t.fdTable 655 t.fdTable = oldFDTable.Fork(t, MaxFdLimit) 656 cu.Add(func() { oldFDTable.DecRef(t) }) 657 } 658 if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 { 659 oldFSContext := t.fsContext 660 t.fsContext = oldFSContext.Fork() 661 cu.Add(func() { oldFSContext.DecRef(t) }) 662 } 663 if flags&linux.CLONE_NEWNS != 0 { 664 if !haveCapSysAdmin { 665 return linuxerr.EPERM 666 } 667 oldMountNS := t.mountNamespace 668 mntns, err := t.k.vfs.CloneMountNamespace(t, creds, oldMountNS, &t.fsContext.root, &t.fsContext.cwd, t.k) 669 if err != nil { 670 return err 671 } 672 t.mountNamespace = mntns 673 cu.Add(func() { oldMountNS.DecRef(t) }) 674 } 675 return nil 676 } 677 678 // UnshareFdTable unshares the FdTable that task t shares with other tasks, upto 679 // the maxFd. 680 // 681 // Preconditions: The caller must be running on the task goroutine. 682 func (t *Task) UnshareFdTable(maxFd int32) { 683 t.mu.Lock() 684 oldFDTable := t.fdTable 685 t.fdTable = oldFDTable.Fork(t, maxFd) 686 t.mu.Unlock() 687 688 oldFDTable.DecRef(t) 689 } 690 691 // vforkStop is a TaskStop imposed on a task that creates a child with 692 // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its 693 // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so 694 // that the child and parent share mappings until the child execve()s into a 695 // new process image or exits.) 696 // 697 // +stateify savable 698 type vforkStop struct{} 699 700 // StopIgnoresKill implements TaskStop.Killable. 701 func (*vforkStop) Killable() bool { return true }