github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/task_clone.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "sync/atomic" 19 20 "github.com/SagerNet/gvisor/pkg/abi/linux" 21 "github.com/SagerNet/gvisor/pkg/bpf" 22 "github.com/SagerNet/gvisor/pkg/cleanup" 23 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 24 "github.com/SagerNet/gvisor/pkg/hostarch" 25 "github.com/SagerNet/gvisor/pkg/sentry/inet" 26 "github.com/SagerNet/gvisor/pkg/usermem" 27 ) 28 29 // SharingOptions controls what resources are shared by a new task created by 30 // Task.Clone, or an existing task affected by Task.Unshare. 31 type SharingOptions struct { 32 // If NewAddressSpace is true, the task should have an independent virtual 33 // address space. 34 NewAddressSpace bool 35 36 // If NewSignalHandlers is true, the task should use an independent set of 37 // signal handlers. 38 NewSignalHandlers bool 39 40 // If NewThreadGroup is true, the task should be the leader of its own 41 // thread group. TerminationSignal is the signal that the thread group 42 // will send to its parent when it exits. If NewThreadGroup is false, 43 // TerminationSignal is ignored. 44 NewThreadGroup bool 45 TerminationSignal linux.Signal 46 47 // If NewPIDNamespace is true: 48 // 49 // - In the context of Task.Clone, the new task should be the init task 50 // (TID 1) in a new PID namespace. 51 // 52 // - In the context of Task.Unshare, the task should create a new PID 53 // namespace, and all subsequent clones of the task should be members of 54 // the new PID namespace. 55 NewPIDNamespace bool 56 57 // If NewUserNamespace is true, the task should have an independent user 58 // namespace. 59 NewUserNamespace bool 60 61 // If NewNetworkNamespace is true, the task should have an independent 62 // network namespace. 63 NewNetworkNamespace bool 64 65 // If NewFiles is true, the task should use an independent file descriptor 66 // table. 67 NewFiles bool 68 69 // If NewFSContext is true, the task should have an independent FSContext. 70 NewFSContext bool 71 72 // If NewUTSNamespace is true, the task should have an independent UTS 73 // namespace. 74 NewUTSNamespace bool 75 76 // If NewIPCNamespace is true, the task should have an independent IPC 77 // namespace. 78 NewIPCNamespace bool 79 } 80 81 // CloneOptions controls the behavior of Task.Clone. 82 type CloneOptions struct { 83 // SharingOptions defines the set of resources that the new task will share 84 // with its parent. 85 SharingOptions 86 87 // Stack is the initial stack pointer of the new task. If Stack is 0, the 88 // new task will start with the same stack pointer as its parent. 89 Stack hostarch.Addr 90 91 // If SetTLS is true, set the new task's TLS (thread-local storage) 92 // descriptor to TLS. If SetTLS is false, TLS is ignored. 93 SetTLS bool 94 TLS hostarch.Addr 95 96 // If ChildClearTID is true, when the child exits, 0 is written to the 97 // address ChildTID in the child's memory, and if the write is successful a 98 // futex wake on the same address is performed. 99 // 100 // If ChildSetTID is true, the child's thread ID (in the child's PID 101 // namespace) is written to address ChildTID in the child's memory. (As in 102 // Linux, failed writes are silently ignored.) 103 ChildClearTID bool 104 ChildSetTID bool 105 ChildTID hostarch.Addr 106 107 // If ParentSetTID is true, the child's thread ID (in the parent's PID 108 // namespace) is written to address ParentTID in the parent's memory. (As 109 // in Linux, failed writes are silently ignored.) 110 // 111 // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID 112 // causes the child's thread ID to be written to ptid in both the parent 113 // and child's memory, but this is a documentation error fixed by 114 // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID"). 115 ParentSetTID bool 116 ParentTID hostarch.Addr 117 118 // If Vfork is true, place the parent in vforkStop until the cloned task 119 // releases its TaskImage. 120 Vfork bool 121 122 // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for 123 // this clone(), and do not ptrace-attach the caller's tracer to the new 124 // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate). 125 Untraced bool 126 127 // If InheritTracer is true, ptrace-attach the caller's tracer to the new 128 // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported 129 // for it. If both Untraced and InheritTracer are true, no event will be 130 // reported, but tracer inheritance will still occur. 131 InheritTracer bool 132 } 133 134 // Clone implements the clone(2) syscall and returns the thread ID of the new 135 // task in t's PID namespace. Clone may return both a non-zero thread ID and a 136 // non-nil error. 137 // 138 // Preconditions: The caller must be running Task.doSyscallInvoke on the task 139 // goroutine. 140 func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { 141 // Since signal actions may refer to application signal handlers by virtual 142 // address, any set of signal handlers must refer to the same address 143 // space. 144 if !opts.NewSignalHandlers && opts.NewAddressSpace { 145 return 0, nil, linuxerr.EINVAL 146 } 147 // In order for the behavior of thread-group-directed signals to be sane, 148 // all tasks in a thread group must share signal handlers. 149 if !opts.NewThreadGroup && opts.NewSignalHandlers { 150 return 0, nil, linuxerr.EINVAL 151 } 152 // All tasks in a thread group must be in the same PID namespace. 153 if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) { 154 return 0, nil, linuxerr.EINVAL 155 } 156 // The two different ways of specifying a new PID namespace are 157 // incompatible. 158 if opts.NewPIDNamespace && t.childPIDNamespace != nil { 159 return 0, nil, linuxerr.EINVAL 160 } 161 // Thread groups and FS contexts cannot span user namespaces. 162 if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) { 163 return 0, nil, linuxerr.EINVAL 164 } 165 166 // Pull task registers and FPU state, a cloned task will inherit the 167 // state of the current task. 168 t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch()) 169 170 // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a 171 // single clone(2) or unshare(2) call, the user namespace is guaranteed to 172 // be created first, giving the child (clone(2)) or caller (unshare(2)) 173 // privileges over the remaining namespaces created by the call." - 174 // user_namespaces(7) 175 creds := t.Credentials() 176 userns := creds.UserNamespace 177 if opts.NewUserNamespace { 178 var err error 179 // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and 180 // the caller is in a chroot environment (i.e., the caller's root 181 // directory does not match the root directory of the mount namespace 182 // in which it resides)." - clone(2). Neither chroot(2) nor 183 // user_namespaces(7) document this. 184 if t.IsChrooted() { 185 return 0, nil, linuxerr.EPERM 186 } 187 userns, err = creds.NewChildUserNamespace() 188 if err != nil { 189 return 0, nil, err 190 } 191 } 192 if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { 193 return 0, nil, linuxerr.EPERM 194 } 195 196 utsns := t.UTSNamespace() 197 if opts.NewUTSNamespace { 198 // Note that this must happen after NewUserNamespace so we get 199 // the new userns if there is one. 200 utsns = t.UTSNamespace().Clone(userns) 201 } 202 203 ipcns := t.IPCNamespace() 204 if opts.NewIPCNamespace { 205 // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC 206 // namespace" 207 ipcns = NewIPCNamespace(userns) 208 } else { 209 ipcns.IncRef() 210 } 211 cu := cleanup.Make(func() { 212 ipcns.DecRef(t) 213 }) 214 defer cu.Clean() 215 216 netns := t.NetworkNamespace() 217 if opts.NewNetworkNamespace { 218 netns = inet.NewNamespace(netns) 219 } 220 221 // TODO(b/63601033): Implement CLONE_NEWNS. 222 mntnsVFS2 := t.mountNamespaceVFS2 223 if mntnsVFS2 != nil { 224 mntnsVFS2.IncRef() 225 cu.Add(func() { 226 mntnsVFS2.DecRef(t) 227 }) 228 } 229 230 image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace) 231 if err != nil { 232 return 0, nil, err 233 } 234 cu.Add(func() { 235 image.release() 236 }) 237 // clone() returns 0 in the child. 238 image.Arch.SetReturn(0) 239 if opts.Stack != 0 { 240 image.Arch.SetStack(uintptr(opts.Stack)) 241 } 242 if opts.SetTLS { 243 if !image.Arch.SetTLS(uintptr(opts.TLS)) { 244 return 0, nil, linuxerr.EPERM 245 } 246 } 247 248 var fsContext *FSContext 249 if opts.NewFSContext { 250 fsContext = t.fsContext.Fork() 251 } else { 252 fsContext = t.fsContext 253 fsContext.IncRef() 254 } 255 256 var fdTable *FDTable 257 if opts.NewFiles { 258 fdTable = t.fdTable.Fork(t) 259 } else { 260 fdTable = t.fdTable 261 fdTable.IncRef() 262 } 263 264 pidns := t.tg.pidns 265 if t.childPIDNamespace != nil { 266 pidns = t.childPIDNamespace 267 } else if opts.NewPIDNamespace { 268 pidns = pidns.NewChild(userns) 269 } 270 271 tg := t.tg 272 rseqAddr := hostarch.Addr(0) 273 rseqSignature := uint32(0) 274 if opts.NewThreadGroup { 275 if tg.mounts != nil { 276 tg.mounts.IncRef() 277 } 278 sh := t.tg.signalHandlers 279 if opts.NewSignalHandlers { 280 sh = sh.Fork() 281 } 282 tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) 283 tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) 284 rseqAddr = t.rseqAddr 285 rseqSignature = t.rseqSignature 286 } 287 288 cfg := &TaskConfig{ 289 Kernel: t.k, 290 ThreadGroup: tg, 291 SignalMask: t.SignalMask(), 292 TaskImage: image, 293 FSContext: fsContext, 294 FDTable: fdTable, 295 Credentials: creds, 296 Niceness: t.Niceness(), 297 NetworkNamespace: netns, 298 AllowedCPUMask: t.CPUMask(), 299 UTSNamespace: utsns, 300 IPCNamespace: ipcns, 301 AbstractSocketNamespace: t.abstractSockets, 302 MountNamespaceVFS2: mntnsVFS2, 303 RSeqAddr: rseqAddr, 304 RSeqSignature: rseqSignature, 305 ContainerID: t.ContainerID(), 306 } 307 if opts.NewThreadGroup { 308 cfg.Parent = t 309 } else { 310 cfg.InheritParent = t 311 } 312 nt, err := t.tg.pidns.owner.NewTask(t, cfg) 313 // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does 314 // the cleanup for us. 315 cu.Release() 316 if err != nil { 317 return 0, nil, err 318 } 319 320 // "A child process created via fork(2) inherits a copy of its parent's 321 // alternate signal stack settings" - sigaltstack(2). 322 // 323 // However kernel/fork.c:copy_process() adds a limitation to this: 324 // "sigaltstack should be cleared when sharing the same VM". 325 if opts.NewAddressSpace || opts.Vfork { 326 nt.SetSignalStack(t.SignalStack()) 327 } 328 329 if userns != creds.UserNamespace { 330 if err := nt.SetUserNamespace(userns); err != nil { 331 // This shouldn't be possible: userns was created from nt.creds, so 332 // nt should have CAP_SYS_ADMIN in userns. 333 panic("Task.Clone: SetUserNamespace failed: " + err.Error()) 334 } 335 } 336 337 // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to 338 // nt that it must receive before its task goroutine starts running. 339 tid := nt.k.tasks.Root.IDOfTask(nt) 340 defer nt.Start(tid) 341 t.traceCloneEvent(tid) 342 343 // "If fork/clone and execve are allowed by @prog, any child processes will 344 // be constrained to the same filters and system call ABI as the parent." - 345 // Documentation/prctl/seccomp_filter.txt 346 if f := t.syscallFilters.Load(); f != nil { 347 copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...) 348 nt.syscallFilters.Store(copiedFilters) 349 } 350 if opts.Vfork { 351 nt.vforkParent = t 352 } 353 354 if opts.ChildClearTID { 355 nt.SetClearTID(opts.ChildTID) 356 } 357 if opts.ChildSetTID { 358 ctid := nt.ThreadID() 359 ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID) 360 } 361 ntid := t.tg.pidns.IDOfTask(nt) 362 if opts.ParentSetTID { 363 ntid.CopyOut(t, opts.ParentTID) 364 } 365 366 kind := ptraceCloneKindClone 367 if opts.Vfork { 368 kind = ptraceCloneKindVfork 369 } else if opts.TerminationSignal == linux.SIGCHLD { 370 kind = ptraceCloneKindFork 371 } 372 if t.ptraceClone(kind, nt, opts) { 373 if opts.Vfork { 374 return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil 375 } 376 return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil 377 } 378 if opts.Vfork { 379 t.maybeBeginVforkStop(nt) 380 return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil 381 } 382 return ntid, nil, nil 383 } 384 385 // maybeBeginVforkStop checks if a previously-started vfork child is still 386 // running and has not yet released its MM, such that its parent t should enter 387 // a vforkStop. 388 // 389 // Preconditions: The caller must be running on t's task goroutine. 390 func (t *Task) maybeBeginVforkStop(child *Task) { 391 t.tg.pidns.owner.mu.RLock() 392 defer t.tg.pidns.owner.mu.RUnlock() 393 t.tg.signalHandlers.mu.Lock() 394 defer t.tg.signalHandlers.mu.Unlock() 395 if t.killedLocked() { 396 child.vforkParent = nil 397 return 398 } 399 if child.vforkParent == t { 400 t.beginInternalStopLocked((*vforkStop)(nil)) 401 } 402 } 403 404 func (t *Task) unstopVforkParent() { 405 t.tg.pidns.owner.mu.RLock() 406 defer t.tg.pidns.owner.mu.RUnlock() 407 if p := t.vforkParent; p != nil { 408 p.tg.signalHandlers.mu.Lock() 409 defer p.tg.signalHandlers.mu.Unlock() 410 if _, ok := p.stop.(*vforkStop); ok { 411 p.endInternalStopLocked() 412 } 413 // Parent no longer needs to be unstopped. 414 t.vforkParent = nil 415 } 416 } 417 418 // +stateify savable 419 type runSyscallAfterPtraceEventClone struct { 420 vforkChild *Task 421 422 // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's 423 // PID namespace. vforkChildTID must be stored since the child may exit and 424 // release its TID before the PTRACE_EVENT stop ends. 425 vforkChildTID ThreadID 426 } 427 428 func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState { 429 if r.vforkChild != nil { 430 t.maybeBeginVforkStop(r.vforkChild) 431 return &runSyscallAfterVforkStop{r.vforkChildTID} 432 } 433 return (*runSyscallExit)(nil) 434 } 435 436 // +stateify savable 437 type runSyscallAfterVforkStop struct { 438 // childTID has the same meaning as 439 // runSyscallAfterPtraceEventClone.vforkChildTID. 440 childTID ThreadID 441 } 442 443 func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState { 444 t.ptraceVforkDone(r.childTID) 445 return (*runSyscallExit)(nil) 446 } 447 448 // Unshare changes the set of resources t shares with other tasks, as specified 449 // by opts. 450 // 451 // Preconditions: The caller must be running on the task goroutine. 452 func (t *Task) Unshare(opts *SharingOptions) error { 453 // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and 454 // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if 455 // t is the only task using its MM, which due to clone(2)'s rules imply 456 // that it is also the only task using its signal handlers / in its thread 457 // group, and cause EINVAL to be returned otherwise. 458 // 459 // Since we don't count the number of tasks using each address space or set 460 // of signal handlers, we reject NewSignalHandlers and NewAddressSpace 461 // altogether, and interpret NewThreadGroup as requiring that t be the only 462 // member of its thread group. This seems to be logically coherent, in the 463 // sense that clone(2) allows a task to share signal handlers and address 464 // spaces with tasks in other thread groups. 465 if opts.NewAddressSpace || opts.NewSignalHandlers { 466 return linuxerr.EINVAL 467 } 468 creds := t.Credentials() 469 if opts.NewThreadGroup { 470 t.tg.signalHandlers.mu.Lock() 471 if t.tg.tasksCount != 1 { 472 t.tg.signalHandlers.mu.Unlock() 473 return linuxerr.EINVAL 474 } 475 t.tg.signalHandlers.mu.Unlock() 476 // This isn't racy because we're the only living task, and therefore 477 // the only task capable of creating new ones, in our thread group. 478 } 479 if opts.NewUserNamespace { 480 if t.IsChrooted() { 481 return linuxerr.EPERM 482 } 483 newUserNS, err := creds.NewChildUserNamespace() 484 if err != nil { 485 return err 486 } 487 err = t.SetUserNamespace(newUserNS) 488 if err != nil { 489 return err 490 } 491 // Need to reload creds, becaue t.SetUserNamespace() changed task credentials. 492 creds = t.Credentials() 493 } 494 haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN) 495 if opts.NewPIDNamespace { 496 if !haveCapSysAdmin { 497 return linuxerr.EPERM 498 } 499 t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace()) 500 } 501 t.mu.Lock() 502 // Can't defer unlock: DecRefs must occur without holding t.mu. 503 if opts.NewNetworkNamespace { 504 if !haveCapSysAdmin { 505 t.mu.Unlock() 506 return linuxerr.EPERM 507 } 508 t.netns = inet.NewNamespace(t.netns) 509 } 510 if opts.NewUTSNamespace { 511 if !haveCapSysAdmin { 512 t.mu.Unlock() 513 return linuxerr.EPERM 514 } 515 // Note that this must happen after NewUserNamespace, so the 516 // new user namespace is used if there is one. 517 t.utsns = t.utsns.Clone(creds.UserNamespace) 518 } 519 if opts.NewIPCNamespace { 520 if !haveCapSysAdmin { 521 t.mu.Unlock() 522 return linuxerr.EPERM 523 } 524 // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC 525 // namespace" 526 t.ipcns.DecRef(t) 527 t.ipcns = NewIPCNamespace(creds.UserNamespace) 528 } 529 var oldFDTable *FDTable 530 if opts.NewFiles { 531 oldFDTable = t.fdTable 532 t.fdTable = oldFDTable.Fork(t) 533 } 534 var oldFSContext *FSContext 535 if opts.NewFSContext { 536 oldFSContext = t.fsContext 537 t.fsContext = oldFSContext.Fork() 538 } 539 t.mu.Unlock() 540 if oldFDTable != nil { 541 oldFDTable.DecRef(t) 542 } 543 if oldFSContext != nil { 544 oldFSContext.DecRef(t) 545 } 546 return nil 547 } 548 549 // vforkStop is a TaskStop imposed on a task that creates a child with 550 // CLONE_VFORK or vfork(2), that ends when the child task ceases to use its 551 // current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so 552 // that the child and parent share mappings until the child execve()s into a 553 // new process image or exits.) 554 // 555 // +stateify savable 556 type vforkStop struct{} 557 558 // StopIgnoresKill implements TaskStop.Killable. 559 func (*vforkStop) Killable() bool { return true }