github.com/LanceLRQ/deer-common@v0.0.9-0.20210319081233-e8222ac018a8/sandbox/forkexec/exec_linux.go (about) 1 // +build linux,amd64 2 3 package forkexec 4 5 import ( 6 "runtime" 7 "syscall" 8 "unsafe" 9 ) 10 11 type capHeader struct { 12 version uint32 13 pid int32 14 } 15 16 type capData struct { 17 effective uint32 18 permitted uint32 19 inheritable uint32 20 } 21 22 type caps struct { 23 hdr capHeader 24 data [2]capData 25 } 26 27 type SysProcAttr struct { 28 Chroot string // Chroot. 29 Credential *syscall.Credential // Credential. 30 // Ptrace tells the child to call ptrace(PTRACE_TRACEME). 31 // Call runtime.LockOSThread before starting a process with this set, 32 // and don't call UnlockOSThread until done with PtraceSyscall calls. 33 Ptrace bool 34 Setsid bool // Create session. 35 // Setpgid sets the process group ID of the child to Pgid, 36 // or, if Pgid == 0, to the new child's process ID. 37 Setpgid bool 38 // Setctty sets the controlling terminal of the child to 39 // file descriptor Ctty. Ctty must be a descriptor number 40 // in the child process: an index into ProcAttr.Files. 41 // This is only meaningful if Setsid is true. 42 Setctty bool 43 Noctty bool // Detach fd 0 from controlling terminal 44 Ctty int // Controlling TTY fd 45 // Foreground places the child process group in the foreground. 46 // This implies Setpgid. The Ctty field must be set to 47 // the descriptor of the controlling TTY. 48 // Unlike Setctty, in this case Ctty must be a descriptor 49 // number in the parent process. 50 Foreground bool 51 Pgid int // Child's process group ID if Setpgid. 52 Pdeathsig syscall.Signal // Signal that the process will get when its parent dies (Linux only) 53 Cloneflags uintptr // Flags for clone calls (Linux only) 54 Unshareflags uintptr // Flags for unshare calls (Linux only) 55 UidMappings []syscall.SysProcIDMap // User ID mappings for user namespaces. 56 GidMappings []syscall.SysProcIDMap // Group ID mappings for user namespaces. 57 // GidMappingsEnableSetgroups enabling setgroups syscall. 58 // If false, then setgroups syscall will be disabled for the child process. 59 // This parameter is no-op if GidMappings == nil. Otherwise for unprivileged 60 // users this should be set to false for mappings work. 61 GidMappingsEnableSetgroups bool 62 AmbientCaps []uintptr // Ambient capabilities (Linux only) 63 Rlimit ExecRLimit // Set child's rlimit. 64 } 65 66 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522 67 68 var ( 69 none = [...]byte{'n', 'o', 'n', 'e', 0} 70 slash = [...]byte{'/', 0} 71 ) 72 73 // fcntl64Syscall is usually SYS_FCNTL, but is overridden on 32-bit Linux 74 // systems by flock_linux_32bit.go to be SYS_FCNTL64. 75 var fcntl64Syscall uintptr = syscall.SYS_FCNTL 76 77 // See CAP_TO_INDEX in linux/capability.h: 78 func capToIndex(cap uintptr) uintptr { return cap >> 5 } 79 80 // See CAP_TO_MASK in linux/capability.h: 81 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) } 82 83 // Try to open a pipe with O_CLOEXEC set on both file descriptors. 84 func forkExecPipe(p []int) (err error) { 85 err = Pipe2(p, syscall.O_CLOEXEC) 86 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it 87 // might not be implemented. 88 if err == syscall.ENOSYS { 89 if err = syscall.Pipe(p); err != nil { 90 return 91 } 92 if _, err = fcntl(p[0], syscall.F_SETFD, syscall.FD_CLOEXEC); err != nil { 93 return 94 } 95 _, err = fcntl(p[1], syscall.F_SETFD, syscall.FD_CLOEXEC) 96 } 97 return 98 } 99 100 // forkAndExecInChild1 implements the body of forkAndExecInChild up to 101 // the parent's post-fork path. This is a separate function so we can 102 // separate the child's and parent's stack frames if we're using 103 // vfork. 104 // 105 // This is go:noinline because the point is to keep the stack frames 106 // of this and forkAndExecInChild separate. 107 // 108 //go:noinline 109 //go:norace 110 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 syscall.Errno, p [2]int, locked bool) { 111 // Defined in linux/prctl.h starting with Linux 4.3. 112 const ( 113 PR_CAP_AMBIENT = 0x2f 114 PR_CAP_AMBIENT_RAISE = 0x2 115 ) 116 117 // vfork requires that the child not touch any of the parent's 118 // active stack frames. Hence, the child does all post-fork 119 // processing in this stack frame and never returns, while the 120 // parent returns immediately from this frame and does all 121 // post-fork processing in the outer frame. 122 // Declare all variables at top in case any 123 // declarations require heap allocation (e.g., err1). 124 var ( 125 err2 syscall.Errno 126 nextfd int 127 i int 128 caps caps 129 fd1 uintptr 130 puid, psetgroups, pgid []byte 131 uidmap, setgroups, gidmap []byte 132 ) 133 134 // Load rlimit options 135 rlimitOptions := GetRlimitOptions(&sys.Rlimit) 136 137 if sys.UidMappings != nil { 138 puid = []byte("/proc/self/uid_map\000") 139 uidmap = formatIDMappings(sys.UidMappings) 140 } 141 142 if sys.GidMappings != nil { 143 psetgroups = []byte("/proc/self/setgroups\000") 144 pgid = []byte("/proc/self/gid_map\000") 145 146 if sys.GidMappingsEnableSetgroups { 147 setgroups = []byte("allow\000") 148 } else { 149 setgroups = []byte("deny\000") 150 } 151 gidmap = formatIDMappings(sys.GidMappings) 152 } 153 154 // Record parent PID so child can test if it has died. 155 ppid, _ := rawSyscallNoError(syscall.SYS_GETPID, 0, 0, 0) 156 157 // Guard against side effects of shuffling fds below. 158 // Make sure that nextfd is beyond any currently open files so 159 // that we can't run the risk of overwriting any of them. 160 fd := make([]int, len(attr.Files)) 161 nextfd = len(attr.Files) 162 for i, ufd := range attr.Files { 163 if nextfd < int(ufd) { 164 nextfd = int(ufd) 165 } 166 fd[i] = int(ufd) 167 } 168 nextfd++ 169 170 // Allocate another pipe for parent to child communication for 171 // synchronizing writing of User ID/Group ID mappings. 172 if sys.UidMappings != nil || sys.GidMappings != nil { 173 if err := forkExecPipe(p[:]); err != nil { 174 err1 = err.(syscall.Errno) 175 return 176 } 177 } 178 179 var hasRawVforkSyscall bool 180 switch runtime.GOARCH { 181 case "amd64", "arm64", "ppc64", "riscv64", "s390x": 182 hasRawVforkSyscall = true 183 } 184 185 // About to call fork. 186 // No more allocation or calls of non-assembly functions. 187 runtime_BeforeFork() 188 locked = true 189 switch { 190 case hasRawVforkSyscall && (sys.Cloneflags&syscall.CLONE_NEWUSER == 0 && sys.Unshareflags&syscall.CLONE_NEWUSER == 0): 191 r1, err1 = rawVforkSyscall(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD|syscall.CLONE_VFORK|syscall.CLONE_VM)|sys.Cloneflags) 192 case runtime.GOARCH == "s390x": 193 r1, _, err1 = syscall.RawSyscall6(syscall.SYS_CLONE, 0, uintptr(syscall.SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0) 194 default: 195 r1, _, err1 = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) 196 } 197 if err1 != 0 || r1 != 0 { 198 // If we're in the parent, we must return immediately 199 // so we're not in the same stack frame as the child. 200 // This can at most use the return PC, which the child 201 // will not modify, and the results of 202 // rawVforkSyscall, which must have been written after 203 // the child was replaced. 204 return 205 } 206 207 // Fork succeeded, now in child. 208 209 runtime_AfterForkInChild() 210 211 // Enable the "keep capabilities" flag to set ambient capabilities later. 212 if len(sys.AmbientCaps) > 0 { 213 _, _, err1 = syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_KEEPCAPS, 1, 0, 0, 0, 0) 214 if err1 != 0 { 215 goto childerror 216 } 217 } 218 219 // Wait for User ID/Group ID mappings to be written. 220 if sys.UidMappings != nil || sys.GidMappings != nil { 221 if _, _, err1 = syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 { 222 goto childerror 223 } 224 r1, _, err1 = syscall.RawSyscall(syscall.SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 225 if err1 != 0 { 226 goto childerror 227 } 228 if r1 != unsafe.Sizeof(err2) { 229 err1 = syscall.EINVAL 230 goto childerror 231 } 232 if err2 != 0 { 233 err1 = err2 234 goto childerror 235 } 236 } 237 238 // Session ID 239 if sys.Setsid { 240 _, _, err1 = syscall.RawSyscall(syscall.SYS_SETSID, 0, 0, 0) 241 if err1 != 0 { 242 goto childerror 243 } 244 } 245 246 // Set process group 247 if sys.Setpgid || sys.Foreground { 248 // Place child in process group. 249 _, _, err1 = syscall.RawSyscall(syscall.SYS_SETPGID, 0, uintptr(sys.Pgid), 0) 250 if err1 != 0 { 251 goto childerror 252 } 253 } 254 255 if sys.Foreground { 256 pgrp := int32(sys.Pgid) 257 if pgrp == 0 { 258 r1, _ = rawSyscallNoError(syscall.SYS_GETPID, 0, 0, 0) 259 260 pgrp = int32(r1) 261 } 262 263 // Place process group in foreground. 264 _, _, err1 = syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(sys.Ctty), uintptr(syscall.TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp))) 265 if err1 != 0 { 266 goto childerror 267 } 268 } 269 270 // Unshare 271 if sys.Unshareflags != 0 { 272 _, _, err1 = syscall.RawSyscall(syscall.SYS_UNSHARE, sys.Unshareflags, 0, 0) 273 if err1 != 0 { 274 goto childerror 275 } 276 277 if sys.Unshareflags&syscall.CLONE_NEWUSER != 0 && sys.GidMappings != nil { 278 dirfd := int(_AT_FDCWD) 279 if fd1, _, err1 = syscall.RawSyscall6(syscall.SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(syscall.O_WRONLY), 0, 0, 0); err1 != 0 { 280 goto childerror 281 } 282 r1, _, err1 = syscall.RawSyscall(syscall.SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups))) 283 if err1 != 0 { 284 goto childerror 285 } 286 if _, _, err1 = syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 { 287 goto childerror 288 } 289 290 if fd1, _, err1 = syscall.RawSyscall6(syscall.SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(syscall.O_WRONLY), 0, 0, 0); err1 != 0 { 291 goto childerror 292 } 293 r1, _, err1 = syscall.RawSyscall(syscall.SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap))) 294 if err1 != 0 { 295 goto childerror 296 } 297 if _, _, err1 = syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 { 298 goto childerror 299 } 300 } 301 302 if sys.Unshareflags&syscall.CLONE_NEWUSER != 0 && sys.UidMappings != nil { 303 dirfd := int(_AT_FDCWD) 304 if fd1, _, err1 = syscall.RawSyscall6(syscall.SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(syscall.O_WRONLY), 0, 0, 0); err1 != 0 { 305 goto childerror 306 } 307 r1, _, err1 = syscall.RawSyscall(syscall.SYS_WRITE, uintptr(fd1), uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap))) 308 if err1 != 0 { 309 goto childerror 310 } 311 if _, _, err1 = syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(fd1), 0, 0); err1 != 0 { 312 goto childerror 313 } 314 } 315 316 // The unshare system call in Linux doesn't unshare mount points 317 // mounted with --shared. Systemd mounts / with --shared. For a 318 // long discussion of the pros and cons of this see debian bug 739593. 319 // The Go model of unsharing is more like Plan 9, where you ask 320 // to unshare and the namespaces are unconditionally unshared. 321 // To make this model work we must further mark / as MS_PRIVATE. 322 // This is what the standard unshare command does. 323 if sys.Unshareflags&syscall.CLONE_NEWNS == syscall.CLONE_NEWNS { 324 _, _, err1 = syscall.RawSyscall6(syscall.SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])), uintptr(unsafe.Pointer(&slash[0])), 0, syscall.MS_REC|syscall.MS_PRIVATE, 0, 0) 325 if err1 != 0 { 326 goto childerror 327 } 328 } 329 } 330 331 // Chroot 332 if chroot != nil { 333 _, _, err1 = syscall.RawSyscall(syscall.SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0) 334 if err1 != 0 { 335 goto childerror 336 } 337 } 338 339 // User and groups 340 if cred := sys.Credential; cred != nil { 341 ngroups := uintptr(len(cred.Groups)) 342 groups := uintptr(0) 343 if ngroups > 0 { 344 groups = uintptr(unsafe.Pointer(&cred.Groups[0])) 345 } 346 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups { 347 _, _, err1 = syscall.RawSyscall(syscall.SYS_SETGROUPS, ngroups, groups, 0) 348 if err1 != 0 { 349 goto childerror 350 } 351 } 352 _, _, err1 = syscall.RawSyscall(syscall.SYS_SETGID, uintptr(cred.Gid), 0, 0) 353 if err1 != 0 { 354 goto childerror 355 } 356 _, _, err1 = syscall.RawSyscall(syscall.SYS_SETUID, uintptr(cred.Uid), 0, 0) 357 if err1 != 0 { 358 goto childerror 359 } 360 } 361 362 if len(sys.AmbientCaps) != 0 { 363 // Ambient capabilities were added in the 4.3 kernel, 364 // so it is safe to always use _LINUX_CAPABILITY_VERSION_3. 365 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3 366 367 if _, _, err1 := syscall.RawSyscall(syscall.SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 { 368 goto childerror 369 } 370 371 for _, c := range sys.AmbientCaps { 372 // Add the c capability to the permitted and inheritable capability mask, 373 // otherwise we will not be able to add it to the ambient capability mask. 374 caps.data[capToIndex(c)].permitted |= capToMask(c) 375 caps.data[capToIndex(c)].inheritable |= capToMask(c) 376 } 377 378 if _, _, err1 := syscall.RawSyscall(syscall.SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 { 379 goto childerror 380 } 381 382 for _, c := range sys.AmbientCaps { 383 _, _, err1 = syscall.RawSyscall6(syscall.SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0) 384 if err1 != 0 { 385 goto childerror 386 } 387 } 388 } 389 390 // Chdir 391 if dir != nil { 392 _, _, err1 = syscall.RawSyscall(syscall.SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0) 393 if err1 != 0 { 394 goto childerror 395 } 396 } 397 398 // Parent death signal 399 if sys.Pdeathsig != 0 { 400 _, _, err1 = syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0) 401 if err1 != 0 { 402 goto childerror 403 } 404 405 // Signal self if parent is already dead. This might cause a 406 // duplicate signal in rare cases, but it won't matter when 407 // using SIGKILL. 408 r1, _ = rawSyscallNoError(syscall.SYS_GETPPID, 0, 0, 0) 409 if r1 != ppid { 410 pid, _ := rawSyscallNoError(syscall.SYS_GETPID, 0, 0, 0) 411 _, _, err1 := syscall.RawSyscall(syscall.SYS_KILL, pid, uintptr(sys.Pdeathsig), 0) 412 if err1 != 0 { 413 goto childerror 414 } 415 } 416 } 417 418 // Pass 1: look for fd[i] < i and move those up above len(fd) 419 // so that pass 2 won't stomp on an fd it needs later. 420 if pipe < nextfd { 421 _, _, err1 = syscall.RawSyscall(syscall.SYS_DUP3, uintptr(pipe), uintptr(nextfd), syscall.O_CLOEXEC) 422 if err1 == syscall.ENOSYS { 423 _, _, err1 = syscall.RawSyscall(syscall.SYS_DUP2, uintptr(pipe), uintptr(nextfd), 0) 424 if err1 != 0 { 425 goto childerror 426 } 427 syscall.RawSyscall(fcntl64Syscall, uintptr(nextfd), syscall.F_SETFD, syscall.FD_CLOEXEC) 428 } else if err1 != 0 { 429 goto childerror 430 } 431 pipe = nextfd 432 nextfd++ 433 } 434 for i = 0; i < len(fd); i++ { 435 if fd[i] >= 0 && fd[i] < int(i) { 436 if nextfd == pipe { // don't stomp on pipe 437 nextfd++ 438 } 439 _, _, err1 = syscall.RawSyscall(syscall.SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), syscall.O_CLOEXEC) 440 if err1 == syscall.ENOSYS { 441 _, _, err1 = syscall.RawSyscall(syscall.SYS_DUP2, uintptr(fd[i]), uintptr(nextfd), 0) 442 if err1 != 0 { 443 goto childerror 444 } 445 syscall.RawSyscall(fcntl64Syscall, uintptr(nextfd), syscall.F_SETFD, syscall.FD_CLOEXEC) 446 } else if err1 != 0 { 447 goto childerror 448 } 449 fd[i] = nextfd 450 nextfd++ 451 } 452 } 453 454 // Pass 2: dup fd[i] down onto i. 455 for i = 0; i < len(fd); i++ { 456 if fd[i] == -1 { 457 syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(i), 0, 0) 458 continue 459 } 460 if fd[i] == int(i) { 461 // dup2(i, i) won't clear close-on-exec flag on Linux, 462 // probably not elsewhere either. 463 _, _, err1 = syscall.RawSyscall(fcntl64Syscall, uintptr(fd[i]), syscall.F_SETFD, 0) 464 if err1 != 0 { 465 goto childerror 466 } 467 continue 468 } 469 // The new fd is created NOT close-on-exec, 470 // which is exactly what we want. 471 _, _, err1 = syscall.RawSyscall(syscall.SYS_DUP2, uintptr(fd[i]), uintptr(i), 0) 472 if err1 != 0 { 473 goto childerror 474 } 475 } 476 477 // By convention, we don't close-on-exec the fds we are 478 // started with, so if len(fd) < 3, close 0, 1, 2 as needed. 479 // Programs that know they inherit fds >= 3 will need 480 // to set them close-on-exec. 481 for i = len(fd); i < 3; i++ { 482 syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(i), 0, 0) 483 } 484 485 // Detach fd 0 from tty 486 if sys.Noctty { 487 _, _, err1 = syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCNOTTY), 0) 488 if err1 != 0 { 489 goto childerror 490 } 491 } 492 493 // Set the controlling TTY to Ctty 494 if sys.Setctty { 495 _, _, err1 = syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(sys.Ctty), uintptr(syscall.TIOCSCTTY), 1) 496 if err1 != 0 { 497 goto childerror 498 } 499 } 500 501 // Enable tracing if requested. 502 // Do this right before exec so that we don't unnecessarily trace the runtime 503 // setting up after the fork. See issue #21428. 504 if sys.Ptrace { 505 _, _, err1 = syscall.RawSyscall(syscall.SYS_PTRACE, uintptr(syscall.PTRACE_TRACEME), 0, 0) 506 if err1 != 0 { 507 goto childerror 508 } 509 } 510 511 // Set resource limitations 512 for _, rlimit := range rlimitOptions.Rlimits { 513 if !rlimit.Enable { 514 continue 515 } 516 _, _, err1 = syscall.RawSyscall(syscall.SYS_SETRLIMIT, uintptr(rlimit.Which), uintptr(unsafe.Pointer(&rlimit.RLim)), 0) 517 if err1 != 0 { 518 goto childerror 519 } 520 } 521 522 // Set real time limitation 523 if sys.Rlimit.RealTimeLimit > 0 { 524 _, _, err1 = syscall.RawSyscall(syscall.SYS_SETITIMER, ITIMER_REAL, uintptr(unsafe.Pointer(&rlimitOptions.ITimerValue)), 0) 525 if err1 != 0 { 526 goto childerror 527 } 528 } 529 530 // Time to exec. 531 _, _, err1 = syscall.RawSyscall(syscall.SYS_EXECVE, 532 uintptr(unsafe.Pointer(argv0)), 533 uintptr(unsafe.Pointer(&argv[0])), 534 uintptr(unsafe.Pointer(&envv[0]))) 535 536 childerror: 537 // send error code on pipe 538 syscall.RawSyscall(syscall.SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1)) 539 for { 540 syscall.RawSyscall(syscall.SYS_EXIT, 253, 0, 0) 541 } 542 } 543 544 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. 545 // If a dup or exec fails, write the errno error to pipe. 546 // (Pipe is close-on-exec so if exec succeeds, it will be closed.) 547 // In the child, this function must not acquire any locks, because 548 // they might have been locked at the time of the fork. This means 549 // no rescheduling, no malloc calls, and no new stack segments. 550 // For the same reason compiler does not race instrument it. 551 // The calls to RawSyscall are okay because they are assembly 552 // functions that do not grow the stack. 553 //go:norace 554 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err syscall.Errno) { 555 // Set up and fork. This returns immediately in the parent or 556 // if there's an error. 557 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe) 558 if locked { 559 runtime_AfterFork() 560 } 561 if err1 != 0 { 562 return 0, err1 563 } 564 565 // parent; return PID 566 pid = int(r1) 567 568 if sys.UidMappings != nil || sys.GidMappings != nil { 569 syscall.Close(p[0]) 570 var err2 syscall.Errno 571 // uid/gid mappings will be written after fork and unshare(2) for user 572 // namespaces. 573 if sys.Unshareflags&syscall.CLONE_NEWUSER == 0 { 574 if err := writeUidGidMappings(pid, sys); err != nil { 575 err2 = err.(syscall.Errno) 576 } 577 } 578 syscall.RawSyscall(syscall.SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 579 syscall.Close(p[1]) 580 } 581 582 return pid, 0 583 } 584 585 // writeUidGidMappings writes User ID and Group ID mappings for user namespaces 586 // for a process and it is called from the parent process. 587 func writeUidGidMappings(pid int, sys *SysProcAttr) error { 588 if sys.UidMappings != nil { 589 uidf := "/proc/" + itoa(pid) + "/uid_map" 590 if err := writeIDMappings(uidf, sys.UidMappings); err != nil { 591 return err 592 } 593 } 594 595 if sys.GidMappings != nil { 596 // If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK. 597 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != syscall.ENOENT { 598 return err 599 } 600 gidf := "/proc/" + itoa(pid) + "/gid_map" 601 if err := writeIDMappings(gidf, sys.GidMappings); err != nil { 602 return err 603 } 604 } 605 606 return nil 607 }