github.com/d4l3k/go@v0.0.0-20151015000803-65fc379daeda/src/syscall/exec_linux.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build linux 6 7 package syscall 8 9 import ( 10 "unsafe" 11 ) 12 13 // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux. 14 // See user_namespaces(7). 15 type SysProcIDMap struct { 16 ContainerID int // Container ID. 17 HostID int // Host ID. 18 Size int // Size. 19 } 20 21 type SysProcAttr struct { 22 Chroot string // Chroot. 23 Credential *Credential // Credential. 24 Ptrace bool // Enable tracing. 25 Setsid bool // Create session. 26 Setpgid bool // Set process group ID to Pgid, or, if Pgid == 0, to new pid. 27 Setctty bool // Set controlling terminal to fd Ctty (only meaningful if Setsid is set) 28 Noctty bool // Detach fd 0 from controlling terminal 29 Ctty int // Controlling TTY fd 30 Foreground bool // Place child's process group in foreground. (Implies Setpgid. Uses Ctty as fd of controlling TTY) 31 Pgid int // Child's process group ID if Setpgid. 32 Pdeathsig Signal // Signal that the process will get when its parent dies (Linux only) 33 Cloneflags uintptr // Flags for clone calls (Linux only) 34 UidMappings []SysProcIDMap // User ID mappings for user namespaces. 35 GidMappings []SysProcIDMap // Group ID mappings for user namespaces. 36 // GidMappingsEnableSetgroups enabling setgroups syscall. 37 // If false, then setgroups syscall will be disabled for the child process. 38 // This parameter is no-op if GidMappings == nil. Otherwise for unprivileged 39 // users this should be set to false for mappings work. 40 GidMappingsEnableSetgroups bool 41 } 42 43 // Implemented in runtime package. 44 func runtime_BeforeFork() 45 func runtime_AfterFork() 46 47 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. 48 // If a dup or exec fails, write the errno error to pipe. 49 // (Pipe is close-on-exec so if exec succeeds, it will be closed.) 50 // In the child, this function must not acquire any locks, because 51 // they might have been locked at the time of the fork. This means 52 // no rescheduling, no malloc calls, and no new stack segments. 53 // For the same reason compiler does not race instrument it. 54 // The calls to RawSyscall are okay because they are assembly 55 // functions that do not grow the stack. 56 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) { 57 // Declare all variables at top in case any 58 // declarations require heap allocation (e.g., err1). 59 var ( 60 r1 uintptr 61 err1 Errno 62 err2 Errno 63 nextfd int 64 i int 65 p [2]int 66 ) 67 68 // Record parent PID so child can test if it has died. 69 ppid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) 70 71 // Guard against side effects of shuffling fds below. 72 // Make sure that nextfd is beyond any currently open files so 73 // that we can't run the risk of overwriting any of them. 74 fd := make([]int, len(attr.Files)) 75 nextfd = len(attr.Files) 76 for i, ufd := range attr.Files { 77 if nextfd < int(ufd) { 78 nextfd = int(ufd) 79 } 80 fd[i] = int(ufd) 81 } 82 nextfd++ 83 84 // Allocate another pipe for parent to child communication for 85 // synchronizing writing of User ID/Group ID mappings. 86 if sys.UidMappings != nil || sys.GidMappings != nil { 87 if err := forkExecPipe(p[:]); err != nil { 88 return 0, err.(Errno) 89 } 90 } 91 92 // About to call fork. 93 // No more allocation or calls of non-assembly functions. 94 runtime_BeforeFork() 95 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) 96 if err1 != 0 { 97 runtime_AfterFork() 98 return 0, err1 99 } 100 101 if r1 != 0 { 102 // parent; return PID 103 runtime_AfterFork() 104 pid = int(r1) 105 106 if sys.UidMappings != nil || sys.GidMappings != nil { 107 Close(p[0]) 108 err := writeUidGidMappings(pid, sys) 109 if err != nil { 110 err2 = err.(Errno) 111 } 112 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 113 Close(p[1]) 114 } 115 116 return pid, 0 117 } 118 119 // Fork succeeded, now in child. 120 121 // Wait for User ID/Group ID mappings to be written. 122 if sys.UidMappings != nil || sys.GidMappings != nil { 123 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 { 124 goto childerror 125 } 126 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 127 if err1 != 0 { 128 goto childerror 129 } 130 if r1 != unsafe.Sizeof(err2) { 131 err1 = EINVAL 132 goto childerror 133 } 134 if err2 != 0 { 135 err1 = err2 136 goto childerror 137 } 138 } 139 140 // Enable tracing if requested. 141 if sys.Ptrace { 142 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0) 143 if err1 != 0 { 144 goto childerror 145 } 146 } 147 148 // Session ID 149 if sys.Setsid { 150 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0) 151 if err1 != 0 { 152 goto childerror 153 } 154 } 155 156 // Set process group 157 if sys.Setpgid || sys.Foreground { 158 // Place child in process group. 159 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0) 160 if err1 != 0 { 161 goto childerror 162 } 163 } 164 165 if sys.Foreground { 166 pgrp := int32(sys.Pgid) 167 if pgrp == 0 { 168 r1, _, err1 = RawSyscall(SYS_GETPID, 0, 0, 0) 169 if err1 != 0 { 170 goto childerror 171 } 172 173 pgrp = int32(r1) 174 } 175 176 // Place process group in foreground. 177 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp))) 178 if err1 != 0 { 179 goto childerror 180 } 181 } 182 183 // Chroot 184 if chroot != nil { 185 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0) 186 if err1 != 0 { 187 goto childerror 188 } 189 } 190 191 // User and groups 192 if cred := sys.Credential; cred != nil { 193 ngroups := uintptr(len(cred.Groups)) 194 if ngroups > 0 { 195 groups := unsafe.Pointer(&cred.Groups[0]) 196 _, _, err1 = RawSyscall(SYS_SETGROUPS, ngroups, uintptr(groups), 0) 197 if err1 != 0 { 198 goto childerror 199 } 200 } 201 _, _, err1 = RawSyscall(SYS_SETGID, uintptr(cred.Gid), 0, 0) 202 if err1 != 0 { 203 goto childerror 204 } 205 _, _, err1 = RawSyscall(SYS_SETUID, uintptr(cred.Uid), 0, 0) 206 if err1 != 0 { 207 goto childerror 208 } 209 } 210 211 // Chdir 212 if dir != nil { 213 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0) 214 if err1 != 0 { 215 goto childerror 216 } 217 } 218 219 // Parent death signal 220 if sys.Pdeathsig != 0 { 221 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0) 222 if err1 != 0 { 223 goto childerror 224 } 225 226 // Signal self if parent is already dead. This might cause a 227 // duplicate signal in rare cases, but it won't matter when 228 // using SIGKILL. 229 r1, _, _ = RawSyscall(SYS_GETPPID, 0, 0, 0) 230 if r1 != ppid { 231 pid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) 232 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0) 233 if err1 != 0 { 234 goto childerror 235 } 236 } 237 } 238 239 // Pass 1: look for fd[i] < i and move those up above len(fd) 240 // so that pass 2 won't stomp on an fd it needs later. 241 if pipe < nextfd { 242 _, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0) 243 if err1 != 0 { 244 goto childerror 245 } 246 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) 247 pipe = nextfd 248 nextfd++ 249 } 250 for i = 0; i < len(fd); i++ { 251 if fd[i] >= 0 && fd[i] < int(i) { 252 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0) 253 if err1 != 0 { 254 goto childerror 255 } 256 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) 257 fd[i] = nextfd 258 nextfd++ 259 if nextfd == pipe { // don't stomp on pipe 260 nextfd++ 261 } 262 } 263 } 264 265 // Pass 2: dup fd[i] down onto i. 266 for i = 0; i < len(fd); i++ { 267 if fd[i] == -1 { 268 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) 269 continue 270 } 271 if fd[i] == int(i) { 272 // dup2(i, i) won't clear close-on-exec flag on Linux, 273 // probably not elsewhere either. 274 _, _, err1 = RawSyscall(SYS_FCNTL, uintptr(fd[i]), F_SETFD, 0) 275 if err1 != 0 { 276 goto childerror 277 } 278 continue 279 } 280 // The new fd is created NOT close-on-exec, 281 // which is exactly what we want. 282 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(i), 0) 283 if err1 != 0 { 284 goto childerror 285 } 286 } 287 288 // By convention, we don't close-on-exec the fds we are 289 // started with, so if len(fd) < 3, close 0, 1, 2 as needed. 290 // Programs that know they inherit fds >= 3 will need 291 // to set them close-on-exec. 292 for i = len(fd); i < 3; i++ { 293 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) 294 } 295 296 // Detach fd 0 from tty 297 if sys.Noctty { 298 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0) 299 if err1 != 0 { 300 goto childerror 301 } 302 } 303 304 // Set the controlling TTY to Ctty 305 if sys.Setctty { 306 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 0) 307 if err1 != 0 { 308 goto childerror 309 } 310 } 311 312 // Time to exec. 313 _, _, err1 = RawSyscall(SYS_EXECVE, 314 uintptr(unsafe.Pointer(argv0)), 315 uintptr(unsafe.Pointer(&argv[0])), 316 uintptr(unsafe.Pointer(&envv[0]))) 317 318 childerror: 319 // send error code on pipe 320 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1)) 321 for { 322 RawSyscall(SYS_EXIT, 253, 0, 0) 323 } 324 } 325 326 // Try to open a pipe with O_CLOEXEC set on both file descriptors. 327 func forkExecPipe(p []int) (err error) { 328 err = Pipe2(p, O_CLOEXEC) 329 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it 330 // might not be implemented. 331 if err == ENOSYS { 332 if err = Pipe(p); err != nil { 333 return 334 } 335 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil { 336 return 337 } 338 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC) 339 } 340 return 341 } 342 343 // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path. 344 func writeIDMappings(path string, idMap []SysProcIDMap) error { 345 fd, err := Open(path, O_RDWR, 0) 346 if err != nil { 347 return err 348 } 349 350 data := "" 351 for _, im := range idMap { 352 data = data + itoa(im.ContainerID) + " " + itoa(im.HostID) + " " + itoa(im.Size) + "\n" 353 } 354 355 bytes, err := ByteSliceFromString(data) 356 if err != nil { 357 Close(fd) 358 return err 359 } 360 361 if _, err := Write(fd, bytes); err != nil { 362 Close(fd) 363 return err 364 } 365 366 if err := Close(fd); err != nil { 367 return err 368 } 369 370 return nil 371 } 372 373 // writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false 374 // and "allow" if enable is true. 375 // This is needed since kernel 3.19, because you can't write gid_map without 376 // disabling setgroups() system call. 377 func writeSetgroups(pid int, enable bool) error { 378 sgf := "/proc/" + itoa(pid) + "/setgroups" 379 fd, err := Open(sgf, O_RDWR, 0) 380 if err != nil { 381 return err 382 } 383 384 var data []byte 385 if enable { 386 data = []byte("allow") 387 } else { 388 data = []byte("deny") 389 } 390 391 if _, err := Write(fd, data); err != nil { 392 Close(fd) 393 return err 394 } 395 396 return Close(fd) 397 } 398 399 // writeUidGidMappings writes User ID and Group ID mappings for user namespaces 400 // for a process and it is called from the parent process. 401 func writeUidGidMappings(pid int, sys *SysProcAttr) error { 402 if sys.UidMappings != nil { 403 uidf := "/proc/" + itoa(pid) + "/uid_map" 404 if err := writeIDMappings(uidf, sys.UidMappings); err != nil { 405 return err 406 } 407 } 408 409 if sys.GidMappings != nil { 410 // If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK. 411 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT { 412 return err 413 } 414 gidf := "/proc/" + itoa(pid) + "/gid_map" 415 if err := writeIDMappings(gidf, sys.GidMappings); err != nil { 416 return err 417 } 418 } 419 420 return nil 421 }