github.com/reiver/go@v0.0.0-20150109200633-1d0c7792f172/src/syscall/exec_linux.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build linux 6 7 package syscall 8 9 import ( 10 "unsafe" 11 ) 12 13 // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux. 14 // See user_namespaces(7). 15 type SysProcIDMap struct { 16 ContainerID int // Container ID. 17 HostID int // Host ID. 18 Size int // Size. 19 } 20 21 type SysProcAttr struct { 22 Chroot string // Chroot. 23 Credential *Credential // Credential. 24 Ptrace bool // Enable tracing. 25 Setsid bool // Create session. 26 Setpgid bool // Set process group ID to new pid (SYSV setpgrp) 27 Setctty bool // Set controlling terminal to fd Ctty (only meaningful if Setsid is set) 28 Noctty bool // Detach fd 0 from controlling terminal 29 Ctty int // Controlling TTY fd (Linux only) 30 Pdeathsig Signal // Signal that the process will get when its parent dies (Linux only) 31 Cloneflags uintptr // Flags for clone calls (Linux only) 32 UidMappings []SysProcIDMap // User ID mappings for user namespaces. 33 GidMappings []SysProcIDMap // Group ID mappings for user namespaces. 34 } 35 36 // Implemented in runtime package. 37 func runtime_BeforeFork() 38 func runtime_AfterFork() 39 40 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. 41 // If a dup or exec fails, write the errno error to pipe. 42 // (Pipe is close-on-exec so if exec succeeds, it will be closed.) 43 // In the child, this function must not acquire any locks, because 44 // they might have been locked at the time of the fork. This means 45 // no rescheduling, no malloc calls, and no new stack segments. 46 // For the same reason compiler does not race instrument it. 47 // The calls to RawSyscall are okay because they are assembly 48 // functions that do not grow the stack. 49 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) { 50 // Declare all variables at top in case any 51 // declarations require heap allocation (e.g., err1). 52 var ( 53 r1 uintptr 54 err1 Errno 55 err2 Errno 56 nextfd int 57 i int 58 p [2]int 59 ) 60 61 // Record parent PID so child can test if it has died. 62 ppid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) 63 64 // Guard against side effects of shuffling fds below. 65 // Make sure that nextfd is beyond any currently open files so 66 // that we can't run the risk of overwriting any of them. 67 fd := make([]int, len(attr.Files)) 68 nextfd = len(attr.Files) 69 for i, ufd := range attr.Files { 70 if nextfd < int(ufd) { 71 nextfd = int(ufd) 72 } 73 fd[i] = int(ufd) 74 } 75 nextfd++ 76 77 // Allocate another pipe for parent to child communication for 78 // synchronizing writing of User ID/Group ID mappings. 79 if sys.UidMappings != nil || sys.GidMappings != nil { 80 if err := forkExecPipe(p[:]); err != nil { 81 return 0, err.(Errno) 82 } 83 } 84 85 // About to call fork. 86 // No more allocation or calls of non-assembly functions. 87 runtime_BeforeFork() 88 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) 89 if err1 != 0 { 90 runtime_AfterFork() 91 return 0, err1 92 } 93 94 if r1 != 0 { 95 // parent; return PID 96 runtime_AfterFork() 97 pid = int(r1) 98 99 if sys.UidMappings != nil || sys.GidMappings != nil { 100 Close(p[0]) 101 err := writeUidGidMappings(pid, sys) 102 if err != nil { 103 err2 = err.(Errno) 104 } 105 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 106 Close(p[1]) 107 } 108 109 return pid, 0 110 } 111 112 // Fork succeeded, now in child. 113 114 // Wait for User ID/Group ID mappings to be written. 115 if sys.UidMappings != nil || sys.GidMappings != nil { 116 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 { 117 goto childerror 118 } 119 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 120 if err1 != 0 { 121 goto childerror 122 } 123 if r1 != unsafe.Sizeof(err2) { 124 err1 = EINVAL 125 goto childerror 126 } 127 if err2 != 0 { 128 err1 = err2 129 goto childerror 130 } 131 } 132 133 // Parent death signal 134 if sys.Pdeathsig != 0 { 135 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0) 136 if err1 != 0 { 137 goto childerror 138 } 139 140 // Signal self if parent is already dead. This might cause a 141 // duplicate signal in rare cases, but it won't matter when 142 // using SIGKILL. 143 r1, _, _ = RawSyscall(SYS_GETPPID, 0, 0, 0) 144 if r1 != ppid { 145 pid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) 146 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0) 147 if err1 != 0 { 148 goto childerror 149 } 150 } 151 } 152 153 // Enable tracing if requested. 154 if sys.Ptrace { 155 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0) 156 if err1 != 0 { 157 goto childerror 158 } 159 } 160 161 // Session ID 162 if sys.Setsid { 163 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0) 164 if err1 != 0 { 165 goto childerror 166 } 167 } 168 169 // Set process group 170 if sys.Setpgid { 171 _, _, err1 = RawSyscall(SYS_SETPGID, 0, 0, 0) 172 if err1 != 0 { 173 goto childerror 174 } 175 } 176 177 // Chroot 178 if chroot != nil { 179 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0) 180 if err1 != 0 { 181 goto childerror 182 } 183 } 184 185 // User and groups 186 if cred := sys.Credential; cred != nil { 187 ngroups := uintptr(len(cred.Groups)) 188 var groups unsafe.Pointer 189 if ngroups > 0 { 190 groups = unsafe.Pointer(&cred.Groups[0]) 191 } 192 _, _, err1 = RawSyscall(SYS_SETGROUPS, ngroups, uintptr(groups), 0) 193 if err1 != 0 { 194 goto childerror 195 } 196 _, _, err1 = RawSyscall(SYS_SETGID, uintptr(cred.Gid), 0, 0) 197 if err1 != 0 { 198 goto childerror 199 } 200 _, _, err1 = RawSyscall(SYS_SETUID, uintptr(cred.Uid), 0, 0) 201 if err1 != 0 { 202 goto childerror 203 } 204 } 205 206 // Chdir 207 if dir != nil { 208 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0) 209 if err1 != 0 { 210 goto childerror 211 } 212 } 213 214 // Pass 1: look for fd[i] < i and move those up above len(fd) 215 // so that pass 2 won't stomp on an fd it needs later. 216 if pipe < nextfd { 217 _, _, err1 = RawSyscall(SYS_DUP2, uintptr(pipe), uintptr(nextfd), 0) 218 if err1 != 0 { 219 goto childerror 220 } 221 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) 222 pipe = nextfd 223 nextfd++ 224 } 225 for i = 0; i < len(fd); i++ { 226 if fd[i] >= 0 && fd[i] < int(i) { 227 _, _, err1 = RawSyscall(SYS_DUP2, uintptr(fd[i]), uintptr(nextfd), 0) 228 if err1 != 0 { 229 goto childerror 230 } 231 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) 232 fd[i] = nextfd 233 nextfd++ 234 if nextfd == pipe { // don't stomp on pipe 235 nextfd++ 236 } 237 } 238 } 239 240 // Pass 2: dup fd[i] down onto i. 241 for i = 0; i < len(fd); i++ { 242 if fd[i] == -1 { 243 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) 244 continue 245 } 246 if fd[i] == int(i) { 247 // dup2(i, i) won't clear close-on-exec flag on Linux, 248 // probably not elsewhere either. 249 _, _, err1 = RawSyscall(SYS_FCNTL, uintptr(fd[i]), F_SETFD, 0) 250 if err1 != 0 { 251 goto childerror 252 } 253 continue 254 } 255 // The new fd is created NOT close-on-exec, 256 // which is exactly what we want. 257 _, _, err1 = RawSyscall(SYS_DUP2, uintptr(fd[i]), uintptr(i), 0) 258 if err1 != 0 { 259 goto childerror 260 } 261 } 262 263 // By convention, we don't close-on-exec the fds we are 264 // started with, so if len(fd) < 3, close 0, 1, 2 as needed. 265 // Programs that know they inherit fds >= 3 will need 266 // to set them close-on-exec. 267 for i = len(fd); i < 3; i++ { 268 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) 269 } 270 271 // Detach fd 0 from tty 272 if sys.Noctty { 273 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0) 274 if err1 != 0 { 275 goto childerror 276 } 277 } 278 279 // Set the controlling TTY to Ctty 280 if sys.Setctty && sys.Ctty >= 0 { 281 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 0) 282 if err1 != 0 { 283 goto childerror 284 } 285 } 286 287 // Time to exec. 288 _, _, err1 = RawSyscall(SYS_EXECVE, 289 uintptr(unsafe.Pointer(argv0)), 290 uintptr(unsafe.Pointer(&argv[0])), 291 uintptr(unsafe.Pointer(&envv[0]))) 292 293 childerror: 294 // send error code on pipe 295 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1)) 296 for { 297 RawSyscall(SYS_EXIT, 253, 0, 0) 298 } 299 } 300 301 // Try to open a pipe with O_CLOEXEC set on both file descriptors. 302 func forkExecPipe(p []int) (err error) { 303 err = Pipe2(p, O_CLOEXEC) 304 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it 305 // might not be implemented. 306 if err == ENOSYS { 307 if err = Pipe(p); err != nil { 308 return 309 } 310 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil { 311 return 312 } 313 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC) 314 } 315 return 316 } 317 318 // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path. 319 func writeIDMappings(path string, idMap []SysProcIDMap) error { 320 fd, err := Open(path, O_RDWR, 0) 321 if err != nil { 322 return err 323 } 324 325 data := "" 326 for _, im := range idMap { 327 data = data + itoa(im.ContainerID) + " " + itoa(im.HostID) + " " + itoa(im.Size) + "\n" 328 } 329 330 bytes, err := ByteSliceFromString(data) 331 if err != nil { 332 Close(fd) 333 return err 334 } 335 336 if _, err := Write(fd, bytes); err != nil { 337 Close(fd) 338 return err 339 } 340 341 if err := Close(fd); err != nil { 342 return err 343 } 344 345 return nil 346 } 347 348 // writeUidGidMappings writes User ID and Group ID mappings for user namespaces 349 // for a process and it is called from the parent process. 350 func writeUidGidMappings(pid int, sys *SysProcAttr) error { 351 if sys.UidMappings != nil { 352 uidf := "/proc/" + itoa(pid) + "/uid_map" 353 if err := writeIDMappings(uidf, sys.UidMappings); err != nil { 354 return err 355 } 356 } 357 358 if sys.GidMappings != nil { 359 gidf := "/proc/" + itoa(pid) + "/gid_map" 360 if err := writeIDMappings(gidf, sys.GidMappings); err != nil { 361 return err 362 } 363 } 364 365 return nil 366 }