github.com/zxy12/golang151_with_comment@v0.0.0-20190507085033-721809559d3c/syscall/exec_linux.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build linux 6 7 package syscall 8 9 import ( 10 "unsafe" 11 ) 12 13 // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux. 14 // See user_namespaces(7). 15 type SysProcIDMap struct { 16 ContainerID int // Container ID. 17 HostID int // Host ID. 18 Size int // Size. 19 } 20 21 type SysProcAttr struct { 22 Chroot string // Chroot. 23 Credential *Credential // Credential. 24 Ptrace bool // Enable tracing. 25 Setsid bool // Create session. 26 Setpgid bool // Set process group ID to Pgid, or, if Pgid == 0, to new pid. 27 Setctty bool // Set controlling terminal to fd Ctty (only meaningful if Setsid is set) 28 Noctty bool // Detach fd 0 from controlling terminal 29 Ctty int // Controlling TTY fd 30 Foreground bool // Place child's process group in foreground. (Implies Setpgid. Uses Ctty as fd of controlling TTY) 31 Pgid int // Child's process group ID if Setpgid. 32 Pdeathsig Signal // Signal that the process will get when its parent dies (Linux only) 33 Cloneflags uintptr // Flags for clone calls (Linux only) 34 UidMappings []SysProcIDMap // User ID mappings for user namespaces. 35 GidMappings []SysProcIDMap // Group ID mappings for user namespaces. 36 // GidMappingsEnableSetgroups enabling setgroups syscall. 37 // If false, then setgroups syscall will be disabled for the child process. 38 // This parameter is no-op if GidMappings == nil. Otherwise for unprivileged 39 // users this should be set to false for mappings work. 40 GidMappingsEnableSetgroups bool 41 } 42 43 // Implemented in runtime package. 44 func runtime_BeforeFork() 45 func runtime_AfterFork() 46 47 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child. 48 // If a dup or exec fails, write the errno error to pipe. 49 // (Pipe is close-on-exec so if exec succeeds, it will be closed.) 50 // In the child, this function must not acquire any locks, because 51 // they might have been locked at the time of the fork. This means 52 // no rescheduling, no malloc calls, and no new stack segments. 53 // For the same reason compiler does not race instrument it. 54 // The calls to RawSyscall are okay because they are assembly 55 // functions that do not grow the stack. 56 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) { 57 // Declare all variables at top in case any 58 // declarations require heap allocation (e.g., err1). 59 var ( 60 r1 uintptr 61 err1 Errno 62 err2 Errno 63 nextfd int 64 i int 65 p [2]int 66 ) 67 68 // Record parent PID so child can test if it has died. 69 ppid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) 70 71 // Guard against side effects of shuffling fds below. 72 // Make sure that nextfd is beyond any currently open files so 73 // that we can't run the risk of overwriting any of them. 74 fd := make([]int, len(attr.Files)) 75 nextfd = len(attr.Files) 76 for i, ufd := range attr.Files { 77 if nextfd < int(ufd) { 78 nextfd = int(ufd) 79 } 80 fd[i] = int(ufd) 81 } 82 nextfd++ 83 84 // Allocate another pipe for parent to child communication for 85 // synchronizing writing of User ID/Group ID mappings. 86 if sys.UidMappings != nil || sys.GidMappings != nil { 87 if err := forkExecPipe(p[:]); err != nil { 88 return 0, err.(Errno) 89 } 90 } 91 92 // About to call fork. 93 // No more allocation or calls of non-assembly functions. 94 runtime_BeforeFork() 95 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) 96 if err1 != 0 { 97 runtime_AfterFork() 98 return 0, err1 99 } 100 101 if r1 != 0 { 102 // parent; return PID 103 runtime_AfterFork() 104 pid = int(r1) 105 106 if sys.UidMappings != nil || sys.GidMappings != nil { 107 Close(p[0]) 108 err := writeUidGidMappings(pid, sys) 109 if err != nil { 110 err2 = err.(Errno) 111 } 112 RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 113 Close(p[1]) 114 } 115 116 return pid, 0 117 } 118 119 // Fork succeeded, now in child. 120 121 // Wait for User ID/Group ID mappings to be written. 122 if sys.UidMappings != nil || sys.GidMappings != nil { 123 if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 { 124 goto childerror 125 } 126 r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) 127 if err1 != 0 { 128 goto childerror 129 } 130 if r1 != unsafe.Sizeof(err2) { 131 err1 = EINVAL 132 goto childerror 133 } 134 if err2 != 0 { 135 err1 = err2 136 goto childerror 137 } 138 } 139 140 // Enable tracing if requested. 141 if sys.Ptrace { 142 _, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0) 143 if err1 != 0 { 144 goto childerror 145 } 146 } 147 148 // Session ID 149 if sys.Setsid { 150 _, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0) 151 if err1 != 0 { 152 goto childerror 153 } 154 } 155 156 // Set process group 157 if sys.Setpgid || sys.Foreground { 158 // Place child in process group. 159 _, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0) 160 if err1 != 0 { 161 goto childerror 162 } 163 } 164 165 if sys.Foreground { 166 pgrp := int32(sys.Pgid) 167 if pgrp == 0 { 168 r1, _, err1 = RawSyscall(SYS_GETPID, 0, 0, 0) 169 if err1 != 0 { 170 goto childerror 171 } 172 173 pgrp = int32(r1) 174 } 175 176 // Place process group in foreground. 177 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp))) 178 if err1 != 0 { 179 goto childerror 180 } 181 } 182 183 // Chroot 184 if chroot != nil { 185 _, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0) 186 if err1 != 0 { 187 goto childerror 188 } 189 } 190 191 // User and groups 192 if cred := sys.Credential; cred != nil { 193 ngroups := uintptr(len(cred.Groups)) 194 var groups unsafe.Pointer 195 if ngroups > 0 { 196 groups = unsafe.Pointer(&cred.Groups[0]) 197 } 198 _, _, err1 = RawSyscall(SYS_SETGROUPS, ngroups, uintptr(groups), 0) 199 if err1 != 0 { 200 goto childerror 201 } 202 _, _, err1 = RawSyscall(SYS_SETGID, uintptr(cred.Gid), 0, 0) 203 if err1 != 0 { 204 goto childerror 205 } 206 _, _, err1 = RawSyscall(SYS_SETUID, uintptr(cred.Uid), 0, 0) 207 if err1 != 0 { 208 goto childerror 209 } 210 } 211 212 // Chdir 213 if dir != nil { 214 _, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0) 215 if err1 != 0 { 216 goto childerror 217 } 218 } 219 220 // Parent death signal 221 if sys.Pdeathsig != 0 { 222 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0) 223 if err1 != 0 { 224 goto childerror 225 } 226 227 // Signal self if parent is already dead. This might cause a 228 // duplicate signal in rare cases, but it won't matter when 229 // using SIGKILL. 230 r1, _, _ = RawSyscall(SYS_GETPPID, 0, 0, 0) 231 if r1 != ppid { 232 pid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) 233 _, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0) 234 if err1 != 0 { 235 goto childerror 236 } 237 } 238 } 239 240 // Pass 1: look for fd[i] < i and move those up above len(fd) 241 // so that pass 2 won't stomp on an fd it needs later. 242 if pipe < nextfd { 243 _, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0) 244 if err1 != 0 { 245 goto childerror 246 } 247 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) 248 pipe = nextfd 249 nextfd++ 250 } 251 for i = 0; i < len(fd); i++ { 252 if fd[i] >= 0 && fd[i] < int(i) { 253 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0) 254 if err1 != 0 { 255 goto childerror 256 } 257 RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC) 258 fd[i] = nextfd 259 nextfd++ 260 if nextfd == pipe { // don't stomp on pipe 261 nextfd++ 262 } 263 } 264 } 265 266 // Pass 2: dup fd[i] down onto i. 267 for i = 0; i < len(fd); i++ { 268 if fd[i] == -1 { 269 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) 270 continue 271 } 272 if fd[i] == int(i) { 273 // dup2(i, i) won't clear close-on-exec flag on Linux, 274 // probably not elsewhere either. 275 _, _, err1 = RawSyscall(SYS_FCNTL, uintptr(fd[i]), F_SETFD, 0) 276 if err1 != 0 { 277 goto childerror 278 } 279 continue 280 } 281 // The new fd is created NOT close-on-exec, 282 // which is exactly what we want. 283 _, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(i), 0) 284 if err1 != 0 { 285 goto childerror 286 } 287 } 288 289 // By convention, we don't close-on-exec the fds we are 290 // started with, so if len(fd) < 3, close 0, 1, 2 as needed. 291 // Programs that know they inherit fds >= 3 will need 292 // to set them close-on-exec. 293 for i = len(fd); i < 3; i++ { 294 RawSyscall(SYS_CLOSE, uintptr(i), 0, 0) 295 } 296 297 // Detach fd 0 from tty 298 if sys.Noctty { 299 _, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0) 300 if err1 != 0 { 301 goto childerror 302 } 303 } 304 305 // Set the controlling TTY to Ctty 306 if sys.Setctty { 307 _, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 0) 308 if err1 != 0 { 309 goto childerror 310 } 311 } 312 313 // Time to exec. 314 _, _, err1 = RawSyscall(SYS_EXECVE, 315 uintptr(unsafe.Pointer(argv0)), 316 uintptr(unsafe.Pointer(&argv[0])), 317 uintptr(unsafe.Pointer(&envv[0]))) 318 319 childerror: 320 // send error code on pipe 321 RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1)) 322 for { 323 RawSyscall(SYS_EXIT, 253, 0, 0) 324 } 325 } 326 327 // Try to open a pipe with O_CLOEXEC set on both file descriptors. 328 func forkExecPipe(p []int) (err error) { 329 err = Pipe2(p, O_CLOEXEC) 330 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it 331 // might not be implemented. 332 if err == ENOSYS { 333 if err = Pipe(p); err != nil { 334 return 335 } 336 if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil { 337 return 338 } 339 _, err = fcntl(p[1], F_SETFD, FD_CLOEXEC) 340 } 341 return 342 } 343 344 // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path. 345 func writeIDMappings(path string, idMap []SysProcIDMap) error { 346 fd, err := Open(path, O_RDWR, 0) 347 if err != nil { 348 return err 349 } 350 351 data := "" 352 for _, im := range idMap { 353 data = data + itoa(im.ContainerID) + " " + itoa(im.HostID) + " " + itoa(im.Size) + "\n" 354 } 355 356 bytes, err := ByteSliceFromString(data) 357 if err != nil { 358 Close(fd) 359 return err 360 } 361 362 if _, err := Write(fd, bytes); err != nil { 363 Close(fd) 364 return err 365 } 366 367 if err := Close(fd); err != nil { 368 return err 369 } 370 371 return nil 372 } 373 374 // writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false 375 // and "allow" if enable is true. 376 // This is needed since kernel 3.19, because you can't write gid_map without 377 // disabling setgroups() system call. 378 func writeSetgroups(pid int, enable bool) error { 379 sgf := "/proc/" + itoa(pid) + "/setgroups" 380 fd, err := Open(sgf, O_RDWR, 0) 381 if err != nil { 382 return err 383 } 384 385 var data []byte 386 if enable { 387 data = []byte("allow") 388 } else { 389 data = []byte("deny") 390 } 391 392 if _, err := Write(fd, data); err != nil { 393 Close(fd) 394 return err 395 } 396 397 return Close(fd) 398 } 399 400 // writeUidGidMappings writes User ID and Group ID mappings for user namespaces 401 // for a process and it is called from the parent process. 402 func writeUidGidMappings(pid int, sys *SysProcAttr) error { 403 if sys.UidMappings != nil { 404 uidf := "/proc/" + itoa(pid) + "/uid_map" 405 if err := writeIDMappings(uidf, sys.UidMappings); err != nil { 406 return err 407 } 408 } 409 410 if sys.GidMappings != nil { 411 // If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK. 412 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT { 413 return err 414 } 415 gidf := "/proc/" + itoa(pid) + "/gid_map" 416 if err := writeIDMappings(gidf, sys.GidMappings); err != nil { 417 return err 418 } 419 } 420 421 return nil 422 }