github.com/miolini/go@v0.0.0-20160405192216-fca68c8cb408/src/syscall/exec_linux.go (about)

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build linux
     6  
     7  package syscall
     8  
     9  import (
    10  	"runtime"
    11  	"unsafe"
    12  )
    13  
    14  // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux.
    15  // See user_namespaces(7).
    16  type SysProcIDMap struct {
    17  	ContainerID int // Container ID.
    18  	HostID      int // Host ID.
    19  	Size        int // Size.
    20  }
    21  
    22  type SysProcAttr struct {
    23  	Chroot      string         // Chroot.
    24  	Credential  *Credential    // Credential.
    25  	Ptrace      bool           // Enable tracing.
    26  	Setsid      bool           // Create session.
    27  	Setpgid     bool           // Set process group ID to Pgid, or, if Pgid == 0, to new pid.
    28  	Setctty     bool           // Set controlling terminal to fd Ctty (only meaningful if Setsid is set)
    29  	Noctty      bool           // Detach fd 0 from controlling terminal
    30  	Ctty        int            // Controlling TTY fd
    31  	Foreground  bool           // Place child's process group in foreground. (Implies Setpgid. Uses Ctty as fd of controlling TTY)
    32  	Pgid        int            // Child's process group ID if Setpgid.
    33  	Pdeathsig   Signal         // Signal that the process will get when its parent dies (Linux only)
    34  	Cloneflags  uintptr        // Flags for clone calls (Linux only)
    35  	UidMappings []SysProcIDMap // User ID mappings for user namespaces.
    36  	GidMappings []SysProcIDMap // Group ID mappings for user namespaces.
    37  	// GidMappingsEnableSetgroups enabling setgroups syscall.
    38  	// If false, then setgroups syscall will be disabled for the child process.
    39  	// This parameter is no-op if GidMappings == nil. Otherwise for unprivileged
    40  	// users this should be set to false for mappings work.
    41  	GidMappingsEnableSetgroups bool
    42  }
    43  
    44  // Implemented in runtime package.
    45  func runtime_BeforeFork()
    46  func runtime_AfterFork()
    47  
    48  // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
    49  // If a dup or exec fails, write the errno error to pipe.
    50  // (Pipe is close-on-exec so if exec succeeds, it will be closed.)
    51  // In the child, this function must not acquire any locks, because
    52  // they might have been locked at the time of the fork. This means
    53  // no rescheduling, no malloc calls, and no new stack segments.
    54  // For the same reason compiler does not race instrument it.
    55  // The calls to RawSyscall are okay because they are assembly
    56  // functions that do not grow the stack.
    57  //go:norace
    58  func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
    59  	// Declare all variables at top in case any
    60  	// declarations require heap allocation (e.g., err1).
    61  	var (
    62  		r1     uintptr
    63  		err1   Errno
    64  		err2   Errno
    65  		nextfd int
    66  		i      int
    67  		p      [2]int
    68  	)
    69  
    70  	// Record parent PID so child can test if it has died.
    71  	ppid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0)
    72  
    73  	// Guard against side effects of shuffling fds below.
    74  	// Make sure that nextfd is beyond any currently open files so
    75  	// that we can't run the risk of overwriting any of them.
    76  	fd := make([]int, len(attr.Files))
    77  	nextfd = len(attr.Files)
    78  	for i, ufd := range attr.Files {
    79  		if nextfd < int(ufd) {
    80  			nextfd = int(ufd)
    81  		}
    82  		fd[i] = int(ufd)
    83  	}
    84  	nextfd++
    85  
    86  	// Allocate another pipe for parent to child communication for
    87  	// synchronizing writing of User ID/Group ID mappings.
    88  	if sys.UidMappings != nil || sys.GidMappings != nil {
    89  		if err := forkExecPipe(p[:]); err != nil {
    90  			return 0, err.(Errno)
    91  		}
    92  	}
    93  
    94  	// About to call fork.
    95  	// No more allocation or calls of non-assembly functions.
    96  	runtime_BeforeFork()
    97  	if runtime.GOARCH == "s390x" {
    98  		r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
    99  	} else {
   100  		r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
   101  	}
   102  	if err1 != 0 {
   103  		runtime_AfterFork()
   104  		return 0, err1
   105  	}
   106  
   107  	if r1 != 0 {
   108  		// parent; return PID
   109  		runtime_AfterFork()
   110  		pid = int(r1)
   111  
   112  		if sys.UidMappings != nil || sys.GidMappings != nil {
   113  			Close(p[0])
   114  			err := writeUidGidMappings(pid, sys)
   115  			if err != nil {
   116  				err2 = err.(Errno)
   117  			}
   118  			RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
   119  			Close(p[1])
   120  		}
   121  
   122  		return pid, 0
   123  	}
   124  
   125  	// Fork succeeded, now in child.
   126  
   127  	// Wait for User ID/Group ID mappings to be written.
   128  	if sys.UidMappings != nil || sys.GidMappings != nil {
   129  		if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 {
   130  			goto childerror
   131  		}
   132  		r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
   133  		if err1 != 0 {
   134  			goto childerror
   135  		}
   136  		if r1 != unsafe.Sizeof(err2) {
   137  			err1 = EINVAL
   138  			goto childerror
   139  		}
   140  		if err2 != 0 {
   141  			err1 = err2
   142  			goto childerror
   143  		}
   144  	}
   145  
   146  	// Enable tracing if requested.
   147  	if sys.Ptrace {
   148  		_, _, err1 = RawSyscall(SYS_PTRACE, uintptr(PTRACE_TRACEME), 0, 0)
   149  		if err1 != 0 {
   150  			goto childerror
   151  		}
   152  	}
   153  
   154  	// Session ID
   155  	if sys.Setsid {
   156  		_, _, err1 = RawSyscall(SYS_SETSID, 0, 0, 0)
   157  		if err1 != 0 {
   158  			goto childerror
   159  		}
   160  	}
   161  
   162  	// Set process group
   163  	if sys.Setpgid || sys.Foreground {
   164  		// Place child in process group.
   165  		_, _, err1 = RawSyscall(SYS_SETPGID, 0, uintptr(sys.Pgid), 0)
   166  		if err1 != 0 {
   167  			goto childerror
   168  		}
   169  	}
   170  
   171  	if sys.Foreground {
   172  		pgrp := int32(sys.Pgid)
   173  		if pgrp == 0 {
   174  			r1, _, err1 = RawSyscall(SYS_GETPID, 0, 0, 0)
   175  			if err1 != 0 {
   176  				goto childerror
   177  			}
   178  
   179  			pgrp = int32(r1)
   180  		}
   181  
   182  		// Place process group in foreground.
   183  		_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSPGRP), uintptr(unsafe.Pointer(&pgrp)))
   184  		if err1 != 0 {
   185  			goto childerror
   186  		}
   187  	}
   188  
   189  	// Chroot
   190  	if chroot != nil {
   191  		_, _, err1 = RawSyscall(SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
   192  		if err1 != 0 {
   193  			goto childerror
   194  		}
   195  	}
   196  
   197  	// User and groups
   198  	if cred := sys.Credential; cred != nil {
   199  		ngroups := uintptr(len(cred.Groups))
   200  		if ngroups > 0 {
   201  			groups := unsafe.Pointer(&cred.Groups[0])
   202  			_, _, err1 = RawSyscall(SYS_SETGROUPS, ngroups, uintptr(groups), 0)
   203  			if err1 != 0 {
   204  				goto childerror
   205  			}
   206  		}
   207  		_, _, err1 = RawSyscall(SYS_SETGID, uintptr(cred.Gid), 0, 0)
   208  		if err1 != 0 {
   209  			goto childerror
   210  		}
   211  		_, _, err1 = RawSyscall(SYS_SETUID, uintptr(cred.Uid), 0, 0)
   212  		if err1 != 0 {
   213  			goto childerror
   214  		}
   215  	}
   216  
   217  	// Chdir
   218  	if dir != nil {
   219  		_, _, err1 = RawSyscall(SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
   220  		if err1 != 0 {
   221  			goto childerror
   222  		}
   223  	}
   224  
   225  	// Parent death signal
   226  	if sys.Pdeathsig != 0 {
   227  		_, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
   228  		if err1 != 0 {
   229  			goto childerror
   230  		}
   231  
   232  		// Signal self if parent is already dead. This might cause a
   233  		// duplicate signal in rare cases, but it won't matter when
   234  		// using SIGKILL.
   235  		r1, _, _ = RawSyscall(SYS_GETPPID, 0, 0, 0)
   236  		if r1 != ppid {
   237  			pid, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0)
   238  			_, _, err1 := RawSyscall(SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
   239  			if err1 != 0 {
   240  				goto childerror
   241  			}
   242  		}
   243  	}
   244  
   245  	// Pass 1: look for fd[i] < i and move those up above len(fd)
   246  	// so that pass 2 won't stomp on an fd it needs later.
   247  	if pipe < nextfd {
   248  		_, _, err1 = RawSyscall(_SYS_dup, uintptr(pipe), uintptr(nextfd), 0)
   249  		if err1 != 0 {
   250  			goto childerror
   251  		}
   252  		RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
   253  		pipe = nextfd
   254  		nextfd++
   255  	}
   256  	for i = 0; i < len(fd); i++ {
   257  		if fd[i] >= 0 && fd[i] < int(i) {
   258  			if nextfd == pipe { // don't stomp on pipe
   259  				nextfd++
   260  			}
   261  			_, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(nextfd), 0)
   262  			if err1 != 0 {
   263  				goto childerror
   264  			}
   265  			RawSyscall(SYS_FCNTL, uintptr(nextfd), F_SETFD, FD_CLOEXEC)
   266  			fd[i] = nextfd
   267  			nextfd++
   268  		}
   269  	}
   270  
   271  	// Pass 2: dup fd[i] down onto i.
   272  	for i = 0; i < len(fd); i++ {
   273  		if fd[i] == -1 {
   274  			RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
   275  			continue
   276  		}
   277  		if fd[i] == int(i) {
   278  			// dup2(i, i) won't clear close-on-exec flag on Linux,
   279  			// probably not elsewhere either.
   280  			_, _, err1 = RawSyscall(SYS_FCNTL, uintptr(fd[i]), F_SETFD, 0)
   281  			if err1 != 0 {
   282  				goto childerror
   283  			}
   284  			continue
   285  		}
   286  		// The new fd is created NOT close-on-exec,
   287  		// which is exactly what we want.
   288  		_, _, err1 = RawSyscall(_SYS_dup, uintptr(fd[i]), uintptr(i), 0)
   289  		if err1 != 0 {
   290  			goto childerror
   291  		}
   292  	}
   293  
   294  	// By convention, we don't close-on-exec the fds we are
   295  	// started with, so if len(fd) < 3, close 0, 1, 2 as needed.
   296  	// Programs that know they inherit fds >= 3 will need
   297  	// to set them close-on-exec.
   298  	for i = len(fd); i < 3; i++ {
   299  		RawSyscall(SYS_CLOSE, uintptr(i), 0, 0)
   300  	}
   301  
   302  	// Detach fd 0 from tty
   303  	if sys.Noctty {
   304  		_, _, err1 = RawSyscall(SYS_IOCTL, 0, uintptr(TIOCNOTTY), 0)
   305  		if err1 != 0 {
   306  			goto childerror
   307  		}
   308  	}
   309  
   310  	// Set the controlling TTY to Ctty
   311  	if sys.Setctty {
   312  		_, _, err1 = RawSyscall(SYS_IOCTL, uintptr(sys.Ctty), uintptr(TIOCSCTTY), 0)
   313  		if err1 != 0 {
   314  			goto childerror
   315  		}
   316  	}
   317  
   318  	// Time to exec.
   319  	_, _, err1 = RawSyscall(SYS_EXECVE,
   320  		uintptr(unsafe.Pointer(argv0)),
   321  		uintptr(unsafe.Pointer(&argv[0])),
   322  		uintptr(unsafe.Pointer(&envv[0])))
   323  
   324  childerror:
   325  	// send error code on pipe
   326  	RawSyscall(SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))
   327  	for {
   328  		RawSyscall(SYS_EXIT, 253, 0, 0)
   329  	}
   330  }
   331  
   332  // Try to open a pipe with O_CLOEXEC set on both file descriptors.
   333  func forkExecPipe(p []int) (err error) {
   334  	err = Pipe2(p, O_CLOEXEC)
   335  	// pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it
   336  	// might not be implemented.
   337  	if err == ENOSYS {
   338  		if err = Pipe(p); err != nil {
   339  			return
   340  		}
   341  		if _, err = fcntl(p[0], F_SETFD, FD_CLOEXEC); err != nil {
   342  			return
   343  		}
   344  		_, err = fcntl(p[1], F_SETFD, FD_CLOEXEC)
   345  	}
   346  	return
   347  }
   348  
   349  // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path.
   350  func writeIDMappings(path string, idMap []SysProcIDMap) error {
   351  	fd, err := Open(path, O_RDWR, 0)
   352  	if err != nil {
   353  		return err
   354  	}
   355  
   356  	data := ""
   357  	for _, im := range idMap {
   358  		data = data + itoa(im.ContainerID) + " " + itoa(im.HostID) + " " + itoa(im.Size) + "\n"
   359  	}
   360  
   361  	bytes, err := ByteSliceFromString(data)
   362  	if err != nil {
   363  		Close(fd)
   364  		return err
   365  	}
   366  
   367  	if _, err := Write(fd, bytes); err != nil {
   368  		Close(fd)
   369  		return err
   370  	}
   371  
   372  	if err := Close(fd); err != nil {
   373  		return err
   374  	}
   375  
   376  	return nil
   377  }
   378  
   379  // writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false
   380  // and "allow" if enable is true.
   381  // This is needed since kernel 3.19, because you can't write gid_map without
   382  // disabling setgroups() system call.
   383  func writeSetgroups(pid int, enable bool) error {
   384  	sgf := "/proc/" + itoa(pid) + "/setgroups"
   385  	fd, err := Open(sgf, O_RDWR, 0)
   386  	if err != nil {
   387  		return err
   388  	}
   389  
   390  	var data []byte
   391  	if enable {
   392  		data = []byte("allow")
   393  	} else {
   394  		data = []byte("deny")
   395  	}
   396  
   397  	if _, err := Write(fd, data); err != nil {
   398  		Close(fd)
   399  		return err
   400  	}
   401  
   402  	return Close(fd)
   403  }
   404  
   405  // writeUidGidMappings writes User ID and Group ID mappings for user namespaces
   406  // for a process and it is called from the parent process.
   407  func writeUidGidMappings(pid int, sys *SysProcAttr) error {
   408  	if sys.UidMappings != nil {
   409  		uidf := "/proc/" + itoa(pid) + "/uid_map"
   410  		if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
   411  			return err
   412  		}
   413  	}
   414  
   415  	if sys.GidMappings != nil {
   416  		// If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK.
   417  		if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
   418  			return err
   419  		}
   420  		gidf := "/proc/" + itoa(pid) + "/gid_map"
   421  		if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
   422  			return err
   423  		}
   424  	}
   425  
   426  	return nil
   427  }