github.com/criyle/go-sandbox@v0.10.3/pkg/forkexec/fork_linux.go (about)

     1  package forkexec
     2  
     3  import (
     4  	"syscall"
     5  	"unsafe" // required for go:linkname.
     6  
     7  	"golang.org/x/sys/unix"
     8  )
     9  
    10  // Start will fork, load seccomp and execve and being traced by ptrace
    11  // Return pid and potential error
    12  // The runtime OS thread must be locked before calling this function
    13  // if ptrace is set to true
    14  func (r *Runner) Start() (int, error) {
    15  	argv0, argv, env, err := prepareExec(r.Args, r.Env)
    16  	if err != nil {
    17  		return 0, err
    18  	}
    19  
    20  	// prepare work dir
    21  	workdir, err := syscallStringFromString(r.WorkDir)
    22  	if err != nil {
    23  		return 0, err
    24  	}
    25  
    26  	// prepare hostname
    27  	hostname, err := syscallStringFromString(r.HostName)
    28  	if err != nil {
    29  		return 0, err
    30  	}
    31  
    32  	// prepare domainname
    33  	domainname, err := syscallStringFromString(r.DomainName)
    34  	if err != nil {
    35  		return 0, err
    36  	}
    37  
    38  	// prepare pivot_root param
    39  	pivotRoot, err := syscallStringFromString(r.PivotRoot)
    40  	if err != nil {
    41  		return 0, err
    42  	}
    43  
    44  	// socketpair p used to notify child the uid / gid mapping have been setup
    45  	// socketpair p is also used to sync with parent before final execve
    46  	// p[0] is used by parent and p[1] is used by child
    47  	p, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
    48  	if err != nil {
    49  		return 0, err
    50  	}
    51  
    52  	// fork in child
    53  	pid, err1 := forkAndExecInChild(r, argv0, argv, env, workdir, hostname, domainname, pivotRoot, p)
    54  
    55  	// restore all signals
    56  	afterFork()
    57  	syscall.ForkLock.Unlock()
    58  
    59  	return syncWithChild(r, p, int(pid), err1)
    60  }
    61  
    62  func syncWithChild(r *Runner, p [2]int, pid int, err1 syscall.Errno) (int, error) {
    63  	var (
    64  		err2        syscall.Errno
    65  		err         error
    66  		unshareUser = r.CloneFlags&unix.CLONE_NEWUSER == unix.CLONE_NEWUSER
    67  		childErr    ChildError
    68  	)
    69  
    70  	// sync with child
    71  	unix.Close(p[1])
    72  
    73  	// clone syscall failed
    74  	if err1 != 0 {
    75  		unix.Close(p[0])
    76  		childErr.Location = LocClone
    77  		childErr.Err = err1
    78  		return 0, childErr
    79  	}
    80  
    81  	// synchronize with child for uid / gid map
    82  	if unshareUser {
    83  		if err = writeIDMaps(r, int(pid)); err != nil {
    84  			err2 = err.(syscall.Errno)
    85  		}
    86  		syscall.RawSyscall(syscall.SYS_WRITE, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), uintptr(unsafe.Sizeof(err2)))
    87  	}
    88  
    89  	n, err := readChildErr(p[0], &childErr)
    90  	// child returned error code
    91  	if (n != int(unsafe.Sizeof(err2)) && n != int(unsafe.Sizeof(childErr))) || childErr.Err != 0 || err != nil {
    92  		childErr.Err = handlePipeError(n, childErr.Err)
    93  		goto fail
    94  	}
    95  
    96  	// if syncfunc return error, then fail child immediately
    97  	if r.SyncFunc != nil {
    98  		if err = r.SyncFunc(int(pid)); err != nil {
    99  			goto fail
   100  		}
   101  	}
   102  	// otherwise, ack child (err1 == 0)
   103  	syscall.RawSyscall(syscall.SYS_WRITE, uintptr(p[0]), uintptr(unsafe.Pointer(&err1)), uintptr(unsafe.Sizeof(err1)))
   104  
   105  	// if stopped before execve by signal SIGSTOP or PTRACE_ME, then do not wait until execve
   106  	if r.Ptrace || r.StopBeforeSeccomp {
   107  		// let's wait it in another goroutine to avoid SIGPIPE
   108  		go func() {
   109  			readChildErr(p[0], &childErr)
   110  			unix.Close(p[0])
   111  		}()
   112  		return int(pid), nil
   113  	}
   114  
   115  	// if read anything mean child failed after sync (close_on_exec so it should not block)
   116  	n, err = readChildErr(p[0], &childErr)
   117  	unix.Close(p[0])
   118  	if n != 0 || err != nil {
   119  		childErr.Err = handlePipeError(n, childErr.Err)
   120  		goto failAfterClose
   121  	}
   122  	return int(pid), nil
   123  
   124  fail:
   125  	unix.Close(p[0])
   126  
   127  failAfterClose:
   128  	handleChildFailed(int(pid))
   129  	if childErr.Err == 0 {
   130  		return 0, err
   131  	}
   132  	return 0, childErr
   133  }
   134  
   135  func readChildErr(fd int, childErr *ChildError) (n int, err error) {
   136  	for {
   137  		n, err = readlen(fd, (*byte)(unsafe.Pointer(childErr)), int(unsafe.Sizeof(*childErr)))
   138  		if err != syscall.EINTR {
   139  			break
   140  		}
   141  	}
   142  	return
   143  }
   144  
   145  // https://cs.opensource.google/go/go/+/refs/tags/go1.18.1:src/syscall/zsyscall_linux_amd64.go;l=944
   146  func readlen(fd int, p *byte, np int) (n int, err error) {
   147  	r0, _, e1 := syscall.Syscall(syscall.SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(p)), uintptr(np))
   148  	n = int(r0)
   149  	if e1 != 0 {
   150  		err = syscall.Errno(e1)
   151  	}
   152  	return
   153  }
   154  
   155  // check pipe error
   156  func handlePipeError(r1 int, errno syscall.Errno) syscall.Errno {
   157  	if uintptr(r1) >= unsafe.Sizeof(errno) {
   158  		return syscall.Errno(errno)
   159  	}
   160  	return syscall.EPIPE
   161  }
   162  
   163  func handleChildFailed(pid int) {
   164  	var wstatus syscall.WaitStatus
   165  	// make sure not blocked
   166  	syscall.Kill(pid, syscall.SIGKILL)
   167  	// child failed; wait for it to exit, to make sure the zombies don't accumulate
   168  	_, err := syscall.Wait4(pid, &wstatus, 0, nil)
   169  	for err == syscall.EINTR {
   170  		_, err = syscall.Wait4(pid, &wstatus, 0, nil)
   171  	}
   172  }