github.com/sdibtacm/sandbox@v0.0.0-20200320120712-60470cf803dc/exec/child.go (about)

     1  //+build linux
     2  
     3  package exec
     4  
     5  import (
     6  	"errors"
     7  	"runtime"
     8  	"sync"
     9  	"syscall"
    10  	"unsafe"
    11  )
    12  
    13  var ForkLock sync.RWMutex
    14  
    15  type SysAttr struct {
    16  	Ptrace        bool
    17  	Setsid        bool
    18  	RlimitList    [20]uint64
    19  	SetNoNewPrivs bool
    20  	Cloneflags    uintptr
    21  	Files         []uintptr
    22  	Pdeathsig     uint
    23  	Credential    *Credential
    24  	Bpf           *syscall.SockFprog
    25  }
    26  
    27  type Credential struct {
    28  	Uid   int
    29  	Gid   int
    30  	Umask uint
    31  }
    32  
    33  type ExecError struct {
    34  	Step int
    35  	Err  error
    36  }
    37  
    38  func (e *ExecError) Error() string {
    39  	return "exec: step[" + SANDBOX_STEP_STR[e.Step] + "] with error: [" + e.Err.Error() + "]"
    40  }
    41  
    42  var zeroSysAttr SysAttr
    43  
    44  func forkExec(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *SysAttr) (pid int, err error) {
    45  
    46  	var (
    47  		stepPipe [2]int
    48  		errPipe  [2]int
    49  		stepN    int
    50  		errN     int
    51  		err1     syscall.Errno
    52  		err2     error
    53  		err3     error
    54  		wstatus  syscall.WaitStatus
    55  		step     int
    56  	)
    57  
    58  	ForkLock.Lock()
    59  	if err = forkExecPipe(errPipe[:]); err != nil {
    60  		goto error
    61  	}
    62  	if err = forkExecPipe(stepPipe[:]); err != nil {
    63  		goto error
    64  	}
    65  
    66  	pid, err1 = cloneAndExecInChild(argv0, argv, envv, chroot, dir, attr, errPipe[1], stepPipe[1])
    67  	if err1 != 0 {
    68  		err = &ExecError{Step: SANDBOX_READY_FOR_CLONE, Err: errors.New(err1.Error())}
    69  		goto error
    70  	}
    71  	ForkLock.Unlock()
    72  
    73  	// Read child error status from pipe.
    74  	_ = syscall.Close(errPipe[1])
    75  	errN, err2 = readlen(errPipe[0], (*byte)(unsafe.Pointer(&err1)), int(unsafe.Sizeof(err1)))
    76  	_ = syscall.Close(errPipe[0])
    77  	_ = syscall.Close(stepPipe[1])
    78  	stepN, err3 = readlen(stepPipe[0], (*byte)(unsafe.Pointer(&step)), int(unsafe.Sizeof(step)))
    79  	_ = syscall.Close(stepPipe[0])
    80  	if err2 != nil || err3 != nil || errN != 0 {
    81  		if errN == int(unsafe.Sizeof(err1)) && stepN == int(unsafe.Sizeof(step)) {
    82  			err = &ExecError{Step: step, Err: errors.New(err1.Error())}
    83  		}
    84  		if err == nil && err2 == nil {
    85  			err = &ExecError{Step: SANDBOX_READ_PIPE, Err: syscall.EPIPE}
    86  		}
    87  		if err == nil && err3 == nil {
    88  			err = &ExecError{Step: SANDBOX_READ_PIPE, Err: syscall.EPIPE}
    89  		}
    90  
    91  		// Child failed; wait for it to exit, to make sure
    92  		// the zombies don't accumulate.
    93  		_, err1 := syscall.Wait4(pid, &wstatus, 0, nil)
    94  		for err1 == syscall.EINTR {
    95  			_, err1 = syscall.Wait4(pid, &wstatus, 0, nil)
    96  		}
    97  		return 0, err
    98  	}
    99  	return
   100  
   101  error:
   102  	if stepPipe[0] >= 0 {
   103  		_ = syscall.Close(stepPipe[0])
   104  		_ = syscall.Close(stepPipe[1])
   105  	}
   106  	if errPipe[0] >= 0 {
   107  		_ = syscall.Close(errPipe[0])
   108  		_ = syscall.Close(errPipe[1])
   109  	}
   110  	ForkLock.Unlock()
   111  	return 0, &ExecError{Step: SANDBOX_PREPARE_PIPE, Err: err2}
   112  }
   113  
   114  func cloneAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *SysAttr, errPipe, stepPipe int) (pid int, err syscall.Errno) {
   115  
   116  	r1, err1, locked := cloneAndExecInChild1(argv0, argv, envv, chroot, dir, attr, errPipe, stepPipe)
   117  	if locked {
   118  		runtimeAfterFork()
   119  	}
   120  	if err1 != 0 {
   121  		return 0, err1
   122  	}
   123  
   124  	// parent; return PID
   125  	pid = int(r1)
   126  	return pid, 0
   127  
   128  }
   129  
   130  var step int = SANDBOX_NO_START
   131  
   132  //go:noinline
   133  //go:norace
   134  func cloneAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, sys *SysAttr, errPipe, stepPipe int) (r1 uintptr, err1 syscall.Errno, locked bool) {
   135  	// The function will do clone, load limit, exec function
   136  	// because will no use normal function after clone,
   137  	// to let the parent know which step is happen error,
   138  	// will use pipe to sent step num and errno.
   139  
   140  	var (
   141  		//err2                      syscall.Errno
   142  		nextfd int
   143  		i      int
   144  		//fd1                       uintptr
   145  	)
   146  
   147  	ppid, _ := rawSyscallNoError(syscall.SYS_GETPID, 0, 0, 0)
   148  
   149  	// Guard against side effects of shuffling fds below.
   150  	// Make sure that nextfd is beyond any currently open files so
   151  	// that we can't run the risk of overwriting any of them.
   152  	fd := make([]int, len(sys.Files))
   153  	nextfd = len(sys.Files)
   154  	for i, ufd := range sys.Files {
   155  		if nextfd < int(ufd) {
   156  			nextfd = int(ufd)
   157  		}
   158  		fd[i] = int(ufd)
   159  	}
   160  	nextfd++
   161  
   162  	runtimeBeforeFork()
   163  	locked = true
   164  
   165  	step = SANDBOX_READY_FOR_CLONE
   166  	switch {
   167  	case runtime.GOARCH == "s390x":
   168  		r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0)
   169  	default:
   170  		r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0)
   171  	}
   172  	if err1 != 0 || r1 != 0 {
   173  		// If we're in the parent, we must return immediately
   174  		// so we're not in the same stack frame as the child.
   175  		// This can at most use the return PC, which the child
   176  		// will not modify, and the results of
   177  		// rawVforkSyscall, which must have been written after
   178  		// the child was replaced.
   179  		return
   180  	}
   181  
   182  	// Fork succeeded, now in child.
   183  
   184  	runtimeAfterForkInChild()
   185  
   186  	// Session ID
   187  	if sys.Setsid {
   188  		_, _, err1 = RawSyscall(syscall.SYS_SETSID, 0, 0, 0)
   189  		if err1 != 0 {
   190  			goto childerror
   191  		}
   192  	}
   193  
   194  	// Chroot
   195  	if chroot != nil {
   196  		step = SANDBOX_READY_FOR_CHROOT
   197  		_, _, err1 = RawSyscall(syscall.SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0)
   198  		if err1 != 0 {
   199  			goto childerror
   200  		}
   201  	}
   202  
   203  	if cred := sys.Credential; cred != nil {
   204  		if cred.Uid != 0 {
   205  			step = SANDBOX_READY_FOR_SETUID
   206  			_, _, err1 = RawSyscall(syscall.SYS_SETGID, uintptr(cred.Gid), 0, 0)
   207  			if err1 != 0 {
   208  				goto childerror
   209  			}
   210  		}
   211  		if cred.Gid != 0 {
   212  			step = SANDBOX_READY_FOR_SETGID
   213  			_, _, err1 = RawSyscall(syscall.SYS_SETUID, uintptr(cred.Uid), 0, 0)
   214  			if err1 != 0 {
   215  				goto childerror
   216  			}
   217  		}
   218  		if cred.Umask != 0 {
   219  			step = SANDBOX_READY_FOR_SETUMASK
   220  			_, _, err1 = RawSyscall(syscall.SYS_UMASK, uintptr(cred.Umask), 0, 0)
   221  			if err1 != 0 {
   222  				goto childerror
   223  			}
   224  		}
   225  	}
   226  
   227  	// Chdir
   228  	if dir != nil {
   229  		step = SANDBOX_READY_FOR_CHDIR
   230  		_, _, err1 = RawSyscall(syscall.SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0)
   231  		if err1 != 0 {
   232  			goto childerror
   233  		}
   234  	}
   235  
   236  	// Parent death signal
   237  	if sys.Pdeathsig != 0 {
   238  		step = SANDBOX_READY_FOR_SET_PDEATHSIG
   239  		_, _, err1 = RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0)
   240  		if err1 != 0 {
   241  			goto childerror
   242  		}
   243  
   244  		// Signal self if parent is already dead. This might cause a
   245  		// duplicate signal in rare cases, but it won't matter when
   246  		// using SIGKILL.
   247  		r1, _ = rawSyscallNoError(syscall.SYS_GETPPID, 0, 0, 0)
   248  		if r1 != ppid {
   249  			pid, _ := rawSyscallNoError(syscall.SYS_GETPID, 0, 0, 0)
   250  			step = SANDBOX_READY_FOR_PDEATHSIG_KILL_MYSELF
   251  			_, _, err1 := RawSyscall(syscall.SYS_KILL, pid, uintptr(sys.Pdeathsig), 0)
   252  			if err1 != 0 {
   253  				goto childerror
   254  			}
   255  		}
   256  	}
   257  
   258  	step = SANDBOX_READY_FRO_DUP_FILE
   259  	// Pass 1: look for fd[i] < i and move those up above len(fd)
   260  	// so that pass 2 won't stomp on an fd it needs later.
   261  	if errPipe < nextfd {
   262  		_, _, err1 = RawSyscall(syscall.SYS_DUP2, uintptr(errPipe), uintptr(nextfd), 0)
   263  		if err1 != 0 {
   264  			goto childerror
   265  		}
   266  		RawSyscall(syscall.SYS_FCNTL, uintptr(nextfd), syscall.F_SETFD, syscall.FD_CLOEXEC)
   267  		errPipe = nextfd
   268  		nextfd++
   269  	}
   270  	for i = 0; i < len(fd); i++ {
   271  		if fd[i] >= 0 && fd[i] < int(i) {
   272  			if nextfd == errPipe { // don't stomp on pipe
   273  				nextfd++
   274  			}
   275  			_, _, err1 = RawSyscall(syscall.SYS_DUP2, uintptr(fd[i]), uintptr(nextfd), 0)
   276  			if err1 != 0 {
   277  				goto childerror
   278  			}
   279  			RawSyscall(syscall.SYS_FCNTL, uintptr(nextfd), syscall.F_SETFD, syscall.FD_CLOEXEC)
   280  			fd[i] = nextfd
   281  			nextfd++
   282  		}
   283  	}
   284  
   285  	// Pass 2: dup fd[i] down onto i.
   286  	for i = 0; i < len(fd); i++ {
   287  		if fd[i] == -1 {
   288  			RawSyscall(syscall.SYS_CLOSE, uintptr(i), 0, 0)
   289  			continue
   290  		}
   291  		if fd[i] == int(i) {
   292  			// dup2(i, i) won't clear close-on-exec flag on Linux,
   293  			// probably not elsewhere either.
   294  			_, _, err1 = RawSyscall(syscall.SYS_FCNTL, uintptr(fd[i]), syscall.F_SETFD, 0)
   295  			if err1 != 0 {
   296  				goto childerror
   297  			}
   298  			continue
   299  		}
   300  		// The new fd is created NOT close-on-exec,
   301  		// which is exactly what we want.
   302  		_, _, err1 = RawSyscall(syscall.SYS_DUP2, uintptr(fd[i]), uintptr(i), 0)
   303  		if err1 != 0 {
   304  			goto childerror
   305  		}
   306  	}
   307  
   308  	step = SANDBOX_READY_FOR_SET_RLIMIT
   309  	for i = 0; i <= RLIMIT_NLIMITS; i++ {
   310  		if sys.RlimitList[i] != RLIMIT_UNRESOURCE {
   311  			_, _, err1 := RawSyscall(syscall.SYS_SETRLIMIT, uintptr(i),
   312  				uintptr(unsafe.Pointer(&syscall.Rlimit{Cur: sys.RlimitList[i], Max: sys.RlimitList[i]})), 0)
   313  			if err1 != 0 {
   314  				goto childerror
   315  			}
   316  		}
   317  	}
   318  
   319  	if sys.Ptrace {
   320  		step = SANDBOX_READY_FOR_SET_PTRACE
   321  		_, _, err1 = RawSyscall(syscall.SYS_PTRACE, uintptr(syscall.PTRACE_TRACEME), 0, 0)
   322  		if err1 != 0 {
   323  			goto childerror
   324  		}
   325  	}
   326  
   327  	if sys.Bpf != nil {
   328  		step = SANDBOX_READY_FOR_SET_BPF
   329  		_, _, err1 = RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECCOMP, 2, uintptr(unsafe.Pointer(sys.Bpf)))
   330  		if err1 != 0 {
   331  			goto childerror
   332  		}
   333  	}
   334  
   335  	// Time to exec.
   336  	step = SANDBOX_READY_FOR_EXEC
   337  	_, _, err1 = RawSyscall(syscall.SYS_EXECVE,
   338  		uintptr(unsafe.Pointer(argv0)),
   339  		uintptr(unsafe.Pointer(&argv[0])),
   340  		uintptr(unsafe.Pointer(&envv[0])))
   341  
   342  childerror:
   343  	RawSyscall(syscall.SYS_WRITE, uintptr(errPipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1))  // what error
   344  	RawSyscall(syscall.SYS_WRITE, uintptr(stepPipe), uintptr(unsafe.Pointer(&step)), unsafe.Sizeof(step)) // which step
   345  	for {
   346  		_, _, _ = RawSyscall(syscall.SYS_EXIT, 253, 0, 0)
   347  	}
   348  }
   349  
   350  func forkExecPipe(p []int) (err error) {
   351  	err = syscall.Pipe2(p, syscall.O_CLOEXEC)
   352  	// pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it
   353  	// might not be implemented.
   354  	if err == syscall.ENOSYS {
   355  		if err = syscall.Pipe(p); err != nil {
   356  			return
   357  		}
   358  		if _, err = fcntl(p[0], syscall.F_SETFD, syscall.FD_CLOEXEC); err != nil {
   359  			return
   360  		}
   361  		_, err = fcntl(p[1], syscall.F_SETFD, syscall.FD_CLOEXEC)
   362  	}
   363  	return
   364  }