github.com/sdibtacm/sandbox@v0.0.0-20200320120712-60470cf803dc/exec/child.go (about) 1 //+build linux 2 3 package exec 4 5 import ( 6 "errors" 7 "runtime" 8 "sync" 9 "syscall" 10 "unsafe" 11 ) 12 13 var ForkLock sync.RWMutex 14 15 type SysAttr struct { 16 Ptrace bool 17 Setsid bool 18 RlimitList [20]uint64 19 SetNoNewPrivs bool 20 Cloneflags uintptr 21 Files []uintptr 22 Pdeathsig uint 23 Credential *Credential 24 Bpf *syscall.SockFprog 25 } 26 27 type Credential struct { 28 Uid int 29 Gid int 30 Umask uint 31 } 32 33 type ExecError struct { 34 Step int 35 Err error 36 } 37 38 func (e *ExecError) Error() string { 39 return "exec: step[" + SANDBOX_STEP_STR[e.Step] + "] with error: [" + e.Err.Error() + "]" 40 } 41 42 var zeroSysAttr SysAttr 43 44 func forkExec(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *SysAttr) (pid int, err error) { 45 46 var ( 47 stepPipe [2]int 48 errPipe [2]int 49 stepN int 50 errN int 51 err1 syscall.Errno 52 err2 error 53 err3 error 54 wstatus syscall.WaitStatus 55 step int 56 ) 57 58 ForkLock.Lock() 59 if err = forkExecPipe(errPipe[:]); err != nil { 60 goto error 61 } 62 if err = forkExecPipe(stepPipe[:]); err != nil { 63 goto error 64 } 65 66 pid, err1 = cloneAndExecInChild(argv0, argv, envv, chroot, dir, attr, errPipe[1], stepPipe[1]) 67 if err1 != 0 { 68 err = &ExecError{Step: SANDBOX_READY_FOR_CLONE, Err: errors.New(err1.Error())} 69 goto error 70 } 71 ForkLock.Unlock() 72 73 // Read child error status from pipe. 74 _ = syscall.Close(errPipe[1]) 75 errN, err2 = readlen(errPipe[0], (*byte)(unsafe.Pointer(&err1)), int(unsafe.Sizeof(err1))) 76 _ = syscall.Close(errPipe[0]) 77 _ = syscall.Close(stepPipe[1]) 78 stepN, err3 = readlen(stepPipe[0], (*byte)(unsafe.Pointer(&step)), int(unsafe.Sizeof(step))) 79 _ = syscall.Close(stepPipe[0]) 80 if err2 != nil || err3 != nil || errN != 0 { 81 if errN == int(unsafe.Sizeof(err1)) && stepN == int(unsafe.Sizeof(step)) { 82 err = &ExecError{Step: step, Err: errors.New(err1.Error())} 83 } 84 if err == nil && err2 == nil { 85 err = &ExecError{Step: SANDBOX_READ_PIPE, Err: syscall.EPIPE} 86 } 87 if err == nil && err3 == nil { 88 err = &ExecError{Step: SANDBOX_READ_PIPE, Err: syscall.EPIPE} 89 } 90 91 // Child failed; wait for it to exit, to make sure 92 // the zombies don't accumulate. 93 _, err1 := syscall.Wait4(pid, &wstatus, 0, nil) 94 for err1 == syscall.EINTR { 95 _, err1 = syscall.Wait4(pid, &wstatus, 0, nil) 96 } 97 return 0, err 98 } 99 return 100 101 error: 102 if stepPipe[0] >= 0 { 103 _ = syscall.Close(stepPipe[0]) 104 _ = syscall.Close(stepPipe[1]) 105 } 106 if errPipe[0] >= 0 { 107 _ = syscall.Close(errPipe[0]) 108 _ = syscall.Close(errPipe[1]) 109 } 110 ForkLock.Unlock() 111 return 0, &ExecError{Step: SANDBOX_PREPARE_PIPE, Err: err2} 112 } 113 114 func cloneAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *SysAttr, errPipe, stepPipe int) (pid int, err syscall.Errno) { 115 116 r1, err1, locked := cloneAndExecInChild1(argv0, argv, envv, chroot, dir, attr, errPipe, stepPipe) 117 if locked { 118 runtimeAfterFork() 119 } 120 if err1 != 0 { 121 return 0, err1 122 } 123 124 // parent; return PID 125 pid = int(r1) 126 return pid, 0 127 128 } 129 130 var step int = SANDBOX_NO_START 131 132 //go:noinline 133 //go:norace 134 func cloneAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, sys *SysAttr, errPipe, stepPipe int) (r1 uintptr, err1 syscall.Errno, locked bool) { 135 // The function will do clone, load limit, exec function 136 // because will no use normal function after clone, 137 // to let the parent know which step is happen error, 138 // will use pipe to sent step num and errno. 139 140 var ( 141 //err2 syscall.Errno 142 nextfd int 143 i int 144 //fd1 uintptr 145 ) 146 147 ppid, _ := rawSyscallNoError(syscall.SYS_GETPID, 0, 0, 0) 148 149 // Guard against side effects of shuffling fds below. 150 // Make sure that nextfd is beyond any currently open files so 151 // that we can't run the risk of overwriting any of them. 152 fd := make([]int, len(sys.Files)) 153 nextfd = len(sys.Files) 154 for i, ufd := range sys.Files { 155 if nextfd < int(ufd) { 156 nextfd = int(ufd) 157 } 158 fd[i] = int(ufd) 159 } 160 nextfd++ 161 162 runtimeBeforeFork() 163 locked = true 164 165 step = SANDBOX_READY_FOR_CLONE 166 switch { 167 case runtime.GOARCH == "s390x": 168 r1, _, err1 = RawSyscall6(SYS_CLONE, 0, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0) 169 default: 170 r1, _, err1 = RawSyscall6(SYS_CLONE, uintptr(SIGCHLD)|sys.Cloneflags, 0, 0, 0, 0, 0) 171 } 172 if err1 != 0 || r1 != 0 { 173 // If we're in the parent, we must return immediately 174 // so we're not in the same stack frame as the child. 175 // This can at most use the return PC, which the child 176 // will not modify, and the results of 177 // rawVforkSyscall, which must have been written after 178 // the child was replaced. 179 return 180 } 181 182 // Fork succeeded, now in child. 183 184 runtimeAfterForkInChild() 185 186 // Session ID 187 if sys.Setsid { 188 _, _, err1 = RawSyscall(syscall.SYS_SETSID, 0, 0, 0) 189 if err1 != 0 { 190 goto childerror 191 } 192 } 193 194 // Chroot 195 if chroot != nil { 196 step = SANDBOX_READY_FOR_CHROOT 197 _, _, err1 = RawSyscall(syscall.SYS_CHROOT, uintptr(unsafe.Pointer(chroot)), 0, 0) 198 if err1 != 0 { 199 goto childerror 200 } 201 } 202 203 if cred := sys.Credential; cred != nil { 204 if cred.Uid != 0 { 205 step = SANDBOX_READY_FOR_SETUID 206 _, _, err1 = RawSyscall(syscall.SYS_SETGID, uintptr(cred.Gid), 0, 0) 207 if err1 != 0 { 208 goto childerror 209 } 210 } 211 if cred.Gid != 0 { 212 step = SANDBOX_READY_FOR_SETGID 213 _, _, err1 = RawSyscall(syscall.SYS_SETUID, uintptr(cred.Uid), 0, 0) 214 if err1 != 0 { 215 goto childerror 216 } 217 } 218 if cred.Umask != 0 { 219 step = SANDBOX_READY_FOR_SETUMASK 220 _, _, err1 = RawSyscall(syscall.SYS_UMASK, uintptr(cred.Umask), 0, 0) 221 if err1 != 0 { 222 goto childerror 223 } 224 } 225 } 226 227 // Chdir 228 if dir != nil { 229 step = SANDBOX_READY_FOR_CHDIR 230 _, _, err1 = RawSyscall(syscall.SYS_CHDIR, uintptr(unsafe.Pointer(dir)), 0, 0) 231 if err1 != 0 { 232 goto childerror 233 } 234 } 235 236 // Parent death signal 237 if sys.Pdeathsig != 0 { 238 step = SANDBOX_READY_FOR_SET_PDEATHSIG 239 _, _, err1 = RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(sys.Pdeathsig), 0, 0, 0, 0) 240 if err1 != 0 { 241 goto childerror 242 } 243 244 // Signal self if parent is already dead. This might cause a 245 // duplicate signal in rare cases, but it won't matter when 246 // using SIGKILL. 247 r1, _ = rawSyscallNoError(syscall.SYS_GETPPID, 0, 0, 0) 248 if r1 != ppid { 249 pid, _ := rawSyscallNoError(syscall.SYS_GETPID, 0, 0, 0) 250 step = SANDBOX_READY_FOR_PDEATHSIG_KILL_MYSELF 251 _, _, err1 := RawSyscall(syscall.SYS_KILL, pid, uintptr(sys.Pdeathsig), 0) 252 if err1 != 0 { 253 goto childerror 254 } 255 } 256 } 257 258 step = SANDBOX_READY_FRO_DUP_FILE 259 // Pass 1: look for fd[i] < i and move those up above len(fd) 260 // so that pass 2 won't stomp on an fd it needs later. 261 if errPipe < nextfd { 262 _, _, err1 = RawSyscall(syscall.SYS_DUP2, uintptr(errPipe), uintptr(nextfd), 0) 263 if err1 != 0 { 264 goto childerror 265 } 266 RawSyscall(syscall.SYS_FCNTL, uintptr(nextfd), syscall.F_SETFD, syscall.FD_CLOEXEC) 267 errPipe = nextfd 268 nextfd++ 269 } 270 for i = 0; i < len(fd); i++ { 271 if fd[i] >= 0 && fd[i] < int(i) { 272 if nextfd == errPipe { // don't stomp on pipe 273 nextfd++ 274 } 275 _, _, err1 = RawSyscall(syscall.SYS_DUP2, uintptr(fd[i]), uintptr(nextfd), 0) 276 if err1 != 0 { 277 goto childerror 278 } 279 RawSyscall(syscall.SYS_FCNTL, uintptr(nextfd), syscall.F_SETFD, syscall.FD_CLOEXEC) 280 fd[i] = nextfd 281 nextfd++ 282 } 283 } 284 285 // Pass 2: dup fd[i] down onto i. 286 for i = 0; i < len(fd); i++ { 287 if fd[i] == -1 { 288 RawSyscall(syscall.SYS_CLOSE, uintptr(i), 0, 0) 289 continue 290 } 291 if fd[i] == int(i) { 292 // dup2(i, i) won't clear close-on-exec flag on Linux, 293 // probably not elsewhere either. 294 _, _, err1 = RawSyscall(syscall.SYS_FCNTL, uintptr(fd[i]), syscall.F_SETFD, 0) 295 if err1 != 0 { 296 goto childerror 297 } 298 continue 299 } 300 // The new fd is created NOT close-on-exec, 301 // which is exactly what we want. 302 _, _, err1 = RawSyscall(syscall.SYS_DUP2, uintptr(fd[i]), uintptr(i), 0) 303 if err1 != 0 { 304 goto childerror 305 } 306 } 307 308 step = SANDBOX_READY_FOR_SET_RLIMIT 309 for i = 0; i <= RLIMIT_NLIMITS; i++ { 310 if sys.RlimitList[i] != RLIMIT_UNRESOURCE { 311 _, _, err1 := RawSyscall(syscall.SYS_SETRLIMIT, uintptr(i), 312 uintptr(unsafe.Pointer(&syscall.Rlimit{Cur: sys.RlimitList[i], Max: sys.RlimitList[i]})), 0) 313 if err1 != 0 { 314 goto childerror 315 } 316 } 317 } 318 319 if sys.Ptrace { 320 step = SANDBOX_READY_FOR_SET_PTRACE 321 _, _, err1 = RawSyscall(syscall.SYS_PTRACE, uintptr(syscall.PTRACE_TRACEME), 0, 0) 322 if err1 != 0 { 323 goto childerror 324 } 325 } 326 327 if sys.Bpf != nil { 328 step = SANDBOX_READY_FOR_SET_BPF 329 _, _, err1 = RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECCOMP, 2, uintptr(unsafe.Pointer(sys.Bpf))) 330 if err1 != 0 { 331 goto childerror 332 } 333 } 334 335 // Time to exec. 336 step = SANDBOX_READY_FOR_EXEC 337 _, _, err1 = RawSyscall(syscall.SYS_EXECVE, 338 uintptr(unsafe.Pointer(argv0)), 339 uintptr(unsafe.Pointer(&argv[0])), 340 uintptr(unsafe.Pointer(&envv[0]))) 341 342 childerror: 343 RawSyscall(syscall.SYS_WRITE, uintptr(errPipe), uintptr(unsafe.Pointer(&err1)), unsafe.Sizeof(err1)) // what error 344 RawSyscall(syscall.SYS_WRITE, uintptr(stepPipe), uintptr(unsafe.Pointer(&step)), unsafe.Sizeof(step)) // which step 345 for { 346 _, _, _ = RawSyscall(syscall.SYS_EXIT, 253, 0, 0) 347 } 348 } 349 350 func forkExecPipe(p []int) (err error) { 351 err = syscall.Pipe2(p, syscall.O_CLOEXEC) 352 // pipe2 was added in 2.6.27 and our minimum requirement is 2.6.23, so it 353 // might not be implemented. 354 if err == syscall.ENOSYS { 355 if err = syscall.Pipe(p); err != nil { 356 return 357 } 358 if _, err = fcntl(p[0], syscall.F_SETFD, syscall.FD_CLOEXEC); err != nil { 359 return 360 } 361 _, err = fcntl(p[1], syscall.F_SETFD, syscall.FD_CLOEXEC) 362 } 363 return 364 }