github.com/criyle/go-sandbox@v0.10.3/pkg/forkexec/fork_child_linux.go (about)

     1  package forkexec
     2  
     3  import (
     4  	"syscall"
     5  	"unsafe"
     6  
     7  	"golang.org/x/sys/unix"
     8  )
     9  
    10  // Reference to src/syscall/exec_linux.go
    11  //
    12  //go:norace
    13  func forkAndExecInChild(r *Runner, argv0 *byte, argv, env []*byte, workdir, hostname, domainname, pivotRoot *byte, p [2]int) (r1 uintptr, err1 syscall.Errno) {
    14  	// similar to exec_linux, avoid side effect by shuffling around
    15  	fd, nextfd := prepareFds(r.Files)
    16  
    17  	// Acquire the fork lock so that no other threads
    18  	// create new fds that are not yet close-on-exec
    19  	// before we fork.
    20  	syscall.ForkLock.Lock()
    21  
    22  	// About to call fork.
    23  	// No more allocation or calls of non-assembly functions.
    24  	beforeFork()
    25  
    26  	// UnshareFlags (new namespaces) is activated by clone syscall
    27  	r1, _, err1 = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGCHLD)|(r.CloneFlags&UnshareFlags), 0, 0, 0, 0, 0)
    28  	if err1 != 0 || r1 != 0 {
    29  		// in parent process, immediate return
    30  		return
    31  	}
    32  
    33  	// In child process
    34  	afterForkInChild()
    35  	// Notice: cannot call any GO functions beyond this point
    36  
    37  	pipe := p[1]
    38  	var (
    39  		pid         uintptr
    40  		err2        syscall.Errno
    41  		unshareUser = r.CloneFlags&unix.CLONE_NEWUSER == unix.CLONE_NEWUSER
    42  	)
    43  
    44  	// Close write end of pipe
    45  	if _, _, err1 = syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(p[0]), 0, 0); err1 != 0 {
    46  		childExitError(pipe, LocCloseWrite, err1)
    47  	}
    48  
    49  	// If usernamespace is unshared, uid map and gid map is required to create folders
    50  	// and files
    51  	// We need parent to setup uid_map / gid_map for us since we do not have capabilities
    52  	// in the original namespace
    53  	// At the same time, socket pair / pipe synchronization is required as well
    54  	if unshareUser {
    55  		r1, _, err1 = syscall.RawSyscall(syscall.SYS_READ, uintptr(pipe), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
    56  		if err1 != 0 {
    57  			childExitError(pipe, LocUnshareUserRead, err1)
    58  		}
    59  		if r1 != unsafe.Sizeof(err2) {
    60  			err1 = syscall.EINVAL
    61  			childExitError(pipe, LocUnshareUserRead, err1)
    62  		}
    63  		if err2 != 0 {
    64  			err1 = err2
    65  			childExitError(pipe, LocUnshareUserRead, err1)
    66  		}
    67  	}
    68  
    69  	// Get pid of child
    70  	pid, _, err1 = syscall.RawSyscall(syscall.SYS_GETPID, 0, 0, 0)
    71  	if err1 != 0 {
    72  		childExitError(pipe, LocGetPid, err1)
    73  	}
    74  
    75  	// keep capabilities through set_uid / set_gid calls (make sure we can use unshare cgroup), later dropped
    76  	if r.Credential != nil || r.UnshareCgroupAfterSync {
    77  		_, _, err1 = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECUREBITS,
    78  			_SECURE_KEEP_CAPS_LOCKED|_SECURE_NO_SETUID_FIXUP|_SECURE_NO_SETUID_FIXUP_LOCKED, 0)
    79  		if err1 != 0 {
    80  			childExitError(pipe, LocKeepCapability, err1)
    81  		}
    82  	}
    83  
    84  	// set the credential for the child process(exec_linux.go)
    85  	if cred := r.Credential; cred != nil {
    86  		ngroups := uintptr(len(cred.Groups))
    87  		groups := uintptr(0)
    88  		if ngroups > 0 {
    89  			groups = uintptr(unsafe.Pointer(&cred.Groups[0]))
    90  		}
    91  		if !(r.GIDMappings != nil && !r.GIDMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
    92  			_, _, err1 = syscall.RawSyscall(unix.SYS_SETGROUPS, ngroups, groups, 0)
    93  			if err1 != 0 {
    94  				childExitError(pipe, LocSetGroups, err1)
    95  			}
    96  		}
    97  		_, _, err1 = syscall.RawSyscall(unix.SYS_SETGID, uintptr(cred.Gid), 0, 0)
    98  		if err1 != 0 {
    99  			childExitError(pipe, LocSetGid, err1)
   100  		}
   101  		_, _, err1 = syscall.RawSyscall(unix.SYS_SETUID, uintptr(cred.Uid), 0, 0)
   102  		if err1 != 0 {
   103  			childExitError(pipe, LocSetUid, err1)
   104  		}
   105  	}
   106  
   107  	// Pass 1 & pass 2 assigns fds for child process
   108  	// Pass 1: fd[i] < i => nextfd
   109  	if pipe < nextfd {
   110  		_, _, err1 = syscall.RawSyscall(syscall.SYS_DUP3, uintptr(pipe), uintptr(nextfd), syscall.O_CLOEXEC)
   111  		if err1 != 0 {
   112  			childExitError(pipe, LocDup3, err1)
   113  		}
   114  		pipe = nextfd
   115  		nextfd++
   116  	}
   117  	if r.ExecFile > 0 && int(r.ExecFile) < nextfd {
   118  		// Avoid fd rewrite
   119  		for nextfd == pipe {
   120  			nextfd++
   121  		}
   122  		_, _, err1 = syscall.RawSyscall(syscall.SYS_DUP3, r.ExecFile, uintptr(nextfd), syscall.O_CLOEXEC)
   123  		if err1 != 0 {
   124  			childExitError(pipe, LocDup3, err1)
   125  		}
   126  		r.ExecFile = uintptr(nextfd)
   127  		nextfd++
   128  	}
   129  	for i := 0; i < len(fd); i++ {
   130  		if fd[i] >= 0 && fd[i] < int(i) {
   131  			// Avoid fd rewrite
   132  			for nextfd == pipe || (r.ExecFile > 0 && nextfd == int(r.ExecFile)) {
   133  				nextfd++
   134  			}
   135  			_, _, err1 = syscall.RawSyscall(syscall.SYS_DUP3, uintptr(fd[i]), uintptr(nextfd), syscall.O_CLOEXEC)
   136  			if err1 != 0 {
   137  				childExitError(pipe, LocDup3, err1)
   138  			}
   139  			// Set up close on exec
   140  			fd[i] = nextfd
   141  			nextfd++
   142  		}
   143  	}
   144  	// Pass 2: fd[i] => i
   145  	for i := 0; i < len(fd); i++ {
   146  		if fd[i] == -1 {
   147  			syscall.RawSyscall(syscall.SYS_CLOSE, uintptr(i), 0, 0)
   148  			continue
   149  		}
   150  		if fd[i] == int(i) {
   151  			// dup2(i, i) will not clear close on exec flag, need to reset the flag
   152  			_, _, err1 = syscall.RawSyscall(syscall.SYS_FCNTL, uintptr(fd[i]), syscall.F_SETFD, 0)
   153  			if err1 != 0 {
   154  				childExitError(pipe, LocFcntl, err1)
   155  			}
   156  			continue
   157  		}
   158  		_, _, err1 = syscall.RawSyscall(syscall.SYS_DUP3, uintptr(fd[i]), uintptr(i), 0)
   159  		if err1 != 0 {
   160  			childExitError(pipe, LocDup3, err1)
   161  		}
   162  	}
   163  
   164  	// Set the session ID
   165  	_, _, err1 = syscall.RawSyscall(syscall.SYS_SETSID, 0, 0, 0)
   166  	if err1 != 0 {
   167  		childExitError(pipe, LocSetSid, err1)
   168  	}
   169  
   170  	// Set the controlling TTY
   171  	if r.CTTY {
   172  		_, _, err1 = syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(0), uintptr(syscall.TIOCSCTTY), 1)
   173  		if err1 != 0 {
   174  			childExitError(pipe, LocIoctl, err1)
   175  		}
   176  	}
   177  
   178  	// Mount file system
   179  	{
   180  		// If mount point is unshared, mark root as private to avoid propagate
   181  		// outside to the original mount namespace
   182  		if r.CloneFlags&syscall.CLONE_NEWNS == syscall.CLONE_NEWNS {
   183  			_, _, err1 = syscall.RawSyscall6(syscall.SYS_MOUNT, uintptr(unsafe.Pointer(&none[0])),
   184  				uintptr(unsafe.Pointer(&slash[0])), 0, syscall.MS_REC|syscall.MS_PRIVATE, 0, 0)
   185  			if err1 != 0 {
   186  				childExitError(pipe, LocMountRoot, err1)
   187  			}
   188  		}
   189  
   190  		// mount tmpfs & chdir to new root before performing mounts
   191  		if pivotRoot != nil {
   192  			// mount("tmpfs", root, "tmpfs", 0, "")
   193  			_, _, err1 = syscall.RawSyscall6(syscall.SYS_MOUNT, uintptr(unsafe.Pointer(&tmpfs[0])),
   194  				uintptr(unsafe.Pointer(pivotRoot)), uintptr(unsafe.Pointer(&tmpfs[0])), 0,
   195  				uintptr(unsafe.Pointer(&empty[0])), 0)
   196  			if err1 != 0 {
   197  				childExitError(pipe, LocMountTmpfs, err1)
   198  			}
   199  
   200  			_, _, err1 = syscall.RawSyscall(syscall.SYS_CHDIR, uintptr(unsafe.Pointer(pivotRoot)), 0, 0)
   201  			if err1 != 0 {
   202  				childExitError(pipe, LocMountChdir, err1)
   203  			}
   204  		}
   205  
   206  		// performing mounts
   207  		for i, m := range r.Mounts {
   208  			// mkdirs(target)
   209  			for j, p := range m.Prefixes {
   210  				// if target mount point is a file, mknod(target)
   211  				if j == len(m.Prefixes)-1 && m.MakeNod {
   212  					_, _, err1 = syscall.RawSyscall(syscall.SYS_MKNODAT, uintptr(_AT_FDCWD), uintptr(unsafe.Pointer(p)), 0755)
   213  					if err1 != 0 && err1 != syscall.EEXIST {
   214  						childExitErrorWithIndex(pipe, LocMountMkdir, i, err1)
   215  					}
   216  					break
   217  				}
   218  				_, _, err1 = syscall.RawSyscall(syscall.SYS_MKDIRAT, uintptr(_AT_FDCWD), uintptr(unsafe.Pointer(p)), 0755)
   219  				if err1 != 0 && err1 != syscall.EEXIST {
   220  					childExitErrorWithIndex(pipe, LocMountMkdir, i, err1)
   221  				}
   222  			}
   223  			// mount(source, target, fsType, flags, data)
   224  			_, _, err1 = syscall.RawSyscall6(syscall.SYS_MOUNT, uintptr(unsafe.Pointer(m.Source)),
   225  				uintptr(unsafe.Pointer(m.Target)), uintptr(unsafe.Pointer(m.FsType)), uintptr(m.Flags),
   226  				uintptr(unsafe.Pointer(m.Data)), 0)
   227  			if err1 != 0 {
   228  				childExitErrorWithIndex(pipe, LocMount, i, err1)
   229  			}
   230  			// bind mount is not respect ro flag so that read-only bind mount needs remount
   231  			if m.Flags&bindRo == bindRo {
   232  				_, _, err1 = syscall.RawSyscall6(syscall.SYS_MOUNT, uintptr(unsafe.Pointer(&empty[0])),
   233  					uintptr(unsafe.Pointer(m.Target)), uintptr(unsafe.Pointer(m.FsType)),
   234  					uintptr(m.Flags|syscall.MS_REMOUNT), uintptr(unsafe.Pointer(m.Data)), 0)
   235  				if err1 != 0 {
   236  					childExitErrorWithIndex(pipe, LocMount, i, err1)
   237  				}
   238  			}
   239  		}
   240  
   241  		// pivot_root
   242  		if pivotRoot != nil {
   243  			// mkdir("old_root")
   244  			_, _, err1 = syscall.RawSyscall(syscall.SYS_MKDIRAT, uintptr(_AT_FDCWD), uintptr(unsafe.Pointer(&oldRoot[0])), 0755)
   245  			if err1 != 0 {
   246  				childExitError(pipe, LocPivotRoot, err1)
   247  			}
   248  
   249  			// pivot_root(root, "old_root")
   250  			_, _, err1 = syscall.RawSyscall(syscall.SYS_PIVOT_ROOT, uintptr(unsafe.Pointer(pivotRoot)), uintptr(unsafe.Pointer(&oldRoot[0])), 0)
   251  			if err1 != 0 {
   252  				childExitError(pipe, LocPivotRoot, err1)
   253  			}
   254  
   255  			// umount("old_root", MNT_DETACH)
   256  			_, _, err1 = syscall.RawSyscall(syscall.SYS_UMOUNT2, uintptr(unsafe.Pointer(&oldRoot[0])), syscall.MNT_DETACH, 0)
   257  			if err1 != 0 {
   258  				childExitError(pipe, LocPivotRoot, err1)
   259  			}
   260  
   261  			// rmdir("old_root")
   262  			_, _, err1 = syscall.RawSyscall(syscall.SYS_UNLINKAT, uintptr(_AT_FDCWD), uintptr(unsafe.Pointer(&oldRoot[0])), uintptr(unix.AT_REMOVEDIR))
   263  			if err1 != 0 {
   264  				childExitError(pipe, LocPivotRoot, err1)
   265  			}
   266  
   267  			// mount("tmpfs", "/", "tmpfs", MS_BIND | MS_REMOUNT | MS_RDONLY | MS_NOATIME | MS_NOSUID, nil)
   268  			_, _, err1 = syscall.RawSyscall6(syscall.SYS_MOUNT, uintptr(unsafe.Pointer(&tmpfs[0])),
   269  				uintptr(unsafe.Pointer(&slash[0])), uintptr(unsafe.Pointer(&tmpfs[0])),
   270  				uintptr(syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_NOATIME|syscall.MS_NOSUID),
   271  				uintptr(unsafe.Pointer(&empty[0])), 0)
   272  			if err1 != 0 {
   273  				childExitError(pipe, LocPivotRoot, err1)
   274  			}
   275  		}
   276  	}
   277  
   278  	// SetHostName
   279  	if hostname != nil {
   280  		syscall.RawSyscall(syscall.SYS_SETHOSTNAME,
   281  			uintptr(unsafe.Pointer(hostname)), uintptr(len(r.HostName)), 0)
   282  	}
   283  
   284  	// SetDomainName
   285  	if domainname != nil {
   286  		syscall.RawSyscall(syscall.SYS_SETDOMAINNAME,
   287  			uintptr(unsafe.Pointer(domainname)), uintptr(len(r.DomainName)), 0)
   288  	}
   289  
   290  	// chdir for child
   291  	if workdir != nil {
   292  		_, _, err1 = syscall.RawSyscall(syscall.SYS_CHDIR, uintptr(unsafe.Pointer(workdir)), 0, 0)
   293  		if err1 != 0 {
   294  			childExitError(pipe, LocChdir, err1)
   295  		}
   296  	}
   297  
   298  	// Set limit
   299  	for i, rlim := range r.RLimits {
   300  		// prlimit instead of setrlimit to avoid 32-bit limitation (linux > 3.2)
   301  		_, _, err1 = syscall.RawSyscall6(syscall.SYS_PRLIMIT64, 0, uintptr(rlim.Res), uintptr(unsafe.Pointer(&rlim.Rlim)), 0, 0, 0)
   302  		if err1 != 0 {
   303  			childExitErrorWithIndex(pipe, LocSetRlimit, i, err1)
   304  		}
   305  	}
   306  
   307  	// No new privs
   308  	if r.NoNewPrivs || r.Seccomp != nil {
   309  		_, _, err1 = syscall.RawSyscall6(syscall.SYS_PRCTL, unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0, 0)
   310  		if err1 != 0 {
   311  			childExitError(pipe, LocSetNoNewPrivs, err1)
   312  		}
   313  	}
   314  
   315  	// Drop all capabilities
   316  	if (r.Credential != nil || r.DropCaps) && !r.UnshareCgroupAfterSync {
   317  		// make sure the children have no privilege at all
   318  		_, _, err1 = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECUREBITS,
   319  			_SECURE_KEEP_CAPS_LOCKED|_SECURE_NO_SETUID_FIXUP|_SECURE_NO_SETUID_FIXUP_LOCKED|_SECURE_NOROOT|_SECURE_NOROOT_LOCKED, 0)
   320  		if err1 != 0 {
   321  			childExitError(pipe, LocDropCapability, err1)
   322  		}
   323  		_, _, err1 = syscall.RawSyscall(syscall.SYS_CAPSET, uintptr(unsafe.Pointer(&dropCapHeader)), uintptr(unsafe.Pointer(&dropCapData)), 0)
   324  		if err1 != 0 {
   325  			childExitError(pipe, LocSetCap, err1)
   326  		}
   327  	}
   328  
   329  	// Enable Ptrace & sync with parent (since ptrace_me is a blocking operation)
   330  	if r.Ptrace && r.Seccomp != nil {
   331  		{
   332  			r1, _, err1 = syscall.RawSyscall(syscall.SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err2)), uintptr(unsafe.Sizeof(err2)))
   333  			if r1 == 0 || err1 != 0 {
   334  				childExitError(pipe, LocSyncWrite, err1)
   335  			}
   336  
   337  			r1, _, err1 = syscall.RawSyscall(syscall.SYS_READ, uintptr(pipe), uintptr(unsafe.Pointer(&err2)), uintptr(unsafe.Sizeof(err2)))
   338  			if r1 == 0 || err1 != 0 {
   339  				childExitError(pipe, LocSyncRead, err1)
   340  			}
   341  
   342  			// unshare cgroup namespace
   343  			if r.UnshareCgroupAfterSync {
   344  				// do not error if unshare fails, it is not critical
   345  				syscall.RawSyscall(syscall.SYS_UNSHARE, uintptr(unix.CLONE_NEWCGROUP), 0, 0)
   346  
   347  				if r.DropCaps || r.Credential != nil {
   348  					// make sure the children have no privilege at all
   349  					_, _, err1 = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECUREBITS,
   350  						_SECURE_KEEP_CAPS_LOCKED|_SECURE_NO_SETUID_FIXUP|_SECURE_NO_SETUID_FIXUP_LOCKED|_SECURE_NOROOT|_SECURE_NOROOT_LOCKED, 0)
   351  					if err1 != 0 {
   352  						childExitError(pipe, LocKeepCapability, err1)
   353  					}
   354  					_, _, err1 = syscall.RawSyscall(syscall.SYS_CAPSET, uintptr(unsafe.Pointer(&dropCapHeader)), uintptr(unsafe.Pointer(&dropCapData)), 0)
   355  					if err1 != 0 {
   356  						childExitError(pipe, LocSetCap, err1)
   357  					}
   358  				}
   359  
   360  				if r.Seccomp != nil {
   361  					// Load seccomp filter
   362  					_, _, err1 = syscall.RawSyscall(unix.SYS_SECCOMP, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(r.Seccomp)))
   363  					if err1 != 0 {
   364  						childExitError(pipe, LocSeccomp, err1)
   365  					}
   366  				}
   367  			}
   368  		}
   369  		_, _, err1 = syscall.RawSyscall(syscall.SYS_PTRACE, uintptr(syscall.PTRACE_TRACEME), 0, 0)
   370  		if err1 != 0 {
   371  			childExitError(pipe, LocPtraceMe, err1)
   372  		}
   373  	}
   374  
   375  	// if both seccomp and ptrace is defined, then seccomp filter should have
   376  	// traced execve, thus child need parent attached to it first
   377  	// actually, this is not effective if pid namespace is unshared
   378  	if r.StopBeforeSeccomp || (r.Seccomp != nil && r.Ptrace) {
   379  		// Stop to wait for ptrace tracer
   380  		_, _, err1 = syscall.RawSyscall(syscall.SYS_KILL, pid, uintptr(syscall.SIGSTOP), 0)
   381  		if err1 != 0 {
   382  			childExitError(pipe, LocStop, err1)
   383  		}
   384  	}
   385  
   386  	// Load seccomp, stop and wait for tracer
   387  	if r.Seccomp != nil && (!r.UnshareCgroupAfterSync || r.Ptrace) {
   388  		// If execve is seccomp trapped, then tracee stop is necessary
   389  		// otherwise execve will fail due to ENOSYS
   390  		// Do getpid and kill to send SYS_KILL to self
   391  		// need to do before seccomp as these might be traced
   392  
   393  		// Load seccomp filter
   394  		_, _, err1 = syscall.RawSyscall(unix.SYS_SECCOMP, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(r.Seccomp)))
   395  		if err1 != 0 {
   396  			childExitError(pipe, LocSeccomp, err1)
   397  		}
   398  	}
   399  
   400  	// Before exec, sync with parent through pipe (configured as close_on_exec)
   401  	if !r.Ptrace || r.Seccomp == nil {
   402  		{
   403  			r1, _, err1 = syscall.RawSyscall(syscall.SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&err2)), uintptr(unsafe.Sizeof(err2)))
   404  			if r1 == 0 || err1 != 0 {
   405  				childExitError(pipe, LocSyncWrite, err1)
   406  			}
   407  
   408  			r1, _, err1 = syscall.RawSyscall(syscall.SYS_READ, uintptr(pipe), uintptr(unsafe.Pointer(&err2)), uintptr(unsafe.Sizeof(err2)))
   409  			if r1 == 0 || err1 != 0 {
   410  				childExitError(pipe, LocSyncRead, err1)
   411  			}
   412  
   413  			// unshare cgroup namespace
   414  			if r.UnshareCgroupAfterSync {
   415  				// do not error if unshare fails, it is not critical
   416  				syscall.RawSyscall(syscall.SYS_UNSHARE, uintptr(unix.CLONE_NEWCGROUP), 0, 0)
   417  
   418  				if r.DropCaps || r.Credential != nil {
   419  					// make sure the children have no privilege at all
   420  					_, _, err1 = syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECUREBITS,
   421  						_SECURE_KEEP_CAPS_LOCKED|_SECURE_NO_SETUID_FIXUP|_SECURE_NO_SETUID_FIXUP_LOCKED|_SECURE_NOROOT|_SECURE_NOROOT_LOCKED, 0)
   422  					if err1 != 0 {
   423  						childExitError(pipe, LocKeepCapability, err1)
   424  					}
   425  					_, _, err1 = syscall.RawSyscall(syscall.SYS_CAPSET, uintptr(unsafe.Pointer(&dropCapHeader)), uintptr(unsafe.Pointer(&dropCapData)), 0)
   426  					if err1 != 0 {
   427  						childExitError(pipe, LocSetCap, err1)
   428  					}
   429  				}
   430  
   431  				if r.Seccomp != nil {
   432  					// Load seccomp filter
   433  					_, _, err1 = syscall.RawSyscall(unix.SYS_SECCOMP, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, uintptr(unsafe.Pointer(r.Seccomp)))
   434  					if err1 != 0 {
   435  						childExitError(pipe, LocSeccomp, err1)
   436  					}
   437  				}
   438  			}
   439  		}
   440  	}
   441  
   442  	// Enable ptrace if no seccomp is needed
   443  	if r.Ptrace && r.Seccomp == nil {
   444  		_, _, err1 = syscall.RawSyscall(syscall.SYS_PTRACE, uintptr(syscall.PTRACE_TRACEME), 0, 0)
   445  		if err1 != 0 {
   446  			childExitError(pipe, LocPtraceMe, err1)
   447  		}
   448  	}
   449  
   450  	// at this point, runner is successfully attached for seccomp trap filter
   451  	// or execve trapped without seccomp filter
   452  	// time to exec
   453  	// if execfile fd is specified, call fexecve
   454  	if r.ExecFile > 0 {
   455  		_, _, err1 = syscall.RawSyscall6(unix.SYS_EXECVEAT, r.ExecFile,
   456  			uintptr(unsafe.Pointer(&empty[0])), uintptr(unsafe.Pointer(&argv[0])),
   457  			uintptr(unsafe.Pointer(&env[0])), unix.AT_EMPTY_PATH, 0)
   458  	} else {
   459  		_, _, err1 = syscall.RawSyscall(unix.SYS_EXECVE, uintptr(unsafe.Pointer(argv0)),
   460  			uintptr(unsafe.Pointer(&argv[0])), uintptr(unsafe.Pointer(&env[0])))
   461  	}
   462  	// Fix potential ETXTBSY but with caution (max 50 attempt)
   463  	// The ETXTBSY happens when we copy the executable into container, another goroutine
   464  	// forks but not execve yet (time consuming for setting up mounting points), the forked
   465  	// process is still holding the fd of the copied executable fd. However, we don't
   466  	// want to have different logic to lock the container creation
   467  	for range [50]struct{}{} {
   468  		if err1 != syscall.ETXTBSY {
   469  			break
   470  		}
   471  		// wait instead of busy wait
   472  		syscall.RawSyscall(unix.SYS_NANOSLEEP, uintptr(unsafe.Pointer(&etxtbsyRetryInterval)), 0, 0)
   473  		if r.ExecFile > 0 {
   474  			_, _, err1 = syscall.RawSyscall6(unix.SYS_EXECVEAT, r.ExecFile,
   475  				uintptr(unsafe.Pointer(&empty[0])), uintptr(unsafe.Pointer(&argv[0])),
   476  				uintptr(unsafe.Pointer(&env[0])), unix.AT_EMPTY_PATH, 0)
   477  		} else {
   478  			_, _, err1 = syscall.RawSyscall(unix.SYS_EXECVE, uintptr(unsafe.Pointer(argv0)),
   479  				uintptr(unsafe.Pointer(&argv[0])), uintptr(unsafe.Pointer(&env[0])))
   480  		}
   481  	}
   482  	childExitError(pipe, LocExecve, err1)
   483  	return
   484  }
   485  
   486  //go:nosplit
   487  func childExitError(pipe int, loc ErrorLocation, err syscall.Errno) {
   488  	// send error code on pipe
   489  	childError := ChildError{
   490  		Err:      err,
   491  		Location: loc,
   492  	}
   493  
   494  	// send error code on pipe
   495  	syscall.RawSyscall(unix.SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&childError)), unsafe.Sizeof(childError))
   496  	for {
   497  		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(err), 0, 0)
   498  	}
   499  }
   500  
   501  //go:nosplit
   502  func childExitErrorWithIndex(pipe int, loc ErrorLocation, idx int, err syscall.Errno) {
   503  	// send error code on pipe
   504  	childError := ChildError{
   505  		Err:      err,
   506  		Location: loc,
   507  		Index:    idx,
   508  	}
   509  
   510  	// send error code on pipe
   511  	syscall.RawSyscall(unix.SYS_WRITE, uintptr(pipe), uintptr(unsafe.Pointer(&childError)), unsafe.Sizeof(childError))
   512  	for {
   513  		syscall.RawSyscall(syscall.SYS_EXIT, uintptr(err), 0, 0)
   514  	}
   515  }