github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/nsenter/nsexec.c

github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/nsenter/nsexec.c (about)

     1  
     2  #define _GNU_SOURCE
     3  #include <endian.h>
     4  #include <errno.h>
     5  #include <fcntl.h>
     6  #include <grp.h>
     7  #include <sched.h>
     8  #include <setjmp.h>
     9  #include <signal.h>
    10  #include <stdarg.h>
    11  #include <stdbool.h>
    12  #include <stdint.h>
    13  #include <stdio.h>
    14  #include <stdlib.h>
    15  #include <stdbool.h>
    16  #include <string.h>
    17  #include <unistd.h>
    18  
    19  #include <sys/ioctl.h>
    20  #include <sys/prctl.h>
    21  #include <sys/socket.h>
    22  #include <sys/types.h>
    23  #include <sys/wait.h>
    24  
    25  #include <linux/limits.h>
    26  #include <linux/netlink.h>
    27  #include <linux/types.h>
    28  
    29  #include "getenv.h"
    30  #include "log.h"
    31  /* Get all of the CLONE_NEW* flags. */
    32  #include "namespace.h"
    33  
    34  /* Synchronisation values. */
    35  enum sync_t {
    36  	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
    37  	SYNC_USERMAP_ACK = 0x41,	/* Mapping finished by the parent. */
    38  	SYNC_RECVPID_PLS = 0x42,	/* Tell parent we're sending the PID. */
    39  	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
    40  	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
    41  	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
    42  	SYNC_TIMEOFFSETS_PLS = 0x46,	/* Request parent to write timens offsets. */
    43  	SYNC_TIMEOFFSETS_ACK = 0x47,	/* Timens offsets were written. */
    44  };
    45  
    46  #define STAGE_SETUP  -1
    47  /* longjmp() arguments. */
    48  #define STAGE_PARENT  0
    49  #define STAGE_CHILD   1
    50  #define STAGE_INIT    2
    51  
    52  /* Stores the current stage of nsexec. */
    53  int current_stage = STAGE_SETUP;
    54  
    55  /* Assume the stack grows down, so arguments should be above it. */
    56  struct clone_t {
    57  	/*
    58  	 * Reserve some space for clone() to locate arguments
    59  	 * and retcode in this place
    60  	 */
    61  	char stack[4096] __attribute__((aligned(16)));
    62  	char stack_ptr[0];
    63  
    64  	/* There's two children. This is used to execute the different code. */
    65  	jmp_buf *env;
    66  	int jmpval;
    67  };
    68  
    69  struct nlconfig_t {
    70  	char *data;
    71  
    72  	/* Process settings. */
    73  	uint32_t cloneflags;
    74  	char *oom_score_adj;
    75  	size_t oom_score_adj_len;
    76  
    77  	/* User namespace settings. */
    78  	char *uidmap;
    79  	size_t uidmap_len;
    80  	char *gidmap;
    81  	size_t gidmap_len;
    82  	char *namespaces;
    83  	size_t namespaces_len;
    84  	uint8_t is_setgroup;
    85  
    86  	/* Rootless container settings. */
    87  	uint8_t is_rootless_euid;	/* boolean */
    88  	char *uidmappath;
    89  	size_t uidmappath_len;
    90  	char *gidmappath;
    91  	size_t gidmappath_len;
    92  
    93  	/* Time NS offsets. */
    94  	char *timensoffset;
    95  	size_t timensoffset_len;
    96  };
    97  
    98  /*
    99   * List of netlink message types sent to us as part of bootstrapping the init.
   100   * These constants are defined in libcontainer/message_linux.go.
   101   */
   102  #define INIT_MSG		62000
   103  #define CLONE_FLAGS_ATTR	27281
   104  #define NS_PATHS_ATTR		27282
   105  #define UIDMAP_ATTR		27283
   106  #define GIDMAP_ATTR		27284
   107  #define SETGROUP_ATTR		27285
   108  #define OOM_SCORE_ADJ_ATTR	27286
   109  #define ROOTLESS_EUID_ATTR	27287
   110  #define UIDMAPPATH_ATTR		27288
   111  #define GIDMAPPATH_ATTR		27289
   112  #define TIMENSOFFSET_ATTR	27290
   113  
   114  /*
   115   * Use the raw syscall for versions of glibc which don't include a function for
   116   * it, namely (glibc 2.12).
   117   */
   118  #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
   119  #  define _GNU_SOURCE
   120  #  include "syscall.h"
   121  #  if !defined(SYS_setns) && defined(__NR_setns)
   122  #    define SYS_setns __NR_setns
   123  #  endif
   124  
   125  #  ifndef SYS_setns
   126  #    error "setns(2) syscall not supported by glibc version"
   127  #  endif
   128  
   129  int setns(int fd, int nstype)
   130  {
   131  	return syscall(SYS_setns, fd, nstype);
   132  }
   133  #endif
   134  
   135  /* XXX: This is ugly. */
   136  static int syncfd = -1;
   137  
   138  static int write_file(char *data, size_t data_len, char *pathfmt, ...)
   139  {
   140  	int fd, len, ret = 0;
   141  	char path[PATH_MAX];
   142  
   143  	va_list ap;
   144  	va_start(ap, pathfmt);
   145  	len = vsnprintf(path, PATH_MAX, pathfmt, ap);
   146  	va_end(ap);
   147  	if (len < 0)
   148  		return -1;
   149  
   150  	fd = open(path, O_RDWR);
   151  	if (fd < 0) {
   152  		return -1;
   153  	}
   154  
   155  	len = write(fd, data, data_len);
   156  	if (len != data_len) {
   157  		ret = -1;
   158  		goto out;
   159  	}
   160  
   161  out:
   162  	close(fd);
   163  	return ret;
   164  }
   165  
   166  enum policy_t {
   167  	SETGROUPS_DEFAULT = 0,
   168  	SETGROUPS_ALLOW,
   169  	SETGROUPS_DENY,
   170  };
   171  
   172  /* This *must* be called before we touch gid_map. */
   173  static void update_setgroups(int pid, enum policy_t setgroup)
   174  {
   175  	char *policy;
   176  
   177  	switch (setgroup) {
   178  	case SETGROUPS_ALLOW:
   179  		policy = "allow";
   180  		break;
   181  	case SETGROUPS_DENY:
   182  		policy = "deny";
   183  		break;
   184  	case SETGROUPS_DEFAULT:
   185  	default:
   186  		/* Nothing to do. */
   187  		return;
   188  	}
   189  
   190  	if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
   191  		/*
   192  		 * If the kernel is too old to support /proc/pid/setgroups,
   193  		 * open(2) or write(2) will return ENOENT. This is fine.
   194  		 */
   195  		if (errno != ENOENT)
   196  			bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
   197  	}
   198  }
   199  
   200  static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
   201  {
   202  	int child;
   203  
   204  	/*
   205  	 * If @app is NULL, execve will segfault. Just check it here and bail (if
   206  	 * we're in this path, the caller is already getting desperate and there
   207  	 * isn't a backup to this failing). This usually would be a configuration
   208  	 * or programming issue.
   209  	 */
   210  	if (!app)
   211  		bail("mapping tool not present");
   212  
   213  	child = fork();
   214  	if (child < 0)
   215  		bail("failed to fork");
   216  
   217  	if (!child) {
   218  #define MAX_ARGV 20
   219  		char *argv[MAX_ARGV];
   220  		char *envp[] = { NULL };
   221  		char pid_fmt[16];
   222  		int argc = 0;
   223  		char *next;
   224  
   225  		snprintf(pid_fmt, 16, "%d", pid);
   226  
   227  		argv[argc++] = (char *)app;
   228  		argv[argc++] = pid_fmt;
   229  		/*
   230  		 * Convert the map string into a list of argument that
   231  		 * newuidmap/newgidmap can understand.
   232  		 */
   233  
   234  		while (argc < MAX_ARGV) {
   235  			if (*map == '\0') {
   236  				argv[argc++] = NULL;
   237  				break;
   238  			}
   239  			argv[argc++] = map;
   240  			next = strpbrk(map, "\n ");
   241  			if (next == NULL)
   242  				break;
   243  			*next++ = '\0';
   244  			map = next + strspn(next, "\n ");
   245  		}
   246  
   247  		execve(app, argv, envp);
   248  		bail("failed to execv");
   249  	} else {
   250  		int status;
   251  
   252  		while (true) {
   253  			if (waitpid(child, &status, 0) < 0) {
   254  				if (errno == EINTR)
   255  					continue;
   256  				bail("failed to waitpid");
   257  			}
   258  			if (WIFEXITED(status) || WIFSIGNALED(status))
   259  				return WEXITSTATUS(status);
   260  		}
   261  	}
   262  
   263  	return -1;
   264  }
   265  
   266  static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
   267  {
   268  	if (map == NULL || map_len == 0)
   269  		return;
   270  
   271  	write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
   272  	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
   273  		if (errno != EPERM)
   274  			bail("failed to update /proc/%d/uid_map", pid);
   275  		write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
   276  		if (try_mapping_tool(path, pid, map, map_len))
   277  			bail("failed to use newuid map on %d", pid);
   278  	}
   279  }
   280  
   281  static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
   282  {
   283  	if (map == NULL || map_len == 0)
   284  		return;
   285  
   286  	write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
   287  	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
   288  		if (errno != EPERM)
   289  			bail("failed to update /proc/%d/gid_map", pid);
   290  		write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
   291  		if (try_mapping_tool(path, pid, map, map_len))
   292  			bail("failed to use newgid map on %d", pid);
   293  	}
   294  }
   295  
   296  static void update_oom_score_adj(char *data, size_t len)
   297  {
   298  	if (data == NULL || len == 0)
   299  		return;
   300  
   301  	write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
   302  	if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
   303  		bail("failed to update /proc/self/oom_score_adj");
   304  }
   305  
   306  /* A dummy function that just jumps to the given jumpval. */
   307  static int child_func(void *arg) __attribute__((noinline));
   308  static int child_func(void *arg)
   309  {
   310  	struct clone_t *ca = (struct clone_t *)arg;
   311  	longjmp(*ca->env, ca->jmpval);
   312  }
   313  
   314  static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
   315  static int clone_parent(jmp_buf *env, int jmpval)
   316  {
   317  	struct clone_t ca = {
   318  		.env = env,
   319  		.jmpval = jmpval,
   320  	};
   321  
   322  	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
   323  }
   324  
   325  /* Returns the clone(2) flag for a namespace, given the name of a namespace. */
   326  static int nsflag(char *name)
   327  {
   328  	if (!strcmp(name, "cgroup"))
   329  		return CLONE_NEWCGROUP;
   330  	else if (!strcmp(name, "ipc"))
   331  		return CLONE_NEWIPC;
   332  	else if (!strcmp(name, "mnt"))
   333  		return CLONE_NEWNS;
   334  	else if (!strcmp(name, "net"))
   335  		return CLONE_NEWNET;
   336  	else if (!strcmp(name, "pid"))
   337  		return CLONE_NEWPID;
   338  	else if (!strcmp(name, "user"))
   339  		return CLONE_NEWUSER;
   340  	else if (!strcmp(name, "uts"))
   341  		return CLONE_NEWUTS;
   342  	else if (!strcmp(name, "time"))
   343  		return CLONE_NEWTIME;
   344  
   345  	/* If we don't recognise a name, fallback to 0. */
   346  	return 0;
   347  }
   348  
   349  static uint32_t readint32(char *buf)
   350  {
   351  	return *(uint32_t *) buf;
   352  }
   353  
   354  static uint8_t readint8(char *buf)
   355  {
   356  	return *(uint8_t *) buf;
   357  }
   358  
   359  static void nl_parse(int fd, struct nlconfig_t *config)
   360  {
   361  	size_t len, size;
   362  	struct nlmsghdr hdr;
   363  	char *data, *current;
   364  
   365  	/* Retrieve the netlink header. */
   366  	len = read(fd, &hdr, NLMSG_HDRLEN);
   367  	if (len != NLMSG_HDRLEN)
   368  		bail("invalid netlink header length %zu", len);
   369  
   370  	if (hdr.nlmsg_type == NLMSG_ERROR)
   371  		bail("failed to read netlink message");
   372  
   373  	if (hdr.nlmsg_type != INIT_MSG)
   374  		bail("unexpected msg type %d", hdr.nlmsg_type);
   375  
   376  	/* Retrieve data. */
   377  	size = NLMSG_PAYLOAD(&hdr, 0);
   378  	current = data = malloc(size);
   379  	if (!data)
   380  		bail("failed to allocate %zu bytes of memory for nl_payload", size);
   381  
   382  	len = read(fd, data, size);
   383  	if (len != size)
   384  		bail("failed to read netlink payload, %zu != %zu", len, size);
   385  
   386  	/* Parse the netlink payload. */
   387  	config->data = data;
   388  	while (current < data + size) {
   389  		struct nlattr *nlattr = (struct nlattr *)current;
   390  		size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
   391  
   392  		/* Advance to payload. */
   393  		current += NLA_HDRLEN;
   394  
   395  		/* Handle payload. */
   396  		switch (nlattr->nla_type) {
   397  		case CLONE_FLAGS_ATTR:
   398  			config->cloneflags = readint32(current);
   399  			break;
   400  		case ROOTLESS_EUID_ATTR:
   401  			config->is_rootless_euid = readint8(current);	/* boolean */
   402  			break;
   403  		case OOM_SCORE_ADJ_ATTR:
   404  			config->oom_score_adj = current;
   405  			config->oom_score_adj_len = payload_len;
   406  			break;
   407  		case NS_PATHS_ATTR:
   408  			config->namespaces = current;
   409  			config->namespaces_len = payload_len;
   410  			break;
   411  		case UIDMAP_ATTR:
   412  			config->uidmap = current;
   413  			config->uidmap_len = payload_len;
   414  			break;
   415  		case GIDMAP_ATTR:
   416  			config->gidmap = current;
   417  			config->gidmap_len = payload_len;
   418  			break;
   419  		case UIDMAPPATH_ATTR:
   420  			config->uidmappath = current;
   421  			config->uidmappath_len = payload_len;
   422  			break;
   423  		case GIDMAPPATH_ATTR:
   424  			config->gidmappath = current;
   425  			config->gidmappath_len = payload_len;
   426  			break;
   427  		case SETGROUP_ATTR:
   428  			config->is_setgroup = readint8(current);
   429  			break;
   430  		case TIMENSOFFSET_ATTR:
   431  			config->timensoffset = current;
   432  			config->timensoffset_len = payload_len;
   433  			break;
   434  		default:
   435  			bail("unknown netlink message type %d", nlattr->nla_type);
   436  		}
   437  
   438  		current += NLA_ALIGN(payload_len);
   439  	}
   440  }
   441  
   442  void nl_free(struct nlconfig_t *config)
   443  {
   444  	free(config->data);
   445  }
   446  
   447  void join_namespaces(char *nslist)
   448  {
   449  	int num = 0, i;
   450  	char *saveptr = NULL;
   451  	char *namespace = strtok_r(nslist, ",", &saveptr);
   452  	struct namespace_t {
   453  		int fd;
   454  		char type[PATH_MAX];
   455  		char path[PATH_MAX];
   456  	} *namespaces = NULL;
   457  
   458  	if (!namespace || !strlen(namespace) || !strlen(nslist))
   459  		bail("ns paths are empty");
   460  
   461  	/*
   462  	 * We have to open the file descriptors first, since after
   463  	 * we join the mnt namespace we might no longer be able to
   464  	 * access the paths.
   465  	 */
   466  	do {
   467  		int fd;
   468  		char *path;
   469  		struct namespace_t *ns;
   470  
   471  		/* Resize the namespace array. */
   472  		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
   473  		if (!namespaces)
   474  			bail("failed to reallocate namespace array");
   475  		ns = &namespaces[num - 1];
   476  
   477  		/* Split 'ns:path'. */
   478  		path = strstr(namespace, ":");
   479  		if (!path)
   480  			bail("failed to parse %s", namespace);
   481  		*path++ = '\0';
   482  
   483  		fd = open(path, O_RDONLY);
   484  		if (fd < 0)
   485  			bail("failed to open %s", path);
   486  
   487  		ns->fd = fd;
   488  		strncpy(ns->type, namespace, PATH_MAX - 1);
   489  		strncpy(ns->path, path, PATH_MAX - 1);
   490  		ns->path[PATH_MAX - 1] = '\0';
   491  	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
   492  
   493  	/*
   494  	 * The ordering in which we join namespaces is important. We should
   495  	 * always join the user namespace *first*. This is all guaranteed
   496  	 * from the container_linux.go side of this, so we're just going to
   497  	 * follow the order given to us.
   498  	 */
   499  
   500  	for (i = 0; i < num; i++) {
   501  		struct namespace_t *ns = &namespaces[i];
   502  		int flag = nsflag(ns->type);
   503  
   504  		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
   505  		if (setns(ns->fd, flag) < 0)
   506  			bail("failed to setns into %s namespace", ns->type);
   507  
   508  		close(ns->fd);
   509  	}
   510  
   511  	free(namespaces);
   512  }
   513  
   514  static inline int sane_kill(pid_t pid, int signum)
   515  {
   516  	if (pid > 0)
   517  		return kill(pid, signum);
   518  	else
   519  		return 0;
   520  }
   521  
   522  void try_unshare(int flags, const char *msg)
   523  {
   524  	write_log(DEBUG, "unshare %s", msg);
   525  	/*
   526  	 * Kernels prior to v4.3 may return EINVAL on unshare when another process
   527  	 * reads runc's /proc/$PID/status or /proc/$PID/maps. To work around this,
   528  	 * retry on EINVAL a few times.
   529  	 */
   530  	int retries = 5;
   531  	for (; retries > 0; retries--) {
   532  		if (unshare(flags) == 0) {
   533  			return;
   534  		}
   535  		if (errno != EINVAL)
   536  			break;
   537  	}
   538  	bail("failed to unshare %s", msg);
   539  }
   540  
   541  static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
   542  {
   543  	if (map == NULL || map_len == 0)
   544  		return;
   545  	write_log(DEBUG, "update /proc/%d/timens_offsets to '%s'", pid, map);
   546  	if (write_file(map, map_len, "/proc/%d/timens_offsets", pid) < 0)
   547  		bail("failed to update /proc/%d/timens_offsets", pid);
   548  }
   549  
   550  void nsexec(void)
   551  {
   552  	int pipenum;
   553  	jmp_buf env;
   554  	int sync_child_pipe[2], sync_grandchild_pipe[2];
   555  	struct nlconfig_t config = { 0 };
   556  
   557  	/*
   558  	 * Setup a pipe to send logs to the parent. This should happen
   559  	 * first, because bail will use that pipe.
   560  	 */
   561  	setup_logpipe();
   562  
   563  	/*
   564  	 * Get the init pipe fd from the environment. The init pipe is used to
   565  	 * read the bootstrap data and tell the parent what the new pids are
   566  	 * after the setup is done.
   567  	 */
   568  	pipenum = getenv_int("_LIBCONTAINER_INITPIPE");
   569  	if (pipenum < 0) {
   570  		/* We are not a runc init. Just return to go runtime. */
   571  		return;
   572  	}
   573  
   574  	/*
   575  	 * Inform the parent we're past initial setup.
   576  	 * For the other side of this, see initWaiter.
   577  	 */
   578  	if (write(pipenum, "", 1) != 1)
   579  		bail("could not inform the parent we are past initial setup");
   580  
   581  	write_log(DEBUG, "=> nsexec container setup");
   582  
   583  	/* Parse all of the netlink configuration. */
   584  	nl_parse(pipenum, &config);
   585  
   586  	/* Set oom_score_adj. This has to be done before !dumpable because
   587  	 * /proc/self/oom_score_adj is not writeable unless you're an privileged
   588  	 * user (if !dumpable is set). All children inherit their parent's
   589  	 * oom_score_adj value on fork(2) so this will always be propagated
   590  	 * properly.
   591  	 */
   592  	update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
   593  
   594  	/*
   595  	 * Make the process non-dumpable, to avoid various race conditions that
   596  	 * could cause processes in namespaces we're joining to access host
   597  	 * resources (or potentially execute code).
   598  	 *
   599  	 * However, if the number of namespaces we are joining is 0, we are not
   600  	 * going to be switching to a different security context. Thus setting
   601  	 * ourselves to be non-dumpable only breaks things (like rootless
   602  	 * containers), which is the recommendation from the kernel folks.
   603  	 */
   604  	if (config.namespaces) {
   605  		write_log(DEBUG, "set process as non-dumpable");
   606  		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
   607  			bail("failed to set process as non-dumpable");
   608  	}
   609  
   610  	/* Pipe so we can tell the child when we've finished setting up. */
   611  	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
   612  		bail("failed to setup sync pipe between parent and child");
   613  
   614  	/*
   615  	 * We need a new socketpair to sync with grandchild so we don't have
   616  	 * race condition with child.
   617  	 */
   618  	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
   619  		bail("failed to setup sync pipe between parent and grandchild");
   620  
   621  	/* TODO: Currently we aren't dealing with child deaths properly. */
   622  
   623  	/*
   624  	 * Okay, so this is quite annoying.
   625  	 *
   626  	 * In order for this unsharing code to be more extensible we need to split
   627  	 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
   628  	 * would be if we did clone(CLONE_NEWUSER) and the other namespaces
   629  	 * separately, but because of SELinux issues we cannot really do that. But
   630  	 * we cannot just dump the namespace flags into clone(...) because several
   631  	 * usecases (such as rootless containers) require more granularity around
   632  	 * the namespace setup. In addition, some older kernels had issues where
   633  	 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
   634  	 * handle this while also dealing with SELinux so we choose SELinux support
   635  	 * over broken kernel support).
   636  	 *
   637  	 * However, if we unshare(2) the user namespace *before* we clone(2), then
   638  	 * all hell breaks loose.
   639  	 *
   640  	 * The parent no longer has permissions to do many things (unshare(2) drops
   641  	 * all capabilities in your old namespace), and the container cannot be set
   642  	 * up to have more than one {uid,gid} mapping. This is obviously less than
   643  	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
   644  	 *
   645  	 * Unfortunately, it's not as simple as that. We have to fork to enter the
   646  	 * PID namespace (the PID namespace only applies to children). Since we'll
   647  	 * have to double-fork, this clone_parent() call won't be able to get the
   648  	 * PID of the _actual_ init process (without doing more synchronisation than
   649  	 * I can deal with at the moment). So we'll just get the parent to send it
   650  	 * for us, the only job of this process is to update
   651  	 * /proc/pid/{setgroups,uid_map,gid_map}.
   652  	 *
   653  	 * And as a result of the above, we also need to setns(2) in the first child
   654  	 * because if we join a PID namespace in the topmost parent then our child
   655  	 * will be in that namespace (and it will not be able to give us a PID value
   656  	 * that makes sense without resorting to sending things with cmsg).
   657  	 *
   658  	 * This also deals with an older issue caused by dumping cloneflags into
   659  	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
   660  	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
   661  	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
   662  	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
   663  	 * aware, the last mainline kernel which had this bug was Linux 3.12.
   664  	 * However, we cannot comment on which kernels the broken patch was
   665  	 * backported to.
   666  	 *
   667  	 * -- Aleksa "what has my life come to?" Sarai
   668  	 */
   669  
   670  	switch (setjmp(env)) {
   671  		/*
   672  		 * Stage 0: We're in the parent. Our job is just to create a new child
   673  		 *          (stage 1: STAGE_CHILD) process and write its uid_map and
   674  		 *          gid_map. That process will go on to create a new process, then
   675  		 *          it will send us its PID which we will send to the bootstrap
   676  		 *          process.
   677  		 */
   678  	case STAGE_PARENT:{
   679  			int len;
   680  			pid_t stage1_pid = -1, stage2_pid = -1;
   681  			bool stage1_complete, stage2_complete;
   682  
   683  			/* For debugging. */
   684  			current_stage = STAGE_PARENT;
   685  			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
   686  			write_log(DEBUG, "~> nsexec stage-0");
   687  
   688  			/* Start the process of getting a container. */
   689  			write_log(DEBUG, "spawn stage-1");
   690  			stage1_pid = clone_parent(&env, STAGE_CHILD);
   691  			if (stage1_pid < 0)
   692  				bail("unable to spawn stage-1");
   693  
   694  			syncfd = sync_child_pipe[1];
   695  			if (close(sync_child_pipe[0]) < 0)
   696  				bail("failed to close sync_child_pipe[0] fd");
   697  
   698  			/*
   699  			 * State machine for synchronisation with the children. We only
   700  			 * return once both the child and grandchild are ready.
   701  			 */
   702  			write_log(DEBUG, "-> stage-1 synchronisation loop");
   703  			stage1_complete = false;
   704  			while (!stage1_complete) {
   705  				enum sync_t s;
   706  
   707  				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
   708  					bail("failed to sync with stage-1: next state");
   709  
   710  				switch (s) {
   711  				case SYNC_USERMAP_PLS:
   712  					write_log(DEBUG, "stage-1 requested userns mappings");
   713  
   714  					/*
   715  					 * Enable setgroups(2) if we've been asked to. But we also
   716  					 * have to explicitly disable setgroups(2) if we're
   717  					 * creating a rootless container for single-entry mapping.
   718  					 * i.e. config.is_setgroup == false.
   719  					 * (this is required since Linux 3.19).
   720  					 *
   721  					 * For rootless multi-entry mapping, config.is_setgroup shall be true and
   722  					 * newuidmap/newgidmap shall be used.
   723  					 */
   724  					if (config.is_rootless_euid && !config.is_setgroup)
   725  						update_setgroups(stage1_pid, SETGROUPS_DENY);
   726  
   727  					/* Set up mappings. */
   728  					update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
   729  					update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
   730  
   731  					s = SYNC_USERMAP_ACK;
   732  					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
   733  						sane_kill(stage1_pid, SIGKILL);
   734  						sane_kill(stage2_pid, SIGKILL);
   735  						bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
   736  					}
   737  					break;
   738  				case SYNC_RECVPID_PLS:
   739  					write_log(DEBUG, "stage-1 requested pid to be forwarded");
   740  
   741  					/* Get the stage-2 pid. */
   742  					if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
   743  						sane_kill(stage1_pid, SIGKILL);
   744  						bail("failed to sync with stage-1: read(stage2_pid)");
   745  					}
   746  
   747  					/* Send ACK. */
   748  					s = SYNC_RECVPID_ACK;
   749  					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
   750  						sane_kill(stage1_pid, SIGKILL);
   751  						sane_kill(stage2_pid, SIGKILL);
   752  						bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
   753  					}
   754  
   755  					/*
   756  					 * Send both the stage-1 and stage-2 pids back to runc.
   757  					 * runc needs the stage-2 to continue process management,
   758  					 * but because stage-1 was spawned with CLONE_PARENT we
   759  					 * cannot reap it within stage-0 and thus we need to ask
   760  					 * runc to reap the zombie for us.
   761  					 */
   762  					write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
   763  						  stage1_pid, stage2_pid);
   764  					len =
   765  					    dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
   766  						    stage2_pid);
   767  					if (len < 0) {
   768  						sane_kill(stage1_pid, SIGKILL);
   769  						sane_kill(stage2_pid, SIGKILL);
   770  						bail("failed to sync with runc: write(pid-JSON)");
   771  					}
   772  					break;
   773  				case SYNC_TIMEOFFSETS_PLS:
   774  					write_log(DEBUG, "stage-1 requested timens offsets to be configured");
   775  					update_timens_offsets(stage1_pid, config.timensoffset, config.timensoffset_len);
   776  					s = SYNC_TIMEOFFSETS_ACK;
   777  					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
   778  						sane_kill(stage1_pid, SIGKILL);
   779  						bail("failed to sync with child: write(SYNC_TIMEOFFSETS_ACK)");
   780  					}
   781  					break;
   782  				case SYNC_CHILD_FINISH:
   783  					write_log(DEBUG, "stage-1 complete");
   784  					stage1_complete = true;
   785  					break;
   786  				default:
   787  					bail("unexpected sync value: %u", s);
   788  				}
   789  			}
   790  			write_log(DEBUG, "<- stage-1 synchronisation loop");
   791  
   792  			/* Now sync with grandchild. */
   793  			syncfd = sync_grandchild_pipe[1];
   794  			if (close(sync_grandchild_pipe[0]) < 0)
   795  				bail("failed to close sync_grandchild_pipe[0] fd");
   796  
   797  			write_log(DEBUG, "-> stage-2 synchronisation loop");
   798  			stage2_complete = false;
   799  			while (!stage2_complete) {
   800  				enum sync_t s;
   801  
   802  				write_log(DEBUG, "signalling stage-2 to run");
   803  				s = SYNC_GRANDCHILD;
   804  				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
   805  					sane_kill(stage2_pid, SIGKILL);
   806  					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
   807  				}
   808  
   809  				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
   810  					bail("failed to sync with child: next state");
   811  
   812  				switch (s) {
   813  				case SYNC_CHILD_FINISH:
   814  					write_log(DEBUG, "stage-2 complete");
   815  					stage2_complete = true;
   816  					break;
   817  				default:
   818  					bail("unexpected sync value: %u", s);
   819  				}
   820  			}
   821  			write_log(DEBUG, "<- stage-2 synchronisation loop");
   822  			write_log(DEBUG, "<~ nsexec stage-0");
   823  			exit(0);
   824  		}
   825  		break;
   826  
   827  		/*
   828  		 * Stage 1: We're in the first child process. Our job is to join any
   829  		 *          provided namespaces in the netlink payload and unshare all of
   830  		 *          the requested namespaces. If we've been asked to CLONE_NEWUSER,
   831  		 *          we will ask our parent (stage 0) to set up our user mappings
   832  		 *          for us. Then, we create a new child (stage 2: STAGE_INIT) for
   833  		 *          PID namespace. We then send the child's PID to our parent
   834  		 *          (stage 0).
   835  		 */
   836  	case STAGE_CHILD:{
   837  			pid_t stage2_pid = -1;
   838  			enum sync_t s;
   839  
   840  			/* For debugging. */
   841  			current_stage = STAGE_CHILD;
   842  
   843  			/* We're in a child and thus need to tell the parent if we die. */
   844  			syncfd = sync_child_pipe[0];
   845  			if (close(sync_child_pipe[1]) < 0)
   846  				bail("failed to close sync_child_pipe[1] fd");
   847  
   848  			/* For debugging. */
   849  			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
   850  			write_log(DEBUG, "~> nsexec stage-1");
   851  
   852  			/*
   853  			 * We need to setns first. We cannot do this earlier (in stage 0)
   854  			 * because of the fact that we forked to get here (the PID of
   855  			 * [stage 2: STAGE_INIT]) would be meaningless). We could send it
   856  			 * using cmsg(3) but that's just annoying.
   857  			 */
   858  			if (config.namespaces)
   859  				join_namespaces(config.namespaces);
   860  
   861  			/*
   862  			 * Deal with user namespaces first. They are quite special, as they
   863  			 * affect our ability to unshare other namespaces and are used as
   864  			 * context for privilege checks.
   865  			 *
   866  			 * We don't unshare all namespaces in one go. The reason for this
   867  			 * is that, while the kernel documentation may claim otherwise,
   868  			 * there are certain cases where unsharing all namespaces at once
   869  			 * will result in namespace objects being owned incorrectly.
   870  			 * Ideally we should just fix these kernel bugs, but it's better to
   871  			 * be safe than sorry, and fix them separately.
   872  			 *
   873  			 * A specific case of this is that the SELinux label of the
   874  			 * internal kern-mount that mqueue uses will be incorrect if the
   875  			 * UTS namespace is cloned before the USER namespace is mapped.
   876  			 * I've also heard of similar problems with the network namespace
   877  			 * in some scenarios. This also mirrors how LXC deals with this
   878  			 * problem.
   879  			 */
   880  			if (config.cloneflags & CLONE_NEWUSER) {
   881  				try_unshare(CLONE_NEWUSER, "user namespace");
   882  				config.cloneflags &= ~CLONE_NEWUSER;
   883  
   884  				/*
   885  				 * We need to set ourselves as dumpable temporarily so that the
   886  				 * parent process can write to our procfs files.
   887  				 */
   888  				if (config.namespaces) {
   889  					write_log(DEBUG, "temporarily set process as dumpable");
   890  					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
   891  						bail("failed to temporarily set process as dumpable");
   892  				}
   893  
   894  				/*
   895  				 * We don't have the privileges to do any mapping here (see the
   896  				 * clone_parent rant). So signal stage-0 to do the mapping for
   897  				 * us.
   898  				 */
   899  				write_log(DEBUG, "request stage-0 to map user namespace");
   900  				s = SYNC_USERMAP_PLS;
   901  				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
   902  					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
   903  
   904  				/* ... wait for mapping ... */
   905  				write_log(DEBUG, "request stage-0 to map user namespace");
   906  				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
   907  					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
   908  				if (s != SYNC_USERMAP_ACK)
   909  					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
   910  
   911  				/* Revert temporary re-dumpable setting. */
   912  				if (config.namespaces) {
   913  					write_log(DEBUG, "re-set process as non-dumpable");
   914  					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
   915  						bail("failed to re-set process as non-dumpable");
   916  				}
   917  
   918  				/* Become root in the namespace proper. */
   919  				if (setresuid(0, 0, 0) < 0)
   920  					bail("failed to become root in user namespace");
   921  			}
   922  
   923  			/*
   924  			 * Unshare all of the namespaces. Now, it should be noted that this
   925  			 * ordering might break in the future (especially with rootless
   926  			 * containers). But for now, it's not possible to split this into
   927  			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
   928  			 *
   929  			 * Note that we don't merge this with clone() because there were
   930  			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
   931  			 * was broken, so we'll just do it the long way anyway.
   932  			 */
   933  			try_unshare(config.cloneflags, "remaining namespaces");
   934  
   935  			if (config.timensoffset) {
   936  				write_log(DEBUG, "request stage-0 to write timens offsets");
   937  
   938  				s = SYNC_TIMEOFFSETS_PLS;
   939  				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
   940  					bail("failed to sync with parent: write(SYNC_TIMEOFFSETS_PLS)");
   941  
   942  				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
   943  					bail("failed to sync with parent: read(SYNC_TIMEOFFSETS_ACK)");
   944  				if (s != SYNC_TIMEOFFSETS_ACK)
   945  					bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s);
   946  			}
   947  
   948  			/*
   949  			 * TODO: What about non-namespace clone flags that we're dropping here?
   950  			 *
   951  			 * We fork again because of PID namespace, setns(2) or unshare(2) don't
   952  			 * change the PID namespace of the calling process, because doing so
   953  			 * would change the caller's idea of its own PID (as reported by getpid()),
   954  			 * which would break many applications and libraries, so we must fork
   955  			 * to actually enter the new PID namespace.
   956  			 */
   957  			write_log(DEBUG, "spawn stage-2");
   958  			stage2_pid = clone_parent(&env, STAGE_INIT);
   959  			if (stage2_pid < 0)
   960  				bail("unable to spawn stage-2");
   961  
   962  			/* Send the child to our parent, which knows what it's doing. */
   963  			write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
   964  			s = SYNC_RECVPID_PLS;
   965  			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
   966  				sane_kill(stage2_pid, SIGKILL);
   967  				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
   968  			}
   969  			if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
   970  				sane_kill(stage2_pid, SIGKILL);
   971  				bail("failed to sync with parent: write(stage2_pid)");
   972  			}
   973  
   974  			/* ... wait for parent to get the pid ... */
   975  			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
   976  				sane_kill(stage2_pid, SIGKILL);
   977  				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
   978  			}
   979  			if (s != SYNC_RECVPID_ACK) {
   980  				sane_kill(stage2_pid, SIGKILL);
   981  				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
   982  			}
   983  
   984  			write_log(DEBUG, "signal completion to stage-0");
   985  			s = SYNC_CHILD_FINISH;
   986  			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
   987  				sane_kill(stage2_pid, SIGKILL);
   988  				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
   989  			}
   990  
   991  			/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
   992  			write_log(DEBUG, "<~ nsexec stage-1");
   993  			exit(0);
   994  		}
   995  		break;
   996  
   997  		/*
   998  		 * Stage 2: We're the final child process, and the only process that will
   999  		 *          actually return to the Go runtime. Our job is to just do the
  1000  		 *          final cleanup steps and then return to the Go runtime to allow
  1001  		 *          init_linux.go to run.
  1002  		 */
  1003  	case STAGE_INIT:{
  1004  			/*
  1005  			 * We're inside the child now, having jumped from the
  1006  			 * start_child() code after forking in the parent.
  1007  			 */
  1008  			enum sync_t s;
  1009  
  1010  			/* For debugging. */
  1011  			current_stage = STAGE_INIT;
  1012  
  1013  			/* We're in a child and thus need to tell the parent if we die. */
  1014  			syncfd = sync_grandchild_pipe[0];
  1015  			if (close(sync_grandchild_pipe[1]) < 0)
  1016  				bail("failed to close sync_grandchild_pipe[1] fd");
  1017  
  1018  			if (close(sync_child_pipe[0]) < 0)
  1019  				bail("failed to close sync_child_pipe[0] fd");
  1020  
  1021  			/* For debugging. */
  1022  			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
  1023  			write_log(DEBUG, "~> nsexec stage-2");
  1024  
  1025  			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
  1026  				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
  1027  			if (s != SYNC_GRANDCHILD)
  1028  				bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
  1029  
  1030  			if (setsid() < 0)
  1031  				bail("setsid failed");
  1032  
  1033  			if (setuid(0) < 0)
  1034  				bail("setuid failed");
  1035  
  1036  			if (setgid(0) < 0)
  1037  				bail("setgid failed");
  1038  
  1039  			if (!config.is_rootless_euid && config.is_setgroup) {
  1040  				if (setgroups(0, NULL) < 0)
  1041  					bail("setgroups failed");
  1042  			}
  1043  
  1044  			write_log(DEBUG, "signal completion to stage-0");
  1045  			s = SYNC_CHILD_FINISH;
  1046  			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
  1047  				bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
  1048  
  1049  			/* Close sync pipes. */
  1050  			if (close(sync_grandchild_pipe[0]) < 0)
  1051  				bail("failed to close sync_grandchild_pipe[0] fd");
  1052  
  1053  			/* Free netlink data. */
  1054  			nl_free(&config);
  1055  
  1056  			/* Finish executing, let the Go runtime take over. */
  1057  			write_log(DEBUG, "<= nsexec container setup");
  1058  			write_log(DEBUG, "booting up go runtime ...");
  1059  			return;
  1060  		}
  1061  		break;
  1062  	default:
  1063  		bail("unexpected jump value");
  1064  	}
  1065  
  1066  	/* Should never be reached. */
  1067  	bail("should never be reached");
  1068  }