github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/mergeCode/runc/libcontainer/nsenter/nsexec.c (about)

     1  #define _GNU_SOURCE
     2  #include <endian.h>
     3  #include <errno.h>
     4  #include <fcntl.h>
     5  #include <grp.h>
     6  #include <sched.h>
     7  #include <setjmp.h>
     8  #include <signal.h>
     9  #include <stdarg.h>
    10  #include <stdbool.h>
    11  #include <stdint.h>
    12  #include <stdio.h>
    13  #include <stdlib.h>
    14  #include <stdbool.h>
    15  #include <string.h>
    16  #include <unistd.h>
    17  
    18  #include <sys/ioctl.h>
    19  #include <sys/prctl.h>
    20  #include <sys/socket.h>
    21  #include <sys/types.h>
    22  
    23  #include <linux/limits.h>
    24  #include <linux/netlink.h>
    25  #include <linux/types.h>
    26  
    27  /* Get all of the CLONE_NEW* flags. */
    28  #include "namespace.h"
    29  
    30  /* Synchronisation values. */
    31  enum sync_t {
    32  	SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
    33  	SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
    34  	SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
    35  	SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
    36  
    37  	/* XXX: This doesn't help with segfaults and other such issues. */
    38  	SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
    39  };
    40  
    41  /* longjmp() arguments. */
    42  #define JUMP_PARENT 0x00
    43  #define JUMP_CHILD  0xA0
    44  #define JUMP_INIT   0xA1
    45  
    46  /* JSON buffer. */
    47  #define JSON_MAX 4096
    48  
    49  /* Assume the stack grows down, so arguments should be above it. */
    50  struct clone_t {
    51  	/*
    52  	 * Reserve some space for clone() to locate arguments
    53  	 * and retcode in this place
    54  	 */
    55  	char stack[4096] __attribute__ ((aligned(16)));
    56  	char stack_ptr[0];
    57  
    58  	/* There's two children. This is used to execute the different code. */
    59  	jmp_buf *env;
    60  	int jmpval;
    61  };
    62  
    63  struct nlconfig_t {
    64  	char *data;
    65  	uint32_t cloneflags;
    66  	char *uidmap;
    67  	size_t uidmap_len;
    68  	char *gidmap;
    69  	size_t gidmap_len;
    70  	char *namespaces;
    71  	size_t namespaces_len;
    72  	uint8_t is_setgroup;
    73  	int consolefd;
    74  };
    75  
    76  /*
    77   * List of netlink message types sent to us as part of bootstrapping the init.
    78   * These constants are defined in libcontainer/message_linux.go.
    79   */
    80  #define INIT_MSG		62000
    81  #define CLONE_FLAGS_ATTR	27281
    82  #define CONSOLE_PATH_ATTR	27282
    83  #define NS_PATHS_ATTR		27283
    84  #define UIDMAP_ATTR		27284
    85  #define GIDMAP_ATTR		27285
    86  #define SETGROUP_ATTR		27286
    87  
    88  /*
    89   * Use the raw syscall for versions of glibc which don't include a function for
    90   * it, namely (glibc 2.12).
    91   */
    92  #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
    93  #	define _GNU_SOURCE
    94  #	include "syscall.h"
    95  #	if !defined(SYS_setns) && defined(__NR_setns)
    96  #		define SYS_setns __NR_setns
    97  #	endif
    98  
    99  #ifndef SYS_setns
   100  #	error "setns(2) syscall not supported by glibc version"
   101  #endif
   102  
   103  int setns(int fd, int nstype)
   104  {
   105  	return syscall(SYS_setns, fd, nstype);
   106  }
   107  #endif
   108  
   109  /* XXX: This is ugly. */
   110  static int syncfd = -1;
   111  
   112  /* TODO(cyphar): Fix this so it correctly deals with syncT. */
   113  #define bail(fmt, ...)								\
   114  	do {									\
   115  		int ret = __COUNTER__ + 1;					\
   116  		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__);	\
   117  		if (syncfd >= 0) {						\
   118  			enum sync_t s = SYNC_ERR;				\
   119  			if (write(syncfd, &s, sizeof(s)) != sizeof(s))		\
   120  				fprintf(stderr, "nsenter: failed: write(s)");	\
   121  			if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret))	\
   122  				fprintf(stderr, "nsenter: failed: write(ret)");	\
   123  		}								\
   124  		exit(ret);							\
   125  	} while(0)
   126  
   127  static int write_file(char *data, size_t data_len, char *pathfmt, ...)
   128  {
   129  	int fd, len, ret = 0;
   130  	char path[PATH_MAX];
   131  
   132  	va_list ap;
   133  	va_start(ap, pathfmt);
   134  	len = vsnprintf(path, PATH_MAX, pathfmt, ap);
   135  	va_end(ap);
   136  	if (len < 0)
   137  		return -1;
   138  
   139  	fd = open(path, O_RDWR);
   140  	if (fd < 0) {
   141  		ret = -1;
   142  		goto out;
   143  	}
   144  
   145  	len = write(fd, data, data_len);
   146  	if (len != data_len) {
   147  		ret = -1;
   148  		goto out;
   149  	}
   150  
   151  out:
   152  	close(fd);
   153  	return ret;
   154  }
   155  
   156  enum policy_t {
   157  	SETGROUPS_DEFAULT = 0,
   158  	SETGROUPS_ALLOW,
   159  	SETGROUPS_DENY,
   160  };
   161  
   162  /* This *must* be called before we touch gid_map. */
   163  static void update_setgroups(int pid, enum policy_t setgroup)
   164  {
   165  	char *policy;
   166  
   167  	switch (setgroup) {
   168  		case SETGROUPS_ALLOW:
   169  			policy = "allow";
   170  			break;
   171  		case SETGROUPS_DENY:
   172  			policy = "deny";
   173  			break;
   174  		case SETGROUPS_DEFAULT:
   175  			/* Nothing to do. */
   176  			return;
   177  	}
   178  
   179  	if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
   180  		/*
   181  		 * If the kernel is too old to support /proc/pid/setgroups,
   182  		 * open(2) or write(2) will return ENOENT. This is fine.
   183  		 */
   184  		if (errno != ENOENT)
   185  			bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
   186  	}
   187  }
   188  
   189  static void update_uidmap(int pid, char *map, int map_len)
   190  {
   191  	if (map == NULL || map_len <= 0)
   192  		return;
   193  
   194  	if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0)
   195  		bail("failed to update /proc/%d/uid_map", pid);
   196  }
   197  
   198  static void update_gidmap(int pid, char *map, int map_len)
   199  {
   200  	if (map == NULL || map_len <= 0)
   201  		return;
   202  
   203  	if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0)
   204  		bail("failed to update /proc/%d/gid_map", pid);
   205  }
   206  
   207  /* A dummy function that just jumps to the given jumpval. */
   208  static int child_func(void *arg) __attribute__ ((noinline));
   209  static int child_func(void *arg)
   210  {
   211  	struct clone_t *ca = (struct clone_t *)arg;
   212  	longjmp(*ca->env, ca->jmpval);
   213  }
   214  
   215  static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
   216  static int clone_parent(jmp_buf *env, int jmpval)
   217  {
   218  	struct clone_t ca = {
   219  		.env    = env,
   220  		.jmpval = jmpval,
   221  	};
   222  
   223  	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
   224  }
   225  
   226  /*
   227   * Gets the init pipe fd from the environment, which is used to read the
   228   * bootstrap data and tell the parent what the new pid is after we finish
   229   * setting up the environment.
   230   */
   231  static int initpipe(void)
   232  {
   233  	int pipenum;
   234  	char *initpipe, *endptr;
   235  
   236  	initpipe = getenv("_LIBCONTAINER_INITPIPE");
   237  	if (initpipe == NULL || *initpipe == '\0')
   238  		return -1;
   239  
   240  	pipenum = strtol(initpipe, &endptr, 10);
   241  	if (*endptr != '\0')
   242  		bail("unable to parse _LIBCONTAINER_INITPIPE");
   243  
   244  	return pipenum;
   245  }
   246  
   247  /* Returns the clone(2) flag for a namespace, given the name of a namespace. */
   248  static int nsflag(char *name)
   249  {
   250  	if (!strcmp(name, "cgroup"))
   251  		return CLONE_NEWCGROUP;
   252  	else if (!strcmp(name, "ipc"))
   253  		return CLONE_NEWIPC;
   254  	else if (!strcmp(name, "mnt"))
   255  		return CLONE_NEWNS;
   256  	else if (!strcmp(name, "net"))
   257  		return CLONE_NEWNET;
   258  	else if (!strcmp(name, "pid"))
   259  		return CLONE_NEWPID;
   260  	else if (!strcmp(name, "user"))
   261  		return CLONE_NEWUSER;
   262  	else if (!strcmp(name, "uts"))
   263  		return CLONE_NEWUTS;
   264  
   265  	/* If we don't recognise a name, fallback to 0. */
   266  	return 0;
   267  }
   268  
   269  static uint32_t readint32(char *buf)
   270  {
   271  	return *(uint32_t *) buf;
   272  }
   273  
   274  static uint8_t readint8(char *buf)
   275  {
   276  	return *(uint8_t *) buf;
   277  }
   278  
   279  static void nl_parse(int fd, struct nlconfig_t *config)
   280  {
   281  	size_t len, size;
   282  	struct nlmsghdr hdr;
   283  	char *data, *current;
   284  
   285  	/* Retrieve the netlink header. */
   286  	len = read(fd, &hdr, NLMSG_HDRLEN);
   287  	if (len != NLMSG_HDRLEN)
   288  		bail("invalid netlink header length %lu", len);
   289  
   290  	if (hdr.nlmsg_type == NLMSG_ERROR)
   291  		bail("failed to read netlink message");
   292  
   293  	if (hdr.nlmsg_type != INIT_MSG)
   294  		bail("unexpected msg type %d", hdr.nlmsg_type);
   295  
   296  	/* Retrieve data. */
   297  	size = NLMSG_PAYLOAD(&hdr, 0);
   298  	current = data = malloc(size);
   299  	if (!data)
   300  		bail("failed to allocate %zu bytes of memory for nl_payload", size);
   301  
   302  	len = read(fd, data, size);
   303  	if (len != size)
   304  		bail("failed to read netlink payload, %lu != %lu", len, size);
   305  
   306  	/* Parse the netlink payload. */
   307  	config->data = data;
   308  	config->consolefd = -1;
   309  	while (current < data + size) {
   310  		struct nlattr *nlattr = (struct nlattr *)current;
   311  		size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
   312  
   313  		/* Advance to payload. */
   314  		current += NLA_HDRLEN;
   315  
   316  		/* Handle payload. */
   317  		switch (nlattr->nla_type) {
   318  		case CLONE_FLAGS_ATTR:
   319  			config->cloneflags = readint32(current);
   320  			break;
   321  		case CONSOLE_PATH_ATTR:
   322  			/*
   323  			 * We open the console here because we currently evaluate console
   324  			 * paths from the *host* namespaces.
   325  			 */
   326  			config->consolefd = open(current, O_RDWR);
   327  			if (config->consolefd < 0)
   328  				bail("failed to open console %s", current);
   329  			break;
   330  		case NS_PATHS_ATTR:
   331  			config->namespaces = current;
   332  			config->namespaces_len = payload_len;
   333  			break;
   334  		case UIDMAP_ATTR:
   335  			config->uidmap = current;
   336  			config->uidmap_len = payload_len;
   337  			break;
   338  		case GIDMAP_ATTR:
   339  			config->gidmap = current;
   340  			config->gidmap_len = payload_len;
   341  			break;
   342  		case SETGROUP_ATTR:
   343  			config->is_setgroup = readint8(current);
   344  			break;
   345  		default:
   346  			bail("unknown netlink message type %d", nlattr->nla_type);
   347  		}
   348  
   349  		current += NLA_ALIGN(payload_len);
   350  	}
   351  }
   352  
   353  void nl_free(struct nlconfig_t *config)
   354  {
   355  	free(config->data);
   356  }
   357  
   358  void join_namespaces(char *nslist)
   359  {
   360  	int num = 0, i;
   361  	char *saveptr = NULL;
   362  	char *namespace = strtok_r(nslist, ",", &saveptr);
   363  	struct namespace_t {
   364  		int fd;
   365  		int ns;
   366  		char type[PATH_MAX];
   367  		char path[PATH_MAX];
   368  	} *namespaces = NULL;
   369  
   370  	if (!namespace || !strlen(namespace) || !strlen(nslist))
   371  		bail("ns paths are empty");
   372  
   373  	/*
   374  	 * We have to open the file descriptors first, since after
   375  	 * we join the mnt namespace we might no longer be able to
   376  	 * access the paths.
   377  	 */
   378  	do {
   379  		int fd;
   380  		char *path;
   381  		struct namespace_t *ns;
   382  
   383  		/* Resize the namespace array. */
   384  		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
   385  		if (!namespaces)
   386  			bail("failed to reallocate namespace array");
   387  		ns = &namespaces[num - 1];
   388  
   389  		/* Split 'ns:path'. */
   390  		path = strstr(namespace, ":");
   391  		if (!path)
   392  			bail("failed to parse %s", namespace);
   393  		*path++ = '\0';
   394  
   395  		fd = open(path, O_RDONLY);
   396  		if (fd < 0)
   397  			bail("failed to open %s", namespace);
   398  
   399  		ns->fd = fd;
   400  		ns->ns = nsflag(namespace);
   401  		strncpy(ns->path, path, PATH_MAX);
   402  	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
   403  
   404  	/*
   405  	 * The ordering in which we join namespaces is important. We should
   406  	 * always join the user namespace *first*. This is all guaranteed
   407  	 * from the container_linux.go side of this, so we're just going to
   408  	 * follow the order given to us.
   409  	 */
   410  
   411  	for (i = 0; i < num; i++) {
   412  		struct namespace_t ns = namespaces[i];
   413  
   414  		if (setns(ns.fd, ns.ns) < 0)
   415  			bail("failed to setns to %s", ns.path);
   416  
   417  		close(ns.fd);
   418  	}
   419  
   420  	free(namespaces);
   421  }
   422  
   423  void nsexec(void)
   424  {
   425  	int pipenum;
   426  	jmp_buf env;
   427  	int syncpipe[2];
   428  	struct nlconfig_t config = {0};
   429  
   430  	/*
   431  	 * If we don't have an init pipe, just return to the go routine.
   432  	 * We'll only get an init pipe for start or exec.
   433  	 */
   434  	pipenum = initpipe();
   435  	if (pipenum == -1)
   436  		return;
   437  
   438  	/* Parse all of the netlink configuration. */
   439  	nl_parse(pipenum, &config);
   440  
   441  	/* Pipe so we can tell the child when we've finished setting up. */
   442  	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
   443  		bail("failed to setup sync pipe between parent and child");
   444  
   445  	/* TODO: Currently we aren't dealing with child deaths properly. */
   446  
   447  	/*
   448  	 * Okay, so this is quite annoying.
   449  	 *
   450  	 * In order for this unsharing code to be more extensible we need to split
   451  	 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
   452  	 * would be if we did clone(CLONE_NEWUSER) and the other namespaces
   453  	 * separately, but because of SELinux issues we cannot really do that. But
   454  	 * we cannot just dump the namespace flags into clone(...) because several
   455  	 * usecases (such as rootless containers) require more granularity around
   456  	 * the namespace setup. In addition, some older kernels had issues where
   457  	 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
   458  	 * handle this while also dealing with SELinux so we choose SELinux support
   459  	 * over broken kernel support).
   460  	 *
   461  	 * However, if we unshare(2) the user namespace *before* we clone(2), then
   462  	 * all hell breaks loose.
   463  	 *
   464  	 * The parent no longer has permissions to do many things (unshare(2) drops
   465  	 * all capabilities in your old namespace), and the container cannot be set
   466  	 * up to have more than one {uid,gid} mapping. This is obviously less than
   467  	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
   468  	 *
   469  	 * Unfortunately, it's not as simple as that. We have to fork to enter the
   470  	 * PID namespace (the PID namespace only applies to children). Since we'll
   471  	 * have to double-fork, this clone_parent() call won't be able to get the
   472  	 * PID of the _actual_ init process (without doing more synchronisation than
   473  	 * I can deal with at the moment). So we'll just get the parent to send it
   474  	 * for us, the only job of this process is to update
   475  	 * /proc/pid/{setgroups,uid_map,gid_map}.
   476  	 *
   477  	 * And as a result of the above, we also need to setns(2) in the first child
   478  	 * because if we join a PID namespace in the topmost parent then our child
   479  	 * will be in that namespace (and it will not be able to give us a PID value
   480  	 * that makes sense without resorting to sending things with cmsg).
   481  	 *
   482  	 * This also deals with an older issue caused by dumping cloneflags into
   483  	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
   484  	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
   485  	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
   486  	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
   487  	 * aware, the last mainline kernel which had this bug was Linux 3.12.
   488  	 * However, we cannot comment on which kernels the broken patch was
   489  	 * backported to.
   490  	 *
   491  	 * -- Aleksa "what has my life come to?" Sarai
   492  	 */
   493  
   494  	switch (setjmp(env)) {
   495  	/*
   496  	 * Stage 0: We're in the parent. Our job is just to create a new child
   497  	 *          (stage 1: JUMP_CHILD) process and write its uid_map and
   498  	 *          gid_map. That process will go on to create a new process, then
   499  	 *          it will send us its PID which we will send to the bootstrap
   500  	 *          process.
   501  	 */
   502  	case JUMP_PARENT: {
   503  			int len;
   504  			pid_t child;
   505  			char buf[JSON_MAX];
   506  
   507  			/* For debugging. */
   508  			prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
   509  
   510  			/* Start the process of getting a container. */
   511  			child = clone_parent(&env, JUMP_CHILD);
   512  			if (child < 0)
   513  				bail("unable to fork: child_func");
   514  
   515  			/* State machine for synchronisation with the children. */
   516  			while (true) {
   517  				enum sync_t s;
   518  
   519  				/* This doesn't need to be global, we're in the parent. */
   520  				int syncfd = syncpipe[1];
   521  
   522  				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
   523  					bail("failed to sync with child: next state");
   524  
   525  				switch (s) {
   526  				case SYNC_ERR: {
   527  						/* We have to mirror the error code of the child. */
   528  						int ret;
   529  
   530  						if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
   531  							bail("failed to sync with child: read(error code)");
   532  
   533  						exit(ret);
   534  					}
   535  					break;
   536  				case SYNC_USERMAP_PLS:
   537  					/* Enable setgroups(2) if we've been asked to. */
   538  					if (config.is_setgroup)
   539  						update_setgroups(child, SETGROUPS_ALLOW);
   540  
   541  					/* Set up mappings. */
   542  					update_uidmap(child, config.uidmap, config.uidmap_len);
   543  					update_gidmap(child, config.gidmap, config.gidmap_len);
   544  
   545  					s = SYNC_USERMAP_ACK;
   546  					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
   547  						kill(child, SIGKILL);
   548  						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
   549  					}
   550  					break;
   551  				case SYNC_USERMAP_ACK:
   552  					/* We should _never_ receive acks. */
   553  					kill(child, SIGKILL);
   554  					bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
   555  					break;
   556  				case SYNC_RECVPID_PLS: {
   557  						pid_t old = child;
   558  
   559  						/* Get the init_func pid. */
   560  						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
   561  							kill(old, SIGKILL);
   562  							bail("failed to sync with child: read(childpid)");
   563  						}
   564  
   565  						/* Send ACK. */
   566  						s = SYNC_RECVPID_ACK;
   567  						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
   568  							kill(old, SIGKILL);
   569  							kill(child, SIGKILL);
   570  							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
   571  						}
   572  					}
   573  
   574  					/* Leave the loop. */
   575  					goto out;
   576  				case SYNC_RECVPID_ACK:
   577  					/* We should _never_ receive acks. */
   578  					kill(child, SIGKILL);
   579  					bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
   580  					break;
   581  				}
   582  			}
   583  
   584  		out:
   585  			/* Send the init_func pid back to our parent. */
   586  			len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
   587  			if (len < 0) {
   588  				kill(child, SIGKILL);
   589  				bail("unable to generate JSON for child pid");
   590  			}
   591  			if (write(pipenum, buf, len) != len) {
   592  				kill(child, SIGKILL);
   593  				bail("unable to send child pid to bootstrapper");
   594  			}
   595  
   596  			exit(0);
   597  		}
   598  
   599  	/*
   600  	 * Stage 1: We're in the first child process. Our job is to join any
   601  	 *          provided namespaces in the netlink payload and unshare all
   602  	 *          of the requested namespaces. If we've been asked to
   603  	 *          CLONE_NEWUSER, we will ask our parent (stage 0) to set up
   604  	 *          our user mappings for us. Then, we create a new child
   605  	 *          (stage 2: JUMP_INIT) for PID namespace. We then send the
   606  	 *          child's PID to our parent (stage 0).
   607  	 */
   608  	case JUMP_CHILD: {
   609  			pid_t child;
   610  			enum sync_t s;
   611  
   612  			/* We're in a child and thus need to tell the parent if we die. */
   613  			syncfd = syncpipe[0];
   614  
   615  			/* For debugging. */
   616  			prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
   617  
   618  			/*
   619  			 * We need to setns first. We cannot do this earlier (in stage 0)
   620  			 * because of the fact that we forked to get here (the PID of
   621  			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
   622  			 * using cmsg(3) but that's just annoying.
   623  			 */
   624  			if (config.namespaces)
   625  				join_namespaces(config.namespaces);
   626  
   627  			/*
   628  			 * Unshare all of the namespaces. Now, it should be noted that this
   629  			 * ordering might break in the future (especially with rootless
   630  			 * containers). But for now, it's not possible to split this into
   631  			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
   632  			 *
   633  			 * Note that we don't merge this with clone() because there were
   634  			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
   635  			 * was broken, so we'll just do it the long way anyway.
   636  			 */
   637  			if (unshare(config.cloneflags) < 0)
   638  				bail("failed to unshare namespaces");
   639  
   640  			/*
   641  			 * Deal with user namespaces first. They are quite special, as they
   642  			 * affect our ability to unshare other namespaces and are used as
   643  			 * context for privilege checks.
   644  			 */
   645  			if (config.cloneflags & CLONE_NEWUSER) {
   646  				/*
   647  				 * We don't have the privileges to do any mapping here (see the
   648  				 * clone_parent rant). So signal our parent to hook us up.
   649  				 */
   650  
   651  				s = SYNC_USERMAP_PLS;
   652  				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
   653  					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
   654  
   655  				/* ... wait for mapping ... */
   656  
   657  				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
   658  					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
   659  				if (s != SYNC_USERMAP_ACK)
   660  					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
   661  			}
   662  
   663  			/*
   664  			 * TODO: What about non-namespace clone flags that we're dropping here?
   665  			 *
   666  			 * We fork again because of PID namespace, setns(2) or unshare(2) don't
   667  			 * change the PID namespace of the calling process, because doing so
   668  			 * would change the caller's idea of its own PID (as reported by getpid()),
   669  			 * which would break many applications and libraries, so we must fork
   670  			 * to actually enter the new PID namespace.
   671  			 */
   672  			child = clone_parent(&env, JUMP_INIT);
   673  			if (child < 0)
   674  				bail("unable to fork: init_func");
   675  
   676  			/* Send the child to our parent, which knows what it's doing. */
   677  			s = SYNC_RECVPID_PLS;
   678  			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
   679  				kill(child, SIGKILL);
   680  				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
   681  			}
   682  			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
   683  				kill(child, SIGKILL);
   684  				bail("failed to sync with parent: write(childpid)");
   685  			}
   686  
   687  			/* ... wait for parent to get the pid ... */
   688  
   689  			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
   690  				kill(child, SIGKILL);
   691  				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
   692  			}
   693  			if (s != SYNC_RECVPID_ACK) {
   694  				kill(child, SIGKILL);
   695  				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
   696  			}
   697  
   698  			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
   699  			exit(0);
   700  		}
   701  
   702  	/*
   703  	 * Stage 2: We're the final child process, and the only process that will
   704  	 *          actually return to the Go runtime. Our job is to just do the
   705  	 *          final cleanup steps and then return to the Go runtime to allow
   706  	 *          init_linux.go to run.
   707  	 */
   708  	case JUMP_INIT: {
   709  			/*
   710  			 * We're inside the child now, having jumped from the
   711  			 * start_child() code after forking in the parent.
   712  			 */
   713  			int consolefd = config.consolefd;
   714  
   715  			/* We're in a child and thus need to tell the parent if we die. */
   716  			syncfd = syncpipe[0];
   717  
   718  			/* For debugging. */
   719  			prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
   720  
   721  			if (setsid() < 0)
   722  				bail("setsid failed");
   723  
   724  			if (setuid(0) < 0)
   725  				bail("setuid failed");
   726  
   727  			if (setgid(0) < 0)
   728  				bail("setgid failed");
   729  
   730  			if (setgroups(0, NULL) < 0)
   731  				bail("setgroups failed");
   732  
   733  			if (consolefd != -1) {
   734  				if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
   735  					bail("ioctl TIOCSCTTY failed");
   736  				if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
   737  					bail("failed to dup stdin");
   738  				if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
   739  					bail("failed to dup stdout");
   740  				if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
   741  					bail("failed to dup stderr");
   742  			}
   743  
   744  			/* Close sync pipes. */
   745  			close(syncpipe[0]);
   746  			close(syncpipe[1]);
   747  
   748  			/* Free netlink data. */
   749  			nl_free(&config);
   750  
   751  			/* Finish executing, let the Go runtime take over. */
   752  			return;
   753  		}
   754  	default:
   755  		bail("unexpected jump value");
   756  		break;
   757  	}
   758  
   759  	/* Should never be reached. */
   760  	bail("should never be reached");
   761  }