github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/stage1/prepare-app/prepare-app.c (about)

     1  // Copyright 2015 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #define _GNU_SOURCE
    16  #include <errno.h>
    17  #include <string.h>
    18  #include <stdio.h>
    19  #include <stdlib.h>
    20  #include <sys/mount.h>
    21  #include <sys/stat.h>
    22  #include <sys/sysmacros.h>
    23  #include <sys/types.h>
    24  #include <unistd.h>
    25  #include <fcntl.h>
    26  #include <sys/vfs.h>
    27  #include <dirent.h>
    28  #include <inttypes.h>
    29  #include <stdbool.h>
    30  
    31  #define err_out(_fmt, _args...)						\
    32  		fprintf(stderr, "Error: " _fmt "\n", ##_args);
    33  static int exit_err;
    34  #define exit_if(_cond, _fmt, _args...)					\
    35  	exit_err++;							\
    36  	if(_cond) {							\
    37  		err_out(_fmt, ##_args);					\
    38  		exit(exit_err);						\
    39  	}
    40  #define pexit_if(_cond, _fmt, _args...)					\
    41  	exit_if(_cond, _fmt ": %s", ##_args, strerror(errno))
    42  
    43  #define goto_if(_cond, _lbl, _fmt, _args...)				\
    44  	if(_cond) {							\
    45  		err_out(_fmt, ##_args);					\
    46  		goto _lbl;						\
    47  	}
    48  #define pgoto_if(_cond, _lbl, _fmt, _args...)				\
    49  	goto_if(_cond, _lbl, _fmt ": %s", ##_args, strerror(errno));
    50  
    51  #define nelems(_array) \
    52  	(sizeof(_array) / sizeof(_array[0]))
    53  #define lenof(_str) \
    54  	(sizeof(_str) - 1)
    55  
    56  #define MACHINE_ID_LEN		lenof("0123456789abcdef0123456789abcdef")
    57  #define MACHINE_NAME_LEN	lenof("rkt-01234567-89ab-cdef-0123-456789abcdef")
    58  
    59  #define UNMAPPED ((uid_t) -1)
    60  
    61  #ifndef CGROUP2_SUPER_MAGIC
    62  #define CGROUP2_SUPER_MAGIC 0x63677270
    63  #endif
    64  
    65  /* permission masks */
    66  #define WORLD_READABLE          0444
    67  #define WORLD_WRITABLE          0222
    68  
    69  typedef struct _dir_op_t {
    70  	const char	*name;
    71  	mode_t		mode;
    72  } dir_op_t;
    73  
    74  typedef struct _mount_point_t {
    75  	const char	*source;
    76  	const char	*target;
    77  	const char	*type;
    78  	const char	*options;
    79  	unsigned long	flags;
    80  	const bool	skip_if_dst_exists; // Only respected for files_mount_table
    81  } mount_point;
    82  
    83  #define dir(_name, _mode) \
    84  	{ .name = _name, .mode = _mode }
    85  
    86  static void mount_at(const char *root, const mount_point *mnt)
    87  {
    88  	char to[4096];
    89  	exit_if(snprintf(to, sizeof(to), "%s/%s", root, mnt->target) >= sizeof(to),
    90  		"Path too long: \"%s\"", to);
    91  	pexit_if(mount(mnt->source, to, mnt->type,
    92  		       mnt->flags, mnt->options) == -1,
    93  		 "Mounting \"%s\" on \"%s\" failed", mnt->source, to);
    94  }
    95  
    96  static int mount_sys_required(const char *root)
    97  {
    98  	FILE *f;
    99  	char *line = NULL;
   100  	size_t len = 0;
   101  	ssize_t read;
   102  
   103  	pexit_if((f = fopen("/proc/self/mountinfo", "re")) == NULL,
   104  		 "Unable to open /proc/self/mountinfo");
   105  
   106  	while ((read = getline(&line, &len, f)) != -1) {
   107  		char *sys_dir;
   108  		char *sys_subdir;
   109  		char *mountpoint;
   110  
   111  		exit_if(asprintf(&sys_dir, "%s/sys", root) == -1,
   112  			"Calling asprintf failed");
   113  		exit_if(asprintf(&sys_subdir, "%s/sys/", root) == -1,
   114  			"Calling asprintf failed");
   115  		sscanf(line, "%*s %*s %*s %*s %ms", &mountpoint);
   116  
   117  		// The mount point is exactly $ROOTFS/sys
   118  		if (strcmp(sys_dir, mountpoint) == 0) {
   119  			free(mountpoint);
   120  			return 0;
   121  		}
   122  		// The mount point is a subdirectory of $ROOTFS/sys
   123  		if (strncmp(sys_subdir, mountpoint, strlen(sys_subdir)) == 0) {
   124  			free(mountpoint);
   125  			return 0;
   126  		}
   127  
   128  		free(mountpoint);
   129  	}
   130  
   131  	pexit_if(fclose(f) != 0, "Unable to close /proc/self/mountinfo");
   132  
   133  	return 1;
   134  }
   135  static void mount_sys(const char *root)
   136  {
   137  	struct statfs fs;
   138  	const mount_point sys_bind_rec = { "/sys", "sys", "bind", NULL, MS_BIND|MS_REC, false };
   139  	const mount_point sys_bind = { "/sys", "sys", "bind", NULL, MS_BIND, false };
   140  
   141  	pexit_if(statfs("/sys/fs/cgroup", &fs) != 0,
   142  	         "Cannot statfs /sys/fs/cgroup");
   143  	if (fs.f_type == (typeof(fs.f_type)) CGROUP2_SUPER_MAGIC) {
   144  		/* With the unified cgroup hierarchy, recursive bind mounts
   145  		 * are fine. */
   146  		mount_at(root, &sys_bind_rec);
   147  		return;
   148  	}
   149  
   150  	// For security reasons recent Linux kernels do not allow to bind-mount non-recursively
   151  	// if it would give read-write access to other subdirectories mounted as read-only.
   152  	// Hence we have to check if we are in a user namespaced environment and bind mount recursively instead.
   153  	if (access("/proc/1/uid_map", F_OK) == 0) {
   154  		FILE *f;
   155  		int k;
   156  		uid_t uid_base, uid_shift, uid_range;
   157  
   158  		pexit_if((f = fopen("/proc/1/uid_map", "re")) == NULL,
   159  			 "Unable to open /proc/1/uid_map");
   160  
   161  		if (sizeof(uid_t) == 4) {
   162  			k = fscanf(f, "%"PRIu32" %"PRIu32" %"PRIu32,
   163  				   &uid_base, &uid_shift, &uid_range);
   164  		} else {
   165  			k = fscanf(f, "%"PRIu16" %"PRIu16" %"PRIu16,
   166  				   &uid_base, &uid_shift, &uid_range);
   167  		}
   168  		pexit_if(fclose(f) != 0, "Unable to close /proc/1/uid_map");
   169  		pexit_if(k != 3, "Invalid uid_map format");
   170  
   171  		// do a recursive bind mount if we are in a user namespace having a parent namespace set,
   172  		// i.e. either one of uid base, shift, or the range is set, see user_namespaces(7).
   173  		if (uid_base != 0 || uid_shift != 0 || uid_range != UNMAPPED) {
   174  			mount_at(root, &sys_bind_rec);
   175  			return;
   176  		}
   177  	}
   178  
   179  	/* With cgroup-v1, rkt and systemd-nspawn add more cgroup
   180  	 * bind-mounts to control which files are read-only. To avoid
   181  	 * a quadratic progression, prepare-app does not bind mount
   182  	 * /sys recursively. See:
   183  	 * https://github.com/rkt/rkt/issues/2351 */
   184  	mount_at(root, &sys_bind);
   185  }
   186  
   187  static void copy_volume_symlinks()
   188  {
   189  	DIR *volumes_dir;
   190  	struct dirent *de;
   191  	const char *rkt_volume_links_path = "/rkt/volumes";
   192  	const char *dev_rkt_path = "/dev/.rkt";
   193  
   194  	pexit_if(mkdir(dev_rkt_path, 0700) == -1 && errno != EEXIST,
   195  		"Failed to create directory \"%s\"", dev_rkt_path);
   196  
   197  	pexit_if((volumes_dir = opendir(rkt_volume_links_path)) == NULL && errno != ENOENT,
   198                   "Failed to open directory \"%s\"", rkt_volume_links_path);
   199  	while (volumes_dir) {
   200  		errno = 0;
   201  		if ((de = readdir(volumes_dir)) != NULL) {
   202  			char *link_path;
   203  			char *new_link;
   204  			char target[4096] = {0,};
   205  
   206  			if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, ".."))
   207  			  continue;
   208  
   209  			exit_if(asprintf(&link_path, "%s/%s", rkt_volume_links_path, de->d_name) == -1,
   210  				"Calling asprintf failed");
   211  			exit_if(asprintf(&new_link, "%s/%s", dev_rkt_path, de->d_name) == -1,
   212  				"Calling asprintf failed");
   213  
   214  			pexit_if(readlink(link_path, target, sizeof(target)) == -1,
   215  				 "Error reading \"%s\" link", link_path);
   216  			pexit_if(symlink(target, new_link) == -1 && errno != EEXIST,
   217  				"Failed to create volume symlink \"%s\"", new_link);
   218  		} else {
   219  			pexit_if(errno != 0,
   220  				"Error reading \"%s\" directory", rkt_volume_links_path);
   221  			pexit_if(closedir(volumes_dir),
   222  				 "Error closing \"%s\" directory", rkt_volume_links_path);
   223  			return;
   224  		}
   225  	}
   226  }
   227  
   228  /* Determine if the specified ptmx device (or symlink to a device)
   229   * is usable by all users.
   230   *
   231   * dirfd: Open file descriptor of a root directory.
   232   * path: Relative path to ptmx device below the path specified by dirfd.
   233   *
   234   * Returns true on success, else false.
   235   */
   236  bool
   237  ptmx_device_usable (int dirfd, const char *path)
   238  {
   239  	struct stat st;
   240  	int perms;
   241  	bool world_readable, world_writable;
   242  	bool is_char;
   243  	bool dev_type;
   244  	dev_t expected_dev;
   245  
   246  	if (dirfd < 0 || ! path) {
   247  		return false;
   248  	}
   249  
   250  	expected_dev = makedev(5, 2);
   251  
   252  	if (fstatat (dirfd, path, &st, 0) < 0) {
   253  		return false;
   254  	}
   255  
   256  	is_char = S_ISCHR (st.st_mode);
   257  	dev_type = (expected_dev == st.st_rdev);
   258  
   259  	perms = (st.st_mode & ACCESSPERMS);
   260  
   261  	world_readable = (perms & WORLD_READABLE) == WORLD_READABLE;
   262  	world_writable = (perms & WORLD_WRITABLE) == WORLD_WRITABLE;
   263  
   264  	return (is_char && dev_type && world_readable && world_writable);
   265  }
   266  
   267  int main(int argc, char *argv[])
   268  {
   269  	static const char *unlink_paths[] = {
   270  		"dev/shm",
   271  		 NULL
   272  	};
   273  	static const dir_op_t dirs[] = {
   274  		dir("dev",	0755),
   275  		dir("dev/net",	0755),
   276  		dir("dev/shm",	0755),
   277  		dir("etc",	0755),
   278  		dir("proc",	0755),
   279  		dir("sys",	0755),
   280  		dir("tmp",	01777),
   281  		dir("dev/pts",	0755),
   282  		dir("run",			0755),
   283  		dir("run/systemd",		0755),
   284  		dir("run/systemd/journal",	0755),
   285  	};
   286  	static const char *devnodes[] = {
   287  		"/dev/null",
   288  		"/dev/zero",
   289  		"/dev/full",
   290  		"/dev/random",
   291  		"/dev/urandom",
   292  		"/dev/tty",
   293  		"/dev/net/tun",
   294  		"/dev/console",
   295  		NULL
   296  	};
   297  	static const mount_point dirs_mount_table[] = {
   298  		{ "/proc", "/proc", "bind", NULL, MS_BIND|MS_REC, false },
   299  		{ "/dev/shm", "/dev/shm", "bind", NULL, MS_BIND, false },
   300  		{ "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, false },
   301  		{ "/run/systemd/journal", "/run/systemd/journal", "bind", NULL, MS_BIND, false },
   302  		/* /sys is handled separately */
   303  	};
   304  	static const mount_point files_mount_table[] = {
   305  		{ "/etc/rkt-resolv.conf", "/etc/resolv.conf", "bind", NULL, MS_BIND, false },
   306  		{ "/etc/rkt-hosts", "/etc/hosts", "bind", NULL, MS_BIND, false },
   307  		{ "/etc/hosts-fallback", "/etc/hosts", "bind", NULL, MS_BIND, true }, // only create as fallback
   308  		{ "/proc/sys/kernel/hostname", "/etc/hostname", "bind", NULL, MS_BIND, false },
   309  		// TODO @alepuccetti this could be removed when https://github.com/systemd/systemd/issues/3544 is solved
   310  		{ "/run/systemd/notify", "/run/systemd/notify", "bind", NULL, MS_BIND, false },
   311  	};
   312  	const char *root;
   313  	int rootfd;
   314  	char to[4096];
   315  	int i;
   316  	bool ptmx_usable, pts_ptmx_usable;
   317  
   318  	exit_if(argc < 2,
   319  		"Usage: %s /path/to/root", argv[0]);
   320  
   321  	root = argv[1];
   322  
   323  	/* Make stage2's root a mount point. Chrooting an application in a
   324  	 * directory which is not a mount point is not nice because the
   325  	 * application would not be able to remount "/" it as private mount.
   326  	 * This allows Docker to run inside rkt.
   327  	 * The recursive flag is to preserve volumes mounted previously by
   328  	 * systemd-nspawn via "rkt run -volume".
   329  	 * */
   330  	pexit_if(mount(root, root, "bind", MS_BIND | MS_REC, NULL) == -1,
   331  			"Make / a mount point failed");
   332  
   333  	rootfd = open(root, O_DIRECTORY | O_CLOEXEC);
   334  	pexit_if(rootfd < 0,
   335  		"Failed to open directory \"%s\"", root);
   336  
   337  	/* Some images have annoying symlinks that are resolved as dangling
   338  	 * links before the chroot in stage1. E.g. "/dev/shm" -> "/run/shm"
   339  	 * Just remove the symlinks.
   340           */
   341  	for (i = 0; unlink_paths[i]; i++) {
   342  		pexit_if(unlinkat(rootfd, unlink_paths[i], 0) != 0
   343  			 && errno != ENOENT && errno != EISDIR,
   344  			 "Failed to unlink \"%s\"", unlink_paths[i])
   345  	}
   346  
   347  	/* Create the directories */
   348  	umask(0);
   349  	for (i = 0; i < nelems(dirs); i++) {
   350  		const dir_op_t *d = &dirs[i];
   351  		pexit_if(mkdirat(rootfd, d->name, d->mode) == -1 &&
   352  			 errno != EEXIST,
   353  			"Failed to create directory \"%s/%s\"", root, d->name);
   354  	}
   355  
   356  	close(rootfd);
   357  
   358  	/* systemd-nspawn already creates few /dev entries in the container
   359  	 * namespace: copy_devnodes()
   360  	 * http://cgit.freedesktop.org/systemd/systemd/tree/src/nspawn/nspawn.c?h=v219#n1345
   361  	 *
   362  	 * But they are not visible by the apps because they are "protected" by
   363  	 * the chroot.
   364  	 *
   365  	 * Bind mount them individually over the chroot border.
   366  	 *
   367  	 * Do NOT bind mount the whole directory /dev because it would shadow
   368  	 * potential individual bind mount by stage0 ("rkt run --volume...").
   369  	 *
   370  	 * Do NOT use mknod, it would not work for /dev/console because it is
   371  	 * a bind mount to a pts and pts device nodes only work when they live
   372  	 * on a devpts filesystem.
   373  	 */
   374  	for (i = 0; devnodes[i]; i++) {
   375  		const char *from = devnodes[i];
   376  		int fd;
   377  
   378  		/* If the file does not exist, skip it. It might be because
   379  		 * the kernel does not provide it (e.g. kernel compiled without
   380  		 * CONFIG_TUN) or because systemd-nspawn does not provide it
   381  		 * (/dev/net/tun is not available with systemd-nspawn < v217
   382  		 */
   383  		if (access(from, F_OK) != 0)
   384  			continue;
   385  
   386  		exit_if(snprintf(to, sizeof(to), "%s%s", root, from) >= sizeof(to),
   387  			"Path too long: \"%s\"", to);
   388  
   389  		/* The mode does not matter: it will be bind-mounted over.
   390  		 */
   391  		fd = open(to, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY, 0644);
   392  		if (fd != -1)
   393  			close(fd);
   394  
   395  		pexit_if(mount(from, to, "bind", MS_BIND, NULL) == -1,
   396  				"Mounting \"%s\" on \"%s\" failed", from, to);
   397  	}
   398  
   399  	/* Bind mount directories */
   400  	for (i = 0; i < nelems(dirs_mount_table); i++) {
   401  		mount_at(root, &dirs_mount_table[i]);
   402  	}
   403  
   404  	/* Bind mount /sys: handled differently, depending on cgroups */
   405  	if (mount_sys_required(root))
   406  		mount_sys(root);
   407  
   408  	/* Bind mount files, if the source exists.
   409  	 * By default, overwrite dst unless skip_if_dst_exists is true. */
   410  	for (i = 0; i < nelems(files_mount_table); i++) {
   411  		const mount_point *mnt = &files_mount_table[i];
   412  		int fd;
   413  
   414  		exit_if(snprintf(to, sizeof(to), "%s/%s", root, mnt->target) >= sizeof(to),
   415  			"Path too long: \"%s\"", to);
   416  		if (access(mnt->source, F_OK) != 0)
   417  			continue;
   418  		if( mnt->skip_if_dst_exists && access(to, F_OK) == 0)
   419  			continue;
   420  		if (access(to, F_OK) != 0) {
   421  			pexit_if((fd = creat(to, 0644)) == -1,
   422  				"Cannot create file: \"%s\"", to);
   423  			pexit_if(close(fd) == -1,
   424  				"Cannot close file: \"%s\"", to);
   425  		}
   426  		pexit_if(mount(mnt->source, to, mnt->type,
   427  			       mnt->flags, mnt->options) == -1,
   428  				"Mounting \"%s\" on \"%s\" failed", mnt->source, to);
   429  	}
   430  
   431  	/* Now that all mounts have been handled, reopen the root
   432  	 * directory to special-case the handling of ptmx devices.
   433  	 */
   434  	rootfd = open(root, O_DIRECTORY | O_CLOEXEC);
   435  	pexit_if(rootfd < 0,
   436  		"Failed to open directory \"%s\"", root);
   437  
   438  	ptmx_usable = ptmx_device_usable (rootfd, "dev/ptmx");
   439  	pts_ptmx_usable = ptmx_device_usable (rootfd, "dev/pts/ptmx");
   440  
   441  	if (pts_ptmx_usable) {
   442  		if (! ptmx_usable) {
   443  			pexit_if(unlinkat(rootfd, "dev/ptmx", 0) != 0
   444  					&& errno != ENOENT,
   445  					"Failed to unlink \"%s\"", "dev/ptmx");
   446  			pexit_if(symlinkat("/dev/pts/ptmx", rootfd, "dev/ptmx") == -1,
   447  					"Failed to create /dev/ptmx symlink");
   448  		}
   449  	} else {
   450  		if (! ptmx_usable) {
   451  			int perms = (WORLD_READABLE + WORLD_WRITABLE);
   452  
   453  			pexit_if(unlinkat(rootfd, "dev/ptmx", 0) != 0
   454  					&& errno != ENOENT,
   455  					"Failed to unlink \"%s\"", "dev/ptmx");
   456  			pexit_if(mknodat (rootfd, "dev/ptmx", (S_IFCHR|perms), makedev (5, 2)) < 0,
   457  					"Failed to create device: \"%s\"", "dev/ptmx");
   458  		}
   459  	}
   460  
   461  	close(rootfd);
   462  
   463  	/* Copy symlinks to device node volumes to "/dev/.rkt" so they can be
   464  	 * used in the DeviceAllow= option of the app's unit file (systemd
   465  	 * needs the path to start with "/dev". */
   466  	copy_volume_symlinks();
   467  
   468  	/* /dev/log -> /run/systemd/journal/dev-log */
   469  	exit_if(snprintf(to, sizeof(to), "%s/dev/log", root) >= sizeof(to),
   470  		"Path too long: \"%s\"", to);
   471  	pexit_if(symlink("/run/systemd/journal/dev-log", to) == -1 && errno != EEXIST,
   472  		"Failed to create /dev/log symlink");
   473  
   474  	return EXIT_SUCCESS;
   475  }