github.com/chipaca/snappy@v0.0.0-20210104084008-1f06296fe8ad/cmd/snap-confine/mount-support.c (about)

     1  /*
     2   * Copyright (C) 2015 Canonical Ltd
     3   *
     4   * This program is free software: you can redistribute it and/or modify
     5   * it under the terms of the GNU General Public License version 3 as
     6   * published by the Free Software Foundation.
     7   *
     8   * This program is distributed in the hope that it will be useful,
     9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11   * GNU General Public License for more details.
    12   *
    13   * You should have received a copy of the GNU General Public License
    14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15   *
    16   */
    17  #ifdef HAVE_CONFIG_H
    18  #include "config.h"
    19  #endif
    20  
    21  #include "mount-support.h"
    22  
    23  #include <errno.h>
    24  #include <fcntl.h>
    25  #include <libgen.h>
    26  #include <limits.h>
    27  #include <mntent.h>
    28  #include <sched.h>
    29  #include <stdio.h>
    30  #include <stdlib.h>
    31  #include <string.h>
    32  #include <sys/mount.h>
    33  #include <sys/stat.h>
    34  #include <sys/syscall.h>
    35  #include <sys/types.h>
    36  #include <sys/wait.h>
    37  #include <unistd.h>
    38  
    39  #include "../libsnap-confine-private/apparmor-support.h"
    40  #include "../libsnap-confine-private/classic.h"
    41  #include "../libsnap-confine-private/cleanup-funcs.h"
    42  #include "../libsnap-confine-private/mount-opt.h"
    43  #include "../libsnap-confine-private/mountinfo.h"
    44  #include "../libsnap-confine-private/snap.h"
    45  #include "../libsnap-confine-private/string-utils.h"
    46  #include "../libsnap-confine-private/tool.h"
    47  #include "../libsnap-confine-private/utils.h"
    48  #include "mount-support-nvidia.h"
    49  
    50  #define MAX_BUF 1000
    51  
    52  static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode);
    53  
    54  // TODO: simplify this, after all it is just a tmpfs
    55  // TODO: fold this into bootstrap
    56  static void setup_private_mount(const char *snap_name)
    57  {
    58  	// Create a 0700 base directory. This is the "base" directory that is
    59  	// protected from other users. This directory name is NOT randomly
    60  	// generated. This has several properties:
    61  	//
    62  	// Users can relate to the name and can find the temporary directory as
    63  	// visible from within the snap. If this directory was random it would be
    64  	// harder to find because there may be situations in which multiple
    65  	// directories related to the same snap name would exist.
    66  	//
    67  	// Snapd can partially manage the directory. Specifically on snap remove
    68  	// snapd could remove the directory and everything in it, potentially
    69  	// avoiding runaway disk use on a machine that either never reboots or uses
    70  	// persistent /tmp directory.
    71  	//
    72  	// Underneath the base directory there is a "tmp" sub-directory that has
    73  	// mode 1777 and behaves as a typical /tmp directory would. That directory
    74  	// is used as a bind-mounted /tmp directory.
    75  	//
    76  	// Because the directories are reused across invocations by distinct users
    77  	// and because the directories are trivially guessable, each invocation
    78  	// unconditionally chowns/chmods them to appropriate values.
    79  	char base_dir[MAX_BUF] = { 0 };
    80  	char tmp_dir[MAX_BUF] = { 0 };
    81  	int base_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
    82  	int tmp_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
    83  	sc_must_snprintf(base_dir, sizeof(base_dir), "/tmp/snap.%s", snap_name);
    84  	sc_must_snprintf(tmp_dir, sizeof(tmp_dir), "%s/tmp", base_dir);
    85  
    86  	/* Switch to root group so that mkdir and open calls below create filesystem
    87  	 * elements that are not owned by the user calling into snap-confine. */
    88  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
    89  	// Create /tmp/snap.$SNAP_NAME/ 0700 root.root. Ignore EEXIST since we want
    90  	// to reuse and we will open with O_NOFOLLOW, below.
    91  	if (mkdir(base_dir, 0700) < 0 && errno != EEXIST) {
    92  		die("cannot create base directory %s", base_dir);
    93  	}
    94  	base_dir_fd = open(base_dir,
    95  			   O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
    96  	if (base_dir_fd < 0) {
    97  		die("cannot open base directory %s", base_dir);
    98  	}
    99  	/* This seems redundant on first read but it has the non-obvious
   100  	 * property of changing existing directories  that have already existed
   101  	 * but had incorrect ownership or permission. This is possible due to
   102  	 * earlier bugs in snap-confine and due to the fact that some systems
   103  	 * use persistent /tmp directory and may not clean up leftover files
   104  	 * for arbitrarily long. This comment applies the following two pairs
   105  	 * of fchmod and fchown. */
   106  	if (fchmod(base_dir_fd, 0700) < 0) {
   107  		die("cannot chmod base directory %s to 0700", base_dir);
   108  	}
   109  	if (fchown(base_dir_fd, 0, 0) < 0) {
   110  		die("cannot chown base directory %s to root.root", base_dir);
   111  	}
   112  	// Create /tmp/snap.$SNAP_NAME/tmp 01777 root.root Ignore EEXIST since we
   113  	// want to reuse and we will open with O_NOFOLLOW, below.
   114  	if (mkdirat(base_dir_fd, "tmp", 01777) < 0 && errno != EEXIST) {
   115  		die("cannot create private tmp directory %s/tmp", base_dir);
   116  	}
   117  	(void)sc_set_effective_identity(old);
   118  	tmp_dir_fd = openat(base_dir_fd, "tmp",
   119  			    O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
   120  	if (tmp_dir_fd < 0) {
   121  		die("cannot open private tmp directory %s/tmp", base_dir);
   122  	}
   123  	if (fchmod(tmp_dir_fd, 01777) < 0) {
   124  		die("cannot chmod private tmp directory %s/tmp to 01777",
   125  		    base_dir);
   126  	}
   127  	if (fchown(tmp_dir_fd, 0, 0) < 0) {
   128  		die("cannot chown private tmp directory %s/tmp to root.root",
   129  		    base_dir);
   130  	}
   131  	sc_do_mount(tmp_dir, "/tmp", NULL, MS_BIND, NULL);
   132  	sc_do_mount("none", "/tmp", NULL, MS_PRIVATE, NULL);
   133  }
   134  
   135  // TODO: fold this into bootstrap
   136  static void setup_private_pts(void)
   137  {
   138  	// See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt
   139  	//
   140  	// Ubuntu by default uses devpts 'single-instance' mode where
   141  	// /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change
   142  	// the startup scripts though, so we follow the instructions in point
   143  	// '4' of 'User-space changes' in the above doc. In other words, after
   144  	// unshare(CLONE_NEWNS), we mount devpts with -o
   145  	// newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto
   146  	// /dev/ptmx
   147  
   148  	struct stat st;
   149  
   150  	// Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode
   151  	// which doesn't provide the isolation we require.
   152  	if (stat("/dev/pts/ptmx", &st) != 0) {
   153  		die("cannot stat /dev/pts/ptmx");
   154  	}
   155  	// Make sure /dev/ptmx exists so we can bind mount over it
   156  	if (stat("/dev/ptmx", &st) != 0) {
   157  		die("cannot stat /dev/ptmx");
   158  	}
   159  	// Since multi-instance, use ptmxmode=0666. The other options are
   160  	// copied from /etc/default/devpts
   161  	sc_do_mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,
   162  		    "newinstance,ptmxmode=0666,mode=0620,gid=5");
   163  	sc_do_mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0);
   164  }
   165  
   166  struct sc_mount {
   167  	const char *path;
   168  	bool is_bidirectional;
   169  	// Alternate path defines the rbind mount "alternative" of path.
   170  	// It exists so that we can make /media on systems that use /run/media.
   171  	const char *altpath;
   172  	// Optional mount points are not processed unless the source and
   173  	// destination both exist.
   174  	bool is_optional;
   175  };
   176  
   177  struct sc_mount_config {
   178  	const char *rootfs_dir;
   179  	// The struct is terminated with an entry with NULL path.
   180  	const struct sc_mount *mounts;
   181  	sc_distro distro;
   182  	bool normal_mode;
   183  	const char *base_snap_name;
   184  };
   185  
   186  /**
   187   * Bootstrap mount namespace.
   188   *
   189   * This is a chunk of tricky code that lets us have full control over the
   190   * layout and direction of propagation of mount events. The documentation below
   191   * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source
   192   * tree.
   193   *
   194   * As a reminder two definitions are quoted below:
   195   *
   196   *  A 'propagation event' is defined as event generated on a vfsmount
   197   *  that leads to mount or unmount actions in other vfsmounts.
   198   *
   199   *  A 'peer group' is defined as a group of vfsmounts that propagate
   200   *  events to each other.
   201   *
   202   * (end of quote).
   203   *
   204   * The main idea is to setup a mount namespace that has a root filesystem with
   205   * vfsmounts and peer groups that, depending on the location, either isolate
   206   * or share with the rest of the system.
   207   *
   208   * The vast majority of the filesystem is shared in one direction. Events from
   209   * the outside (from the main mount namespace) propagate inside (to namespaces
   210   * of particular snaps) so things like new snap revisions, mounted drives, etc,
   211   * just show up as expected but even if a snap is exploited or malicious in
   212   * nature it cannot affect anything in another namespace where it might cause
   213   * security or stability issues.
   214   *
   215   * Selected directories (today just /media) can be shared in both directions.
   216   * This allows snaps with sufficient privileges to either create, through the
   217   * mount system call, additional mount points that are visible by the rest of
   218   * the system (both the main mount namespace and namespaces of individual
   219   * snaps) or remove them, through the unmount system call.
   220   **/
   221  static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config)
   222  {
   223  	char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX";
   224  	char src[PATH_MAX] = { 0 };
   225  	char dst[PATH_MAX] = { 0 };
   226  	if (mkdtemp(scratch_dir) == NULL) {
   227  		die("cannot create temporary directory for the root file system");
   228  	}
   229  	// NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new
   230  	// mount namespace and have a private list of mounts.
   231  	debug("scratch directory for constructing namespace: %s", scratch_dir);
   232  	// Make the root filesystem recursively shared. This way propagation events
   233  	// will be shared with main mount namespace.
   234  	sc_do_mount("none", "/", NULL, MS_REC | MS_SHARED, NULL);
   235  	// Bind mount the temporary scratch directory for root filesystem over
   236  	// itself so that it is a mount point. This is done so that it can become
   237  	// unbindable as explained below.
   238  	sc_do_mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL);
   239  	// Make the scratch directory unbindable.
   240  	//
   241  	// This is necessary as otherwise a mount loop can occur and the kernel
   242  	// would crash. The term unbindable simply states that it cannot be bind
   243  	// mounted anywhere. When we construct recursive bind mounts below this
   244  	// guarantees that this directory will not be replicated anywhere.
   245  	sc_do_mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL);
   246  	// Recursively bind mount desired root filesystem directory over the
   247  	// scratch directory. This puts the initial content into the scratch space
   248  	// and serves as a foundation for all subsequent operations below.
   249  	//
   250  	// The mount is recursive because it can either be applied to the root
   251  	// filesystem of a core system (aka all-snap) or the core snap on a classic
   252  	// system. In the former case we need recursive bind mounts to accurately
   253  	// replicate the state of the root filesystem into the scratch directory.
   254  	sc_do_mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND,
   255  		    NULL);
   256  	// Make the scratch directory recursively slave. Nothing done there will be
   257  	// shared with the initial mount namespace. This effectively detaches us,
   258  	// in one way, from the original namespace and coupled with pivot_root
   259  	// below serves as the foundation of the mount sandbox.
   260  	sc_do_mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL);
   261  	// Bind mount certain directories from the host filesystem to the scratch
   262  	// directory. By default mount events will propagate in both into and out
   263  	// of the peer group. This way the running application can alter any global
   264  	// state visible on the host and in other snaps. This can be restricted by
   265  	// disabling the "is_bidirectional" flag as can be seen below.
   266  	for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL;
   267  	     mnt++) {
   268  
   269  		if (mnt->is_bidirectional) {
   270  			sc_identity old =
   271  			    sc_set_effective_identity(sc_root_group_identity());
   272  			if (mkdir(mnt->path, 0755) < 0 && errno != EEXIST) {
   273  				die("cannot create %s", mnt->path);
   274  			}
   275  			(void)sc_set_effective_identity(old);
   276  		}
   277  		sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir,
   278  				 mnt->path);
   279  		if (mnt->is_optional) {
   280  			bool ok = sc_do_optional_mount(mnt->path, dst, NULL,
   281  						       MS_REC | MS_BIND, NULL);
   282  			if (!ok) {
   283  				// If we cannot mount it, just continue.
   284  				continue;
   285  			}
   286  		} else {
   287  			sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND,
   288  				    NULL);
   289  		}
   290  		if (!mnt->is_bidirectional) {
   291  			// Mount events will only propagate inwards to the namespace. This
   292  			// way the running application cannot alter any global state apart
   293  			// from that of its own snap.
   294  			sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   295  		}
   296  		if (mnt->altpath == NULL) {
   297  			continue;
   298  		}
   299  		// An alternate path of mnt->path is provided at another location.
   300  		// It should behave exactly the same as the original.
   301  		sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir,
   302  				 mnt->altpath);
   303  		struct stat stat_buf;
   304  		if (lstat(dst, &stat_buf) < 0) {
   305  			die("cannot lstat %s", dst);
   306  		}
   307  		if ((stat_buf.st_mode & S_IFMT) == S_IFLNK) {
   308  			die("cannot bind mount alternate path over a symlink: %s", dst);
   309  		}
   310  		sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL);
   311  		if (!mnt->is_bidirectional) {
   312  			sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   313  		}
   314  	}
   315  	if (config->normal_mode) {
   316  		// Since we mounted /etc from the host filesystem to the scratch directory,
   317  		// we may need to put certain directories from the desired root filesystem
   318  		// (e.g. the core snap) back. This way the behavior of running snaps is not
   319  		// affected by the alternatives directory from the host, if one exists.
   320  		//
   321  		// Fixes the following bugs:
   322  		//  - https://bugs.launchpad.net/snap-confine/+bug/1580018
   323  		//  - https://bugzilla.opensuse.org/show_bug.cgi?id=1028568
   324  		const char *dirs_from_core[] = {
   325  			"/etc/alternatives", "/etc/ssl", "/etc/nsswitch.conf",
   326  			// Some specifc and privileged interfaces (e.g docker-support) give
   327  			// access to apparmor_parser from the base snap which at a minimum
   328  			// needs to use matching configuration from the base snap instead
   329  			// of from the users host system.
   330  			"/etc/apparmor", "/etc/apparmor.d",
   331  			NULL
   332  		};
   333  		for (const char **dirs = dirs_from_core; *dirs != NULL; dirs++) {
   334  			const char *dir = *dirs;
   335  			if (access(dir, F_OK) != 0) {
   336  				continue;
   337  			}
   338  			struct stat dst_stat;
   339  			struct stat src_stat;
   340  			sc_must_snprintf(src, sizeof src, "%s%s",
   341  					 config->rootfs_dir, dir);
   342  			sc_must_snprintf(dst, sizeof dst, "%s%s",
   343  					 scratch_dir, dir);
   344  			if (lstat(src, &src_stat) != 0) {
   345  				if (errno == ENOENT) {
   346  					continue;
   347  				}
   348  				die("cannot stat %s from desired rootfs", src);
   349  			}
   350  			if (!S_ISREG(src_stat.st_mode)
   351  			    && !S_ISDIR(src_stat.st_mode)) {
   352  				debug
   353  				    ("entry %s from the desired rootfs is not a file or directory, skipping mount",
   354  				     src);
   355  				continue;
   356  			}
   357  
   358  			if (lstat(dst, &dst_stat) != 0) {
   359  				if (errno == ENOENT) {
   360  					continue;
   361  				}
   362  				die("cannot stat %s from host", src);
   363  			}
   364  			if (!S_ISREG(dst_stat.st_mode)
   365  			    && !S_ISDIR(dst_stat.st_mode)) {
   366  				debug
   367  				    ("entry %s from the host is not a file or directory, skipping mount",
   368  				     src);
   369  				continue;
   370  			}
   371  
   372  			if ((dst_stat.st_mode & S_IFMT) !=
   373  			    (src_stat.st_mode & S_IFMT)) {
   374  				debug
   375  				    ("entries %s and %s are of different types, skipping mount",
   376  				     dst, src);
   377  				continue;
   378  			}
   379  			// both source and destination exist where both are either files
   380  			// or both are directories
   381  			sc_do_mount(src, dst, NULL, MS_BIND, NULL);
   382  			sc_do_mount("none", dst, NULL, MS_SLAVE, NULL);
   383  		}
   384  	}
   385  	// The "core" base snap is special as it contains snapd and friends.
   386  	// Other base snaps do not, so whenever a base snap other than core is
   387  	// in use we need extra provisions for setting up internal tooling to
   388  	// be available.
   389  	//
   390  	// However on a core18 (and similar) system the core snap is not
   391  	// a special base anymore and we should map our own tooling in.
   392  	if (config->distro == SC_DISTRO_CORE_OTHER
   393  	    || !sc_streq(config->base_snap_name, "core")) {
   394  		// when bases are used we need to bind-mount the libexecdir
   395  		// (that contains snap-exec) into /usr/lib/snapd of the
   396  		// base snap so that snap-exec is available for the snaps
   397  		// (base snaps do not ship snapd)
   398  
   399  		// dst is always /usr/lib/snapd as this is where snapd
   400  		// assumes to find snap-exec
   401  		sc_must_snprintf(dst, sizeof dst, "%s/usr/lib/snapd",
   402  				 scratch_dir);
   403  
   404  		// bind mount the current $ROOT/usr/lib/snapd path,
   405  		// where $ROOT is either "/" or the "/snap/{core,snapd}/current"
   406  		// that we are re-execing from
   407  		char *src = NULL;
   408  		char self[PATH_MAX + 1] = { 0 };
   409  		ssize_t nread;
   410  		nread = readlink("/proc/self/exe", self, sizeof self - 1);
   411  		if (nread < 0) {
   412  			die("cannot read /proc/self/exe");
   413  		}
   414  		// Though we initialized self to NULs and passed one less to
   415  		// readlink, therefore guaranteeing that self is
   416  		// zero-terminated, perform an explicit assignment to make
   417  		// Coverity happy.
   418  		self[nread] = '\0';
   419  		// this cannot happen except when the kernel is buggy
   420  		if (strstr(self, "/snap-confine") == NULL) {
   421  			die("cannot use result from readlink: %s", self);
   422  		}
   423  		src = dirname(self);
   424  		// dirname(path) might return '.' depending on path.
   425  		// /proc/self/exe should always point
   426  		// to an absolute path, but let's guarantee that.
   427  		if (src[0] != '/') {
   428  			die("cannot use the result of dirname(): %s", src);
   429  		}
   430  
   431  		sc_do_mount(src, dst, NULL, MS_BIND | MS_RDONLY, NULL);
   432  		sc_do_mount("none", dst, NULL, MS_SLAVE, NULL);
   433  	}
   434  	// Bind mount the directory where all snaps are mounted. The location of
   435  	// the this directory on the host filesystem may not match the location in
   436  	// the desired root filesystem. In the "core" and "ubuntu-core" snaps the
   437  	// directory is always /snap. On the host it is a build-time configuration
   438  	// option stored in SNAP_MOUNT_DIR. In legacy mode (or in other words, not
   439  	// in normal mode), we don't need to do this because /snap is fixed and
   440  	// already contains the correct view of the mounted snaps.
   441  	if (config->normal_mode) {
   442  		sc_must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir);
   443  		sc_do_mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC, NULL);
   444  		sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   445  	}
   446  	// Create the hostfs directory if one is missing. This directory is a part
   447  	// of packaging now so perhaps this code can be removed later.
   448  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
   449  	if (mkdir(SC_HOSTFS_DIR, 0755) < 0) {
   450  		if (errno != EEXIST) {
   451  			die("cannot perform operation: mkdir %s", SC_HOSTFS_DIR);
   452  		}
   453  	}
   454  	(void)sc_set_effective_identity(old);
   455  	// Ensure that hostfs isgroup owned by root. We may have (now or earlier)
   456  	// created the directory as the user who first ran a snap on a given
   457  	// system and the group identity of that user is visilbe on disk.
   458  	// This was LP:#1665004
   459  	struct stat sb;
   460  	if (stat(SC_HOSTFS_DIR, &sb) < 0) {
   461  		die("cannot stat %s", SC_HOSTFS_DIR);
   462  	}
   463  	if (sb.st_uid != 0 || sb.st_gid != 0) {
   464  		if (chown(SC_HOSTFS_DIR, 0, 0) < 0) {
   465  			die("cannot change user/group owner of %s to root",
   466  			    SC_HOSTFS_DIR);
   467  		}
   468  	}
   469  	// Make the upcoming "put_old" directory for pivot_root private so that
   470  	// mount events don't propagate to any peer group. In practice pivot root
   471  	// has a number of undocumented requirements and one of them is that the
   472  	// "put_old" directory (the second argument) cannot be shared in any way.
   473  	sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR);
   474  	sc_do_mount(dst, dst, NULL, MS_BIND, NULL);
   475  	sc_do_mount("none", dst, NULL, MS_PRIVATE, NULL);
   476  	// On classic mount the nvidia driver. Ideally this would be done in an
   477  	// uniform way after pivot_root but this is good enough and requires less
   478  	// code changes the nvidia code assumes it has access to the existing
   479  	// pre-pivot filesystem.
   480  	if (config->distro == SC_DISTRO_CLASSIC) {
   481  		sc_mount_nvidia_driver(scratch_dir);
   482  	}
   483  	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   484  	//                    pivot_root
   485  	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   486  	// Use pivot_root to "chroot" into the scratch directory.
   487  	//
   488  	// Q: Why are we using something as esoteric as pivot_root(2)?
   489  	// A: Because this makes apparmor handling easy. Using a normal chroot
   490  	// makes all apparmor rules conditional.  We are either running on an
   491  	// all-snap system where this would-be chroot didn't happen and all the
   492  	// rules see / as the root file system _OR_ we are running on top of a
   493  	// classic distribution and this chroot has now moved all paths to
   494  	// /tmp/snap.rootfs_*.
   495  	//
   496  	// Because we are using unshare(2) with CLONE_NEWNS we can essentially use
   497  	// pivot_root just like chroot but this makes apparmor unaware of the old
   498  	// root so everything works okay.
   499  	//
   500  	// HINT: If you are debugging this and are trying to see why pivot_root
   501  	// happens to return EINVAL with any changes you may be making, please
   502  	// consider applying
   503  	// misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree
   504  	// kernel.
   505  	debug("performing operation: pivot_root %s %s", scratch_dir, dst);
   506  	if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) {
   507  		die("cannot perform operation: pivot_root %s %s", scratch_dir,
   508  		    dst);
   509  	}
   510  	// Unmount the self-bind mount over the scratch directory created earlier
   511  	// in the original root filesystem (which is now mounted on SC_HOSTFS_DIR).
   512  	// This way we can remove the temporary directory we created and "clean up"
   513  	// after ourselves nicely.
   514  	sc_must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir);
   515  	sc_do_umount(dst, UMOUNT_NOFOLLOW);
   516  	// Remove the scratch directory. Note that we are using the path that is
   517  	// based on the old root filesystem as after pivot_root we cannot guarantee
   518  	// what is present at the same location normally. (It is probably an empty
   519  	// /tmp directory that is populated in another place).
   520  	debug("performing operation: rmdir %s", dst);
   521  	if (rmdir(scratch_dir) < 0) {
   522  		die("cannot perform operation: rmdir %s", dst);
   523  	};
   524  	// Make the old root filesystem recursively slave. This way operations
   525  	// performed in this mount namespace will not propagate to the peer group.
   526  	// This is another essential part of the confinement system.
   527  	sc_do_mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL);
   528  	// Detach the redundant hostfs version of sysfs since it shows up in the
   529  	// mount table and software inspecting the mount table may become confused
   530  	// (eg, docker and LP:# 162601).
   531  	sc_must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR);
   532  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   533  	// Detach the redundant hostfs version of /dev since it shows up in the
   534  	// mount table and software inspecting the mount table may become confused.
   535  	sc_must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR);
   536  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   537  	// Detach the redundant hostfs version of /proc since it shows up in the
   538  	// mount table and software inspecting the mount table may become confused.
   539  	sc_must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR);
   540  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   541  	// Detach both views of /writable: the one from hostfs and the one directly
   542  	// visible in /writable. Interfaces don't grant access to this directory
   543  	// and it has a large duplicated view of many mount points.  Note that this
   544  	// is only applicable to ubuntu-core systems.
   545  	sc_detach_views_of_writable(config->distro, config->normal_mode);
   546  }
   547  
   548  static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode)
   549  {
   550  	// Note that prior to detaching either mount point we switch the
   551  	// propagation to private to both limit the change to just this view and to
   552  	// prevent otherwise occurring event propagation from self-conflicting and
   553  	// returning EBUSY. A similar approach is used by snap-update-ns and is
   554  	// documented in umount(2).
   555  	const char *writable_dir = "/writable";
   556  	const char *hostfs_writable_dir = "/var/lib/snapd/hostfs/writable";
   557  
   558  	// Writable only exists on ubuntu-core.
   559  	if (distro == SC_DISTRO_CLASSIC) {
   560  		return;
   561  	}
   562  	// On all core distributions we see /var/lib/snapd/hostfs/writable that
   563  	// exposes writable, with a structure specific to ubuntu-core.
   564  	debug("detaching %s", hostfs_writable_dir);
   565  	sc_do_mount("none", hostfs_writable_dir, NULL,
   566  		    MS_REC | MS_PRIVATE, NULL);
   567  	sc_do_umount(hostfs_writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH);
   568  
   569  	// On ubuntu-core 16, when the executed snap uses core as base we also see
   570  	// the /writable that we directly inherited from the initial mount
   571  	// namespace.
   572  	if (distro == SC_DISTRO_CORE16 && !normal_mode) {
   573  		debug("detaching %s", writable_dir);
   574  		sc_do_mount("none", writable_dir, NULL, MS_REC | MS_PRIVATE,
   575  			    NULL);
   576  		sc_do_umount(writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH);
   577  	}
   578  }
   579  
   580  /**
   581   * @path:    a pathname where / replaced with '\0'.
   582   * @offsetp: pointer to int showing which path segment was last seen.
   583   *           Updated on return to reflect the next segment.
   584   * @fulllen: full original path length.
   585   * Returns a pointer to the next path segment, or NULL if done.
   586   */
   587  static char * __attribute__((used))
   588      get_nextpath(char *path, size_t *offsetp, size_t fulllen)
   589  {
   590  	size_t offset = *offsetp;
   591  
   592  	if (offset >= fulllen)
   593  		return NULL;
   594  
   595  	while (offset < fulllen && path[offset] != '\0')
   596  		offset++;
   597  	while (offset < fulllen && path[offset] == '\0')
   598  		offset++;
   599  
   600  	*offsetp = offset;
   601  	return (offset < fulllen) ? &path[offset] : NULL;
   602  }
   603  
   604  /**
   605   * Check that @subdir is a subdir of @dir.
   606  **/
   607  static bool __attribute__((used))
   608      is_subdir(const char *subdir, const char *dir)
   609  {
   610  	size_t dirlen = strlen(dir);
   611  	size_t subdirlen = strlen(subdir);
   612  
   613  	// @dir has to be at least as long as @subdir
   614  	if (subdirlen < dirlen)
   615  		return false;
   616  	// @dir has to be a prefix of @subdir
   617  	if (strncmp(subdir, dir, dirlen) != 0)
   618  		return false;
   619  	// @dir can look like "path/" (that is, end with the directory separator).
   620  	// When that is the case then given the test above we can be sure @subdir
   621  	// is a real subdirectory.
   622  	if (dirlen > 0 && dir[dirlen - 1] == '/')
   623  		return true;
   624  	// @subdir can look like "path/stuff" and when the directory separator
   625  	// is exactly at the spot where @dir ends (that is, it was not caught
   626  	// by the test above) then @subdir is a real subdirectory.
   627  	if (subdir[dirlen] == '/' && dirlen > 0)
   628  		return true;
   629  	// If both @dir and @subdir have identical length then given that the
   630  	// prefix check above @subdir is a real subdirectory.
   631  	if (subdirlen == dirlen)
   632  		return true;
   633  	return false;
   634  }
   635  
   636  void sc_populate_mount_ns(struct sc_apparmor *apparmor, int snap_update_ns_fd,
   637  			  const sc_invocation * inv, const gid_t real_gid,
   638  			  const gid_t saved_gid)
   639  {
   640  	// Classify the current distribution, as claimed by /etc/os-release.
   641  	sc_distro distro = sc_classify_distro();
   642  
   643  	// Check which mode we should run in, normal or legacy.
   644  	if (inv->is_normal_mode) {
   645  		// In normal mode we use the base snap as / and set up several bind mounts.
   646  		const struct sc_mount mounts[] = {
   647  			{"/dev"},	// because it contains devices on host OS
   648  			{"/etc"},	// because that's where /etc/resolv.conf lives, perhaps a bad idea
   649  			{"/home"},	// to support /home/*/snap and home interface
   650  			{"/root"},	// because that is $HOME for services
   651  			{"/proc"},	// fundamental filesystem
   652  			{"/sys"},	// fundamental filesystem
   653  			{"/tmp"},	// to get writable tmp
   654  			{"/var/snap"},	// to get access to global snap data
   655  			{"/var/lib/snapd"},	// to get access to snapd state and seccomp profiles
   656  			{"/var/tmp"},	// to get access to the other temporary directory
   657  			{"/run"},	// to get /run with sockets and what not
   658  			{"/lib/modules",.is_optional = true},	// access to the modules of the running kernel
   659  			{"/lib/firmware",.is_optional = true},	// access to the firmware of the running kernel
   660  			{"/usr/src"},	// FIXME: move to SecurityMounts in system-trace interface
   661  			{"/var/log"},	// FIXME: move to SecurityMounts in log-observe interface
   662  #ifdef MERGED_USR
   663  			{"/run/media", true, "/media"},	// access to the users removable devices
   664  #else
   665  			{"/media", true},	// access to the users removable devices
   666  #endif				// MERGED_USR
   667  			{"/run/netns", true},	// access to the 'ip netns' network namespaces
   668  			// The /mnt directory is optional in base snaps to ensure backwards
   669  			// compatibility with the first version of base snaps that was
   670  			// released.
   671  			{"/mnt",.is_optional = true},	// to support the removable-media interface
   672  			{"/var/lib/extrausers",.is_optional = true},	// access to UID/GID of extrausers (if available)
   673  			{},
   674  		};
   675  		struct sc_mount_config normal_config = {
   676  			.rootfs_dir = inv->rootfs_dir,
   677  			.mounts = mounts,
   678  			.distro = distro,
   679  			.normal_mode = true,
   680  			.base_snap_name = inv->base_snap_name,
   681  		};
   682  		sc_bootstrap_mount_namespace(&normal_config);
   683  	} else {
   684  		// In legacy mode we don't pivot and instead just arrange bi-
   685  		// directional mount propagation for two directories.
   686  		const struct sc_mount mounts[] = {
   687  			{"/media", true},
   688  			{"/run/netns", true},
   689  			{},
   690  		};
   691  		struct sc_mount_config legacy_config = {
   692  			.rootfs_dir = "/",
   693  			.mounts = mounts,
   694  			.distro = distro,
   695  			.normal_mode = false,
   696  			.base_snap_name = inv->base_snap_name,
   697  		};
   698  		sc_bootstrap_mount_namespace(&legacy_config);
   699  	}
   700  
   701  	// TODO: rename this and fold it into bootstrap
   702  	setup_private_mount(inv->snap_instance);
   703  	// set up private /dev/pts
   704  	// TODO: fold this into bootstrap
   705  	setup_private_pts();
   706  
   707  	// setup the security backend bind mounts
   708  	sc_call_snap_update_ns(snap_update_ns_fd, inv->snap_instance, apparmor);
   709  }
   710  
   711  static bool is_mounted_with_shared_option(const char *dir)
   712      __attribute__((nonnull(1)));
   713  
   714  static bool is_mounted_with_shared_option(const char *dir)
   715  {
   716  	sc_mountinfo *sm SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   717  	sm = sc_parse_mountinfo(NULL);
   718  	if (sm == NULL) {
   719  		die("cannot parse /proc/self/mountinfo");
   720  	}
   721  	sc_mountinfo_entry *entry = sc_first_mountinfo_entry(sm);
   722  	while (entry != NULL) {
   723  		const char *mount_dir = entry->mount_dir;
   724  		if (sc_streq(mount_dir, dir)) {
   725  			const char *optional_fields = entry->optional_fields;
   726  			if (strstr(optional_fields, "shared:") != NULL) {
   727  				return true;
   728  			}
   729  		}
   730  		entry = sc_next_mountinfo_entry(entry);
   731  	}
   732  	return false;
   733  }
   734  
   735  void sc_ensure_shared_snap_mount(void)
   736  {
   737  	if (!is_mounted_with_shared_option("/")
   738  	    && !is_mounted_with_shared_option(SNAP_MOUNT_DIR)) {
   739  		// TODO: We could be more aggressive and refuse to function but since
   740  		// we have no data on actual environments that happen to limp along in
   741  		// this configuration let's not do that yet.  This code should be
   742  		// removed once we have a measurement and feedback mechanism that lets
   743  		// us decide based on measurable data.
   744  		sc_do_mount(SNAP_MOUNT_DIR, SNAP_MOUNT_DIR, "none",
   745  			    MS_BIND | MS_REC, 0);
   746  		sc_do_mount("none", SNAP_MOUNT_DIR, NULL, MS_SHARED | MS_REC,
   747  			    NULL);
   748  	}
   749  }
   750  
   751  void sc_setup_user_mounts(struct sc_apparmor *apparmor, int snap_update_ns_fd,
   752  			  const char *snap_name)
   753  {
   754  	debug("%s: %s", __FUNCTION__, snap_name);
   755  
   756  	char profile_path[PATH_MAX];
   757  	struct stat st;
   758  
   759  	sc_must_snprintf(profile_path, sizeof(profile_path),
   760  			 "/var/lib/snapd/mount/snap.%s.user-fstab", snap_name);
   761  	if (stat(profile_path, &st) != 0) {
   762  		// It is ok for the user fstab to not exist.
   763  		return;
   764  	}
   765  
   766  	// In our new mount namespace, recursively change all mounts
   767  	// to slave mode, so we see changes from the parent namespace
   768  	// but don't propagate our own changes.
   769  	sc_do_mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL);
   770  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
   771  	sc_call_snap_update_ns_as_user(snap_update_ns_fd, snap_name, apparmor);
   772  	(void)sc_set_effective_identity(old);
   773  }
   774  
   775  void sc_ensure_snap_dir_shared_mounts(void)
   776  {
   777  	const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL };
   778  	for (int i = 0; dirs[i] != NULL; i++) {
   779  		const char *dir = dirs[i];
   780  		if (!is_mounted_with_shared_option(dir)) {
   781  			/* Since this directory isn't yet shared (but it should be),
   782  			 * recursively bind mount it, then recursively share it so that
   783  			 * changes to the host are seen in the snap and vice-versa. This
   784  			 * allows us to fine-tune propagation events elsewhere for this new
   785  			 * mountpoint.
   786  			 *
   787  			 * Not using MS_SLAVE because it's too late for SNAP_MOUNT_DIR,
   788  			 * since snaps are already mounted, and it's not needed for
   789  			 * /var/snap.
   790  			 */
   791  			sc_do_mount(dir, dir, "none", MS_BIND | MS_REC, 0);
   792  			sc_do_mount("none", dir, NULL, MS_REC | MS_SHARED,
   793  				    NULL);
   794  		}
   795  	}
   796  }
   797  
   798  void sc_setup_parallel_instance_classic_mounts(const char *snap_name,
   799  					       const char *snap_instance_name)
   800  {
   801  	char src[PATH_MAX] = { 0 };
   802  	char dst[PATH_MAX] = { 0 };
   803  
   804  	const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL };
   805  	for (int i = 0; dirs[i] != NULL; i++) {
   806  		const char *dir = dirs[i];
   807  		sc_do_mount("none", dir, NULL, MS_REC | MS_SLAVE, NULL);
   808  	}
   809  
   810  	/* Mount SNAP_MOUNT_DIR/<snap>_<key> on SNAP_MOUNT_DIR/<snap> */
   811  	sc_must_snprintf(src, sizeof src, "%s/%s", SNAP_MOUNT_DIR,
   812  			 snap_instance_name);
   813  	sc_must_snprintf(dst, sizeof dst, "%s/%s", SNAP_MOUNT_DIR, snap_name);
   814  	sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0);
   815  
   816  	/* Mount /var/snap/<snap>_<key> on /var/snap/<snap> */
   817  	sc_must_snprintf(src, sizeof src, "/var/snap/%s", snap_instance_name);
   818  	sc_must_snprintf(dst, sizeof dst, "/var/snap/%s", snap_name);
   819  	sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0);
   820  }