github.com/kubiko/snapd@v0.0.0-20201013125620-d4f3094d9ddf/cmd/snap-confine/mount-support.c (about)

     1  /*
     2   * Copyright (C) 2015 Canonical Ltd
     3   *
     4   * This program is free software: you can redistribute it and/or modify
     5   * it under the terms of the GNU General Public License version 3 as
     6   * published by the Free Software Foundation.
     7   *
     8   * This program is distributed in the hope that it will be useful,
     9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11   * GNU General Public License for more details.
    12   *
    13   * You should have received a copy of the GNU General Public License
    14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15   *
    16   */
    17  #ifdef HAVE_CONFIG_H
    18  #include "config.h"
    19  #endif
    20  
    21  #include "mount-support.h"
    22  
    23  #include <errno.h>
    24  #include <fcntl.h>
    25  #include <libgen.h>
    26  #include <limits.h>
    27  #include <mntent.h>
    28  #include <sched.h>
    29  #include <stdio.h>
    30  #include <stdlib.h>
    31  #include <string.h>
    32  #include <sys/mount.h>
    33  #include <sys/stat.h>
    34  #include <sys/syscall.h>
    35  #include <sys/types.h>
    36  #include <sys/wait.h>
    37  #include <unistd.h>
    38  
    39  #include "../libsnap-confine-private/apparmor-support.h"
    40  #include "../libsnap-confine-private/classic.h"
    41  #include "../libsnap-confine-private/cleanup-funcs.h"
    42  #include "../libsnap-confine-private/mount-opt.h"
    43  #include "../libsnap-confine-private/mountinfo.h"
    44  #include "../libsnap-confine-private/snap.h"
    45  #include "../libsnap-confine-private/string-utils.h"
    46  #include "../libsnap-confine-private/tool.h"
    47  #include "../libsnap-confine-private/utils.h"
    48  #include "mount-support-nvidia.h"
    49  
    50  #define MAX_BUF 1000
    51  
    52  static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode);
    53  
    54  // TODO: simplify this, after all it is just a tmpfs
    55  // TODO: fold this into bootstrap
    56  static void setup_private_mount(const char *snap_name)
    57  {
    58  	// Create a 0700 base directory. This is the "base" directory that is
    59  	// protected from other users. This directory name is NOT randomly
    60  	// generated. This has several properties:
    61  	//
    62  	// Users can relate to the name and can find the temporary directory as
    63  	// visible from within the snap. If this directory was random it would be
    64  	// harder to find because there may be situations in which multiple
    65  	// directories related to the same snap name would exist.
    66  	//
    67  	// Snapd can partially manage the directory. Specifically on snap remove
    68  	// snapd could remove the directory and everything in it, potentially
    69  	// avoiding runaway disk use on a machine that either never reboots or uses
    70  	// persistent /tmp directory.
    71  	//
    72  	// Underneath the base directory there is a "tmp" sub-directory that has
    73  	// mode 1777 and behaves as a typical /tmp directory would. That directory
    74  	// is used as a bind-mounted /tmp directory.
    75  	//
    76  	// Because the directories are reused across invocations by distinct users
    77  	// and because the directories are trivially guessable, each invocation
    78  	// unconditionally chowns/chmods them to appropriate values.
    79  	char base_dir[MAX_BUF] = { 0 };
    80  	char tmp_dir[MAX_BUF] = { 0 };
    81  	int base_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
    82  	int tmp_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
    83  	sc_must_snprintf(base_dir, sizeof(base_dir), "/tmp/snap.%s", snap_name);
    84  	sc_must_snprintf(tmp_dir, sizeof(tmp_dir), "%s/tmp", base_dir);
    85  
    86  	/* Switch to root group so that mkdir and open calls below create filesystem
    87  	 * elements that are not owned by the user calling into snap-confine. */
    88  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
    89  	// Create /tmp/snap.$SNAP_NAME/ 0700 root.root. Ignore EEXIST since we want
    90  	// to reuse and we will open with O_NOFOLLOW, below.
    91  	if (mkdir(base_dir, 0700) < 0 && errno != EEXIST) {
    92  		die("cannot create base directory %s", base_dir);
    93  	}
    94  	base_dir_fd = open(base_dir,
    95  			   O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
    96  	if (base_dir_fd < 0) {
    97  		die("cannot open base directory %s", base_dir);
    98  	}
    99  	/* This seems redundant on first read but it has the non-obvious
   100  	 * property of changing existing directories  that have already existed
   101  	 * but had incorrect ownership or permission. This is possible due to
   102  	 * earlier bugs in snap-confine and due to the fact that some systems
   103  	 * use persistent /tmp directory and may not clean up leftover files
   104  	 * for arbitrarily long. This comment applies the following two pairs
   105  	 * of fchmod and fchown. */
   106  	if (fchmod(base_dir_fd, 0700) < 0) {
   107  		die("cannot chmod base directory %s to 0700", base_dir);
   108  	}
   109  	if (fchown(base_dir_fd, 0, 0) < 0) {
   110  		die("cannot chown base directory %s to root.root", base_dir);
   111  	}
   112  	// Create /tmp/snap.$SNAP_NAME/tmp 01777 root.root Ignore EEXIST since we
   113  	// want to reuse and we will open with O_NOFOLLOW, below.
   114  	if (mkdirat(base_dir_fd, "tmp", 01777) < 0 && errno != EEXIST) {
   115  		die("cannot create private tmp directory %s/tmp", base_dir);
   116  	}
   117  	(void)sc_set_effective_identity(old);
   118  	tmp_dir_fd = openat(base_dir_fd, "tmp",
   119  			    O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
   120  	if (tmp_dir_fd < 0) {
   121  		die("cannot open private tmp directory %s/tmp", base_dir);
   122  	}
   123  	if (fchmod(tmp_dir_fd, 01777) < 0) {
   124  		die("cannot chmod private tmp directory %s/tmp to 01777",
   125  		    base_dir);
   126  	}
   127  	if (fchown(tmp_dir_fd, 0, 0) < 0) {
   128  		die("cannot chown private tmp directory %s/tmp to root.root",
   129  		    base_dir);
   130  	}
   131  	sc_do_mount(tmp_dir, "/tmp", NULL, MS_BIND, NULL);
   132  	sc_do_mount("none", "/tmp", NULL, MS_PRIVATE, NULL);
   133  }
   134  
   135  // TODO: fold this into bootstrap
   136  static void setup_private_pts(void)
   137  {
   138  	// See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt
   139  	//
   140  	// Ubuntu by default uses devpts 'single-instance' mode where
   141  	// /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change
   142  	// the startup scripts though, so we follow the instructions in point
   143  	// '4' of 'User-space changes' in the above doc. In other words, after
   144  	// unshare(CLONE_NEWNS), we mount devpts with -o
   145  	// newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto
   146  	// /dev/ptmx
   147  
   148  	struct stat st;
   149  
   150  	// Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode
   151  	// which doesn't provide the isolation we require.
   152  	if (stat("/dev/pts/ptmx", &st) != 0) {
   153  		die("cannot stat /dev/pts/ptmx");
   154  	}
   155  	// Make sure /dev/ptmx exists so we can bind mount over it
   156  	if (stat("/dev/ptmx", &st) != 0) {
   157  		die("cannot stat /dev/ptmx");
   158  	}
   159  	// Since multi-instance, use ptmxmode=0666. The other options are
   160  	// copied from /etc/default/devpts
   161  	sc_do_mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,
   162  		    "newinstance,ptmxmode=0666,mode=0620,gid=5");
   163  	sc_do_mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0);
   164  }
   165  
   166  struct sc_mount {
   167  	const char *path;
   168  	bool is_bidirectional;
   169  	// Alternate path defines the rbind mount "alternative" of path.
   170  	// It exists so that we can make /media on systems that use /run/media.
   171  	const char *altpath;
   172  	// Optional mount points are not processed unless the source and
   173  	// destination both exist.
   174  	bool is_optional;
   175  };
   176  
   177  struct sc_mount_config {
   178  	const char *rootfs_dir;
   179  	// The struct is terminated with an entry with NULL path.
   180  	const struct sc_mount *mounts;
   181  	sc_distro distro;
   182  	bool normal_mode;
   183  	const char *base_snap_name;
   184  };
   185  
   186  /**
   187   * Bootstrap mount namespace.
   188   *
   189   * This is a chunk of tricky code that lets us have full control over the
   190   * layout and direction of propagation of mount events. The documentation below
   191   * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source
   192   * tree.
   193   *
   194   * As a reminder two definitions are quoted below:
   195   *
   196   *  A 'propagation event' is defined as event generated on a vfsmount
   197   *  that leads to mount or unmount actions in other vfsmounts.
   198   *
   199   *  A 'peer group' is defined as a group of vfsmounts that propagate
   200   *  events to each other.
   201   *
   202   * (end of quote).
   203   *
   204   * The main idea is to setup a mount namespace that has a root filesystem with
   205   * vfsmounts and peer groups that, depending on the location, either isolate
   206   * or share with the rest of the system.
   207   *
   208   * The vast majority of the filesystem is shared in one direction. Events from
   209   * the outside (from the main mount namespace) propagate inside (to namespaces
   210   * of particular snaps) so things like new snap revisions, mounted drives, etc,
   211   * just show up as expected but even if a snap is exploited or malicious in
   212   * nature it cannot affect anything in another namespace where it might cause
   213   * security or stability issues.
   214   *
   215   * Selected directories (today just /media) can be shared in both directions.
   216   * This allows snaps with sufficient privileges to either create, through the
   217   * mount system call, additional mount points that are visible by the rest of
   218   * the system (both the main mount namespace and namespaces of individual
   219   * snaps) or remove them, through the unmount system call.
   220   **/
   221  static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config)
   222  {
   223  	char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX";
   224  	char src[PATH_MAX] = { 0 };
   225  	char dst[PATH_MAX] = { 0 };
   226  	if (mkdtemp(scratch_dir) == NULL) {
   227  		die("cannot create temporary directory for the root file system");
   228  	}
   229  	// NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new
   230  	// mount namespace and have a private list of mounts.
   231  	debug("scratch directory for constructing namespace: %s", scratch_dir);
   232  	// Make the root filesystem recursively shared. This way propagation events
   233  	// will be shared with main mount namespace.
   234  	sc_do_mount("none", "/", NULL, MS_REC | MS_SHARED, NULL);
   235  	// Bind mount the temporary scratch directory for root filesystem over
   236  	// itself so that it is a mount point. This is done so that it can become
   237  	// unbindable as explained below.
   238  	sc_do_mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL);
   239  	// Make the scratch directory unbindable.
   240  	//
   241  	// This is necessary as otherwise a mount loop can occur and the kernel
   242  	// would crash. The term unbindable simply states that it cannot be bind
   243  	// mounted anywhere. When we construct recursive bind mounts below this
   244  	// guarantees that this directory will not be replicated anywhere.
   245  	sc_do_mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL);
   246  	// Recursively bind mount desired root filesystem directory over the
   247  	// scratch directory. This puts the initial content into the scratch space
   248  	// and serves as a foundation for all subsequent operations below.
   249  	//
   250  	// The mount is recursive because it can either be applied to the root
   251  	// filesystem of a core system (aka all-snap) or the core snap on a classic
   252  	// system. In the former case we need recursive bind mounts to accurately
   253  	// replicate the state of the root filesystem into the scratch directory.
   254  	sc_do_mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND,
   255  		    NULL);
   256  	// Make the scratch directory recursively slave. Nothing done there will be
   257  	// shared with the initial mount namespace. This effectively detaches us,
   258  	// in one way, from the original namespace and coupled with pivot_root
   259  	// below serves as the foundation of the mount sandbox.
   260  	sc_do_mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL);
   261  	// Bind mount certain directories from the host filesystem to the scratch
   262  	// directory. By default mount events will propagate in both into and out
   263  	// of the peer group. This way the running application can alter any global
   264  	// state visible on the host and in other snaps. This can be restricted by
   265  	// disabling the "is_bidirectional" flag as can be seen below.
   266  	for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL;
   267  	     mnt++) {
   268  
   269  		if (mnt->is_bidirectional) {
   270  			sc_identity old =
   271  			    sc_set_effective_identity(sc_root_group_identity());
   272  			if (mkdir(mnt->path, 0755) < 0 && errno != EEXIST) {
   273  				die("cannot create %s", mnt->path);
   274  			}
   275  			(void)sc_set_effective_identity(old);
   276  		}
   277  		sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir,
   278  				 mnt->path);
   279  		if (mnt->is_optional) {
   280  			bool ok = sc_do_optional_mount(mnt->path, dst, NULL,
   281  						       MS_REC | MS_BIND, NULL);
   282  			if (!ok) {
   283  				// If we cannot mount it, just continue.
   284  				continue;
   285  			}
   286  		} else {
   287  			sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND,
   288  				    NULL);
   289  		}
   290  		if (!mnt->is_bidirectional) {
   291  			// Mount events will only propagate inwards to the namespace. This
   292  			// way the running application cannot alter any global state apart
   293  			// from that of its own snap.
   294  			sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   295  		}
   296  		if (mnt->altpath == NULL) {
   297  			continue;
   298  		}
   299  		// An alternate path of mnt->path is provided at another location.
   300  		// It should behave exactly the same as the original.
   301  		sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir,
   302  				 mnt->altpath);
   303  		struct stat stat_buf;
   304  		if (lstat(dst, &stat_buf) < 0) {
   305  			die("cannot lstat %s", dst);
   306  		}
   307  		if ((stat_buf.st_mode & S_IFMT) == S_IFLNK) {
   308  			die("cannot bind mount alternate path over a symlink: %s", dst);
   309  		}
   310  		sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL);
   311  		if (!mnt->is_bidirectional) {
   312  			sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   313  		}
   314  	}
   315  	if (config->normal_mode) {
   316  		// Since we mounted /etc from the host filesystem to the scratch directory,
   317  		// we may need to put certain directories from the desired root filesystem
   318  		// (e.g. the core snap) back. This way the behavior of running snaps is not
   319  		// affected by the alternatives directory from the host, if one exists.
   320  		//
   321  		// Fixes the following bugs:
   322  		//  - https://bugs.launchpad.net/snap-confine/+bug/1580018
   323  		//  - https://bugzilla.opensuse.org/show_bug.cgi?id=1028568
   324  		const char *dirs_from_core[] =
   325  		    { "/etc/alternatives", "/etc/ssl", "/etc/nsswitch.conf",
   326  			NULL
   327  		};
   328  		for (const char **dirs = dirs_from_core; *dirs != NULL; dirs++) {
   329  			const char *dir = *dirs;
   330  			if (access(dir, F_OK) != 0) {
   331  				continue;
   332  			}
   333  			struct stat dst_stat;
   334  			struct stat src_stat;
   335  			sc_must_snprintf(src, sizeof src, "%s%s",
   336  					 config->rootfs_dir, dir);
   337  			sc_must_snprintf(dst, sizeof dst, "%s%s",
   338  					 scratch_dir, dir);
   339  			if (lstat(src, &src_stat) != 0) {
   340  				if (errno == ENOENT) {
   341  					continue;
   342  				}
   343  				die("cannot stat %s from desired rootfs", src);
   344  			}
   345  			if (!S_ISREG(src_stat.st_mode)
   346  			    && !S_ISDIR(src_stat.st_mode)) {
   347  				debug
   348  				    ("entry %s from the desired rootfs is not a file or directory, skipping mount",
   349  				     src);
   350  				continue;
   351  			}
   352  
   353  			if (lstat(dst, &dst_stat) != 0) {
   354  				if (errno == ENOENT) {
   355  					continue;
   356  				}
   357  				die("cannot stat %s from host", src);
   358  			}
   359  			if (!S_ISREG(dst_stat.st_mode)
   360  			    && !S_ISDIR(dst_stat.st_mode)) {
   361  				debug
   362  				    ("entry %s from the host is not a file or directory, skipping mount",
   363  				     src);
   364  				continue;
   365  			}
   366  
   367  			if ((dst_stat.st_mode & S_IFMT) !=
   368  			    (src_stat.st_mode & S_IFMT)) {
   369  				debug
   370  				    ("entries %s and %s are of different types, skipping mount",
   371  				     dst, src);
   372  				continue;
   373  			}
   374  			// both source and destination exist where both are either files
   375  			// or both are directories
   376  			sc_do_mount(src, dst, NULL, MS_BIND, NULL);
   377  			sc_do_mount("none", dst, NULL, MS_SLAVE, NULL);
   378  		}
   379  	}
   380  	// The "core" base snap is special as it contains snapd and friends.
   381  	// Other base snaps do not, so whenever a base snap other than core is
   382  	// in use we need extra provisions for setting up internal tooling to
   383  	// be available.
   384  	//
   385  	// However on a core18 (and similar) system the core snap is not
   386  	// a special base anymore and we should map our own tooling in.
   387  	if (config->distro == SC_DISTRO_CORE_OTHER
   388  	    || !sc_streq(config->base_snap_name, "core")) {
   389  		// when bases are used we need to bind-mount the libexecdir
   390  		// (that contains snap-exec) into /usr/lib/snapd of the
   391  		// base snap so that snap-exec is available for the snaps
   392  		// (base snaps do not ship snapd)
   393  
   394  		// dst is always /usr/lib/snapd as this is where snapd
   395  		// assumes to find snap-exec
   396  		sc_must_snprintf(dst, sizeof dst, "%s/usr/lib/snapd",
   397  				 scratch_dir);
   398  
   399  		// bind mount the current $ROOT/usr/lib/snapd path,
   400  		// where $ROOT is either "/" or the "/snap/{core,snapd}/current"
   401  		// that we are re-execing from
   402  		char *src = NULL;
   403  		char self[PATH_MAX + 1] = { 0 };
   404  		ssize_t nread;
   405  		nread = readlink("/proc/self/exe", self, sizeof self - 1);
   406  		if (nread < 0) {
   407  			die("cannot read /proc/self/exe");
   408  		}
   409  		// Though we initialized self to NULs and passed one less to
   410  		// readlink, therefore guaranteeing that self is
   411  		// zero-terminated, perform an explicit assignment to make
   412  		// Coverity happy.
   413  		self[nread] = '\0';
   414  		// this cannot happen except when the kernel is buggy
   415  		if (strstr(self, "/snap-confine") == NULL) {
   416  			die("cannot use result from readlink: %s", self);
   417  		}
   418  		src = dirname(self);
   419  		// dirname(path) might return '.' depending on path.
   420  		// /proc/self/exe should always point
   421  		// to an absolute path, but let's guarantee that.
   422  		if (src[0] != '/') {
   423  			die("cannot use the result of dirname(): %s", src);
   424  		}
   425  
   426  		sc_do_mount(src, dst, NULL, MS_BIND | MS_RDONLY, NULL);
   427  		sc_do_mount("none", dst, NULL, MS_SLAVE, NULL);
   428  	}
   429  	// Bind mount the directory where all snaps are mounted. The location of
   430  	// the this directory on the host filesystem may not match the location in
   431  	// the desired root filesystem. In the "core" and "ubuntu-core" snaps the
   432  	// directory is always /snap. On the host it is a build-time configuration
   433  	// option stored in SNAP_MOUNT_DIR. In legacy mode (or in other words, not
   434  	// in normal mode), we don't need to do this because /snap is fixed and
   435  	// already contains the correct view of the mounted snaps.
   436  	if (config->normal_mode) {
   437  		sc_must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir);
   438  		sc_do_mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC, NULL);
   439  		sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   440  	}
   441  	// Create the hostfs directory if one is missing. This directory is a part
   442  	// of packaging now so perhaps this code can be removed later.
   443  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
   444  	if (mkdir(SC_HOSTFS_DIR, 0755) < 0) {
   445  		if (errno != EEXIST) {
   446  			die("cannot perform operation: mkdir %s", SC_HOSTFS_DIR);
   447  		}
   448  	}
   449  	(void)sc_set_effective_identity(old);
   450  	// Ensure that hostfs isgroup owned by root. We may have (now or earlier)
   451  	// created the directory as the user who first ran a snap on a given
   452  	// system and the group identity of that user is visilbe on disk.
   453  	// This was LP:#1665004
   454  	struct stat sb;
   455  	if (stat(SC_HOSTFS_DIR, &sb) < 0) {
   456  		die("cannot stat %s", SC_HOSTFS_DIR);
   457  	}
   458  	if (sb.st_uid != 0 || sb.st_gid != 0) {
   459  		if (chown(SC_HOSTFS_DIR, 0, 0) < 0) {
   460  			die("cannot change user/group owner of %s to root",
   461  			    SC_HOSTFS_DIR);
   462  		}
   463  	}
   464  	// Make the upcoming "put_old" directory for pivot_root private so that
   465  	// mount events don't propagate to any peer group. In practice pivot root
   466  	// has a number of undocumented requirements and one of them is that the
   467  	// "put_old" directory (the second argument) cannot be shared in any way.
   468  	sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR);
   469  	sc_do_mount(dst, dst, NULL, MS_BIND, NULL);
   470  	sc_do_mount("none", dst, NULL, MS_PRIVATE, NULL);
   471  	// On classic mount the nvidia driver. Ideally this would be done in an
   472  	// uniform way after pivot_root but this is good enough and requires less
   473  	// code changes the nvidia code assumes it has access to the existing
   474  	// pre-pivot filesystem.
   475  	if (config->distro == SC_DISTRO_CLASSIC) {
   476  		sc_mount_nvidia_driver(scratch_dir);
   477  	}
   478  	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   479  	//                    pivot_root
   480  	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   481  	// Use pivot_root to "chroot" into the scratch directory.
   482  	//
   483  	// Q: Why are we using something as esoteric as pivot_root(2)?
   484  	// A: Because this makes apparmor handling easy. Using a normal chroot
   485  	// makes all apparmor rules conditional.  We are either running on an
   486  	// all-snap system where this would-be chroot didn't happen and all the
   487  	// rules see / as the root file system _OR_ we are running on top of a
   488  	// classic distribution and this chroot has now moved all paths to
   489  	// /tmp/snap.rootfs_*.
   490  	//
   491  	// Because we are using unshare(2) with CLONE_NEWNS we can essentially use
   492  	// pivot_root just like chroot but this makes apparmor unaware of the old
   493  	// root so everything works okay.
   494  	//
   495  	// HINT: If you are debugging this and are trying to see why pivot_root
   496  	// happens to return EINVAL with any changes you may be making, please
   497  	// consider applying
   498  	// misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree
   499  	// kernel.
   500  	debug("performing operation: pivot_root %s %s", scratch_dir, dst);
   501  	if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) {
   502  		die("cannot perform operation: pivot_root %s %s", scratch_dir,
   503  		    dst);
   504  	}
   505  	// Unmount the self-bind mount over the scratch directory created earlier
   506  	// in the original root filesystem (which is now mounted on SC_HOSTFS_DIR).
   507  	// This way we can remove the temporary directory we created and "clean up"
   508  	// after ourselves nicely.
   509  	sc_must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir);
   510  	sc_do_umount(dst, UMOUNT_NOFOLLOW);
   511  	// Remove the scratch directory. Note that we are using the path that is
   512  	// based on the old root filesystem as after pivot_root we cannot guarantee
   513  	// what is present at the same location normally. (It is probably an empty
   514  	// /tmp directory that is populated in another place).
   515  	debug("performing operation: rmdir %s", dst);
   516  	if (rmdir(scratch_dir) < 0) {
   517  		die("cannot perform operation: rmdir %s", dst);
   518  	};
   519  	// Make the old root filesystem recursively slave. This way operations
   520  	// performed in this mount namespace will not propagate to the peer group.
   521  	// This is another essential part of the confinement system.
   522  	sc_do_mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL);
   523  	// Detach the redundant hostfs version of sysfs since it shows up in the
   524  	// mount table and software inspecting the mount table may become confused
   525  	// (eg, docker and LP:# 162601).
   526  	sc_must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR);
   527  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   528  	// Detach the redundant hostfs version of /dev since it shows up in the
   529  	// mount table and software inspecting the mount table may become confused.
   530  	sc_must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR);
   531  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   532  	// Detach the redundant hostfs version of /proc since it shows up in the
   533  	// mount table and software inspecting the mount table may become confused.
   534  	sc_must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR);
   535  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   536  	// Detach both views of /writable: the one from hostfs and the one directly
   537  	// visible in /writable. Interfaces don't grant access to this directory
   538  	// and it has a large duplicated view of many mount points.  Note that this
   539  	// is only applicable to ubuntu-core systems.
   540  	sc_detach_views_of_writable(config->distro, config->normal_mode);
   541  }
   542  
   543  static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode)
   544  {
   545  	// Note that prior to detaching either mount point we switch the
   546  	// propagation to private to both limit the change to just this view and to
   547  	// prevent otherwise occurring event propagation from self-conflicting and
   548  	// returning EBUSY. A similar approach is used by snap-update-ns and is
   549  	// documented in umount(2).
   550  	const char *writable_dir = "/writable";
   551  	const char *hostfs_writable_dir = "/var/lib/snapd/hostfs/writable";
   552  
   553  	// Writable only exists on ubuntu-core.
   554  	if (distro == SC_DISTRO_CLASSIC) {
   555  		return;
   556  	}
   557  	// On all core distributions we see /var/lib/snapd/hostfs/writable that
   558  	// exposes writable, with a structure specific to ubuntu-core.
   559  	debug("detaching %s", hostfs_writable_dir);
   560  	sc_do_mount("none", hostfs_writable_dir, NULL,
   561  		    MS_REC | MS_PRIVATE, NULL);
   562  	sc_do_umount(hostfs_writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH);
   563  
   564  	// On ubuntu-core 16, when the executed snap uses core as base we also see
   565  	// the /writable that we directly inherited from the initial mount
   566  	// namespace.
   567  	if (distro == SC_DISTRO_CORE16 && !normal_mode) {
   568  		debug("detaching %s", writable_dir);
   569  		sc_do_mount("none", writable_dir, NULL, MS_REC | MS_PRIVATE,
   570  			    NULL);
   571  		sc_do_umount(writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH);
   572  	}
   573  }
   574  
   575  /**
   576   * @path:    a pathname where / replaced with '\0'.
   577   * @offsetp: pointer to int showing which path segment was last seen.
   578   *           Updated on return to reflect the next segment.
   579   * @fulllen: full original path length.
   580   * Returns a pointer to the next path segment, or NULL if done.
   581   */
   582  static char * __attribute__((used))
   583      get_nextpath(char *path, size_t *offsetp, size_t fulllen)
   584  {
   585  	size_t offset = *offsetp;
   586  
   587  	if (offset >= fulllen)
   588  		return NULL;
   589  
   590  	while (offset < fulllen && path[offset] != '\0')
   591  		offset++;
   592  	while (offset < fulllen && path[offset] == '\0')
   593  		offset++;
   594  
   595  	*offsetp = offset;
   596  	return (offset < fulllen) ? &path[offset] : NULL;
   597  }
   598  
   599  /**
   600   * Check that @subdir is a subdir of @dir.
   601  **/
   602  static bool __attribute__((used))
   603      is_subdir(const char *subdir, const char *dir)
   604  {
   605  	size_t dirlen = strlen(dir);
   606  	size_t subdirlen = strlen(subdir);
   607  
   608  	// @dir has to be at least as long as @subdir
   609  	if (subdirlen < dirlen)
   610  		return false;
   611  	// @dir has to be a prefix of @subdir
   612  	if (strncmp(subdir, dir, dirlen) != 0)
   613  		return false;
   614  	// @dir can look like "path/" (that is, end with the directory separator).
   615  	// When that is the case then given the test above we can be sure @subdir
   616  	// is a real subdirectory.
   617  	if (dirlen > 0 && dir[dirlen - 1] == '/')
   618  		return true;
   619  	// @subdir can look like "path/stuff" and when the directory separator
   620  	// is exactly at the spot where @dir ends (that is, it was not caught
   621  	// by the test above) then @subdir is a real subdirectory.
   622  	if (subdir[dirlen] == '/' && dirlen > 0)
   623  		return true;
   624  	// If both @dir and @subdir have identical length then given that the
   625  	// prefix check above @subdir is a real subdirectory.
   626  	if (subdirlen == dirlen)
   627  		return true;
   628  	return false;
   629  }
   630  
   631  void sc_populate_mount_ns(struct sc_apparmor *apparmor, int snap_update_ns_fd,
   632  			  const sc_invocation * inv, const gid_t real_gid,
   633  			  const gid_t saved_gid)
   634  {
   635  	// Classify the current distribution, as claimed by /etc/os-release.
   636  	sc_distro distro = sc_classify_distro();
   637  
   638  	// Check which mode we should run in, normal or legacy.
   639  	if (inv->is_normal_mode) {
   640  		// In normal mode we use the base snap as / and set up several bind mounts.
   641  		const struct sc_mount mounts[] = {
   642  			{"/dev"},	// because it contains devices on host OS
   643  			{"/etc"},	// because that's where /etc/resolv.conf lives, perhaps a bad idea
   644  			{"/home"},	// to support /home/*/snap and home interface
   645  			{"/root"},	// because that is $HOME for services
   646  			{"/proc"},	// fundamental filesystem
   647  			{"/sys"},	// fundamental filesystem
   648  			{"/tmp"},	// to get writable tmp
   649  			{"/var/snap"},	// to get access to global snap data
   650  			{"/var/lib/snapd"},	// to get access to snapd state and seccomp profiles
   651  			{"/var/tmp"},	// to get access to the other temporary directory
   652  			{"/run"},	// to get /run with sockets and what not
   653  			{"/lib/modules",.is_optional = true},	// access to the modules of the running kernel
   654  			{"/lib/firmware",.is_optional = true},	// access to the firmware of the running kernel
   655  			{"/usr/src"},	// FIXME: move to SecurityMounts in system-trace interface
   656  			{"/var/log"},	// FIXME: move to SecurityMounts in log-observe interface
   657  #ifdef MERGED_USR
   658  			{"/run/media", true, "/media"},	// access to the users removable devices
   659  #else
   660  			{"/media", true},	// access to the users removable devices
   661  #endif				// MERGED_USR
   662  			{"/run/netns", true},	// access to the 'ip netns' network namespaces
   663  			// The /mnt directory is optional in base snaps to ensure backwards
   664  			// compatibility with the first version of base snaps that was
   665  			// released.
   666  			{"/mnt",.is_optional = true},	// to support the removable-media interface
   667  			{"/var/lib/extrausers",.is_optional = true},	// access to UID/GID of extrausers (if available)
   668  			{},
   669  		};
   670  		struct sc_mount_config normal_config = {
   671  			.rootfs_dir = inv->rootfs_dir,
   672  			.mounts = mounts,
   673  			.distro = distro,
   674  			.normal_mode = true,
   675  			.base_snap_name = inv->base_snap_name,
   676  		};
   677  		sc_bootstrap_mount_namespace(&normal_config);
   678  	} else {
   679  		// In legacy mode we don't pivot and instead just arrange bi-
   680  		// directional mount propagation for two directories.
   681  		const struct sc_mount mounts[] = {
   682  			{"/media", true},
   683  			{"/run/netns", true},
   684  			{},
   685  		};
   686  		struct sc_mount_config legacy_config = {
   687  			.rootfs_dir = "/",
   688  			.mounts = mounts,
   689  			.distro = distro,
   690  			.normal_mode = false,
   691  			.base_snap_name = inv->base_snap_name,
   692  		};
   693  		sc_bootstrap_mount_namespace(&legacy_config);
   694  	}
   695  
   696  	// TODO: rename this and fold it into bootstrap
   697  	setup_private_mount(inv->snap_instance);
   698  	// set up private /dev/pts
   699  	// TODO: fold this into bootstrap
   700  	setup_private_pts();
   701  
   702  	// setup the security backend bind mounts
   703  	sc_call_snap_update_ns(snap_update_ns_fd, inv->snap_instance, apparmor);
   704  }
   705  
   706  static bool is_mounted_with_shared_option(const char *dir)
   707      __attribute__((nonnull(1)));
   708  
   709  static bool is_mounted_with_shared_option(const char *dir)
   710  {
   711  	sc_mountinfo *sm SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   712  	sm = sc_parse_mountinfo(NULL);
   713  	if (sm == NULL) {
   714  		die("cannot parse /proc/self/mountinfo");
   715  	}
   716  	sc_mountinfo_entry *entry = sc_first_mountinfo_entry(sm);
   717  	while (entry != NULL) {
   718  		const char *mount_dir = entry->mount_dir;
   719  		if (sc_streq(mount_dir, dir)) {
   720  			const char *optional_fields = entry->optional_fields;
   721  			if (strstr(optional_fields, "shared:") != NULL) {
   722  				return true;
   723  			}
   724  		}
   725  		entry = sc_next_mountinfo_entry(entry);
   726  	}
   727  	return false;
   728  }
   729  
   730  void sc_ensure_shared_snap_mount(void)
   731  {
   732  	if (!is_mounted_with_shared_option("/")
   733  	    && !is_mounted_with_shared_option(SNAP_MOUNT_DIR)) {
   734  		// TODO: We could be more aggressive and refuse to function but since
   735  		// we have no data on actual environments that happen to limp along in
   736  		// this configuration let's not do that yet.  This code should be
   737  		// removed once we have a measurement and feedback mechanism that lets
   738  		// us decide based on measurable data.
   739  		sc_do_mount(SNAP_MOUNT_DIR, SNAP_MOUNT_DIR, "none",
   740  			    MS_BIND | MS_REC, 0);
   741  		sc_do_mount("none", SNAP_MOUNT_DIR, NULL, MS_SHARED | MS_REC,
   742  			    NULL);
   743  	}
   744  }
   745  
   746  void sc_setup_user_mounts(struct sc_apparmor *apparmor, int snap_update_ns_fd,
   747  			  const char *snap_name)
   748  {
   749  	debug("%s: %s", __FUNCTION__, snap_name);
   750  
   751  	char profile_path[PATH_MAX];
   752  	struct stat st;
   753  
   754  	sc_must_snprintf(profile_path, sizeof(profile_path),
   755  			 "/var/lib/snapd/mount/snap.%s.user-fstab", snap_name);
   756  	if (stat(profile_path, &st) != 0) {
   757  		// It is ok for the user fstab to not exist.
   758  		return;
   759  	}
   760  
   761  	// In our new mount namespace, recursively change all mounts
   762  	// to slave mode, so we see changes from the parent namespace
   763  	// but don't propagate our own changes.
   764  	sc_do_mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL);
   765  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
   766  	sc_call_snap_update_ns_as_user(snap_update_ns_fd, snap_name, apparmor);
   767  	(void)sc_set_effective_identity(old);
   768  }
   769  
   770  void sc_ensure_snap_dir_shared_mounts(void)
   771  {
   772  	const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL };
   773  	for (int i = 0; dirs[i] != NULL; i++) {
   774  		const char *dir = dirs[i];
   775  		if (!is_mounted_with_shared_option(dir)) {
   776  			/* Since this directory isn't yet shared (but it should be),
   777  			 * recursively bind mount it, then recursively share it so that
   778  			 * changes to the host are seen in the snap and vice-versa. This
   779  			 * allows us to fine-tune propagation events elsewhere for this new
   780  			 * mountpoint.
   781  			 *
   782  			 * Not using MS_SLAVE because it's too late for SNAP_MOUNT_DIR,
   783  			 * since snaps are already mounted, and it's not needed for
   784  			 * /var/snap.
   785  			 */
   786  			sc_do_mount(dir, dir, "none", MS_BIND | MS_REC, 0);
   787  			sc_do_mount("none", dir, NULL, MS_REC | MS_SHARED,
   788  				    NULL);
   789  		}
   790  	}
   791  }
   792  
   793  void sc_setup_parallel_instance_classic_mounts(const char *snap_name,
   794  					       const char *snap_instance_name)
   795  {
   796  	char src[PATH_MAX] = { 0 };
   797  	char dst[PATH_MAX] = { 0 };
   798  
   799  	const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL };
   800  	for (int i = 0; dirs[i] != NULL; i++) {
   801  		const char *dir = dirs[i];
   802  		sc_do_mount("none", dir, NULL, MS_REC | MS_SLAVE, NULL);
   803  	}
   804  
   805  	/* Mount SNAP_MOUNT_DIR/<snap>_<key> on SNAP_MOUNT_DIR/<snap> */
   806  	sc_must_snprintf(src, sizeof src, "%s/%s", SNAP_MOUNT_DIR,
   807  			 snap_instance_name);
   808  	sc_must_snprintf(dst, sizeof dst, "%s/%s", SNAP_MOUNT_DIR, snap_name);
   809  	sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0);
   810  
   811  	/* Mount /var/snap/<snap>_<key> on /var/snap/<snap> */
   812  	sc_must_snprintf(src, sizeof src, "/var/snap/%s", snap_instance_name);
   813  	sc_must_snprintf(dst, sizeof dst, "/var/snap/%s", snap_name);
   814  	sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0);
   815  }