github.com/rigado/snapd@v2.42.5-go-mod+incompatible/cmd/snap-confine/mount-support.c (about)

     1  /*
     2   * Copyright (C) 2015 Canonical Ltd
     3   *
     4   * This program is free software: you can redistribute it and/or modify
     5   * it under the terms of the GNU General Public License version 3 as
     6   * published by the Free Software Foundation.
     7   *
     8   * This program is distributed in the hope that it will be useful,
     9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11   * GNU General Public License for more details.
    12   *
    13   * You should have received a copy of the GNU General Public License
    14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15   *
    16   */
    17  #ifdef HAVE_CONFIG_H
    18  #include "config.h"
    19  #endif
    20  
    21  #include "mount-support.h"
    22  
    23  #include <errno.h>
    24  #include <fcntl.h>
    25  #include <libgen.h>
    26  #include <limits.h>
    27  #include <mntent.h>
    28  #include <sched.h>
    29  #include <stdio.h>
    30  #include <stdlib.h>
    31  #include <string.h>
    32  #include <sys/mount.h>
    33  #include <sys/stat.h>
    34  #include <sys/syscall.h>
    35  #include <sys/types.h>
    36  #include <sys/types.h>
    37  #include <sys/wait.h>
    38  #include <unistd.h>
    39  
    40  #include "../libsnap-confine-private/apparmor-support.h"
    41  #include "../libsnap-confine-private/classic.h"
    42  #include "../libsnap-confine-private/cleanup-funcs.h"
    43  #include "../libsnap-confine-private/mount-opt.h"
    44  #include "../libsnap-confine-private/mountinfo.h"
    45  #include "../libsnap-confine-private/snap.h"
    46  #include "../libsnap-confine-private/string-utils.h"
    47  #include "../libsnap-confine-private/tool.h"
    48  #include "../libsnap-confine-private/utils.h"
    49  #include "mount-support-nvidia.h"
    50  
    51  #define MAX_BUF 1000
    52  
    53  // TODO: simplify this, after all it is just a tmpfs
    54  // TODO: fold this into bootstrap
    55  static void setup_private_mount(const char *snap_name)
    56  {
    57  	// Create a 0700 base directory. This is the "base" directory that is
    58  	// protected from other users. This directory name is NOT randomly
    59  	// generated. This has several properties:
    60  	//
    61  	// Users can relate to the name and can find the temporary directory as
    62  	// visible from within the snap. If this directory was random it would be
    63  	// harder to find because there may be situations in which multiple
    64  	// directories related to the same snap name would exist.
    65  	//
    66  	// Snapd can partially manage the directory. Specifically on snap remove
    67  	// snapd could remove the directory and everything in it, potentially
    68  	// avoiding runaway disk use on a machine that either never reboots or uses
    69  	// persistent /tmp directory.
    70  	//
    71  	// Underneath the base directory there is a "tmp" sub-directory that has
    72  	// mode 1777 and behaves as a typical /tmp directory would. That directory
    73  	// is used as a bind-mounted /tmp directory.
    74  	//
    75  	// Because the directories are reused across invocations by distinct users
    76  	// and because the directories are trivially guessable, each invocation
    77  	// unconditionally chowns/chmods them to appropriate values.
    78  	char base_dir[MAX_BUF] = { 0 };
    79  	char tmp_dir[MAX_BUF] = { 0 };
    80  	int base_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
    81  	int tmp_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
    82  	sc_must_snprintf(base_dir, sizeof(base_dir), "/tmp/snap.%s", snap_name);
    83  	sc_must_snprintf(tmp_dir, sizeof(tmp_dir), "%s/tmp", base_dir);
    84  
    85  	// Create /tmp/snap.$SNAP_NAME/ 0700 root.root. Ignore EEXIST since we want
    86  	// to reuse and we will open with O_NOFOLLOW, below.
    87  	if (mkdir(base_dir, 0700) < 0 && errno != EEXIST) {
    88  		die("cannot create base directory %s", base_dir);
    89  	}
    90  	base_dir_fd = open(base_dir,
    91  			   O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
    92  	if (base_dir_fd < 0) {
    93  		die("cannot open base directory %s", base_dir);
    94  	}
    95  	if (fchmod(base_dir_fd, 0700) < 0) {
    96  		die("cannot chmod base directory %s to 0700", base_dir);
    97  	}
    98  	if (fchown(base_dir_fd, 0, 0) < 0) {
    99  		die("cannot chown base directory %s to root.root", base_dir);
   100  	}
   101  	// Create /tmp/snap.$SNAP_NAME/tmp 01777 root.root Ignore EEXIST since we
   102  	// want to reuse and we will open with O_NOFOLLOW, below.
   103  	if (mkdirat(base_dir_fd, "tmp", 01777) < 0 && errno != EEXIST) {
   104  		die("cannot create private tmp directory %s/tmp", base_dir);
   105  	}
   106  	tmp_dir_fd = openat(base_dir_fd, "tmp",
   107  			    O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
   108  	if (tmp_dir_fd < 0) {
   109  		die("cannot open private tmp directory %s/tmp", base_dir);
   110  	}
   111  	if (fchmod(tmp_dir_fd, 01777) < 0) {
   112  		die("cannot chmod private tmp directory %s/tmp to 01777",
   113  		    base_dir);
   114  	}
   115  	if (fchown(tmp_dir_fd, 0, 0) < 0) {
   116  		die("cannot chown private tmp directory %s/tmp to root.root",
   117  		    base_dir);
   118  	}
   119  	sc_do_mount(tmp_dir, "/tmp", NULL, MS_BIND, NULL);
   120  	sc_do_mount("none", "/tmp", NULL, MS_PRIVATE, NULL);
   121  }
   122  
   123  // TODO: fold this into bootstrap
   124  static void setup_private_pts(void)
   125  {
   126  	// See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt
   127  	//
   128  	// Ubuntu by default uses devpts 'single-instance' mode where
   129  	// /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change
   130  	// the startup scripts though, so we follow the instructions in point
   131  	// '4' of 'User-space changes' in the above doc. In other words, after
   132  	// unshare(CLONE_NEWNS), we mount devpts with -o
   133  	// newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto
   134  	// /dev/ptmx
   135  
   136  	struct stat st;
   137  
   138  	// Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode
   139  	// which doesn't provide the isolation we require.
   140  	if (stat("/dev/pts/ptmx", &st) != 0) {
   141  		die("cannot stat /dev/pts/ptmx");
   142  	}
   143  	// Make sure /dev/ptmx exists so we can bind mount over it
   144  	if (stat("/dev/ptmx", &st) != 0) {
   145  		die("cannot stat /dev/ptmx");
   146  	}
   147  	// Since multi-instance, use ptmxmode=0666. The other options are
   148  	// copied from /etc/default/devpts
   149  	sc_do_mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,
   150  		    "newinstance,ptmxmode=0666,mode=0620,gid=5");
   151  	sc_do_mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0);
   152  }
   153  
   154  struct sc_mount {
   155  	const char *path;
   156  	bool is_bidirectional;
   157  	// Alternate path defines the rbind mount "alternative" of path.
   158  	// It exists so that we can make /media on systems that use /run/media.
   159  	const char *altpath;
   160  	// Optional mount points are not processed unless the source and
   161  	// destination both exist.
   162  	bool is_optional;
   163  };
   164  
   165  struct sc_mount_config {
   166  	const char *rootfs_dir;
   167  	// The struct is terminated with an entry with NULL path.
   168  	const struct sc_mount *mounts;
   169  	sc_distro distro;
   170  	bool normal_mode;
   171  	const char *base_snap_name;
   172  };
   173  
   174  /**
   175   * Bootstrap mount namespace.
   176   *
   177   * This is a chunk of tricky code that lets us have full control over the
   178   * layout and direction of propagation of mount events. The documentation below
   179   * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source
   180   * tree.
   181   *
   182   * As a reminder two definitions are quoted below:
   183   *
   184   *  A 'propagation event' is defined as event generated on a vfsmount
   185   *  that leads to mount or unmount actions in other vfsmounts.
   186   *
   187   *  A 'peer group' is defined as a group of vfsmounts that propagate
   188   *  events to each other.
   189   *
   190   * (end of quote).
   191   *
   192   * The main idea is to setup a mount namespace that has a root filesystem with
   193   * vfsmounts and peer groups that, depending on the location, either isolate
   194   * or share with the rest of the system.
   195   *
   196   * The vast majority of the filesystem is shared in one direction. Events from
   197   * the outside (from the main mount namespace) propagate inside (to namespaces
   198   * of particular snaps) so things like new snap revisions, mounted drives, etc,
   199   * just show up as expected but even if a snap is exploited or malicious in
   200   * nature it cannot affect anything in another namespace where it might cause
   201   * security or stability issues.
   202   *
   203   * Selected directories (today just /media) can be shared in both directions.
   204   * This allows snaps with sufficient privileges to either create, through the
   205   * mount system call, additional mount points that are visible by the rest of
   206   * the system (both the main mount namespace and namespaces of individual
   207   * snaps) or remove them, through the unmount system call.
   208   **/
   209  static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config)
   210  {
   211  	char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX";
   212  	char src[PATH_MAX] = { 0 };
   213  	char dst[PATH_MAX] = { 0 };
   214  	if (mkdtemp(scratch_dir) == NULL) {
   215  		die("cannot create temporary directory for the root file system");
   216  	}
   217  	// NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new
   218  	// mount namespace and have a private list of mounts.
   219  	debug("scratch directory for constructing namespace: %s", scratch_dir);
   220  	// Make the root filesystem recursively shared. This way propagation events
   221  	// will be shared with main mount namespace.
   222  	sc_do_mount("none", "/", NULL, MS_REC | MS_SHARED, NULL);
   223  	// Bind mount the temporary scratch directory for root filesystem over
   224  	// itself so that it is a mount point. This is done so that it can become
   225  	// unbindable as explained below.
   226  	sc_do_mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL);
   227  	// Make the scratch directory unbindable.
   228  	//
   229  	// This is necessary as otherwise a mount loop can occur and the kernel
   230  	// would crash. The term unbindable simply states that it cannot be bind
   231  	// mounted anywhere. When we construct recursive bind mounts below this
   232  	// guarantees that this directory will not be replicated anywhere.
   233  	sc_do_mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL);
   234  	// Recursively bind mount desired root filesystem directory over the
   235  	// scratch directory. This puts the initial content into the scratch space
   236  	// and serves as a foundation for all subsequent operations below.
   237  	//
   238  	// The mount is recursive because it can either be applied to the root
   239  	// filesystem of a core system (aka all-snap) or the core snap on a classic
   240  	// system. In the former case we need recursive bind mounts to accurately
   241  	// replicate the state of the root filesystem into the scratch directory.
   242  	sc_do_mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND,
   243  		    NULL);
   244  	// Make the scratch directory recursively slave. Nothing done there will be
   245  	// shared with the initial mount namespace. This effectively detaches us,
   246  	// in one way, from the original namespace and coupled with pivot_root
   247  	// below serves as the foundation of the mount sandbox.
   248  	sc_do_mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL);
   249  	// Bind mount certain directories from the host filesystem to the scratch
   250  	// directory. By default mount events will propagate in both into and out
   251  	// of the peer group. This way the running application can alter any global
   252  	// state visible on the host and in other snaps. This can be restricted by
   253  	// disabling the "is_bidirectional" flag as can be seen below.
   254  	for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL;
   255  	     mnt++) {
   256  		if (mnt->is_bidirectional && mkdir(mnt->path, 0755) < 0 &&
   257  		    errno != EEXIST) {
   258  			die("cannot create %s", mnt->path);
   259  		}
   260  		sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir,
   261  				 mnt->path);
   262  		if (mnt->is_optional) {
   263  			bool ok = sc_do_optional_mount(mnt->path, dst, NULL,
   264  						       MS_REC | MS_BIND, NULL);
   265  			if (!ok) {
   266  				// If we cannot mount it, just continue.
   267  				continue;
   268  			}
   269  		} else {
   270  			sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND,
   271  				    NULL);
   272  		}
   273  		if (!mnt->is_bidirectional) {
   274  			// Mount events will only propagate inwards to the namespace. This
   275  			// way the running application cannot alter any global state apart
   276  			// from that of its own snap.
   277  			sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   278  		}
   279  		if (mnt->altpath == NULL) {
   280  			continue;
   281  		}
   282  		// An alternate path of mnt->path is provided at another location.
   283  		// It should behave exactly the same as the original.
   284  		sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir,
   285  				 mnt->altpath);
   286  		struct stat stat_buf;
   287  		if (lstat(dst, &stat_buf) < 0) {
   288  			die("cannot lstat %s", dst);
   289  		}
   290  		if ((stat_buf.st_mode & S_IFMT) == S_IFLNK) {
   291  			die("cannot bind mount alternate path over a symlink: %s", dst);
   292  		}
   293  		sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL);
   294  		if (!mnt->is_bidirectional) {
   295  			sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   296  		}
   297  	}
   298  	if (config->normal_mode) {
   299  		// Since we mounted /etc from the host filesystem to the scratch directory,
   300  		// we may need to put certain directories from the desired root filesystem
   301  		// (e.g. the core snap) back. This way the behavior of running snaps is not
   302  		// affected by the alternatives directory from the host, if one exists.
   303  		//
   304  		// Fixes the following bugs:
   305  		//  - https://bugs.launchpad.net/snap-confine/+bug/1580018
   306  		//  - https://bugzilla.opensuse.org/show_bug.cgi?id=1028568
   307  		const char *dirs_from_core[] =
   308  		    { "/etc/alternatives", "/etc/ssl", "/etc/nsswitch.conf",
   309  			NULL
   310  		};
   311  		for (const char **dirs = dirs_from_core; *dirs != NULL; dirs++) {
   312  			const char *dir = *dirs;
   313  			if (access(dir, F_OK) != 0) {
   314  				continue;
   315  			}
   316  			struct stat dst_stat;
   317  			struct stat src_stat;
   318  			sc_must_snprintf(src, sizeof src, "%s%s",
   319  					 config->rootfs_dir, dir);
   320  			sc_must_snprintf(dst, sizeof dst, "%s%s",
   321  					 scratch_dir, dir);
   322  			if (lstat(src, &src_stat) != 0) {
   323  				if (errno == ENOENT) {
   324  					continue;
   325  				}
   326  				die("cannot stat %s from desired rootfs", src);
   327  			}
   328  			if (!S_ISREG(src_stat.st_mode)
   329  			    && !S_ISDIR(src_stat.st_mode)) {
   330  				debug
   331  				    ("entry %s from the desired rootfs is not a file or directory, skipping mount",
   332  				     src);
   333  				continue;
   334  			}
   335  
   336  			if (lstat(dst, &dst_stat) != 0) {
   337  				if (errno == ENOENT) {
   338  					continue;
   339  				}
   340  				die("cannot stat %s from host", src);
   341  			}
   342  			if (!S_ISREG(dst_stat.st_mode)
   343  			    && !S_ISDIR(dst_stat.st_mode)) {
   344  				debug
   345  				    ("entry %s from the host is not a file or directory, skipping mount",
   346  				     src);
   347  				continue;
   348  			}
   349  
   350  			if ((dst_stat.st_mode & S_IFMT) !=
   351  			    (src_stat.st_mode & S_IFMT)) {
   352  				debug
   353  				    ("entries %s and %s are of different types, skipping mount",
   354  				     dst, src);
   355  				continue;
   356  			}
   357  			// both source and destination exist where both are either files
   358  			// or both are directories
   359  			sc_do_mount(src, dst, NULL, MS_BIND, NULL);
   360  			sc_do_mount("none", dst, NULL, MS_SLAVE, NULL);
   361  		}
   362  	}
   363  	// The "core" base snap is special as it contains snapd and friends.
   364  	// Other base snaps do not, so whenever a base snap other than core is
   365  	// in use we need extra provisions for setting up internal tooling to
   366  	// be available.
   367  	//
   368  	// However on a core18 (and similar) system the core snap is not
   369  	// a special base anymore and we should map our own tooling in.
   370  	if (config->distro == SC_DISTRO_CORE_OTHER
   371  	    || !sc_streq(config->base_snap_name, "core")) {
   372  		// when bases are used we need to bind-mount the libexecdir
   373  		// (that contains snap-exec) into /usr/lib/snapd of the
   374  		// base snap so that snap-exec is available for the snaps
   375  		// (base snaps do not ship snapd)
   376  
   377  		// dst is always /usr/lib/snapd as this is where snapd
   378  		// assumes to find snap-exec
   379  		sc_must_snprintf(dst, sizeof dst, "%s/usr/lib/snapd",
   380  				 scratch_dir);
   381  
   382  		// bind mount the current $ROOT/usr/lib/snapd path,
   383  		// where $ROOT is either "/" or the "/snap/{core,snapd}/current"
   384  		// that we are re-execing from
   385  		char *src = NULL;
   386  		char self[PATH_MAX + 1] = { 0 };
   387  		if (readlink("/proc/self/exe", self, sizeof(self) - 1) < 0) {
   388  			die("cannot read /proc/self/exe");
   389  		}
   390  		// this cannot happen except when the kernel is buggy
   391  		if (strstr(self, "/snap-confine") == NULL) {
   392  			die("cannot use result from readlink: %s", self);
   393  		}
   394  		src = dirname(self);
   395  		// dirname(path) might return '.' depending on path.
   396  		// /proc/self/exe should always point
   397  		// to an absolute path, but let's guarantee that.
   398  		if (src[0] != '/') {
   399  			die("cannot use the result of dirname(): %s", src);
   400  		}
   401  
   402  		sc_do_mount(src, dst, NULL, MS_BIND | MS_RDONLY, NULL);
   403  		sc_do_mount("none", dst, NULL, MS_SLAVE, NULL);
   404  	}
   405  	// Bind mount the directory where all snaps are mounted. The location of
   406  	// the this directory on the host filesystem may not match the location in
   407  	// the desired root filesystem. In the "core" and "ubuntu-core" snaps the
   408  	// directory is always /snap. On the host it is a build-time configuration
   409  	// option stored in SNAP_MOUNT_DIR. In legacy mode (or in other words, not
   410  	// in normal mode), we don't need to do this because /snap is fixed and
   411  	// already contains the correct view of the mounted snaps.
   412  	if (config->normal_mode) {
   413  		sc_must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir);
   414  		sc_do_mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC, NULL);
   415  		sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   416  	}
   417  	// Create the hostfs directory if one is missing. This directory is a part
   418  	// of packaging now so perhaps this code can be removed later.
   419  	if (access(SC_HOSTFS_DIR, F_OK) != 0) {
   420  		debug("creating missing hostfs directory");
   421  		if (mkdir(SC_HOSTFS_DIR, 0755) != 0) {
   422  			die("cannot perform operation: mkdir %s",
   423  			    SC_HOSTFS_DIR);
   424  		}
   425  	}
   426  	// Ensure that hostfs isgroup owned by root. We may have (now or earlier)
   427  	// created the directory as the user who first ran a snap on a given
   428  	// system and the group identity of that user is visilbe on disk.
   429  	// This was LP:#1665004
   430  	struct stat sb;
   431  	if (stat(SC_HOSTFS_DIR, &sb) < 0) {
   432  		die("cannot stat %s", SC_HOSTFS_DIR);
   433  	}
   434  	if (sb.st_uid != 0 || sb.st_gid != 0) {
   435  		if (chown(SC_HOSTFS_DIR, 0, 0) < 0) {
   436  			die("cannot change user/group owner of %s to root",
   437  			    SC_HOSTFS_DIR);
   438  		}
   439  	}
   440  	// Make the upcoming "put_old" directory for pivot_root private so that
   441  	// mount events don't propagate to any peer group. In practice pivot root
   442  	// has a number of undocumented requirements and one of them is that the
   443  	// "put_old" directory (the second argument) cannot be shared in any way.
   444  	sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR);
   445  	sc_do_mount(dst, dst, NULL, MS_BIND, NULL);
   446  	sc_do_mount("none", dst, NULL, MS_PRIVATE, NULL);
   447  	// On classic mount the nvidia driver. Ideally this would be done in an
   448  	// uniform way after pivot_root but this is good enough and requires less
   449  	// code changes the nvidia code assumes it has access to the existing
   450  	// pre-pivot filesystem.
   451  	if (config->distro == SC_DISTRO_CLASSIC) {
   452  		sc_mount_nvidia_driver(scratch_dir);
   453  	}
   454  	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   455  	//                    pivot_root
   456  	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   457  	// Use pivot_root to "chroot" into the scratch directory.
   458  	//
   459  	// Q: Why are we using something as esoteric as pivot_root(2)?
   460  	// A: Because this makes apparmor handling easy. Using a normal chroot
   461  	// makes all apparmor rules conditional.  We are either running on an
   462  	// all-snap system where this would-be chroot didn't happen and all the
   463  	// rules see / as the root file system _OR_ we are running on top of a
   464  	// classic distribution and this chroot has now moved all paths to
   465  	// /tmp/snap.rootfs_*.
   466  	//
   467  	// Because we are using unshare(2) with CLONE_NEWNS we can essentially use
   468  	// pivot_root just like chroot but this makes apparmor unaware of the old
   469  	// root so everything works okay.
   470  	//
   471  	// HINT: If you are debugging this and are trying to see why pivot_root
   472  	// happens to return EINVAL with any changes you may be making, please
   473  	// consider applying
   474  	// misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree
   475  	// kernel.
   476  	debug("performing operation: pivot_root %s %s", scratch_dir, dst);
   477  	if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) {
   478  		die("cannot perform operation: pivot_root %s %s", scratch_dir,
   479  		    dst);
   480  	}
   481  	// Unmount the self-bind mount over the scratch directory created earlier
   482  	// in the original root filesystem (which is now mounted on SC_HOSTFS_DIR).
   483  	// This way we can remove the temporary directory we created and "clean up"
   484  	// after ourselves nicely.
   485  	sc_must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir);
   486  	sc_do_umount(dst, UMOUNT_NOFOLLOW);
   487  	// Remove the scratch directory. Note that we are using the path that is
   488  	// based on the old root filesystem as after pivot_root we cannot guarantee
   489  	// what is present at the same location normally. (It is probably an empty
   490  	// /tmp directory that is populated in another place).
   491  	debug("performing operation: rmdir %s", dst);
   492  	if (rmdir(scratch_dir) < 0) {
   493  		die("cannot perform operation: rmdir %s", dst);
   494  	};
   495  	// Make the old root filesystem recursively slave. This way operations
   496  	// performed in this mount namespace will not propagate to the peer group.
   497  	// This is another essential part of the confinement system.
   498  	sc_do_mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL);
   499  	// Detach the redundant hostfs version of sysfs since it shows up in the
   500  	// mount table and software inspecting the mount table may become confused
   501  	// (eg, docker and LP:# 162601).
   502  	sc_must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR);
   503  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   504  	// Detach the redundant hostfs version of /dev since it shows up in the
   505  	// mount table and software inspecting the mount table may become confused.
   506  	sc_must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR);
   507  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   508  	// Detach the redundant hostfs version of /proc since it shows up in the
   509  	// mount table and software inspecting the mount table may become confused.
   510  	sc_must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR);
   511  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   512  }
   513  
   514  /**
   515   * @path:    a pathname where / replaced with '\0'.
   516   * @offsetp: pointer to int showing which path segment was last seen.
   517   *           Updated on return to reflect the next segment.
   518   * @fulllen: full original path length.
   519   * Returns a pointer to the next path segment, or NULL if done.
   520   */
   521  static char * __attribute__((used))
   522      get_nextpath(char *path, size_t *offsetp, size_t fulllen)
   523  {
   524  	size_t offset = *offsetp;
   525  
   526  	if (offset >= fulllen)
   527  		return NULL;
   528  
   529  	while (offset < fulllen && path[offset] != '\0')
   530  		offset++;
   531  	while (offset < fulllen && path[offset] == '\0')
   532  		offset++;
   533  
   534  	*offsetp = offset;
   535  	return (offset < fulllen) ? &path[offset] : NULL;
   536  }
   537  
   538  /**
   539   * Check that @subdir is a subdir of @dir.
   540  **/
   541  static bool __attribute__((used))
   542      is_subdir(const char *subdir, const char *dir)
   543  {
   544  	size_t dirlen = strlen(dir);
   545  	size_t subdirlen = strlen(subdir);
   546  
   547  	// @dir has to be at least as long as @subdir
   548  	if (subdirlen < dirlen)
   549  		return false;
   550  	// @dir has to be a prefix of @subdir
   551  	if (strncmp(subdir, dir, dirlen) != 0)
   552  		return false;
   553  	// @dir can look like "path/" (that is, end with the directory separator).
   554  	// When that is the case then given the test above we can be sure @subdir
   555  	// is a real subdirectory.
   556  	if (dirlen > 0 && dir[dirlen - 1] == '/')
   557  		return true;
   558  	// @subdir can look like "path/stuff" and when the directory separator
   559  	// is exactly at the spot where @dir ends (that is, it was not caught
   560  	// by the test above) then @subdir is a real subdirectory.
   561  	if (subdir[dirlen] == '/' && dirlen > 0)
   562  		return true;
   563  	// If both @dir and @subdir have identical length then given that the
   564  	// prefix check above @subdir is a real subdirectory.
   565  	if (subdirlen == dirlen)
   566  		return true;
   567  	return false;
   568  }
   569  
   570  void sc_populate_mount_ns(struct sc_apparmor *apparmor, int snap_update_ns_fd,
   571  			  const sc_invocation * inv)
   572  {
   573  	// Classify the current distribution, as claimed by /etc/os-release.
   574  	sc_distro distro = sc_classify_distro();
   575  
   576  	// Check which mode we should run in, normal or legacy.
   577  	if (inv->is_normal_mode) {
   578  		// In normal mode we use the base snap as / and set up several bind mounts.
   579  		const struct sc_mount mounts[] = {
   580  			{"/dev"},	// because it contains devices on host OS
   581  			{"/etc"},	// because that's where /etc/resolv.conf lives, perhaps a bad idea
   582  			{"/home"},	// to support /home/*/snap and home interface
   583  			{"/root"},	// because that is $HOME for services
   584  			{"/proc"},	// fundamental filesystem
   585  			{"/sys"},	// fundamental filesystem
   586  			{"/tmp"},	// to get writable tmp
   587  			{"/var/snap"},	// to get access to global snap data
   588  			{"/var/lib/snapd"},	// to get access to snapd state and seccomp profiles
   589  			{"/var/tmp"},	// to get access to the other temporary directory
   590  			{"/run"},	// to get /run with sockets and what not
   591  			{"/lib/modules",.is_optional = true},	// access to the modules of the running kernel
   592  			{"/lib/firmware",.is_optional = true},	// access to the firmware of the running kernel
   593  			{"/usr/src"},	// FIXME: move to SecurityMounts in system-trace interface
   594  			{"/var/log"},	// FIXME: move to SecurityMounts in log-observe interface
   595  #ifdef MERGED_USR
   596  			{"/run/media", true, "/media"},	// access to the users removable devices
   597  #else
   598  			{"/media", true},	// access to the users removable devices
   599  #endif				// MERGED_USR
   600  			{"/run/netns", true},	// access to the 'ip netns' network namespaces
   601  			// The /mnt directory is optional in base snaps to ensure backwards
   602  			// compatibility with the first version of base snaps that was
   603  			// released.
   604  			{"/mnt",.is_optional = true},	// to support the removable-media interface
   605  			{"/var/lib/extrausers",.is_optional = true},	// access to UID/GID of extrausers (if available)
   606  			{},
   607  		};
   608  		struct sc_mount_config normal_config = {
   609  			.rootfs_dir = inv->rootfs_dir,
   610  			.mounts = mounts,
   611  			.distro = distro,
   612  			.normal_mode = true,
   613  			.base_snap_name = inv->base_snap_name,
   614  		};
   615  		sc_bootstrap_mount_namespace(&normal_config);
   616  	} else {
   617  		// In legacy mode we don't pivot and instead just arrange bi-
   618  		// directional mount propagation for two directories.
   619  		const struct sc_mount mounts[] = {
   620  			{"/media", true},
   621  			{"/run/netns", true},
   622  			{},
   623  		};
   624  		struct sc_mount_config legacy_config = {
   625  			.rootfs_dir = "/",
   626  			.mounts = mounts,
   627  			.distro = distro,
   628  			.normal_mode = false,
   629  			.base_snap_name = inv->base_snap_name,
   630  		};
   631  		sc_bootstrap_mount_namespace(&legacy_config);
   632  	}
   633  
   634  	// set up private mounts
   635  	// TODO: rename this and fold it into bootstrap
   636  	setup_private_mount(inv->snap_instance);
   637  
   638  	// set up private /dev/pts
   639  	// TODO: fold this into bootstrap
   640  	setup_private_pts();
   641  
   642  	// setup the security backend bind mounts
   643  	sc_call_snap_update_ns(snap_update_ns_fd, inv->snap_instance, apparmor);
   644  }
   645  
   646  static bool is_mounted_with_shared_option(const char *dir)
   647      __attribute__((nonnull(1)));
   648  
   649  static bool is_mounted_with_shared_option(const char *dir)
   650  {
   651  	sc_mountinfo *sm SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   652  	sm = sc_parse_mountinfo(NULL);
   653  	if (sm == NULL) {
   654  		die("cannot parse /proc/self/mountinfo");
   655  	}
   656  	sc_mountinfo_entry *entry = sc_first_mountinfo_entry(sm);
   657  	while (entry != NULL) {
   658  		const char *mount_dir = entry->mount_dir;
   659  		if (sc_streq(mount_dir, dir)) {
   660  			const char *optional_fields = entry->optional_fields;
   661  			if (strstr(optional_fields, "shared:") != NULL) {
   662  				return true;
   663  			}
   664  		}
   665  		entry = sc_next_mountinfo_entry(entry);
   666  	}
   667  	return false;
   668  }
   669  
   670  void sc_ensure_shared_snap_mount(void)
   671  {
   672  	if (!is_mounted_with_shared_option("/")
   673  	    && !is_mounted_with_shared_option(SNAP_MOUNT_DIR)) {
   674  		// TODO: We could be more aggressive and refuse to function but since
   675  		// we have no data on actual environments that happen to limp along in
   676  		// this configuration let's not do that yet.  This code should be
   677  		// removed once we have a measurement and feedback mechanism that lets
   678  		// us decide based on measurable data.
   679  		sc_do_mount(SNAP_MOUNT_DIR, SNAP_MOUNT_DIR, "none",
   680  			    MS_BIND | MS_REC, 0);
   681  		sc_do_mount("none", SNAP_MOUNT_DIR, NULL, MS_SHARED | MS_REC,
   682  			    NULL);
   683  	}
   684  }
   685  
   686  void sc_setup_user_mounts(struct sc_apparmor *apparmor, int snap_update_ns_fd,
   687  			  const char *snap_name)
   688  {
   689  	debug("%s: %s", __FUNCTION__, snap_name);
   690  
   691  	char profile_path[PATH_MAX];
   692  	struct stat st;
   693  
   694  	sc_must_snprintf(profile_path, sizeof(profile_path),
   695  			 "/var/lib/snapd/mount/snap.%s.user-fstab", snap_name);
   696  	if (stat(profile_path, &st) != 0) {
   697  		// It is ok for the user fstab to not exist.
   698  		return;
   699  	}
   700  
   701  	// In our new mount namespace, recursively change all mounts
   702  	// to slave mode, so we see changes from the parent namespace
   703  	// but don't propagate our own changes.
   704  	sc_do_mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL);
   705  	sc_call_snap_update_ns_as_user(snap_update_ns_fd, snap_name, apparmor);
   706  }