github.com/meulengracht/snapd@v0.0.0-20210719210640-8bde69bcc84e/cmd/snap-confine/mount-support.c (about)

     1  /*
     2   * Copyright (C) 2015 Canonical Ltd
     3   *
     4   * This program is free software: you can redistribute it and/or modify
     5   * it under the terms of the GNU General Public License version 3 as
     6   * published by the Free Software Foundation.
     7   *
     8   * This program is distributed in the hope that it will be useful,
     9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11   * GNU General Public License for more details.
    12   *
    13   * You should have received a copy of the GNU General Public License
    14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15   *
    16   */
    17  #ifdef HAVE_CONFIG_H
    18  #include "config.h"
    19  #endif
    20  
    21  #include "mount-support.h"
    22  
    23  #include <errno.h>
    24  #include <fcntl.h>
    25  #include <libgen.h>
    26  #include <limits.h>
    27  #include <mntent.h>
    28  #include <sched.h>
    29  #include <stdio.h>
    30  #include <stdlib.h>
    31  #include <string.h>
    32  #include <sys/mount.h>
    33  #include <sys/stat.h>
    34  #include <sys/syscall.h>
    35  #include <sys/types.h>
    36  #include <sys/wait.h>
    37  #include <unistd.h>
    38  
    39  #include "../libsnap-confine-private/apparmor-support.h"
    40  #include "../libsnap-confine-private/classic.h"
    41  #include "../libsnap-confine-private/cleanup-funcs.h"
    42  #include "../libsnap-confine-private/mount-opt.h"
    43  #include "../libsnap-confine-private/mountinfo.h"
    44  #include "../libsnap-confine-private/snap.h"
    45  #include "../libsnap-confine-private/string-utils.h"
    46  #include "../libsnap-confine-private/tool.h"
    47  #include "../libsnap-confine-private/utils.h"
    48  #include "mount-support-nvidia.h"
    49  
    50  #define MAX_BUF 1000
    51  
    52  static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode);
    53  
    54  // TODO: simplify this, after all it is just a tmpfs
    55  // TODO: fold this into bootstrap
    56  static void setup_private_mount(const char *snap_name)
    57  {
    58  	// Create a 0700 base directory. This is the "base" directory that is
    59  	// protected from other users. This directory name is NOT randomly
    60  	// generated. This has several properties:
    61  	//
    62  	// Users can relate to the name and can find the temporary directory as
    63  	// visible from within the snap. If this directory was random it would be
    64  	// harder to find because there may be situations in which multiple
    65  	// directories related to the same snap name would exist.
    66  	//
    67  	// Snapd can partially manage the directory. Specifically on snap remove
    68  	// snapd could remove the directory and everything in it, potentially
    69  	// avoiding runaway disk use on a machine that either never reboots or uses
    70  	// persistent /tmp directory.
    71  	//
    72  	// Underneath the base directory there is a "tmp" sub-directory that has
    73  	// mode 1777 and behaves as a typical /tmp directory would. That directory
    74  	// is used as a bind-mounted /tmp directory.
    75  	//
    76  	// Because the directories are reused across invocations by distinct users
    77  	// and because the directories are trivially guessable, each invocation
    78  	// unconditionally chowns/chmods them to appropriate values.
    79  	char base_dir[MAX_BUF] = { 0 };
    80  	char tmp_dir[MAX_BUF] = { 0 };
    81  	int base_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
    82  	int tmp_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
    83  	sc_must_snprintf(base_dir, sizeof(base_dir), "/tmp/snap.%s", snap_name);
    84  	sc_must_snprintf(tmp_dir, sizeof(tmp_dir), "%s/tmp", base_dir);
    85  
    86  	/* Switch to root group so that mkdir and open calls below create filesystem
    87  	 * elements that are not owned by the user calling into snap-confine. */
    88  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
    89  	// Create /tmp/snap.$SNAP_NAME/ 0700 root.root. Ignore EEXIST since we want
    90  	// to reuse and we will open with O_NOFOLLOW, below.
    91  	if (mkdir(base_dir, 0700) < 0 && errno != EEXIST) {
    92  		die("cannot create base directory %s", base_dir);
    93  	}
    94  	base_dir_fd = open(base_dir,
    95  			   O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
    96  	if (base_dir_fd < 0) {
    97  		die("cannot open base directory %s", base_dir);
    98  	}
    99  	/* This seems redundant on first read but it has the non-obvious
   100  	 * property of changing existing directories  that have already existed
   101  	 * but had incorrect ownership or permission. This is possible due to
   102  	 * earlier bugs in snap-confine and due to the fact that some systems
   103  	 * use persistent /tmp directory and may not clean up leftover files
   104  	 * for arbitrarily long. This comment applies the following two pairs
   105  	 * of fchmod and fchown. */
   106  	if (fchmod(base_dir_fd, 0700) < 0) {
   107  		die("cannot chmod base directory %s to 0700", base_dir);
   108  	}
   109  	if (fchown(base_dir_fd, 0, 0) < 0) {
   110  		die("cannot chown base directory %s to root.root", base_dir);
   111  	}
   112  	// Create /tmp/snap.$SNAP_NAME/tmp 01777 root.root Ignore EEXIST since we
   113  	// want to reuse and we will open with O_NOFOLLOW, below.
   114  	if (mkdirat(base_dir_fd, "tmp", 01777) < 0 && errno != EEXIST) {
   115  		die("cannot create private tmp directory %s/tmp", base_dir);
   116  	}
   117  	(void)sc_set_effective_identity(old);
   118  	tmp_dir_fd = openat(base_dir_fd, "tmp",
   119  			    O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
   120  	if (tmp_dir_fd < 0) {
   121  		die("cannot open private tmp directory %s/tmp", base_dir);
   122  	}
   123  	if (fchmod(tmp_dir_fd, 01777) < 0) {
   124  		die("cannot chmod private tmp directory %s/tmp to 01777",
   125  		    base_dir);
   126  	}
   127  	if (fchown(tmp_dir_fd, 0, 0) < 0) {
   128  		die("cannot chown private tmp directory %s/tmp to root.root",
   129  		    base_dir);
   130  	}
   131  	sc_do_mount(tmp_dir, "/tmp", NULL, MS_BIND, NULL);
   132  	sc_do_mount("none", "/tmp", NULL, MS_PRIVATE, NULL);
   133  }
   134  
   135  // TODO: fold this into bootstrap
   136  static void setup_private_pts(void)
   137  {
   138  	// See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt
   139  	//
   140  	// Ubuntu by default uses devpts 'single-instance' mode where
   141  	// /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change
   142  	// the startup scripts though, so we follow the instructions in point
   143  	// '4' of 'User-space changes' in the above doc. In other words, after
   144  	// unshare(CLONE_NEWNS), we mount devpts with -o
   145  	// newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto
   146  	// /dev/ptmx
   147  
   148  	struct stat st;
   149  
   150  	// Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode
   151  	// which doesn't provide the isolation we require.
   152  	if (stat("/dev/pts/ptmx", &st) != 0) {
   153  		die("cannot stat /dev/pts/ptmx");
   154  	}
   155  	// Make sure /dev/ptmx exists so we can bind mount over it
   156  	if (stat("/dev/ptmx", &st) != 0) {
   157  		die("cannot stat /dev/ptmx");
   158  	}
   159  	// Since multi-instance, use ptmxmode=0666. The other options are
   160  	// copied from /etc/default/devpts
   161  	sc_do_mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,
   162  		    "newinstance,ptmxmode=0666,mode=0620,gid=5");
   163  	sc_do_mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, NULL);
   164  }
   165  
   166  struct sc_mount {
   167  	const char *path;
   168  	bool is_bidirectional;
   169  	// Alternate path defines the rbind mount "alternative" of path.
   170  	// It exists so that we can make /media on systems that use /run/media.
   171  	const char *altpath;
   172  	// Optional mount points are not processed unless the source and
   173  	// destination both exist.
   174  	bool is_optional;
   175  };
   176  
   177  struct sc_mount_config {
   178  	const char *rootfs_dir;
   179  	// The struct is terminated with an entry with NULL path.
   180  	const struct sc_mount *mounts;
   181  	sc_distro distro;
   182  	bool normal_mode;
   183  	const char *base_snap_name;
   184  };
   185  
   186  /**
   187   * Bootstrap mount namespace.
   188   *
   189   * This is a chunk of tricky code that lets us have full control over the
   190   * layout and direction of propagation of mount events. The documentation below
   191   * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source
   192   * tree.
   193   *
   194   * As a reminder two definitions are quoted below:
   195   *
   196   *  A 'propagation event' is defined as event generated on a vfsmount
   197   *  that leads to mount or unmount actions in other vfsmounts.
   198   *
   199   *  A 'peer group' is defined as a group of vfsmounts that propagate
   200   *  events to each other.
   201   *
   202   * (end of quote).
   203   *
   204   * The main idea is to setup a mount namespace that has a root filesystem with
   205   * vfsmounts and peer groups that, depending on the location, either isolate
   206   * or share with the rest of the system.
   207   *
   208   * The vast majority of the filesystem is shared in one direction. Events from
   209   * the outside (from the main mount namespace) propagate inside (to namespaces
   210   * of particular snaps) so things like new snap revisions, mounted drives, etc,
   211   * just show up as expected but even if a snap is exploited or malicious in
   212   * nature it cannot affect anything in another namespace where it might cause
   213   * security or stability issues.
   214   *
   215   * Selected directories (today just /media) can be shared in both directions.
   216   * This allows snaps with sufficient privileges to either create, through the
   217   * mount system call, additional mount points that are visible by the rest of
   218   * the system (both the main mount namespace and namespaces of individual
   219   * snaps) or remove them, through the unmount system call.
   220   **/
   221  static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config)
   222  {
   223  	char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX";
   224  	char src[PATH_MAX] = { 0 };
   225  	char dst[PATH_MAX] = { 0 };
   226  	if (mkdtemp(scratch_dir) == NULL) {
   227  		die("cannot create temporary directory for the root file system");
   228  	}
   229  	// NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new
   230  	// mount namespace and have a private list of mounts.
   231  	debug("scratch directory for constructing namespace: %s", scratch_dir);
   232  	// Make the root filesystem recursively shared. This way propagation events
   233  	// will be shared with main mount namespace.
   234  	sc_do_mount("none", "/", NULL, MS_REC | MS_SHARED, NULL);
   235  	// Bind mount the temporary scratch directory for root filesystem over
   236  	// itself so that it is a mount point. This is done so that it can become
   237  	// unbindable as explained below.
   238  	sc_do_mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL);
   239  	// Make the scratch directory unbindable.
   240  	//
   241  	// This is necessary as otherwise a mount loop can occur and the kernel
   242  	// would crash. The term unbindable simply states that it cannot be bind
   243  	// mounted anywhere. When we construct recursive bind mounts below this
   244  	// guarantees that this directory will not be replicated anywhere.
   245  	sc_do_mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL);
   246  	// Recursively bind mount desired root filesystem directory over the
   247  	// scratch directory. This puts the initial content into the scratch space
   248  	// and serves as a foundation for all subsequent operations below.
   249  	//
   250  	// The mount is recursive because it can either be applied to the root
   251  	// filesystem of a core system (aka all-snap) or the core snap on a classic
   252  	// system. In the former case we need recursive bind mounts to accurately
   253  	// replicate the state of the root filesystem into the scratch directory.
   254  	sc_do_mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND,
   255  		    NULL);
   256  	// Make the scratch directory recursively slave. Nothing done there will be
   257  	// shared with the initial mount namespace. This effectively detaches us,
   258  	// in one way, from the original namespace and coupled with pivot_root
   259  	// below serves as the foundation of the mount sandbox.
   260  	sc_do_mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL);
   261  	// Bind mount certain directories from the host filesystem to the scratch
   262  	// directory. By default mount events will propagate in both into and out
   263  	// of the peer group. This way the running application can alter any global
   264  	// state visible on the host and in other snaps. This can be restricted by
   265  	// disabling the "is_bidirectional" flag as can be seen below.
   266  	for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL;
   267  	     mnt++) {
   268  
   269  		if (mnt->is_bidirectional) {
   270  			sc_identity old =
   271  			    sc_set_effective_identity(sc_root_group_identity());
   272  			if (mkdir(mnt->path, 0755) < 0 && errno != EEXIST) {
   273  				die("cannot create %s", mnt->path);
   274  			}
   275  			(void)sc_set_effective_identity(old);
   276  		}
   277  		sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir,
   278  				 mnt->path);
   279  		if (mnt->is_optional) {
   280  			bool ok = sc_do_optional_mount(mnt->path, dst, NULL,
   281  						       MS_REC | MS_BIND, NULL);
   282  			if (!ok) {
   283  				// If we cannot mount it, just continue.
   284  				continue;
   285  			}
   286  		} else {
   287  			sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND,
   288  				    NULL);
   289  		}
   290  		if (!mnt->is_bidirectional) {
   291  			// Mount events will only propagate inwards to the namespace. This
   292  			// way the running application cannot alter any global state apart
   293  			// from that of its own snap.
   294  			sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   295  		}
   296  		if (mnt->altpath == NULL) {
   297  			continue;
   298  		}
   299  		// An alternate path of mnt->path is provided at another location.
   300  		// It should behave exactly the same as the original.
   301  		sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir,
   302  				 mnt->altpath);
   303  		struct stat stat_buf;
   304  		if (lstat(dst, &stat_buf) < 0) {
   305  			die("cannot lstat %s", dst);
   306  		}
   307  		if ((stat_buf.st_mode & S_IFMT) == S_IFLNK) {
   308  			die("cannot bind mount alternate path over a symlink: %s", dst);
   309  		}
   310  		sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL);
   311  		if (!mnt->is_bidirectional) {
   312  			sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   313  		}
   314  	}
   315  	if (config->normal_mode) {
   316  		// Since we mounted /etc from the host filesystem to the scratch directory,
   317  		// we may need to put certain directories from the desired root filesystem
   318  		// (e.g. the core snap) back. This way the behavior of running snaps is not
   319  		// affected by the alternatives directory from the host, if one exists.
   320  		//
   321  		// Fixes the following bugs:
   322  		//  - https://bugs.launchpad.net/snap-confine/+bug/1580018
   323  		//  - https://bugzilla.opensuse.org/show_bug.cgi?id=1028568
   324  		const char *dirs_from_core[] = {
   325  			"/etc/alternatives", "/etc/nsswitch.conf",
   326  			// Some specific and privileged interfaces (e.g docker-support) give
   327  			// access to apparmor_parser from the base snap which at a minimum
   328  			// needs to use matching configuration from the base snap instead
   329  			// of from the users host system.
   330  			"/etc/apparmor", "/etc/apparmor.d",
   331  			// Use ssl certs from the base by default unless
   332  			// using Debian/Ubuntu classic (see below)
   333  			"/etc/ssl",
   334  			NULL
   335  		};
   336  
   337  		for (const char **dirs = dirs_from_core; *dirs != NULL; dirs++) {
   338  			const char *dir = *dirs;
   339  
   340  			// Special case for ubuntu/debian based
   341  			// classic distros that use the core* snap:
   342  			// here we use the host /etc/ssl
   343  			// to support custom ca-cert setups
   344  			if (sc_streq(dir, "/etc/ssl") &&
   345  			    config->distro == SC_DISTRO_CLASSIC &&
   346  			    sc_is_debian_like() &&
   347  			    sc_startswith(config->base_snap_name, "core")) {
   348  				continue;
   349  			}
   350  
   351  			if (access(dir, F_OK) != 0) {
   352  				continue;
   353  			}
   354  			struct stat dst_stat;
   355  			struct stat src_stat;
   356  			sc_must_snprintf(src, sizeof src, "%s%s",
   357  					 config->rootfs_dir, dir);
   358  			sc_must_snprintf(dst, sizeof dst, "%s%s",
   359  					 scratch_dir, dir);
   360  			if (lstat(src, &src_stat) != 0) {
   361  				if (errno == ENOENT) {
   362  					continue;
   363  				}
   364  				die("cannot stat %s from desired rootfs", src);
   365  			}
   366  			if (!S_ISREG(src_stat.st_mode)
   367  			    && !S_ISDIR(src_stat.st_mode)) {
   368  				debug
   369  				    ("entry %s from the desired rootfs is not a file or directory, skipping mount",
   370  				     src);
   371  				continue;
   372  			}
   373  
   374  			if (lstat(dst, &dst_stat) != 0) {
   375  				if (errno == ENOENT) {
   376  					continue;
   377  				}
   378  				die("cannot stat %s from host", src);
   379  			}
   380  			if (!S_ISREG(dst_stat.st_mode)
   381  			    && !S_ISDIR(dst_stat.st_mode)) {
   382  				debug
   383  				    ("entry %s from the host is not a file or directory, skipping mount",
   384  				     src);
   385  				continue;
   386  			}
   387  
   388  			if ((dst_stat.st_mode & S_IFMT) !=
   389  			    (src_stat.st_mode & S_IFMT)) {
   390  				debug
   391  				    ("entries %s and %s are of different types, skipping mount",
   392  				     dst, src);
   393  				continue;
   394  			}
   395  			// both source and destination exist where both are either files
   396  			// or both are directories
   397  			sc_do_mount(src, dst, NULL, MS_BIND, NULL);
   398  			sc_do_mount("none", dst, NULL, MS_SLAVE, NULL);
   399  		}
   400  	}
   401  	// The "core" base snap is special as it contains snapd and friends.
   402  	// Other base snaps do not, so whenever a base snap other than core is
   403  	// in use we need extra provisions for setting up internal tooling to
   404  	// be available.
   405  	//
   406  	// However on a core18 (and similar) system the core snap is not
   407  	// a special base anymore and we should map our own tooling in.
   408  	if (config->distro == SC_DISTRO_CORE_OTHER
   409  	    || !sc_streq(config->base_snap_name, "core")) {
   410  		// when bases are used we need to bind-mount the libexecdir
   411  		// (that contains snap-exec) into /usr/lib/snapd of the
   412  		// base snap so that snap-exec is available for the snaps
   413  		// (base snaps do not ship snapd)
   414  
   415  		// dst is always /usr/lib/snapd as this is where snapd
   416  		// assumes to find snap-exec
   417  		sc_must_snprintf(dst, sizeof dst, "%s/usr/lib/snapd",
   418  				 scratch_dir);
   419  
   420  		// bind mount the current $ROOT/usr/lib/snapd path,
   421  		// where $ROOT is either "/" or the "/snap/{core,snapd}/current"
   422  		// that we are re-execing from
   423  		char *src = NULL;
   424  		char self[PATH_MAX + 1] = { 0 };
   425  		ssize_t nread;
   426  		nread = readlink("/proc/self/exe", self, sizeof self - 1);
   427  		if (nread < 0) {
   428  			die("cannot read /proc/self/exe");
   429  		}
   430  		// Though we initialized self to NULs and passed one less to
   431  		// readlink, therefore guaranteeing that self is
   432  		// zero-terminated, perform an explicit assignment to make
   433  		// Coverity happy.
   434  		self[nread] = '\0';
   435  		// this cannot happen except when the kernel is buggy
   436  		if (strstr(self, "/snap-confine") == NULL) {
   437  			die("cannot use result from readlink: %s", self);
   438  		}
   439  		src = dirname(self);
   440  		// dirname(path) might return '.' depending on path.
   441  		// /proc/self/exe should always point
   442  		// to an absolute path, but let's guarantee that.
   443  		if (src[0] != '/') {
   444  			die("cannot use the result of dirname(): %s", src);
   445  		}
   446  
   447  		sc_do_mount(src, dst, NULL, MS_BIND | MS_RDONLY, NULL);
   448  		sc_do_mount("none", dst, NULL, MS_SLAVE, NULL);
   449  	}
   450  	// Bind mount the directory where all snaps are mounted. The location of
   451  	// the this directory on the host filesystem may not match the location in
   452  	// the desired root filesystem. In the "core" and "ubuntu-core" snaps the
   453  	// directory is always /snap. On the host it is a build-time configuration
   454  	// option stored in SNAP_MOUNT_DIR. In legacy mode (or in other words, not
   455  	// in normal mode), we don't need to do this because /snap is fixed and
   456  	// already contains the correct view of the mounted snaps.
   457  	if (config->normal_mode) {
   458  		sc_must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir);
   459  		sc_do_mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC, NULL);
   460  		sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL);
   461  	}
   462  	// Create the hostfs directory if one is missing. This directory is a part
   463  	// of packaging now so perhaps this code can be removed later.
   464  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
   465  	if (mkdir(SC_HOSTFS_DIR, 0755) < 0) {
   466  		if (errno != EEXIST) {
   467  			die("cannot perform operation: mkdir %s", SC_HOSTFS_DIR);
   468  		}
   469  	}
   470  	(void)sc_set_effective_identity(old);
   471  	// Ensure that hostfs isgroup owned by root. We may have (now or earlier)
   472  	// created the directory as the user who first ran a snap on a given
   473  	// system and the group identity of that user is visilbe on disk.
   474  	// This was LP:#1665004
   475  	struct stat sb;
   476  	if (stat(SC_HOSTFS_DIR, &sb) < 0) {
   477  		die("cannot stat %s", SC_HOSTFS_DIR);
   478  	}
   479  	if (sb.st_uid != 0 || sb.st_gid != 0) {
   480  		if (chown(SC_HOSTFS_DIR, 0, 0) < 0) {
   481  			die("cannot change user/group owner of %s to root",
   482  			    SC_HOSTFS_DIR);
   483  		}
   484  	}
   485  	// Make the upcoming "put_old" directory for pivot_root private so that
   486  	// mount events don't propagate to any peer group. In practice pivot root
   487  	// has a number of undocumented requirements and one of them is that the
   488  	// "put_old" directory (the second argument) cannot be shared in any way.
   489  	sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR);
   490  	sc_do_mount(dst, dst, NULL, MS_BIND, NULL);
   491  	sc_do_mount("none", dst, NULL, MS_PRIVATE, NULL);
   492  	// On classic mount the nvidia driver. Ideally this would be done in an
   493  	// uniform way after pivot_root but this is good enough and requires less
   494  	// code changes the nvidia code assumes it has access to the existing
   495  	// pre-pivot filesystem.
   496  	if (config->distro == SC_DISTRO_CLASSIC) {
   497  		sc_mount_nvidia_driver(scratch_dir);
   498  	}
   499  	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   500  	//                    pivot_root
   501  	// XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
   502  	// Use pivot_root to "chroot" into the scratch directory.
   503  	//
   504  	// Q: Why are we using something as esoteric as pivot_root(2)?
   505  	// A: Because this makes apparmor handling easy. Using a normal chroot
   506  	// makes all apparmor rules conditional.  We are either running on an
   507  	// all-snap system where this would-be chroot didn't happen and all the
   508  	// rules see / as the root file system _OR_ we are running on top of a
   509  	// classic distribution and this chroot has now moved all paths to
   510  	// /tmp/snap.rootfs_*.
   511  	//
   512  	// Because we are using unshare(2) with CLONE_NEWNS we can essentially use
   513  	// pivot_root just like chroot but this makes apparmor unaware of the old
   514  	// root so everything works okay.
   515  	//
   516  	// HINT: If you are debugging this and are trying to see why pivot_root
   517  	// happens to return EINVAL with any changes you may be making, please
   518  	// consider applying
   519  	// misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree
   520  	// kernel.
   521  	debug("performing operation: pivot_root %s %s", scratch_dir, dst);
   522  	if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) {
   523  		die("cannot perform operation: pivot_root %s %s", scratch_dir,
   524  		    dst);
   525  	}
   526  	// Unmount the self-bind mount over the scratch directory created earlier
   527  	// in the original root filesystem (which is now mounted on SC_HOSTFS_DIR).
   528  	// This way we can remove the temporary directory we created and "clean up"
   529  	// after ourselves nicely.
   530  	sc_must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir);
   531  	sc_do_umount(dst, UMOUNT_NOFOLLOW);
   532  	// Remove the scratch directory. Note that we are using the path that is
   533  	// based on the old root filesystem as after pivot_root we cannot guarantee
   534  	// what is present at the same location normally. (It is probably an empty
   535  	// /tmp directory that is populated in another place).
   536  	debug("performing operation: rmdir %s", dst);
   537  	if (rmdir(scratch_dir) < 0) {
   538  		die("cannot perform operation: rmdir %s", dst);
   539  	};
   540  	// Make the old root filesystem recursively slave. This way operations
   541  	// performed in this mount namespace will not propagate to the peer group.
   542  	// This is another essential part of the confinement system.
   543  	sc_do_mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL);
   544  	// Detach the redundant hostfs version of sysfs since it shows up in the
   545  	// mount table and software inspecting the mount table may become confused
   546  	// (eg, docker and LP:# 162601).
   547  	sc_must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR);
   548  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   549  	// Detach the redundant hostfs version of /dev since it shows up in the
   550  	// mount table and software inspecting the mount table may become confused.
   551  	sc_must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR);
   552  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   553  	// Detach the redundant hostfs version of /proc since it shows up in the
   554  	// mount table and software inspecting the mount table may become confused.
   555  	sc_must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR);
   556  	sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH);
   557  	// Detach both views of /writable: the one from hostfs and the one directly
   558  	// visible in /writable. Interfaces don't grant access to this directory
   559  	// and it has a large duplicated view of many mount points.  Note that this
   560  	// is only applicable to ubuntu-core systems.
   561  	sc_detach_views_of_writable(config->distro, config->normal_mode);
   562  }
   563  
   564  static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode)
   565  {
   566  	// Note that prior to detaching either mount point we switch the
   567  	// propagation to private to both limit the change to just this view and to
   568  	// prevent otherwise occurring event propagation from self-conflicting and
   569  	// returning EBUSY. A similar approach is used by snap-update-ns and is
   570  	// documented in umount(2).
   571  	const char *writable_dir = "/writable";
   572  	const char *hostfs_writable_dir = "/var/lib/snapd/hostfs/writable";
   573  
   574  	// Writable only exists on ubuntu-core.
   575  	if (distro == SC_DISTRO_CLASSIC) {
   576  		return;
   577  	}
   578  	// On all core distributions we see /var/lib/snapd/hostfs/writable that
   579  	// exposes writable, with a structure specific to ubuntu-core.
   580  	debug("detaching %s", hostfs_writable_dir);
   581  	sc_do_mount("none", hostfs_writable_dir, NULL,
   582  		    MS_REC | MS_PRIVATE, NULL);
   583  	sc_do_umount(hostfs_writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH);
   584  
   585  	// On ubuntu-core 16, when the executed snap uses core as base we also see
   586  	// the /writable that we directly inherited from the initial mount
   587  	// namespace.
   588  	if (distro == SC_DISTRO_CORE16 && !normal_mode) {
   589  		debug("detaching %s", writable_dir);
   590  		sc_do_mount("none", writable_dir, NULL, MS_REC | MS_PRIVATE,
   591  			    NULL);
   592  		sc_do_umount(writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH);
   593  	}
   594  }
   595  
   596  /**
   597   * @path:    a pathname where / replaced with '\0'.
   598   * @offsetp: pointer to int showing which path segment was last seen.
   599   *           Updated on return to reflect the next segment.
   600   * @fulllen: full original path length.
   601   * Returns a pointer to the next path segment, or NULL if done.
   602   */
   603  static char * __attribute__((used))
   604      get_nextpath(char *path, size_t *offsetp, size_t fulllen)
   605  {
   606  	size_t offset = *offsetp;
   607  
   608  	if (offset >= fulllen)
   609  		return NULL;
   610  
   611  	while (offset < fulllen && path[offset] != '\0')
   612  		offset++;
   613  	while (offset < fulllen && path[offset] == '\0')
   614  		offset++;
   615  
   616  	*offsetp = offset;
   617  	return (offset < fulllen) ? &path[offset] : NULL;
   618  }
   619  
   620  /**
   621   * Check that @subdir is a subdir of @dir.
   622  **/
   623  static bool __attribute__((used))
   624      is_subdir(const char *subdir, const char *dir)
   625  {
   626  	size_t dirlen = strlen(dir);
   627  	size_t subdirlen = strlen(subdir);
   628  
   629  	// @dir has to be at least as long as @subdir
   630  	if (subdirlen < dirlen)
   631  		return false;
   632  	// @dir has to be a prefix of @subdir
   633  	if (strncmp(subdir, dir, dirlen) != 0)
   634  		return false;
   635  	// @dir can look like "path/" (that is, end with the directory separator).
   636  	// When that is the case then given the test above we can be sure @subdir
   637  	// is a real subdirectory.
   638  	if (dirlen > 0 && dir[dirlen - 1] == '/')
   639  		return true;
   640  	// @subdir can look like "path/stuff" and when the directory separator
   641  	// is exactly at the spot where @dir ends (that is, it was not caught
   642  	// by the test above) then @subdir is a real subdirectory.
   643  	if (subdir[dirlen] == '/' && dirlen > 0)
   644  		return true;
   645  	// If both @dir and @subdir have identical length then given that the
   646  	// prefix check above @subdir is a real subdirectory.
   647  	if (subdirlen == dirlen)
   648  		return true;
   649  	return false;
   650  }
   651  
   652  void sc_populate_mount_ns(struct sc_apparmor *apparmor, int snap_update_ns_fd,
   653  			  const sc_invocation * inv, const gid_t real_gid,
   654  			  const gid_t saved_gid)
   655  {
   656  	// Classify the current distribution, as claimed by /etc/os-release.
   657  	sc_distro distro = sc_classify_distro();
   658  
   659  	// Check which mode we should run in, normal or legacy.
   660  	if (inv->is_normal_mode) {
   661  		// In normal mode we use the base snap as / and set up several bind mounts.
   662  		const struct sc_mount mounts[] = {
   663  			{"/dev"},	// because it contains devices on host OS
   664  			{"/etc"},	// because that's where /etc/resolv.conf lives, perhaps a bad idea
   665  			{"/home"},	// to support /home/*/snap and home interface
   666  			{"/root"},	// because that is $HOME for services
   667  			{"/proc"},	// fundamental filesystem
   668  			{"/sys"},	// fundamental filesystem
   669  			{"/tmp"},	// to get writable tmp
   670  			{"/var/snap"},	// to get access to global snap data
   671  			{"/var/lib/snapd"},	// to get access to snapd state and seccomp profiles
   672  			{"/var/tmp"},	// to get access to the other temporary directory
   673  			{"/run"},	// to get /run with sockets and what not
   674  			{"/lib/modules",.is_optional = true},	// access to the modules of the running kernel
   675  			{"/lib/firmware",.is_optional = true},	// access to the firmware of the running kernel
   676  			{"/usr/src"},	// FIXME: move to SecurityMounts in system-trace interface
   677  			{"/var/log"},	// FIXME: move to SecurityMounts in log-observe interface
   678  #ifdef MERGED_USR
   679  			{"/run/media", true, "/media"},	// access to the users removable devices
   680  #else
   681  			{"/media", true},	// access to the users removable devices
   682  #endif				// MERGED_USR
   683  			{"/run/netns", true},	// access to the 'ip netns' network namespaces
   684  			// The /mnt directory is optional in base snaps to ensure backwards
   685  			// compatibility with the first version of base snaps that was
   686  			// released.
   687  			{"/mnt",.is_optional = true},	// to support the removable-media interface
   688  			{"/var/lib/extrausers",.is_optional = true},	// access to UID/GID of extrausers (if available)
   689  			{},
   690  		};
   691  		struct sc_mount_config normal_config = {
   692  			.rootfs_dir = inv->rootfs_dir,
   693  			.mounts = mounts,
   694  			.distro = distro,
   695  			.normal_mode = true,
   696  			.base_snap_name = inv->base_snap_name,
   697  		};
   698  		sc_bootstrap_mount_namespace(&normal_config);
   699  	} else {
   700  		// In legacy mode we don't pivot and instead just arrange bi-
   701  		// directional mount propagation for two directories.
   702  		const struct sc_mount mounts[] = {
   703  			{"/media", true},
   704  			{"/run/netns", true},
   705  			{},
   706  		};
   707  		struct sc_mount_config legacy_config = {
   708  			.rootfs_dir = "/",
   709  			.mounts = mounts,
   710  			.distro = distro,
   711  			.normal_mode = false,
   712  			.base_snap_name = inv->base_snap_name,
   713  		};
   714  		sc_bootstrap_mount_namespace(&legacy_config);
   715  	}
   716  
   717  	// TODO: rename this and fold it into bootstrap
   718  	setup_private_mount(inv->snap_instance);
   719  	// set up private /dev/pts
   720  	// TODO: fold this into bootstrap
   721  	setup_private_pts();
   722  
   723  	// setup the security backend bind mounts
   724  	sc_call_snap_update_ns(snap_update_ns_fd, inv->snap_instance, apparmor);
   725  }
   726  
   727  static bool is_mounted_with_shared_option(const char *dir)
   728      __attribute__((nonnull(1)));
   729  
   730  static bool is_mounted_with_shared_option(const char *dir)
   731  {
   732  	sc_mountinfo *sm SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   733  	sm = sc_parse_mountinfo(NULL);
   734  	if (sm == NULL) {
   735  		die("cannot parse /proc/self/mountinfo");
   736  	}
   737  	sc_mountinfo_entry *entry = sc_first_mountinfo_entry(sm);
   738  	while (entry != NULL) {
   739  		const char *mount_dir = entry->mount_dir;
   740  		if (sc_streq(mount_dir, dir)) {
   741  			const char *optional_fields = entry->optional_fields;
   742  			if (strstr(optional_fields, "shared:") != NULL) {
   743  				return true;
   744  			}
   745  		}
   746  		entry = sc_next_mountinfo_entry(entry);
   747  	}
   748  	return false;
   749  }
   750  
   751  void sc_ensure_shared_snap_mount(void)
   752  {
   753  	if (!is_mounted_with_shared_option("/")
   754  	    && !is_mounted_with_shared_option(SNAP_MOUNT_DIR)) {
   755  		// TODO: We could be more aggressive and refuse to function but since
   756  		// we have no data on actual environments that happen to limp along in
   757  		// this configuration let's not do that yet.  This code should be
   758  		// removed once we have a measurement and feedback mechanism that lets
   759  		// us decide based on measurable data.
   760  		sc_do_mount(SNAP_MOUNT_DIR, SNAP_MOUNT_DIR, "none",
   761  			    MS_BIND | MS_REC, NULL);
   762  		sc_do_mount("none", SNAP_MOUNT_DIR, NULL, MS_SHARED | MS_REC,
   763  			    NULL);
   764  	}
   765  }
   766  
   767  void sc_setup_user_mounts(struct sc_apparmor *apparmor, int snap_update_ns_fd,
   768  			  const char *snap_name)
   769  {
   770  	debug("%s: %s", __FUNCTION__, snap_name);
   771  
   772  	char profile_path[PATH_MAX];
   773  	struct stat st;
   774  
   775  	sc_must_snprintf(profile_path, sizeof(profile_path),
   776  			 "/var/lib/snapd/mount/snap.%s.user-fstab", snap_name);
   777  	if (stat(profile_path, &st) != 0) {
   778  		// It is ok for the user fstab to not exist.
   779  		return;
   780  	}
   781  
   782  	// In our new mount namespace, recursively change all mounts
   783  	// to slave mode, so we see changes from the parent namespace
   784  	// but don't propagate our own changes.
   785  	sc_do_mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL);
   786  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
   787  	sc_call_snap_update_ns_as_user(snap_update_ns_fd, snap_name, apparmor);
   788  	(void)sc_set_effective_identity(old);
   789  }
   790  
   791  void sc_ensure_snap_dir_shared_mounts(void)
   792  {
   793  	const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL };
   794  	for (int i = 0; dirs[i] != NULL; i++) {
   795  		const char *dir = dirs[i];
   796  		if (!is_mounted_with_shared_option(dir)) {
   797  			/* Since this directory isn't yet shared (but it should be),
   798  			 * recursively bind mount it, then recursively share it so that
   799  			 * changes to the host are seen in the snap and vice-versa. This
   800  			 * allows us to fine-tune propagation events elsewhere for this new
   801  			 * mountpoint.
   802  			 *
   803  			 * Not using MS_SLAVE because it's too late for SNAP_MOUNT_DIR,
   804  			 * since snaps are already mounted, and it's not needed for
   805  			 * /var/snap.
   806  			 */
   807  			sc_do_mount(dir, dir, "none", MS_BIND | MS_REC, NULL);
   808  			sc_do_mount("none", dir, NULL, MS_REC | MS_SHARED,
   809  				    NULL);
   810  		}
   811  	}
   812  }
   813  
   814  void sc_setup_parallel_instance_classic_mounts(const char *snap_name,
   815  					       const char *snap_instance_name)
   816  {
   817  	char src[PATH_MAX] = { 0 };
   818  	char dst[PATH_MAX] = { 0 };
   819  
   820  	const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL };
   821  	for (int i = 0; dirs[i] != NULL; i++) {
   822  		const char *dir = dirs[i];
   823  		sc_do_mount("none", dir, NULL, MS_REC | MS_SLAVE, NULL);
   824  	}
   825  
   826  	/* Mount SNAP_MOUNT_DIR/<snap>_<key> on SNAP_MOUNT_DIR/<snap> */
   827  	sc_must_snprintf(src, sizeof src, "%s/%s", SNAP_MOUNT_DIR,
   828  			 snap_instance_name);
   829  	sc_must_snprintf(dst, sizeof dst, "%s/%s", SNAP_MOUNT_DIR, snap_name);
   830  	sc_do_mount(src, dst, "none", MS_BIND | MS_REC, NULL);
   831  
   832  	/* Mount /var/snap/<snap>_<key> on /var/snap/<snap> */
   833  	sc_must_snprintf(src, sizeof src, "/var/snap/%s", snap_instance_name);
   834  	sc_must_snprintf(dst, sizeof dst, "/var/snap/%s", snap_name);
   835  	sc_do_mount(src, dst, "none", MS_BIND | MS_REC, NULL);
   836  }