github.com/kubiko/snapd@v0.0.0-20201013125620-d4f3094d9ddf/cmd/snap-confine/ns-support.c (about)

     1  /*
     2   * Copyright (C) 2016 Canonical Ltd
     3   *
     4   * This program is free software: you can redistribute it and/or modify
     5   * it under the terms of the GNU General Public License version 3 as
     6   * published by the Free Software Foundation.
     7   *
     8   * This program is distributed in the hope that it will be useful,
     9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11   * GNU General Public License for more details.
    12   *
    13   * You should have received a copy of the GNU General Public License
    14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15   *
    16   */
    17  
    18  #include "ns-support.h"
    19  
    20  #ifdef HAVE_CONFIG_H
    21  #include "config.h"
    22  #endif
    23  
    24  #include <errno.h>
    25  #include <fcntl.h>
    26  #include <linux/magic.h>
    27  #include <sched.h>
    28  #include <signal.h>
    29  #include <string.h>
    30  #include <sys/eventfd.h>
    31  #include <sys/file.h>
    32  #include <sys/mount.h>
    33  #include <sys/prctl.h>
    34  #include <sys/stat.h>
    35  #include <sys/sysmacros.h>
    36  #include <sys/types.h>
    37  #include <sys/vfs.h>
    38  #include <sys/wait.h>
    39  #include <unistd.h>
    40  
    41  #include "../libsnap-confine-private/cgroup-freezer-support.h"
    42  #include "../libsnap-confine-private/cgroup-support.h"
    43  #include "../libsnap-confine-private/classic.h"
    44  #include "../libsnap-confine-private/cleanup-funcs.h"
    45  #include "../libsnap-confine-private/feature.h"
    46  #include "../libsnap-confine-private/infofile.h"
    47  #include "../libsnap-confine-private/locking.h"
    48  #include "../libsnap-confine-private/mountinfo.h"
    49  #include "../libsnap-confine-private/string-utils.h"
    50  #include "../libsnap-confine-private/tool.h"
    51  #include "../libsnap-confine-private/utils.h"
    52  #include "user-support.h"
    53  #include "mount-support.h"
    54  
    55  /**
    56   * Directory where snap-confine keeps namespace files.
    57   **/
    58  #define SC_NS_DIR "/run/snapd/ns"
    59  
    60  /**
    61   * Effective value of SC_NS_DIR.
    62   *
    63   * We use 'const char *' so we can update sc_ns_dir in the testsuite
    64   **/
    65  static const char *sc_ns_dir = SC_NS_DIR;
    66  
    67  enum {
    68  	HELPER_CMD_EXIT,
    69  	HELPER_CMD_CAPTURE_MOUNT_NS,
    70  	HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS,
    71  };
    72  
    73  void sc_reassociate_with_pid1_mount_ns(void)
    74  {
    75  	int init_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
    76  	int self_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
    77  	const char *path_pid_1 = "/proc/1/ns/mnt";
    78  	const char *path_pid_self = "/proc/self/ns/mnt";
    79  
    80  	init_mnt_fd = open(path_pid_1,
    81  			   O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
    82  	if (init_mnt_fd < 0) {
    83  		die("cannot open path %s", path_pid_1);
    84  	}
    85  	self_mnt_fd = open(path_pid_self,
    86  			   O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
    87  	if (self_mnt_fd < 0) {
    88  		die("cannot open path %s", path_pid_1);
    89  	}
    90  	char init_buf[128] = { 0 };
    91  	char self_buf[128] = { 0 };
    92  	memset(init_buf, 0, sizeof init_buf);
    93  	if (readlinkat(init_mnt_fd, "", init_buf, sizeof init_buf) < 0) {
    94  		if (errno == ENOENT) {
    95  			// According to namespaces(7) on a pre 3.8 kernel the namespace
    96  			// files are hardlinks, not symlinks. If that happens readlinkat
    97  			// fails with ENOENT. As a quick workaround for this special-case
    98  			// functionality, just bail out and do nothing without raising an
    99  			// error.
   100  			return;
   101  		}
   102  		die("cannot read mount namespace identifier of pid 1");
   103  	}
   104  	memset(self_buf, 0, sizeof self_buf);
   105  	if (readlinkat(self_mnt_fd, "", self_buf, sizeof self_buf) < 0) {
   106  		die("cannot read mount namespace identifier of the current process");
   107  	}
   108  	if (memcmp(init_buf, self_buf, sizeof init_buf) != 0) {
   109  		debug("moving to mount namespace of pid 1");
   110  		// We cannot use O_NOFOLLOW here because that file will always be a
   111  		// symbolic link. We actually want to open it this way.
   112  		int init_mnt_fd_real SC_CLEANUP(sc_cleanup_close) = -1;
   113  		init_mnt_fd_real = open(path_pid_1, O_RDONLY | O_CLOEXEC);
   114  		if (init_mnt_fd_real < 0) {
   115  			die("cannot open %s", path_pid_1);
   116  		}
   117  		if (setns(init_mnt_fd_real, CLONE_NEWNS) < 0) {
   118  			die("cannot join mount namespace of pid 1");
   119  		}
   120  	}
   121  }
   122  
   123  void sc_initialize_mount_ns(unsigned int experimental_features)
   124  {
   125  	debug("unsharing snap namespace directory");
   126  
   127  	/* Ensure that /run/snapd/ns is a directory. */
   128  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
   129  	if (sc_nonfatal_mkpath(sc_ns_dir, 0755) < 0) {
   130  		die("cannot create directory %s", sc_ns_dir);
   131  	}
   132  	(void)sc_set_effective_identity(old);
   133  
   134  	/* Read and analyze the mount table. We need to see whether /run/snapd/ns
   135  	 * is a mount point with private event propagation. */
   136  	sc_mountinfo *info SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   137  	info = sc_parse_mountinfo(NULL);
   138  	if (info == NULL) {
   139  		die("cannot parse /proc/self/mountinfo");
   140  	}
   141  
   142  	bool is_mnt = false;
   143  	bool is_private = false;
   144  	for (sc_mountinfo_entry * entry = sc_first_mountinfo_entry(info);
   145  	     entry != NULL; entry = sc_next_mountinfo_entry(entry)) {
   146  		/* Find /run/snapd/ns */
   147  		if (!sc_streq(entry->mount_dir, sc_ns_dir)) {
   148  			continue;
   149  		}
   150  		is_mnt = true;
   151  		if (strstr(entry->optional_fields, "shared:") == NULL) {
   152  			/* Mount event propagation is not set to shared, good. */
   153  			is_private = true;
   154  		}
   155  		break;
   156  	}
   157  
   158  	if (!is_mnt) {
   159  		if (mount(sc_ns_dir, sc_ns_dir, NULL, MS_BIND | MS_REC, NULL) <
   160  		    0) {
   161  			die("cannot self-bind mount %s", sc_ns_dir);
   162  		}
   163  	}
   164  
   165  	if (!is_private) {
   166  		if (mount(NULL, sc_ns_dir, NULL, MS_PRIVATE, NULL) < 0) {
   167  			die("cannot change propagation type to MS_PRIVATE in %s", sc_ns_dir);
   168  		}
   169  	}
   170  
   171  	/* code that follows is experimental */
   172  	if (experimental_features & SC_FEATURE_PARALLEL_INSTANCES) {
   173  		// Ensure that SNAP_MOUNT_DIR and /var/snap are shared mount points
   174  		debug
   175  		    ("(experimental) ensuring snap mount and data directories are mount points");
   176  		sc_ensure_snap_dir_shared_mounts();
   177  	}
   178  }
   179  
   180  struct sc_mount_ns {
   181  	// Name of the namespace group ($SNAP_NAME).
   182  	char *name;
   183  	// Descriptor to the namespace group control directory.  This descriptor is
   184  	// opened with O_PATH|O_DIRECTORY so it's only used for openat() calls.
   185  	int dir_fd;
   186  	// Pair of descriptors for a pair for a pipe file descriptors (read end,
   187  	// write end) that snap-confine uses to send messages to the helper
   188  	// process and back.
   189  	int pipe_helper[2];
   190  	int pipe_master[2];
   191  	// Identifier of the child process that is used during the one-time (per
   192  	// group) initialization and capture process.
   193  	pid_t child;
   194  };
   195  
   196  static struct sc_mount_ns *sc_alloc_mount_ns(void)
   197  {
   198  	struct sc_mount_ns *group = calloc(1, sizeof *group);
   199  	if (group == NULL) {
   200  		die("cannot allocate memory for sc_mount_ns");
   201  	}
   202  	group->dir_fd = -1;
   203  	group->pipe_helper[0] = -1;
   204  	group->pipe_helper[1] = -1;
   205  	group->pipe_master[0] = -1;
   206  	group->pipe_master[1] = -1;
   207  	// Redundant with calloc but some functions check for the non-zero value so
   208  	// I'd like to keep this explicit in the code.
   209  	group->child = 0;
   210  	return group;
   211  }
   212  
   213  struct sc_mount_ns *sc_open_mount_ns(const char *group_name)
   214  {
   215  	struct sc_mount_ns *group = sc_alloc_mount_ns();
   216  	group->dir_fd = open(sc_ns_dir,
   217  			     O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW);
   218  	if (group->dir_fd < 0) {
   219  		die("cannot open directory %s", sc_ns_dir);
   220  	}
   221  	group->name = sc_strdup(group_name);
   222  	return group;
   223  }
   224  
   225  void sc_close_mount_ns(struct sc_mount_ns *group)
   226  {
   227  	if (group->child != 0) {
   228  		sc_wait_for_helper(group);
   229  	}
   230  	sc_cleanup_close(&group->dir_fd);
   231  	sc_cleanup_close(&group->pipe_master[0]);
   232  	sc_cleanup_close(&group->pipe_master[1]);
   233  	sc_cleanup_close(&group->pipe_helper[0]);
   234  	sc_cleanup_close(&group->pipe_helper[1]);
   235  	free(group->name);
   236  	free(group);
   237  }
   238  
   239  static dev_t find_base_snap_device(const char *base_snap_name,
   240  				   const char *base_snap_rev)
   241  {
   242  	// Find the backing device of the base snap.
   243  	// TODO: add support for "try mode" base snaps that also need
   244  	// consideration of the mie->root component.
   245  	dev_t base_snap_dev = 0;
   246  	char base_squashfs_path[PATH_MAX];
   247  	sc_must_snprintf(base_squashfs_path,
   248  			 sizeof base_squashfs_path, "%s/%s/%s",
   249  			 SNAP_MOUNT_DIR, base_snap_name, base_snap_rev);
   250  	sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   251  	mi = sc_parse_mountinfo(NULL);
   252  	if (mi == NULL) {
   253  		die("cannot parse mountinfo of the current process");
   254  	}
   255  	bool found = false;
   256  	for (sc_mountinfo_entry * mie =
   257  	     sc_first_mountinfo_entry(mi); mie != NULL;
   258  	     mie = sc_next_mountinfo_entry(mie)) {
   259  		if (sc_streq(mie->mount_dir, base_squashfs_path)) {
   260  			base_snap_dev = makedev(mie->dev_major, mie->dev_minor);
   261  			debug("block device of snap %s, revision %s is %d:%d",
   262  			      base_snap_name, base_snap_rev, mie->dev_major,
   263  			      mie->dev_minor);
   264  			// Don't break when found, we are interested in the last
   265  			// entry as this is the "effective" one.
   266  			found = true;
   267  		}
   268  	}
   269  	if (!found) {
   270  		die("cannot find mount entry for snap %s revision %s",
   271  		    base_snap_name, base_snap_rev);
   272  	}
   273  	return base_snap_dev;
   274  }
   275  
   276  static bool should_discard_current_ns(dev_t base_snap_dev)
   277  {
   278  	// Inspect the namespace and check if we should discard it.
   279  	//
   280  	// The namespace may become "stale" when the rootfs is not the same
   281  	// device we found above. This will happen whenever the base snap is
   282  	// refreshed since the namespace was first created.
   283  	sc_mountinfo_entry *mie;
   284  	sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   285  
   286  	mi = sc_parse_mountinfo(NULL);
   287  	if (mi == NULL) {
   288  		die("cannot parse mountinfo of the current process");
   289  	}
   290  	for (mie = sc_first_mountinfo_entry(mi); mie != NULL;
   291  	     mie = sc_next_mountinfo_entry(mie)) {
   292  		if (!sc_streq(mie->mount_dir, "/")) {
   293  			continue;
   294  		}
   295  		// NOTE: we want the initial rootfs just in case overmount
   296  		// was used to do something weird. The initial rootfs was
   297  		// set up by snap-confine and that is the one we want to
   298  		// measure.
   299  		debug("block device of the root filesystem is %d:%d",
   300  		      mie->dev_major, mie->dev_minor);
   301  		return base_snap_dev != makedev(mie->dev_major, mie->dev_minor);
   302  	}
   303  	die("cannot find mount entry of the root filesystem");
   304  }
   305  
   306  enum sc_discard_vote {
   307  	/**
   308  	 * SC_DISCARD_NO denotes that the mount namespace doesn't have to be
   309  	 * discarded. This happens when the base snap has not changed.
   310  	 **/
   311  	SC_DISCARD_NO = 1,
   312  	/**
   313  	 * SC_DISCARD_SHOULD indicates that the mount namespace should be discarded
   314  	 * but may be reused if it is still inhabited by processes. This only
   315  	 * happens when the base snap revision changes but the name of the base
   316  	 * snap is the same as before.
   317  	 **/
   318  	SC_DISCARD_SHOULD = 2,
   319  	/**
   320  	 * SC_DISCARD_MUST indicates that the mount namespace must be discarded
   321  	 * even if it still inhabited by processes. This only happens when the name
   322  	 * of the base snap changes.
   323  	 **/
   324  	SC_DISCARD_MUST = 3,
   325  };
   326  
   327  /**
   328   * is_base_transition returns true if a base transition is occurring.
   329   *
   330   * The function inspects /run/snapd/ns/snap.$SNAP_INSTANCE_NAME.info as well
   331   * as the invocation parameters of snap-confine. If the base snap name, as
   332   * encoded in the info file and as described by the invocation parameters
   333   * differ then a base transition is occurring. If the info file is absent or
   334   * does not record the name of the base snap then transition cannot be
   335   * detected.
   336  **/
   337  static bool is_base_transition(const sc_invocation * inv)
   338  {
   339  	char info_path[PATH_MAX] = { 0 };
   340  	sc_must_snprintf(info_path,
   341  			 sizeof info_path,
   342  			 "/run/snapd/ns/snap.%s.info", inv->snap_instance);
   343  
   344  	FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL;
   345  	stream = fopen(info_path, "r");
   346  	if (stream == NULL && errno == ENOENT) {
   347  		// If the info file is absent then we cannot decide if a transition had
   348  		// occurred. For people upgrading from snap-confine without the info
   349  		// file, that is the best we can do.
   350  		return false;
   351  	}
   352  	if (stream == NULL) {
   353  		die("cannot open %s", info_path);
   354  	}
   355  
   356  	char *base_snap_name SC_CLEANUP(sc_cleanup_string) = NULL;
   357  	sc_error *err = NULL;
   358  	if (sc_infofile_get_key
   359  	    (stream, "base-snap-name", &base_snap_name, &err) < 0) {
   360  		sc_die_on_error(err);
   361  	}
   362  
   363  	if (base_snap_name == NULL) {
   364  		// If the info file doesn't record the name of the base snap then,
   365  		// again, we cannot decide if a transition had occurred.
   366  		return false;
   367  	}
   368  
   369  	return !sc_streq(inv->orig_base_snap_name, base_snap_name);
   370  }
   371  
   372  // The namespace may be stale. To check this we must actually switch into it
   373  // but then we use up our setns call (the kernel misbehaves if we setns twice).
   374  // To work around this we'll fork a child and use it to probe. The child will
   375  // inspect the namespace and send information back via eventfd and then exit
   376  // unconditionally.
   377  static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd,
   378  						 const sc_invocation * inv,
   379  						 int snap_discard_ns_fd)
   380  {
   381  	char base_snap_rev[PATH_MAX] = { 0 };
   382  	dev_t base_snap_dev;
   383  	int event_fd SC_CLEANUP(sc_cleanup_close) = -1;
   384  
   385  	// Read the revision of the base snap by looking at the current symlink.
   386  	if (readlink(inv->rootfs_dir, base_snap_rev, sizeof base_snap_rev) < 0) {
   387  		die("cannot read current revision of snap %s",
   388  		    inv->snap_instance);
   389  	}
   390  	if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') {
   391  		die("cannot read current revision of snap %s: value too long",
   392  		    inv->snap_instance);
   393  	}
   394  	// Find the device that is backing the current revision of the base snap.
   395  	base_snap_dev =
   396  	    find_base_snap_device(inv->base_snap_name, base_snap_rev);
   397  
   398  	// Store the PID of this process. This is done instead of calls to
   399  	// getppid() below because then we can reliably track the PID of the
   400  	// parent even if the child process is re-parented.
   401  	pid_t parent = getpid();
   402  
   403  	// Create an eventfd for the communication with the child.
   404  	event_fd = eventfd(0, EFD_CLOEXEC);
   405  	if (event_fd < 0) {
   406  		die("cannot create eventfd");
   407  	}
   408  	// Fork a child, it will do the inspection for us.
   409  	pid_t child = fork();
   410  	if (child < 0) {
   411  		die("cannot fork support process");
   412  	}
   413  
   414  	if (child == 0) {
   415  		// This is the child process which will inspect the mount namespace.
   416  		//
   417  		// Configure the child to die as soon as the parent dies. In an odd
   418  		// case where the parent is killed then we don't want to complete our
   419  		// task or wait for anything.
   420  		if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
   421  			die("cannot set parent process death notification signal to SIGINT");
   422  		}
   423  		// Check that parent process is still alive. If this is the case then
   424  		// we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake
   425  		// us up from eventfd_read() below. In the rare case that the PID
   426  		// numbers overflow and the now-dead parent PID is recycled we will
   427  		// still hang forever on the read from eventfd below.
   428  		if (kill(parent, 0) < 0) {
   429  			switch (errno) {
   430  			case ESRCH:
   431  				debug("parent process has terminated");
   432  				abort();
   433  			default:
   434  				die("cannot confirm that parent process is alive");
   435  				break;
   436  			}
   437  		}
   438  
   439  		debug("joining preserved mount namespace for inspection");
   440  		// Move to the mount namespace of the snap we're trying to inspect.
   441  		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
   442  			die("cannot join preserved mount namespace");
   443  		}
   444  		// Check if the namespace needs to be discarded.
   445  		eventfd_t value = SC_DISCARD_NO;
   446  		const char *value_str = "no";
   447  
   448  		// TODO: enable this for core distributions. This is complex because on
   449  		// core the rootfs is mounted in initrd and is _not_ changed (no
   450  		// pivot_root) and the base snap is again mounted (2nd time) by
   451  		// systemd. This makes us end up in a situation where the outer base
   452  		// snap will never match the rootfs inside the mount namespace.
   453  		if (inv->is_normal_mode
   454  		    && should_discard_current_ns(base_snap_dev)) {
   455  			value = SC_DISCARD_SHOULD;
   456  			value_str = "should";
   457  
   458  		}
   459  		// If the base snap changed, we must discard the mount namespace and
   460  		// start over to allow the newly started process to see the requested
   461  		// base snap. Due to the TODO above always perform explicit transition
   462  		// check to protect against LP:#1819875 and LP:#1861901
   463  		if (is_base_transition(inv)) {
   464  			// The base snap has changed. We must discard ...
   465  			value = SC_DISCARD_MUST;
   466  			value_str = "must";
   467  		}
   468  		// Send this back to the parent: 3 - force discard 2 - prefer discard, 1 - keep.
   469  		// Note that we cannot just use 0 and 1 because of the semantics of eventfd(2).
   470  		if (eventfd_write(event_fd, value) < 0) {
   471  			die("cannot send information to %s preserved mount namespace", value_str);
   472  		}
   473  		// Exit, we're done.
   474  		exit(0);
   475  	}
   476  	// This is back in the parent process.
   477  	//
   478  	// Enable a sanity timeout in case the read blocks for unbound amount of
   479  	// time. This will ensure we will not hang around while holding the lock.
   480  	// Next, read the value written by the child process.
   481  	sc_enable_sanity_timeout();
   482  	eventfd_t value = 0;
   483  	if (eventfd_read(event_fd, &value) < 0) {
   484  		die("cannot read from eventfd");
   485  	}
   486  	sc_disable_sanity_timeout();
   487  
   488  	// Wait for the child process to exit and collect its exit status.
   489  	errno = 0;
   490  	int status = 0;
   491  	if (waitpid(child, &status, 0) < 0) {
   492  		die("cannot wait for the support process for mount namespace inspection");
   493  	}
   494  	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
   495  		die("support process for mount namespace inspection exited abnormally");
   496  	}
   497  	// If the namespace is up-to-date then we are done.
   498  	switch (value) {
   499  	case SC_DISCARD_NO:
   500  		debug("preserved mount is not stale, reusing");
   501  		return 0;
   502  	case SC_DISCARD_SHOULD:
   503  		if (sc_cgroup_is_v2()) {
   504  			debug
   505  			    ("WARNING: cgroup v2 detected, preserved mount namespace process presence check unsupported, discarding");
   506  			break;
   507  		}
   508  		if (sc_cgroup_freezer_occupied(inv->snap_instance)) {
   509  			// Some processes are still using the namespace so we cannot discard it
   510  			// as that would fracture the view that the set of processes inside
   511  			// have on what is mounted.
   512  			debug
   513  			    ("preserved mount namespace is stale but occupied, reusing");
   514  			return 0;
   515  		}
   516  		break;
   517  	case SC_DISCARD_MUST:
   518  		debug
   519  		    ("preserved mount namespace is stale and base snap has changed, discarding");
   520  		break;
   521  	}
   522  	sc_call_snap_discard_ns(snap_discard_ns_fd, inv->snap_instance);
   523  	return EAGAIN;
   524  }
   525  
   526  static void helper_fork(struct sc_mount_ns *group,
   527  			struct sc_apparmor *apparmor);
   528  static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor,
   529  			pid_t parent);
   530  static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent);
   531  static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent);
   532  
   533  int sc_join_preserved_ns(struct sc_mount_ns *group, struct sc_apparmor
   534  			 *apparmor, const sc_invocation * inv,
   535  			 int snap_discard_ns_fd)
   536  {
   537  	// Open the mount namespace file.
   538  	char mnt_fname[PATH_MAX] = { 0 };
   539  	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.mnt", group->name);
   540  	int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
   541  	// NOTE: There is no O_EXCL here because the file can be around but
   542  	// doesn't have to be a mounted namespace.
   543  	mnt_fd = openat(group->dir_fd, mnt_fname,
   544  			O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
   545  	if (mnt_fd < 0 && errno == ENOENT) {
   546  		return ESRCH;
   547  	}
   548  	if (mnt_fd < 0) {
   549  		die("cannot open preserved mount namespace %s", group->name);
   550  	}
   551  	// Check if we got an nsfs-based or procfs file or a regular file. This can
   552  	// be reliably tested because nsfs has an unique filesystem type
   553  	// NSFS_MAGIC.  On older kernels that don't support nsfs yet we can look
   554  	// for PROC_SUPER_MAGIC instead.
   555  	// We can just ensure that this is the case thanks to fstatfs.
   556  	struct statfs ns_statfs_buf;
   557  	if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) {
   558  		die("cannot inspect filesystem of preserved mount namespace file");
   559  	}
   560  	// Stat the mount namespace as well, this is later used to check if the
   561  	// namespace is used by other processes if we are considering discarding a
   562  	// stale namespace.
   563  	struct stat ns_stat_buf;
   564  	if (fstat(mnt_fd, &ns_stat_buf) < 0) {
   565  		die("cannot inspect preserved mount namespace file");
   566  	}
   567  #ifndef NSFS_MAGIC
   568  // Account for kernel headers old enough to not know about NSFS_MAGIC.
   569  #define NSFS_MAGIC 0x6e736673
   570  #endif
   571  	if (ns_statfs_buf.f_type == NSFS_MAGIC
   572  	    || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) {
   573  
   574  		// Inspect and perhaps discard the preserved mount namespace.
   575  		if (sc_inspect_and_maybe_discard_stale_ns
   576  		    (mnt_fd, inv, snap_discard_ns_fd) == EAGAIN) {
   577  			return ESRCH;
   578  		}
   579  		// Move to the mount namespace of the snap we're trying to start.
   580  		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
   581  			die("cannot join preserved mount namespace %s",
   582  			    group->name);
   583  		}
   584  		debug("joined preserved mount namespace %s", group->name);
   585  		return 0;
   586  	}
   587  	return ESRCH;
   588  }
   589  
   590  int sc_join_preserved_per_user_ns(struct sc_mount_ns *group,
   591  				  const char *snap_name)
   592  {
   593  	uid_t uid = getuid();
   594  	char mnt_fname[PATH_MAX] = { 0 };
   595  	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.%d.mnt", group->name,
   596  			 (int)uid);
   597  
   598  	int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
   599  	mnt_fd = openat(group->dir_fd, mnt_fname,
   600  			O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
   601  	if (mnt_fd < 0 && errno == ENOENT) {
   602  		return ESRCH;
   603  	}
   604  	if (mnt_fd < 0) {
   605  		die("cannot open preserved mount namespace %s", group->name);
   606  	}
   607  	struct statfs ns_statfs_buf;
   608  	if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) {
   609  		die("cannot inspect filesystem of preserved mount namespace file");
   610  	}
   611  	struct stat ns_stat_buf;
   612  	if (fstat(mnt_fd, &ns_stat_buf) < 0) {
   613  		die("cannot inspect preserved mount namespace file");
   614  	}
   615  #ifndef NSFS_MAGIC
   616  	/* Define NSFS_MAGIC for Ubuntu 14.04 and other older systems. */
   617  #define NSFS_MAGIC 0x6e736673
   618  #endif
   619  	if (ns_statfs_buf.f_type == NSFS_MAGIC
   620  	    || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) {
   621  		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
   622  			die("cannot join preserved per-user mount namespace %s",
   623  			    group->name);
   624  		}
   625  		debug("joined preserved mount namespace %s", group->name);
   626  		return 0;
   627  	}
   628  	return ESRCH;
   629  }
   630  
   631  static void setup_signals_for_helper(void)
   632  {
   633  	/* Ignore the SIGPIPE signal so that we get EPIPE on the read / write
   634  	 * operations attempting to work with a closed pipe. This ensures that we
   635  	 * are not killed by the default disposition (terminate) and can return a
   636  	 * non-signal-death return code to the program invoking snap-confine. */
   637  	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
   638  		die("cannot install ignore handler for SIGPIPE");
   639  	}
   640  }
   641  
   642  static void teardown_signals_for_helper(void)
   643  {
   644  	/* Undo operations done by setup_signals_for_helper. */
   645  	if (signal(SIGPIPE, SIG_DFL) == SIG_ERR) {
   646  		die("cannot restore default handler for SIGPIPE");
   647  	}
   648  }
   649  
   650  static void helper_fork(struct sc_mount_ns *group, struct sc_apparmor *apparmor)
   651  {
   652  	// Create a pipe for sending commands to the helper process.
   653  	if (pipe2(group->pipe_master, O_CLOEXEC | O_DIRECT) < 0) {
   654  		die("cannot create pipes for commanding the helper process");
   655  	}
   656  	if (pipe2(group->pipe_helper, O_CLOEXEC | O_DIRECT) < 0) {
   657  		die("cannot create pipes for responding to master process");
   658  	}
   659  	// Store the PID of the "parent" process. This done instead of calls to
   660  	// getppid() because then we can reliably track the PID of the parent even
   661  	// if the child process is re-parented.
   662  	pid_t parent = getpid();
   663  
   664  	// For rationale of forking see this:
   665  	// https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html
   666  	pid_t pid = fork();
   667  	if (pid < 0) {
   668  		die("cannot fork helper process for mount namespace capture");
   669  	}
   670  	if (pid == 0) {
   671  		/* helper */
   672  		sc_cleanup_close(&group->pipe_master[1]);
   673  		sc_cleanup_close(&group->pipe_helper[0]);
   674  		helper_main(group, apparmor, parent);
   675  	} else {
   676  		setup_signals_for_helper();
   677  
   678  		/* master */
   679  		sc_cleanup_close(&group->pipe_master[0]);
   680  		sc_cleanup_close(&group->pipe_helper[1]);
   681  
   682  		// Glibc defines pid as a signed 32bit integer. There's no standard way to
   683  		// print pid's portably so this is the best we can do.
   684  		debug("forked support process %d", (int)pid);
   685  		group->child = pid;
   686  	}
   687  }
   688  
   689  static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor,
   690  			pid_t parent)
   691  {
   692  	// This is the child process which will capture the mount namespace.
   693  	//
   694  	// It will do so by bind-mounting the .mnt after the parent process calls
   695  	// unshare() and finishes setting up the namespace completely. Change the
   696  	// hat to a sub-profile that has limited permissions necessary to
   697  	// accomplish the capture of the mount namespace.
   698  	sc_maybe_aa_change_hat(apparmor, "mount-namespace-capture-helper", 0);
   699  	// Configure the child to die as soon as the parent dies. In an odd
   700  	// case where the parent is killed then we don't want to complete our
   701  	// task or wait for anything.
   702  	if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
   703  		die("cannot set parent process death notification signal to SIGINT");
   704  	}
   705  	// Check that parent process is still alive. If this is the case then we
   706  	// can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake us up
   707  	// from read(2) below. In the rare case that the PID numbers overflow and
   708  	// the now-dead parent PID is recycled we will still hang forever on the
   709  	// read from the pipe below.
   710  	if (kill(parent, 0) < 0) {
   711  		switch (errno) {
   712  		case ESRCH:
   713  			// When snap-confine executes it will fork a helper process. That
   714  			// process establishes an elaborate dance to ensure both itself and
   715  			// the parent are operating exactly as specified, so that no
   716  			// processes are left behind for unbound amount of time. As a part
   717  			// of that dance the child pings the parent to ensure it is still
   718  			// alive after establishing a notification signal to be sent in
   719  			// case the parent dies. This is a race avoidance mechanism, we set
   720  			// up the notification and then check if the parent is alive by the
   721  			// time we are done.
   722  			//
   723  			// In the case when the parent does go away we used to call
   724  			// abort(). On some distributions this would trigger an unclean
   725  			// process termination error report to be sent. One such example is
   726  			// the Ubuntu error tracker. Since the parent process can be
   727  			// legitimately interrupted and killed, this should not generate an
   728  			// error report. As such, perform clean exit in this specific case.
   729  			debug("parent process has terminated");
   730  			exit(0);
   731  		default:
   732  			die("cannot confirm that parent process is alive");
   733  			break;
   734  		}
   735  	}
   736  	if (fchdir(group->dir_fd) < 0) {
   737  		die("cannot move to directory with preserved namespaces");
   738  	}
   739  	int command = -1;
   740  	int run = 1;
   741  	while (run) {
   742  		debug("helper process waiting for command");
   743  		sc_enable_sanity_timeout();
   744  		if (read(group->pipe_master[0], &command, sizeof command) < 0) {
   745  			int saved_errno = errno;
   746  			// This will ensure we get the correct error message
   747  			// if there is a read error because the timeout
   748  			// expired.
   749  			sc_disable_sanity_timeout();
   750  			errno = saved_errno;
   751  			die("cannot read command from the pipe");
   752  		}
   753  		sc_disable_sanity_timeout();
   754  		debug("helper process received command %d", command);
   755  		switch (command) {
   756  		case HELPER_CMD_EXIT:
   757  			run = 0;
   758  			break;
   759  		case HELPER_CMD_CAPTURE_MOUNT_NS:
   760  			helper_capture_ns(group, parent);
   761  			break;
   762  		case HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS:
   763  			helper_capture_per_user_ns(group, parent);
   764  			break;
   765  		}
   766  		if (write(group->pipe_helper[1], &command, sizeof command) < 0) {
   767  			die("cannot write ack");
   768  		}
   769  	}
   770  	debug("helper process exiting");
   771  	exit(0);
   772  }
   773  
   774  static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent)
   775  {
   776  	char src[PATH_MAX] = { 0 };
   777  	char dst[PATH_MAX] = { 0 };
   778  
   779  	debug("capturing per-snap mount namespace");
   780  	sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent);
   781  	sc_must_snprintf(dst, sizeof dst, "%s.mnt", group->name);
   782  
   783  	/* Ensure the bind mount destination exists. */
   784  	int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600);
   785  	if (fd < 0) {
   786  		die("cannot create file %s", dst);
   787  	}
   788  	close(fd);
   789  
   790  	if (mount(src, dst, NULL, MS_BIND, NULL) < 0) {
   791  		die("cannot preserve mount namespace of process %d as %s",
   792  		    (int)parent, dst);
   793  	}
   794  	debug("mount namespace of process %d preserved as %s",
   795  	      (int)parent, dst);
   796  }
   797  
   798  static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent)
   799  {
   800  	char src[PATH_MAX] = { 0 };
   801  	char dst[PATH_MAX] = { 0 };
   802  	uid_t uid = getuid();
   803  
   804  	debug("capturing per-snap, per-user mount namespace");
   805  	sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent);
   806  	sc_must_snprintf(dst, sizeof dst, "%s.%d.mnt", group->name, (int)uid);
   807  
   808  	/* Ensure the bind mount destination exists. */
   809  	int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600);
   810  	if (fd < 0) {
   811  		die("cannot create file %s", dst);
   812  	}
   813  	close(fd);
   814  
   815  	if (mount(src, dst, NULL, MS_BIND, NULL) < 0) {
   816  		die("cannot preserve per-user mount namespace of process %d as %s", (int)parent, dst);
   817  	}
   818  	debug("per-user mount namespace of process %d preserved as %s",
   819  	      (int)parent, dst);
   820  }
   821  
   822  static void sc_message_capture_helper(struct sc_mount_ns *group, int command_id)
   823  {
   824  	int ack;
   825  	if (group->child == 0) {
   826  		die("precondition failed: we don't have a helper process");
   827  	}
   828  	if (group->pipe_master[1] < 0) {
   829  		die("precondition failed: we don't have a pipe");
   830  	}
   831  	if (group->pipe_helper[0] < 0) {
   832  		die("precondition failed: we don't have a pipe");
   833  	}
   834  	debug("sending command %d to helper process (pid: %d)",
   835  	      command_id, group->child);
   836  	if (write(group->pipe_master[1], &command_id, sizeof command_id) < 0) {
   837  		die("cannot send command %d to helper process", command_id);
   838  	}
   839  	debug("waiting for response from helper");
   840  	int read_n = read(group->pipe_helper[0], &ack, sizeof ack);
   841  	if (read_n < 0) {
   842  		die("cannot receive ack from helper process");
   843  	}
   844  	if (read_n == 0) {
   845  		die("unexpected eof from helper process");
   846  	}
   847  }
   848  
   849  static void sc_wait_for_capture_helper(struct sc_mount_ns *group)
   850  {
   851  	if (group->child == 0) {
   852  		die("precondition failed: we don't have a helper process");
   853  	}
   854  	debug("waiting for the helper process to exit");
   855  	int status = 0;
   856  	errno = 0;
   857  	if (waitpid(group->child, &status, 0) < 0) {
   858  		die("cannot wait for the helper process");
   859  	}
   860  	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
   861  		die("helper process exited abnormally");
   862  	}
   863  	debug("helper process exited normally");
   864  	group->child = 0;
   865  	teardown_signals_for_helper();
   866  }
   867  
   868  void sc_fork_helper(struct sc_mount_ns *group, struct sc_apparmor *apparmor)
   869  {
   870  	helper_fork(group, apparmor);
   871  }
   872  
   873  void sc_preserve_populated_mount_ns(struct sc_mount_ns *group)
   874  {
   875  	sc_message_capture_helper(group, HELPER_CMD_CAPTURE_MOUNT_NS);
   876  }
   877  
   878  void sc_preserve_populated_per_user_mount_ns(struct sc_mount_ns *group)
   879  {
   880  	sc_message_capture_helper(group, HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS);
   881  }
   882  
   883  void sc_wait_for_helper(struct sc_mount_ns *group)
   884  {
   885  	sc_message_capture_helper(group, HELPER_CMD_EXIT);
   886  	sc_wait_for_capture_helper(group);
   887  }
   888  
   889  void sc_store_ns_info(const sc_invocation * inv)
   890  {
   891  	FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL;
   892  	char info_path[PATH_MAX] = { 0 };
   893  	sc_must_snprintf(info_path, sizeof info_path,
   894  			 "/run/snapd/ns/snap.%s.info", inv->snap_instance);
   895  	int fd = -1;
   896  	fd = open(info_path,
   897  		  O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC | O_NOFOLLOW, 0644);
   898  	if (fd < 0) {
   899  		die("cannot open %s", info_path);
   900  	}
   901  	if (fchown(fd, 0, 0) < 0) {
   902  		die("cannot chown %s to root.root", info_path);
   903  	}
   904  	// The stream now owns the file descriptor.
   905  	stream = fdopen(fd, "w");
   906  	if (stream == NULL) {
   907  		die("cannot get stream from file descriptor");
   908  	}
   909  	fprintf(stream, "base-snap-name=%s\n", inv->orig_base_snap_name);
   910  	if (ferror(stream) != 0) {
   911  		die("I/O error when writing to %s", info_path);
   912  	}
   913  	if (fflush(stream) == EOF) {
   914  		die("cannot flush %s", info_path);
   915  	}
   916  	debug("saved mount namespace meta-data to %s", info_path);
   917  }