github.com/tompreston/snapd@v0.0.0-20210817193607-954edfcb9611/cmd/snap-confine/ns-support.c

github.com/tompreston/snapd@v0.0.0-20210817193607-954edfcb9611/cmd/snap-confine/ns-support.c (about)

     1  /*
     2   * Copyright (C) 2016 Canonical Ltd
     3   *
     4   * This program is free software: you can redistribute it and/or modify
     5   * it under the terms of the GNU General Public License version 3 as
     6   * published by the Free Software Foundation.
     7   *
     8   * This program is distributed in the hope that it will be useful,
     9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11   * GNU General Public License for more details.
    12   *
    13   * You should have received a copy of the GNU General Public License
    14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15   *
    16   */
    17  
    18  #include "ns-support.h"
    19  
    20  #ifdef HAVE_CONFIG_H
    21  #include "config.h"
    22  #endif
    23  
    24  #include <errno.h>
    25  #include <fcntl.h>
    26  #include <linux/magic.h>
    27  #include <sched.h>
    28  #include <signal.h>
    29  #include <string.h>
    30  #include <sys/eventfd.h>
    31  #include <sys/file.h>
    32  #include <sys/mount.h>
    33  #include <sys/prctl.h>
    34  #include <sys/stat.h>
    35  #include <sys/sysmacros.h>
    36  #include <sys/types.h>
    37  #include <sys/vfs.h>
    38  #include <sys/wait.h>
    39  #include <unistd.h>
    40  
    41  #include "../libsnap-confine-private/cgroup-freezer-support.h"
    42  #include "../libsnap-confine-private/cgroup-support.h"
    43  #include "../libsnap-confine-private/classic.h"
    44  #include "../libsnap-confine-private/cleanup-funcs.h"
    45  #include "../libsnap-confine-private/feature.h"
    46  #include "../libsnap-confine-private/infofile.h"
    47  #include "../libsnap-confine-private/locking.h"
    48  #include "../libsnap-confine-private/mountinfo.h"
    49  #include "../libsnap-confine-private/string-utils.h"
    50  #include "../libsnap-confine-private/tool.h"
    51  #include "../libsnap-confine-private/utils.h"
    52  #include "user-support.h"
    53  #include "mount-support.h"
    54  
    55  /**
    56   * Directory where snap-confine keeps namespace files.
    57   **/
    58  #define SC_NS_DIR "/run/snapd/ns"
    59  
    60  /**
    61   * Effective value of SC_NS_DIR.
    62   *
    63   * We use 'const char *' so we can update sc_ns_dir in the testsuite
    64   **/
    65  static const char *sc_ns_dir = SC_NS_DIR;
    66  
    67  enum {
    68  	HELPER_CMD_EXIT,
    69  	HELPER_CMD_CAPTURE_MOUNT_NS,
    70  	HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS,
    71  };
    72  
    73  void sc_reassociate_with_pid1_mount_ns(void)
    74  {
    75  	int init_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
    76  	int self_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
    77  	const char *path_pid_1 = "/proc/1/ns/mnt";
    78  	const char *path_pid_self = "/proc/self/ns/mnt";
    79  
    80  	init_mnt_fd = open(path_pid_1,
    81  			   O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
    82  	if (init_mnt_fd < 0) {
    83  		die("cannot open path %s", path_pid_1);
    84  	}
    85  	self_mnt_fd = open(path_pid_self,
    86  			   O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
    87  	if (self_mnt_fd < 0) {
    88  		die("cannot open path %s", path_pid_1);
    89  	}
    90  	char init_buf[128] = { 0 };
    91  	char self_buf[128] = { 0 };
    92  	memset(init_buf, 0, sizeof init_buf);
    93  	if (readlinkat(init_mnt_fd, "", init_buf, sizeof init_buf) < 0) {
    94  		if (errno == ENOENT) {
    95  			// According to namespaces(7) on a pre 3.8 kernel the namespace
    96  			// files are hardlinks, not symlinks. If that happens readlinkat
    97  			// fails with ENOENT. As a quick workaround for this special-case
    98  			// functionality, just bail out and do nothing without raising an
    99  			// error.
   100  			return;
   101  		}
   102  		die("cannot read mount namespace identifier of pid 1");
   103  	}
   104  	memset(self_buf, 0, sizeof self_buf);
   105  	if (readlinkat(self_mnt_fd, "", self_buf, sizeof self_buf) < 0) {
   106  		die("cannot read mount namespace identifier of the current process");
   107  	}
   108  	if (memcmp(init_buf, self_buf, sizeof init_buf) != 0) {
   109  		debug("moving to mount namespace of pid 1");
   110  		// We cannot use O_NOFOLLOW here because that file will always be a
   111  		// symbolic link. We actually want to open it this way.
   112  		int init_mnt_fd_real SC_CLEANUP(sc_cleanup_close) = -1;
   113  		init_mnt_fd_real = open(path_pid_1, O_RDONLY | O_CLOEXEC);
   114  		if (init_mnt_fd_real < 0) {
   115  			die("cannot open %s", path_pid_1);
   116  		}
   117  		if (setns(init_mnt_fd_real, CLONE_NEWNS) < 0) {
   118  			die("cannot join mount namespace of pid 1");
   119  		}
   120  	}
   121  }
   122  
   123  void sc_initialize_mount_ns(unsigned int experimental_features)
   124  {
   125  	debug("unsharing snap namespace directory");
   126  
   127  	/* Ensure that /run/snapd/ns is a directory. */
   128  	sc_identity old = sc_set_effective_identity(sc_root_group_identity());
   129  	if (sc_nonfatal_mkpath(sc_ns_dir, 0755) < 0) {
   130  		die("cannot create directory %s", sc_ns_dir);
   131  	}
   132  	(void)sc_set_effective_identity(old);
   133  
   134  	/* Read and analyze the mount table. We need to see whether /run/snapd/ns
   135  	 * is a mount point with private event propagation. */
   136  	sc_mountinfo *info SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   137  	info = sc_parse_mountinfo(NULL);
   138  	if (info == NULL) {
   139  		die("cannot parse /proc/self/mountinfo");
   140  	}
   141  
   142  	bool is_mnt = false;
   143  	bool is_private = false;
   144  	for (sc_mountinfo_entry * entry = sc_first_mountinfo_entry(info);
   145  	     entry != NULL; entry = sc_next_mountinfo_entry(entry)) {
   146  		/* Find /run/snapd/ns */
   147  		if (!sc_streq(entry->mount_dir, sc_ns_dir)) {
   148  			continue;
   149  		}
   150  		is_mnt = true;
   151  		if (strstr(entry->optional_fields, "shared:") == NULL) {
   152  			/* Mount event propagation is not set to shared, good. */
   153  			is_private = true;
   154  		}
   155  		break;
   156  	}
   157  
   158  	if (!is_mnt) {
   159  		if (mount(sc_ns_dir, sc_ns_dir, NULL, MS_BIND | MS_REC, NULL) <
   160  		    0) {
   161  			die("cannot self-bind mount %s", sc_ns_dir);
   162  		}
   163  	}
   164  
   165  	if (!is_private) {
   166  		if (mount(NULL, sc_ns_dir, NULL, MS_PRIVATE, NULL) < 0) {
   167  			die("cannot change propagation type to MS_PRIVATE in %s", sc_ns_dir);
   168  		}
   169  	}
   170  
   171  	/* code that follows is experimental */
   172  	if (experimental_features & SC_FEATURE_PARALLEL_INSTANCES) {
   173  		// Ensure that SNAP_MOUNT_DIR and /var/snap are shared mount points
   174  		debug
   175  		    ("(experimental) ensuring snap mount and data directories are mount points");
   176  		sc_ensure_snap_dir_shared_mounts();
   177  	}
   178  }
   179  
   180  struct sc_mount_ns {
   181  	// Name of the namespace group ($SNAP_NAME).
   182  	char *name;
   183  	// Descriptor to the namespace group control directory.  This descriptor is
   184  	// opened with O_PATH|O_DIRECTORY so it's only used for openat() calls.
   185  	int dir_fd;
   186  	// Pair of descriptors for a pair for a pipe file descriptors (read end,
   187  	// write end) that snap-confine uses to send messages to the helper
   188  	// process and back.
   189  	int pipe_helper[2];
   190  	int pipe_master[2];
   191  	// Identifier of the child process that is used during the one-time (per
   192  	// group) initialization and capture process.
   193  	pid_t child;
   194  };
   195  
   196  static struct sc_mount_ns *sc_alloc_mount_ns(void)
   197  {
   198  	struct sc_mount_ns *group = calloc(1, sizeof *group);
   199  	if (group == NULL) {
   200  		die("cannot allocate memory for sc_mount_ns");
   201  	}
   202  	group->dir_fd = -1;
   203  	group->pipe_helper[0] = -1;
   204  	group->pipe_helper[1] = -1;
   205  	group->pipe_master[0] = -1;
   206  	group->pipe_master[1] = -1;
   207  	// Redundant with calloc but some functions check for the non-zero value so
   208  	// I'd like to keep this explicit in the code.
   209  	group->child = 0;
   210  	return group;
   211  }
   212  
   213  struct sc_mount_ns *sc_open_mount_ns(const char *group_name)
   214  {
   215  	struct sc_mount_ns *group = sc_alloc_mount_ns();
   216  	group->dir_fd = open(sc_ns_dir,
   217  			     O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW);
   218  	if (group->dir_fd < 0) {
   219  		die("cannot open directory %s", sc_ns_dir);
   220  	}
   221  	group->name = sc_strdup(group_name);
   222  	return group;
   223  }
   224  
   225  void sc_close_mount_ns(struct sc_mount_ns *group)
   226  {
   227  	if (group->child != 0) {
   228  		sc_wait_for_helper(group);
   229  	}
   230  	sc_cleanup_close(&group->dir_fd);
   231  	sc_cleanup_close(&group->pipe_master[0]);
   232  	sc_cleanup_close(&group->pipe_master[1]);
   233  	sc_cleanup_close(&group->pipe_helper[0]);
   234  	sc_cleanup_close(&group->pipe_helper[1]);
   235  	free(group->name);
   236  	free(group);
   237  }
   238  
   239  static dev_t find_base_snap_device(const char *base_snap_name,
   240  				   const char *base_snap_rev)
   241  {
   242  	// Find the backing device of the base snap.
   243  	// TODO: add support for "try mode" base snaps that also need
   244  	// consideration of the mie->root component.
   245  	dev_t base_snap_dev = 0;
   246  	char base_squashfs_path[PATH_MAX];
   247  	sc_must_snprintf(base_squashfs_path,
   248  			 sizeof base_squashfs_path, "%s/%s/%s",
   249  			 SNAP_MOUNT_DIR, base_snap_name, base_snap_rev);
   250  	sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   251  	mi = sc_parse_mountinfo(NULL);
   252  	if (mi == NULL) {
   253  		die("cannot parse mountinfo of the current process");
   254  	}
   255  	bool found = false;
   256  	for (sc_mountinfo_entry * mie =
   257  	     sc_first_mountinfo_entry(mi); mie != NULL;
   258  	     mie = sc_next_mountinfo_entry(mie)) {
   259  		if (sc_streq(mie->mount_dir, base_squashfs_path)) {
   260  			base_snap_dev = makedev(mie->dev_major, mie->dev_minor);
   261  			debug("block device of snap %s, revision %s is %d:%d",
   262  			      base_snap_name, base_snap_rev, mie->dev_major,
   263  			      mie->dev_minor);
   264  			// Don't break when found, we are interested in the last
   265  			// entry as this is the "effective" one.
   266  			found = true;
   267  		}
   268  	}
   269  	if (!found) {
   270  		die("cannot find mount entry for snap %s revision %s",
   271  		    base_snap_name, base_snap_rev);
   272  	}
   273  	return base_snap_dev;
   274  }
   275  
   276  static bool should_discard_current_ns(dev_t base_snap_dev)
   277  {
   278  	// Inspect the namespace and check if we should discard it.
   279  	//
   280  	// The namespace may become "stale" when the rootfs is not the same
   281  	// device we found above. This will happen whenever the base snap is
   282  	// refreshed since the namespace was first created.
   283  	sc_mountinfo_entry *mie;
   284  	sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   285  
   286  	mi = sc_parse_mountinfo(NULL);
   287  	if (mi == NULL) {
   288  		die("cannot parse mountinfo of the current process");
   289  	}
   290  	for (mie = sc_first_mountinfo_entry(mi); mie != NULL;
   291  	     mie = sc_next_mountinfo_entry(mie)) {
   292  		if (!sc_streq(mie->mount_dir, "/")) {
   293  			continue;
   294  		}
   295  		// NOTE: we want the initial rootfs just in case overmount
   296  		// was used to do something weird. The initial rootfs was
   297  		// set up by snap-confine and that is the one we want to
   298  		// measure.
   299  		debug("block device of the root filesystem is %d:%d",
   300  		      mie->dev_major, mie->dev_minor);
   301  		return base_snap_dev != makedev(mie->dev_major, mie->dev_minor);
   302  	}
   303  	die("cannot find mount entry of the root filesystem");
   304  }
   305  
   306  enum sc_discard_vote {
   307  	/**
   308  	 * SC_DISCARD_NO denotes that the mount namespace doesn't have to be
   309  	 * discarded. This happens when the base snap has not changed.
   310  	 **/
   311  	SC_DISCARD_NO = 1,
   312  	/**
   313  	 * SC_DISCARD_SHOULD indicates that the mount namespace should be discarded
   314  	 * but may be reused if it is still inhabited by processes. This only
   315  	 * happens when the base snap revision changes but the name of the base
   316  	 * snap is the same as before.
   317  	 **/
   318  	SC_DISCARD_SHOULD = 2,
   319  	/**
   320  	 * SC_DISCARD_MUST indicates that the mount namespace must be discarded
   321  	 * even if it still inhabited by processes. This only happens when the name
   322  	 * of the base snap changes.
   323  	 **/
   324  	SC_DISCARD_MUST = 3,
   325  };
   326  
   327  /**
   328   * is_base_transition returns true if a base transition is occurring.
   329   *
   330   * The function inspects /run/snapd/ns/snap.$SNAP_INSTANCE_NAME.info as well
   331   * as the invocation parameters of snap-confine. If the base snap name, as
   332   * encoded in the info file and as described by the invocation parameters
   333   * differ then a base transition is occurring. If the info file is absent or
   334   * does not record the name of the base snap then transition cannot be
   335   * detected.
   336  **/
   337  static bool is_base_transition(const sc_invocation * inv)
   338  {
   339  	char info_path[PATH_MAX] = { 0 };
   340  	sc_must_snprintf(info_path,
   341  			 sizeof info_path,
   342  			 "/run/snapd/ns/snap.%s.info", inv->snap_instance);
   343  
   344  	FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL;
   345  	stream = fopen(info_path, "r");
   346  	if (stream == NULL && errno == ENOENT) {
   347  		// If the info file is absent then we cannot decide if a transition had
   348  		// occurred. For people upgrading from snap-confine without the info
   349  		// file, that is the best we can do.
   350  		return false;
   351  	}
   352  	if (stream == NULL) {
   353  		die("cannot open %s", info_path);
   354  	}
   355  
   356  	char *base_snap_name SC_CLEANUP(sc_cleanup_string) = NULL;
   357  	sc_error *err = NULL;
   358  	if (sc_infofile_get_key
   359  	    (stream, "base-snap-name", &base_snap_name, &err) < 0) {
   360  		sc_die_on_error(err);
   361  	}
   362  
   363  	if (base_snap_name == NULL) {
   364  		// If the info file doesn't record the name of the base snap then,
   365  		// again, we cannot decide if a transition had occurred.
   366  		return false;
   367  	}
   368  
   369  	return !sc_streq(inv->orig_base_snap_name, base_snap_name);
   370  }
   371  
   372  static bool sc_is_mount_ns_in_use(const char *snap_instance);
   373  
   374  // The namespace may be stale. To check this we must actually switch into it
   375  // but then we use up our setns call (the kernel misbehaves if we setns twice).
   376  // To work around this we'll fork a child and use it to probe. The child will
   377  // inspect the namespace and send information back via eventfd and then exit
   378  // unconditionally.
   379  static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd,
   380  						 const sc_invocation * inv,
   381  						 int snap_discard_ns_fd)
   382  {
   383  	char base_snap_rev[PATH_MAX] = { 0 };
   384  	dev_t base_snap_dev;
   385  	int event_fd SC_CLEANUP(sc_cleanup_close) = -1;
   386  
   387  	// Read the revision of the base snap by looking at the current symlink.
   388  	if (readlink(inv->rootfs_dir, base_snap_rev, sizeof base_snap_rev) < 0) {
   389  		die("cannot read current revision of snap %s",
   390  		    inv->snap_instance);
   391  	}
   392  	if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') {
   393  		die("cannot read current revision of snap %s: value too long",
   394  		    inv->snap_instance);
   395  	}
   396  	// Find the device that is backing the current revision of the base snap.
   397  	base_snap_dev =
   398  	    find_base_snap_device(inv->base_snap_name, base_snap_rev);
   399  
   400  	// Store the PID of this process. This is done instead of calls to
   401  	// getppid() below because then we can reliably track the PID of the
   402  	// parent even if the child process is re-parented.
   403  	pid_t parent = getpid();
   404  
   405  	// Create an eventfd for the communication with the child.
   406  	event_fd = eventfd(0, EFD_CLOEXEC);
   407  	if (event_fd < 0) {
   408  		die("cannot create eventfd");
   409  	}
   410  	// Fork a child, it will do the inspection for us.
   411  	pid_t child = fork();
   412  	if (child < 0) {
   413  		die("cannot fork support process");
   414  	}
   415  
   416  	if (child == 0) {
   417  		// This is the child process which will inspect the mount namespace.
   418  		//
   419  		// Configure the child to die as soon as the parent dies. In an odd
   420  		// case where the parent is killed then we don't want to complete our
   421  		// task or wait for anything.
   422  		if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
   423  			die("cannot set parent process death notification signal to SIGINT");
   424  		}
   425  		// Check that parent process is still alive. If this is the case then
   426  		// we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake
   427  		// us up from eventfd_read() below. In the rare case that the PID
   428  		// numbers overflow and the now-dead parent PID is recycled we will
   429  		// still hang forever on the read from eventfd below.
   430  		if (kill(parent, 0) < 0) {
   431  			switch (errno) {
   432  			case ESRCH:
   433  				debug("parent process has terminated");
   434  				abort();
   435  			default:
   436  				die("cannot confirm that parent process is alive");
   437  				break;
   438  			}
   439  		}
   440  
   441  		debug("joining preserved mount namespace for inspection");
   442  		// Move to the mount namespace of the snap we're trying to inspect.
   443  		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
   444  			die("cannot join preserved mount namespace");
   445  		}
   446  		// Check if the namespace needs to be discarded.
   447  		eventfd_t value = SC_DISCARD_NO;
   448  		const char *value_str = "no";
   449  
   450  		// TODO: enable this for core distributions. This is complex because on
   451  		// core the rootfs is mounted in initrd and is _not_ changed (no
   452  		// pivot_root) and the base snap is again mounted (2nd time) by
   453  		// systemd. This makes us end up in a situation where the outer base
   454  		// snap will never match the rootfs inside the mount namespace.
   455  		if (inv->is_normal_mode
   456  		    && should_discard_current_ns(base_snap_dev)) {
   457  			value = SC_DISCARD_SHOULD;
   458  			value_str = "should";
   459  		}
   460  		// If the base snap changed, we must discard the mount namespace and
   461  		// start over to allow the newly started process to see the requested
   462  		// base snap. Due to the TODO above always perform explicit transition
   463  		// check to protect against LP:#1819875 and LP:#1861901
   464  		if (is_base_transition(inv)) {
   465  			// The base snap has changed. We must discard ...
   466  			value = SC_DISCARD_MUST;
   467  			value_str = "must";
   468  		}
   469  		// Send this back to the parent: 3 - force discard 2 - prefer discard, 1 - keep.
   470  		// Note that we cannot just use 0 and 1 because of the semantics of eventfd(2).
   471  		if (eventfd_write(event_fd, value) < 0) {
   472  			die("cannot send information to %s preserved mount namespace", value_str);
   473  		}
   474  		// Exit, we're done.
   475  		exit(0);
   476  	}
   477  	// This is back in the parent process.
   478  	//
   479  	// Enable a sanity timeout in case the read blocks for unbound amount of
   480  	// time. This will ensure we will not hang around while holding the lock.
   481  	// Next, read the value written by the child process.
   482  	sc_enable_sanity_timeout();
   483  	eventfd_t value = 0;
   484  	if (eventfd_read(event_fd, &value) < 0) {
   485  		die("cannot read from eventfd");
   486  	}
   487  	sc_disable_sanity_timeout();
   488  
   489  	// Wait for the child process to exit and collect its exit status.
   490  	errno = 0;
   491  	int status = 0;
   492  	if (waitpid(child, &status, 0) < 0) {
   493  		die("cannot wait for the support process for mount namespace inspection");
   494  	}
   495  	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
   496  		die("support process for mount namespace inspection exited abnormally");
   497  	}
   498  	// If the namespace is up-to-date then we are done.
   499  	switch (value) {
   500  	case SC_DISCARD_NO:
   501  		debug("preserved mount is not stale, reusing");
   502  		return 0;
   503  	case SC_DISCARD_SHOULD:
   504  		if (sc_is_mount_ns_in_use(inv->snap_instance)) {
   505  			// Some processes are still using the namespace so we cannot discard it
   506  			// as that would fracture the view that the set of processes inside
   507  			// have on what is mounted.
   508  			debug
   509  			    ("preserved mount namespace is stale but occupied, reusing");
   510  			return 0;
   511  		}
   512  		break;
   513  	case SC_DISCARD_MUST:
   514  		debug
   515  		    ("preserved mount namespace is stale and base snap has changed, discarding");
   516  		break;
   517  	}
   518  	sc_call_snap_discard_ns(snap_discard_ns_fd, inv->snap_instance);
   519  	return EAGAIN;
   520  }
   521  
   522  static void helper_fork(struct sc_mount_ns *group,
   523  			struct sc_apparmor *apparmor);
   524  static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor,
   525  			pid_t parent);
   526  static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent);
   527  static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent);
   528  
   529  int sc_join_preserved_ns(struct sc_mount_ns *group, struct sc_apparmor
   530  			 *apparmor, const sc_invocation * inv,
   531  			 int snap_discard_ns_fd)
   532  {
   533  	// Open the mount namespace file.
   534  	char mnt_fname[PATH_MAX] = { 0 };
   535  	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.mnt", group->name);
   536  	int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
   537  	// NOTE: There is no O_EXCL here because the file can be around but
   538  	// doesn't have to be a mounted namespace.
   539  	mnt_fd = openat(group->dir_fd, mnt_fname,
   540  			O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
   541  	if (mnt_fd < 0 && errno == ENOENT) {
   542  		return ESRCH;
   543  	}
   544  	if (mnt_fd < 0) {
   545  		die("cannot open preserved mount namespace %s", group->name);
   546  	}
   547  	// Check if we got an nsfs-based or procfs file or a regular file. This can
   548  	// be reliably tested because nsfs has an unique filesystem type
   549  	// NSFS_MAGIC.  On older kernels that don't support nsfs yet we can look
   550  	// for PROC_SUPER_MAGIC instead.
   551  	// We can just ensure that this is the case thanks to fstatfs.
   552  	struct statfs ns_statfs_buf;
   553  	if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) {
   554  		die("cannot inspect filesystem of preserved mount namespace file");
   555  	}
   556  	// Stat the mount namespace as well, this is later used to check if the
   557  	// namespace is used by other processes if we are considering discarding a
   558  	// stale namespace.
   559  	struct stat ns_stat_buf;
   560  	if (fstat(mnt_fd, &ns_stat_buf) < 0) {
   561  		die("cannot inspect preserved mount namespace file");
   562  	}
   563  #ifndef NSFS_MAGIC
   564  // Account for kernel headers old enough to not know about NSFS_MAGIC.
   565  #define NSFS_MAGIC 0x6e736673
   566  #endif
   567  	if (ns_statfs_buf.f_type == NSFS_MAGIC
   568  	    || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) {
   569  
   570  		// Inspect and perhaps discard the preserved mount namespace.
   571  		if (sc_inspect_and_maybe_discard_stale_ns
   572  		    (mnt_fd, inv, snap_discard_ns_fd) == EAGAIN) {
   573  			return ESRCH;
   574  		}
   575  		// Move to the mount namespace of the snap we're trying to start.
   576  		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
   577  			die("cannot join preserved mount namespace %s",
   578  			    group->name);
   579  		}
   580  		debug("joined preserved mount namespace %s", group->name);
   581  		return 0;
   582  	}
   583  	return ESRCH;
   584  }
   585  
   586  int sc_join_preserved_per_user_ns(struct sc_mount_ns *group,
   587  				  const char *snap_name)
   588  {
   589  	uid_t uid = getuid();
   590  	char mnt_fname[PATH_MAX] = { 0 };
   591  	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.%d.mnt", group->name,
   592  			 (int)uid);
   593  
   594  	int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
   595  	mnt_fd = openat(group->dir_fd, mnt_fname,
   596  			O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
   597  	if (mnt_fd < 0 && errno == ENOENT) {
   598  		return ESRCH;
   599  	}
   600  	if (mnt_fd < 0) {
   601  		die("cannot open preserved mount namespace %s", group->name);
   602  	}
   603  	struct statfs ns_statfs_buf;
   604  	if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) {
   605  		die("cannot inspect filesystem of preserved mount namespace file");
   606  	}
   607  	struct stat ns_stat_buf;
   608  	if (fstat(mnt_fd, &ns_stat_buf) < 0) {
   609  		die("cannot inspect preserved mount namespace file");
   610  	}
   611  #ifndef NSFS_MAGIC
   612  	/* Define NSFS_MAGIC for Ubuntu 14.04 and other older systems. */
   613  #define NSFS_MAGIC 0x6e736673
   614  #endif
   615  	if (ns_statfs_buf.f_type == NSFS_MAGIC
   616  	    || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) {
   617  		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
   618  			die("cannot join preserved per-user mount namespace %s",
   619  			    group->name);
   620  		}
   621  		debug("joined preserved mount namespace %s", group->name);
   622  		return 0;
   623  	}
   624  	return ESRCH;
   625  }
   626  
   627  static void setup_signals_for_helper(void)
   628  {
   629  	/* Ignore the SIGPIPE signal so that we get EPIPE on the read / write
   630  	 * operations attempting to work with a closed pipe. This ensures that we
   631  	 * are not killed by the default disposition (terminate) and can return a
   632  	 * non-signal-death return code to the program invoking snap-confine. */
   633  	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
   634  		die("cannot install ignore handler for SIGPIPE");
   635  	}
   636  }
   637  
   638  static void teardown_signals_for_helper(void)
   639  {
   640  	/* Undo operations done by setup_signals_for_helper. */
   641  	if (signal(SIGPIPE, SIG_DFL) == SIG_ERR) {
   642  		die("cannot restore default handler for SIGPIPE");
   643  	}
   644  }
   645  
   646  static void helper_fork(struct sc_mount_ns *group, struct sc_apparmor *apparmor)
   647  {
   648  	// Create a pipe for sending commands to the helper process.
   649  	if (pipe2(group->pipe_master, O_CLOEXEC | O_DIRECT) < 0) {
   650  		die("cannot create pipes for commanding the helper process");
   651  	}
   652  	if (pipe2(group->pipe_helper, O_CLOEXEC | O_DIRECT) < 0) {
   653  		die("cannot create pipes for responding to master process");
   654  	}
   655  	// Store the PID of the "parent" process. This done instead of calls to
   656  	// getppid() because then we can reliably track the PID of the parent even
   657  	// if the child process is re-parented.
   658  	pid_t parent = getpid();
   659  
   660  	// For rationale of forking see this:
   661  	// https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html
   662  	pid_t pid = fork();
   663  	if (pid < 0) {
   664  		die("cannot fork helper process for mount namespace capture");
   665  	}
   666  	if (pid == 0) {
   667  		/* helper */
   668  		sc_cleanup_close(&group->pipe_master[1]);
   669  		sc_cleanup_close(&group->pipe_helper[0]);
   670  		helper_main(group, apparmor, parent);
   671  	} else {
   672  		setup_signals_for_helper();
   673  
   674  		/* master */
   675  		sc_cleanup_close(&group->pipe_master[0]);
   676  		sc_cleanup_close(&group->pipe_helper[1]);
   677  
   678  		// Glibc defines pid as a signed 32bit integer. There's no standard way to
   679  		// print pid's portably so this is the best we can do.
   680  		debug("forked support process %d", (int)pid);
   681  		group->child = pid;
   682  	}
   683  }
   684  
   685  static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor,
   686  			pid_t parent)
   687  {
   688  	// This is the child process which will capture the mount namespace.
   689  	//
   690  	// It will do so by bind-mounting the .mnt after the parent process calls
   691  	// unshare() and finishes setting up the namespace completely. Change the
   692  	// hat to a sub-profile that has limited permissions necessary to
   693  	// accomplish the capture of the mount namespace.
   694  	sc_maybe_aa_change_hat(apparmor, "mount-namespace-capture-helper", 0);
   695  	// Configure the child to die as soon as the parent dies. In an odd
   696  	// case where the parent is killed then we don't want to complete our
   697  	// task or wait for anything.
   698  	if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
   699  		die("cannot set parent process death notification signal to SIGINT");
   700  	}
   701  	// Check that parent process is still alive. If this is the case then we
   702  	// can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake us up
   703  	// from read(2) below. In the rare case that the PID numbers overflow and
   704  	// the now-dead parent PID is recycled we will still hang forever on the
   705  	// read from the pipe below.
   706  	if (kill(parent, 0) < 0) {
   707  		switch (errno) {
   708  		case ESRCH:
   709  			// When snap-confine executes it will fork a helper process. That
   710  			// process establishes an elaborate dance to ensure both itself and
   711  			// the parent are operating exactly as specified, so that no
   712  			// processes are left behind for unbound amount of time. As a part
   713  			// of that dance the child pings the parent to ensure it is still
   714  			// alive after establishing a notification signal to be sent in
   715  			// case the parent dies. This is a race avoidance mechanism, we set
   716  			// up the notification and then check if the parent is alive by the
   717  			// time we are done.
   718  			//
   719  			// In the case when the parent does go away we used to call
   720  			// abort(). On some distributions this would trigger an unclean
   721  			// process termination error report to be sent. One such example is
   722  			// the Ubuntu error tracker. Since the parent process can be
   723  			// legitimately interrupted and killed, this should not generate an
   724  			// error report. As such, perform clean exit in this specific case.
   725  			debug("parent process has terminated");
   726  			exit(0);
   727  		default:
   728  			die("cannot confirm that parent process is alive");
   729  			break;
   730  		}
   731  	}
   732  	if (fchdir(group->dir_fd) < 0) {
   733  		die("cannot move to directory with preserved namespaces");
   734  	}
   735  	int command = -1;
   736  	int run = 1;
   737  	while (run) {
   738  		debug("helper process waiting for command");
   739  		sc_enable_sanity_timeout();
   740  		if (read(group->pipe_master[0], &command, sizeof command) < 0) {
   741  			int saved_errno = errno;
   742  			// This will ensure we get the correct error message
   743  			// if there is a read error because the timeout
   744  			// expired.
   745  			sc_disable_sanity_timeout();
   746  			errno = saved_errno;
   747  			die("cannot read command from the pipe");
   748  		}
   749  		sc_disable_sanity_timeout();
   750  		debug("helper process received command %d", command);
   751  		switch (command) {
   752  		case HELPER_CMD_EXIT:
   753  			run = 0;
   754  			break;
   755  		case HELPER_CMD_CAPTURE_MOUNT_NS:
   756  			helper_capture_ns(group, parent);
   757  			break;
   758  		case HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS:
   759  			helper_capture_per_user_ns(group, parent);
   760  			break;
   761  		}
   762  		if (write(group->pipe_helper[1], &command, sizeof command) < 0) {
   763  			die("cannot write ack");
   764  		}
   765  	}
   766  	debug("helper process exiting");
   767  	exit(0);
   768  }
   769  
   770  static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent)
   771  {
   772  	char src[PATH_MAX] = { 0 };
   773  	char dst[PATH_MAX] = { 0 };
   774  
   775  	debug("capturing per-snap mount namespace");
   776  	sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent);
   777  	sc_must_snprintf(dst, sizeof dst, "%s.mnt", group->name);
   778  
   779  	/* Ensure the bind mount destination exists. */
   780  	int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600);
   781  	if (fd < 0) {
   782  		die("cannot create file %s", dst);
   783  	}
   784  	close(fd);
   785  
   786  	if (mount(src, dst, NULL, MS_BIND, NULL) < 0) {
   787  		die("cannot preserve mount namespace of process %d as %s",
   788  		    (int)parent, dst);
   789  	}
   790  	debug("mount namespace of process %d preserved as %s",
   791  	      (int)parent, dst);
   792  }
   793  
   794  static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent)
   795  {
   796  	char src[PATH_MAX] = { 0 };
   797  	char dst[PATH_MAX] = { 0 };
   798  	uid_t uid = getuid();
   799  
   800  	debug("capturing per-snap, per-user mount namespace");
   801  	sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent);
   802  	sc_must_snprintf(dst, sizeof dst, "%s.%d.mnt", group->name, (int)uid);
   803  
   804  	/* Ensure the bind mount destination exists. */
   805  	int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600);
   806  	if (fd < 0) {
   807  		die("cannot create file %s", dst);
   808  	}
   809  	close(fd);
   810  
   811  	if (mount(src, dst, NULL, MS_BIND, NULL) < 0) {
   812  		die("cannot preserve per-user mount namespace of process %d as %s", (int)parent, dst);
   813  	}
   814  	debug("per-user mount namespace of process %d preserved as %s",
   815  	      (int)parent, dst);
   816  }
   817  
   818  static void sc_message_capture_helper(struct sc_mount_ns *group, int command_id)
   819  {
   820  	int ack;
   821  	if (group->child == 0) {
   822  		die("precondition failed: we don't have a helper process");
   823  	}
   824  	if (group->pipe_master[1] < 0) {
   825  		die("precondition failed: we don't have a pipe");
   826  	}
   827  	if (group->pipe_helper[0] < 0) {
   828  		die("precondition failed: we don't have a pipe");
   829  	}
   830  	debug("sending command %d to helper process (pid: %d)",
   831  	      command_id, group->child);
   832  	if (write(group->pipe_master[1], &command_id, sizeof command_id) < 0) {
   833  		die("cannot send command %d to helper process", command_id);
   834  	}
   835  	debug("waiting for response from helper");
   836  	int read_n = read(group->pipe_helper[0], &ack, sizeof ack);
   837  	if (read_n < 0) {
   838  		die("cannot receive ack from helper process");
   839  	}
   840  	if (read_n == 0) {
   841  		die("unexpected eof from helper process");
   842  	}
   843  }
   844  
   845  static void sc_wait_for_capture_helper(struct sc_mount_ns *group)
   846  {
   847  	if (group->child == 0) {
   848  		die("precondition failed: we don't have a helper process");
   849  	}
   850  	debug("waiting for the helper process to exit");
   851  	int status = 0;
   852  	errno = 0;
   853  	if (waitpid(group->child, &status, 0) < 0) {
   854  		die("cannot wait for the helper process");
   855  	}
   856  	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
   857  		die("helper process exited abnormally");
   858  	}
   859  	debug("helper process exited normally");
   860  	group->child = 0;
   861  	teardown_signals_for_helper();
   862  }
   863  
   864  void sc_fork_helper(struct sc_mount_ns *group, struct sc_apparmor *apparmor)
   865  {
   866  	helper_fork(group, apparmor);
   867  }
   868  
   869  void sc_preserve_populated_mount_ns(struct sc_mount_ns *group)
   870  {
   871  	sc_message_capture_helper(group, HELPER_CMD_CAPTURE_MOUNT_NS);
   872  }
   873  
   874  void sc_preserve_populated_per_user_mount_ns(struct sc_mount_ns *group)
   875  {
   876  	sc_message_capture_helper(group, HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS);
   877  }
   878  
   879  void sc_wait_for_helper(struct sc_mount_ns *group)
   880  {
   881  	sc_message_capture_helper(group, HELPER_CMD_EXIT);
   882  	sc_wait_for_capture_helper(group);
   883  }
   884  
   885  void sc_store_ns_info(const sc_invocation * inv)
   886  {
   887  	FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL;
   888  	char info_path[PATH_MAX] = { 0 };
   889  	sc_must_snprintf(info_path, sizeof info_path,
   890  			 "/run/snapd/ns/snap.%s.info", inv->snap_instance);
   891  	int fd = -1;
   892  	fd = open(info_path,
   893  		  O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC | O_NOFOLLOW, 0644);
   894  	if (fd < 0) {
   895  		die("cannot open %s", info_path);
   896  	}
   897  	if (fchown(fd, 0, 0) < 0) {
   898  		die("cannot chown %s to root.root", info_path);
   899  	}
   900  	// The stream now owns the file descriptor.
   901  	stream = fdopen(fd, "w");
   902  	if (stream == NULL) {
   903  		die("cannot get stream from file descriptor");
   904  	}
   905  	fprintf(stream, "base-snap-name=%s\n", inv->orig_base_snap_name);
   906  	if (ferror(stream) != 0) {
   907  		die("I/O error when writing to %s", info_path);
   908  	}
   909  	if (fflush(stream) == EOF) {
   910  		die("cannot flush %s", info_path);
   911  	}
   912  	debug("saved mount namespace meta-data to %s", info_path);
   913  }
   914  
   915  bool sc_is_mount_ns_in_use(const char *snap_instance)
   916  {
   917  	// perform an indirect check of whether the mount namespace is occupied,
   918  	// with cgroups v1, each snap process is attached to a group under the
   919  	// freezer controller, however with cgroups v2, we must check for any groups
   920  	// tracking the snap
   921  	bool occupied = false;
   922  	if (sc_cgroup_is_v2()) {
   923  		// cgroup v2 must consult the tracking groups
   924  		occupied = sc_cgroup_v2_is_tracking_snap(snap_instance);
   925  	} else {
   926  		occupied = sc_cgroup_freezer_occupied(snap_instance);
   927  	}
   928  	return occupied;
   929  }