github.com/rigado/snapd@v2.42.5-go-mod+incompatible/cmd/snap-confine/ns-support.c

github.com/rigado/snapd@v2.42.5-go-mod+incompatible/cmd/snap-confine/ns-support.c (about)

     1  /*
     2   * Copyright (C) 2016 Canonical Ltd
     3   *
     4   * This program is free software: you can redistribute it and/or modify
     5   * it under the terms of the GNU General Public License version 3 as
     6   * published by the Free Software Foundation.
     7   *
     8   * This program is distributed in the hope that it will be useful,
     9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11   * GNU General Public License for more details.
    12   *
    13   * You should have received a copy of the GNU General Public License
    14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15   *
    16   */
    17  
    18  #include "ns-support.h"
    19  
    20  #ifdef HAVE_CONFIG_H
    21  #include "config.h"
    22  #endif
    23  
    24  #include <errno.h>
    25  #include <fcntl.h>
    26  #include <linux/magic.h>
    27  #include <sched.h>
    28  #include <signal.h>
    29  #include <string.h>
    30  #include <sys/eventfd.h>
    31  #include <sys/file.h>
    32  #include <sys/mount.h>
    33  #include <sys/prctl.h>
    34  #include <sys/stat.h>
    35  #include <sys/sysmacros.h>
    36  #include <sys/types.h>
    37  #include <sys/vfs.h>
    38  #include <sys/wait.h>
    39  #include <unistd.h>
    40  
    41  #include "../libsnap-confine-private/cgroup-freezer-support.h"
    42  #include "../libsnap-confine-private/cgroup-support.h"
    43  #include "../libsnap-confine-private/classic.h"
    44  #include "../libsnap-confine-private/cleanup-funcs.h"
    45  #include "../libsnap-confine-private/infofile.h"
    46  #include "../libsnap-confine-private/locking.h"
    47  #include "../libsnap-confine-private/mountinfo.h"
    48  #include "../libsnap-confine-private/string-utils.h"
    49  #include "../libsnap-confine-private/tool.h"
    50  #include "../libsnap-confine-private/utils.h"
    51  #include "user-support.h"
    52  
    53  /**
    54   * Directory where snap-confine keeps namespace files.
    55   **/
    56  #define SC_NS_DIR "/run/snapd/ns"
    57  
    58  /**
    59   * Effective value of SC_NS_DIR.
    60   *
    61   * We use 'const char *' so we can update sc_ns_dir in the testsuite
    62   **/
    63  static const char *sc_ns_dir = SC_NS_DIR;
    64  
    65  enum {
    66  	HELPER_CMD_EXIT,
    67  	HELPER_CMD_CAPTURE_MOUNT_NS,
    68  	HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS,
    69  };
    70  
    71  void sc_reassociate_with_pid1_mount_ns(void)
    72  {
    73  	int init_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
    74  	int self_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
    75  	const char *path_pid_1 = "/proc/1/ns/mnt";
    76  	const char *path_pid_self = "/proc/self/ns/mnt";
    77  
    78  	init_mnt_fd = open(path_pid_1,
    79  			   O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
    80  	if (init_mnt_fd < 0) {
    81  		die("cannot open path %s", path_pid_1);
    82  	}
    83  	self_mnt_fd = open(path_pid_self,
    84  			   O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
    85  	if (self_mnt_fd < 0) {
    86  		die("cannot open path %s", path_pid_1);
    87  	}
    88  	char init_buf[128] = { 0 };
    89  	char self_buf[128] = { 0 };
    90  	memset(init_buf, 0, sizeof init_buf);
    91  	if (readlinkat(init_mnt_fd, "", init_buf, sizeof init_buf) < 0) {
    92  		if (errno == ENOENT) {
    93  			// According to namespaces(7) on a pre 3.8 kernel the namespace
    94  			// files are hardlinks, not sylinks. If that happens readlinkat
    95  			// fails with ENOENT. As a quick workaround for this special-case
    96  			// functionality, just bail out and do nothing without raising an
    97  			// error.
    98  			return;
    99  		}
   100  		die("cannot read mount namespace identifier of pid 1");
   101  	}
   102  	memset(self_buf, 0, sizeof self_buf);
   103  	if (readlinkat(self_mnt_fd, "", self_buf, sizeof self_buf) < 0) {
   104  		die("cannot read mount namespace identifier of the current process");
   105  	}
   106  	if (memcmp(init_buf, self_buf, sizeof init_buf) != 0) {
   107  		debug("moving to mount namespace of pid 1");
   108  		// We cannot use O_NOFOLLOW here because that file will always be a
   109  		// symbolic link. We actually want to open it this way.
   110  		int init_mnt_fd_real SC_CLEANUP(sc_cleanup_close) = -1;
   111  		init_mnt_fd_real = open(path_pid_1, O_RDONLY | O_CLOEXEC);
   112  		if (init_mnt_fd_real < 0) {
   113  			die("cannot open %s", path_pid_1);
   114  		}
   115  		if (setns(init_mnt_fd_real, CLONE_NEWNS) < 0) {
   116  			die("cannot join mount namespace of pid 1");
   117  		}
   118  	}
   119  }
   120  
   121  void sc_initialize_mount_ns(void)
   122  {
   123  	/* Ensure that /run/snapd/ns is a directory. */
   124  	if (sc_nonfatal_mkpath(sc_ns_dir, 0755) < 0) {
   125  		die("cannot create directory %s", sc_ns_dir);
   126  	}
   127  
   128  	/* Read and analyze the mount table. We need to see whether /run/snapd/ns
   129  	 * is a mount point with private event propagation. */
   130  	sc_mountinfo *info SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   131  	info = sc_parse_mountinfo(NULL);
   132  	if (info == NULL) {
   133  		die("cannot parse /proc/self/mountinfo");
   134  	}
   135  
   136  	bool is_mnt = false;
   137  	bool is_private = false;
   138  	for (sc_mountinfo_entry * entry = sc_first_mountinfo_entry(info);
   139  	     entry != NULL; entry = sc_next_mountinfo_entry(entry)) {
   140  		/* Find /run/snapd/ns */
   141  		if (!sc_streq(entry->mount_dir, sc_ns_dir)) {
   142  			continue;
   143  		}
   144  		is_mnt = true;
   145  		if (strstr(entry->optional_fields, "shared:") == NULL) {
   146  			/* Mount event propagation is not set to shared, good. */
   147  			is_private = true;
   148  		}
   149  		break;
   150  	}
   151  
   152  	if (!is_mnt) {
   153  		if (mount(sc_ns_dir, sc_ns_dir, NULL, MS_BIND | MS_REC, NULL) <
   154  		    0) {
   155  			die("cannot self-bind mount %s", sc_ns_dir);
   156  		}
   157  	}
   158  
   159  	if (!is_private) {
   160  		if (mount(NULL, sc_ns_dir, NULL, MS_PRIVATE, NULL) < 0) {
   161  			die("cannot change propagation type to MS_PRIVATE in %s", sc_ns_dir);
   162  		}
   163  	}
   164  }
   165  
   166  struct sc_mount_ns {
   167  	// Name of the namespace group ($SNAP_NAME).
   168  	char *name;
   169  	// Descriptor to the namespace group control directory.  This descriptor is
   170  	// opened with O_PATH|O_DIRECTORY so it's only used for openat() calls.
   171  	int dir_fd;
   172  	// Pair of descriptors for a pair for a pipe file descriptors (read end,
   173  	// write end) that snap-confine uses to send messages to the helper
   174  	// process and back.
   175  	int pipe_helper[2];
   176  	int pipe_master[2];
   177  	// Identifier of the child process that is used during the one-time (per
   178  	// group) initialization and capture process.
   179  	pid_t child;
   180  };
   181  
   182  static struct sc_mount_ns *sc_alloc_mount_ns(void)
   183  {
   184  	struct sc_mount_ns *group = calloc(1, sizeof *group);
   185  	if (group == NULL) {
   186  		die("cannot allocate memory for sc_mount_ns");
   187  	}
   188  	group->dir_fd = -1;
   189  	group->pipe_helper[0] = -1;
   190  	group->pipe_helper[1] = -1;
   191  	group->pipe_master[0] = -1;
   192  	group->pipe_master[1] = -1;
   193  	// Redundant with calloc but some functions check for the non-zero value so
   194  	// I'd like to keep this explicit in the code.
   195  	group->child = 0;
   196  	return group;
   197  }
   198  
   199  struct sc_mount_ns *sc_open_mount_ns(const char *group_name)
   200  {
   201  	struct sc_mount_ns *group = sc_alloc_mount_ns();
   202  	group->dir_fd = open(sc_ns_dir,
   203  			     O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW);
   204  	if (group->dir_fd < 0) {
   205  		die("cannot open directory %s", sc_ns_dir);
   206  	}
   207  	group->name = sc_strdup(group_name);
   208  	return group;
   209  }
   210  
   211  void sc_close_mount_ns(struct sc_mount_ns *group)
   212  {
   213  	if (group->child != 0) {
   214  		sc_wait_for_helper(group);
   215  	}
   216  	sc_cleanup_close(&group->dir_fd);
   217  	sc_cleanup_close(&group->pipe_master[0]);
   218  	sc_cleanup_close(&group->pipe_master[1]);
   219  	sc_cleanup_close(&group->pipe_helper[0]);
   220  	sc_cleanup_close(&group->pipe_helper[1]);
   221  	free(group->name);
   222  	free(group);
   223  }
   224  
   225  static dev_t find_base_snap_device(const char *base_snap_name,
   226  				   const char *base_snap_rev)
   227  {
   228  	// Find the backing device of the base snap.
   229  	// TODO: add support for "try mode" base snaps that also need
   230  	// consideration of the mie->root component.
   231  	dev_t base_snap_dev = 0;
   232  	char base_squashfs_path[PATH_MAX];
   233  	sc_must_snprintf(base_squashfs_path,
   234  			 sizeof base_squashfs_path, "%s/%s/%s",
   235  			 SNAP_MOUNT_DIR, base_snap_name, base_snap_rev);
   236  	sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   237  	mi = sc_parse_mountinfo(NULL);
   238  	if (mi == NULL) {
   239  		die("cannot parse mountinfo of the current process");
   240  	}
   241  	bool found = false;
   242  	for (sc_mountinfo_entry * mie =
   243  	     sc_first_mountinfo_entry(mi); mie != NULL;
   244  	     mie = sc_next_mountinfo_entry(mie)) {
   245  		if (sc_streq(mie->mount_dir, base_squashfs_path)) {
   246  			base_snap_dev = makedev(mie->dev_major, mie->dev_minor);
   247  			debug("block device of snap %s, revision %s is %d:%d",
   248  			      base_snap_name, base_snap_rev, mie->dev_major,
   249  			      mie->dev_minor);
   250  			// Don't break when found, we are interested in the last
   251  			// entry as this is the "effective" one.
   252  			found = true;
   253  		}
   254  	}
   255  	if (!found) {
   256  		die("cannot find mount entry for snap %s revision %s",
   257  		    base_snap_name, base_snap_rev);
   258  	}
   259  	return base_snap_dev;
   260  }
   261  
   262  static bool should_discard_current_ns(dev_t base_snap_dev)
   263  {
   264  	// Inspect the namespace and check if we should discard it.
   265  	//
   266  	// The namespace may become "stale" when the rootfs is not the same
   267  	// device we found above. This will happen whenever the base snap is
   268  	// refreshed since the namespace was first created.
   269  	sc_mountinfo_entry *mie;
   270  	sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL;
   271  
   272  	mi = sc_parse_mountinfo(NULL);
   273  	if (mi == NULL) {
   274  		die("cannot parse mountinfo of the current process");
   275  	}
   276  	for (mie = sc_first_mountinfo_entry(mi); mie != NULL;
   277  	     mie = sc_next_mountinfo_entry(mie)) {
   278  		if (!sc_streq(mie->mount_dir, "/")) {
   279  			continue;
   280  		}
   281  		// NOTE: we want the initial rootfs just in case overmount
   282  		// was used to do something weird. The initial rootfs was
   283  		// set up by snap-confine and that is the one we want to
   284  		// measure.
   285  		debug("block device of the root filesystem is %d:%d",
   286  		      mie->dev_major, mie->dev_minor);
   287  		return base_snap_dev != makedev(mie->dev_major, mie->dev_minor);
   288  	}
   289  	die("cannot find mount entry of the root filesystem");
   290  }
   291  
   292  enum sc_discard_vote {
   293  	/**
   294  	 * SC_DISCARD_NO denotes that the mount namespace doesn't have to be
   295  	 * discarded. This happens when the base snap has not changed.
   296  	 **/
   297  	SC_DISCARD_NO = 1,
   298  	/**
   299  	 * SC_DISCARD_SHOULD indicates that the mount namespace should be discarded
   300  	 * but may be reused if it is still inhabited by processes. This only
   301  	 * happens when the base snap revision changes but the name of the base
   302  	 * snap is the same as before.
   303  	 **/
   304  	SC_DISCARD_SHOULD = 2,
   305  	/**
   306  	 * SC_DISCARD_MUST indicates that the mount namespace must be discarded
   307  	 * even if it still inhabited by processes. This only happens when the name
   308  	 * of the base snap changes.
   309  	 **/
   310  	SC_DISCARD_MUST = 3,
   311  };
   312  
   313  /**
   314   * is_base_transition returns true if a base transition is occurring.
   315   *
   316   * The function inspects /run/snapd/ns/snap.$SNAP_INSTANCE_NAME.info as well
   317   * as the invocation parameters of snap-confine. If the base snap name, as
   318   * encoded in the info file and as described by the invocation parameters
   319   * differ then a base transition is occurring. If the info file is absent or
   320   * does not record the name of the base snap then transition cannot be
   321   * detected.
   322  **/
   323  static bool is_base_transition(const sc_invocation * inv)
   324  {
   325  	char info_path[PATH_MAX] = { 0 };
   326  	sc_must_snprintf(info_path,
   327  			 sizeof info_path,
   328  			 "/run/snapd/ns/snap.%s.info", inv->snap_instance);
   329  
   330  	FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL;
   331  	stream = fopen(info_path, "r");
   332  	if (stream == NULL && errno == ENOENT) {
   333  		// If the info file is absent then we cannot decide if a transition had
   334  		// occurred. For people upgrading from snap-confine without the info
   335  		// file, that is the best we can do.
   336  		return false;
   337  	}
   338  	if (stream == NULL) {
   339  		die("cannot open %s", info_path);
   340  	}
   341  
   342  	char *base_snap_name SC_CLEANUP(sc_cleanup_string) = NULL;
   343  	sc_error *err = NULL;
   344  	if (sc_infofile_get_key
   345  	    (stream, "base-snap-name", &base_snap_name, &err) < 0) {
   346  		sc_die_on_error(err);
   347  	}
   348  
   349  	if (base_snap_name == NULL) {
   350  		// If the info file doesn't record the name of the base snap then,
   351  		// again, we cannot decide if a transition had occurred.
   352  		return false;
   353  	}
   354  
   355  	return !sc_streq(inv->orig_base_snap_name, base_snap_name);
   356  }
   357  
   358  // The namespace may be stale. To check this we must actually switch into it
   359  // but then we use up our setns call (the kernel misbehaves if we setns twice).
   360  // To work around this we'll fork a child and use it to probe. The child will
   361  // inspect the namespace and send information back via eventfd and then exit
   362  // unconditionally.
   363  static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd,
   364  						 const sc_invocation * inv,
   365  						 int snap_discard_ns_fd)
   366  {
   367  	char base_snap_rev[PATH_MAX] = { 0 };
   368  	dev_t base_snap_dev;
   369  	int event_fd SC_CLEANUP(sc_cleanup_close) = -1;
   370  
   371  	// Read the revision of the base snap by looking at the current symlink.
   372  	if (readlink(inv->rootfs_dir, base_snap_rev, sizeof base_snap_rev) < 0) {
   373  		die("cannot read current revision of snap %s",
   374  		    inv->snap_instance);
   375  	}
   376  	if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') {
   377  		die("cannot read current revision of snap %s: value too long",
   378  		    inv->snap_instance);
   379  	}
   380  	// Find the device that is backing the current revision of the base snap.
   381  	base_snap_dev =
   382  	    find_base_snap_device(inv->base_snap_name, base_snap_rev);
   383  
   384  	// Store the PID of this process. This is done instead of calls to
   385  	// getppid() below because then we can reliably track the PID of the
   386  	// parent even if the child process is re-parented.
   387  	pid_t parent = getpid();
   388  
   389  	// Create an eventfd for the communication with the child.
   390  	event_fd = eventfd(0, EFD_CLOEXEC);
   391  	if (event_fd < 0) {
   392  		die("cannot create eventfd");
   393  	}
   394  	// Fork a child, it will do the inspection for us.
   395  	pid_t child = fork();
   396  	if (child < 0) {
   397  		die("cannot fork support process");
   398  	}
   399  
   400  	if (child == 0) {
   401  		// This is the child process which will inspect the mount namespace.
   402  		//
   403  		// Configure the child to die as soon as the parent dies. In an odd
   404  		// case where the parent is killed then we don't want to complete our
   405  		// task or wait for anything.
   406  		if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
   407  			die("cannot set parent process death notification signal to SIGINT");
   408  		}
   409  		// Check that parent process is still alive. If this is the case then
   410  		// we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake
   411  		// us up from eventfd_read() below. In the rare case that the PID
   412  		// numbers overflow and the now-dead parent PID is recycled we will
   413  		// still hang forever on the read from eventfd below.
   414  		if (kill(parent, 0) < 0) {
   415  			switch (errno) {
   416  			case ESRCH:
   417  				debug("parent process has terminated");
   418  				abort();
   419  			default:
   420  				die("cannot confirm that parent process is alive");
   421  				break;
   422  			}
   423  		}
   424  
   425  		debug("joining preserved mount namespace for inspection");
   426  		// Move to the mount namespace of the snap we're trying to inspect.
   427  		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
   428  			die("cannot join preserved mount namespace");
   429  		}
   430  		// Check if the namespace needs to be discarded.
   431  		eventfd_t value = SC_DISCARD_NO;
   432  		const char *value_str = "no";
   433  
   434  		// TODO: enable this for core distributions. This is complex because on
   435  		// core the rootfs is mounted in initrd and is _not_ changed (no
   436  		// pivot_root) and the base snap is again mounted (2nd time) by
   437  		// systemd. This makes us end up in a situation where the outer base
   438  		// snap will never match the rootfs inside the mount namespace.
   439  		if (inv->is_normal_mode
   440  		    && should_discard_current_ns(base_snap_dev)) {
   441  			value = SC_DISCARD_SHOULD;
   442  			value_str = "should";
   443  
   444  			// The namespace is stale so also check if we must discard it due to the
   445  			// base snap changing. If the base snap changed, we must discard since even
   446  			// though currently running processes from this snap will continue to see
   447  			// the old base, we want new processes to use the new base. See LP:
   448  			// #1819875 for details.
   449  			if (is_base_transition(inv)) {
   450  				// The base snap has changed. We must discard ...
   451  				value = SC_DISCARD_MUST;
   452  				value_str = "must";
   453  			}
   454  		}
   455  		// Send this back to the parent: 3 - force discard 2 - prefer discard, 1 - keep.
   456  		// Note that we cannot just use 0 and 1 because of the semantics of eventfd(2).
   457  		if (eventfd_write(event_fd, value) < 0) {
   458  			die("cannot send information to %s preserved mount namespace", value_str);
   459  		}
   460  		// Exit, we're done.
   461  		exit(0);
   462  	}
   463  	// This is back in the parent process.
   464  	//
   465  	// Enable a sanity timeout in case the read blocks for unbound amount of
   466  	// time. This will ensure we will not hang around while holding the lock.
   467  	// Next, read the value written by the child process.
   468  	sc_enable_sanity_timeout();
   469  	eventfd_t value = 0;
   470  	if (eventfd_read(event_fd, &value) < 0) {
   471  		die("cannot read from eventfd");
   472  	}
   473  	sc_disable_sanity_timeout();
   474  
   475  	// Wait for the child process to exit and collect its exit status.
   476  	errno = 0;
   477  	int status = 0;
   478  	if (waitpid(child, &status, 0) < 0) {
   479  		die("cannot wait for the support process for mount namespace inspection");
   480  	}
   481  	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
   482  		die("support process for mount namespace inspection exited abnormally");
   483  	}
   484  	// If the namespace is up-to-date then we are done.
   485  	switch (value) {
   486  	case SC_DISCARD_NO:
   487  		debug("preserved mount is not stale, reusing");
   488  		return 0;
   489  	case SC_DISCARD_SHOULD:
   490  		if (sc_cgroup_is_v2()) {
   491  			debug
   492  			    ("WARNING: cgroup v2 detected, preserved mount namespace process presence check unsupported, discarding");
   493  			break;
   494  		}
   495  		if (sc_cgroup_freezer_occupied(inv->snap_instance)) {
   496  			// Some processes are still using the namespace so we cannot discard it
   497  			// as that would fracture the view that the set of processes inside
   498  			// have on what is mounted.
   499  			debug
   500  			    ("preserved mount namespace is stale but occupied, reusing");
   501  			return 0;
   502  		}
   503  		break;
   504  	case SC_DISCARD_MUST:
   505  		debug
   506  		    ("preserved mount namespace is stale and base snap has changed, discarding");
   507  		break;
   508  	}
   509  	sc_call_snap_discard_ns(snap_discard_ns_fd, inv->snap_instance);
   510  	return EAGAIN;
   511  }
   512  
   513  static void helper_fork(struct sc_mount_ns *group,
   514  			struct sc_apparmor *apparmor);
   515  static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor,
   516  			pid_t parent);
   517  static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent);
   518  static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent);
   519  
   520  int sc_join_preserved_ns(struct sc_mount_ns *group, struct sc_apparmor
   521  			 *apparmor, const sc_invocation * inv,
   522  			 int snap_discard_ns_fd)
   523  {
   524  	// Open the mount namespace file.
   525  	char mnt_fname[PATH_MAX] = { 0 };
   526  	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.mnt", group->name);
   527  	int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
   528  	// NOTE: There is no O_EXCL here because the file can be around but
   529  	// doesn't have to be a mounted namespace.
   530  	mnt_fd = openat(group->dir_fd, mnt_fname,
   531  			O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
   532  	if (mnt_fd < 0 && errno == ENOENT) {
   533  		return ESRCH;
   534  	}
   535  	if (mnt_fd < 0) {
   536  		die("cannot open preserved mount namespace %s", group->name);
   537  	}
   538  	// Check if we got an nsfs-based or procfs file or a regular file. This can
   539  	// be reliably tested because nsfs has an unique filesystem type
   540  	// NSFS_MAGIC.  On older kernels that don't support nsfs yet we can look
   541  	// for PROC_SUPER_MAGIC instead.
   542  	// We can just ensure that this is the case thanks to fstatfs.
   543  	struct statfs ns_statfs_buf;
   544  	if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) {
   545  		die("cannot inspect filesystem of preserved mount namespace file");
   546  	}
   547  	// Stat the mount namespace as well, this is later used to check if the
   548  	// namespace is used by other processes if we are considering discarding a
   549  	// stale namespace.
   550  	struct stat ns_stat_buf;
   551  	if (fstat(mnt_fd, &ns_stat_buf) < 0) {
   552  		die("cannot inspect preserved mount namespace file");
   553  	}
   554  #ifndef NSFS_MAGIC
   555  // Account for kernel headers old enough to not know about NSFS_MAGIC.
   556  #define NSFS_MAGIC 0x6e736673
   557  #endif
   558  	if (ns_statfs_buf.f_type == NSFS_MAGIC
   559  	    || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) {
   560  
   561  		// Inspect and perhaps discard the preserved mount namespace.
   562  		if (sc_inspect_and_maybe_discard_stale_ns
   563  		    (mnt_fd, inv, snap_discard_ns_fd) == EAGAIN) {
   564  			return ESRCH;
   565  		}
   566  		// Move to the mount namespace of the snap we're trying to start.
   567  		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
   568  			die("cannot join preserved mount namespace %s",
   569  			    group->name);
   570  		}
   571  		debug("joined preserved mount namespace %s", group->name);
   572  		return 0;
   573  	}
   574  	return ESRCH;
   575  }
   576  
   577  int sc_join_preserved_per_user_ns(struct sc_mount_ns *group,
   578  				  const char *snap_name)
   579  {
   580  	uid_t uid = getuid();
   581  	char mnt_fname[PATH_MAX] = { 0 };
   582  	sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.%d.mnt", group->name,
   583  			 (int)uid);
   584  
   585  	int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1;
   586  	mnt_fd = openat(group->dir_fd, mnt_fname,
   587  			O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600);
   588  	if (mnt_fd < 0 && errno == ENOENT) {
   589  		return ESRCH;
   590  	}
   591  	if (mnt_fd < 0) {
   592  		die("cannot open preserved mount namespace %s", group->name);
   593  	}
   594  	struct statfs ns_statfs_buf;
   595  	if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) {
   596  		die("cannot inspect filesystem of preserved mount namespace file");
   597  	}
   598  	struct stat ns_stat_buf;
   599  	if (fstat(mnt_fd, &ns_stat_buf) < 0) {
   600  		die("cannot inspect preserved mount namespace file");
   601  	}
   602  #ifndef NSFS_MAGIC
   603  	/* Define NSFS_MAGIC for Ubuntu 14.04 and other older systems. */
   604  #define NSFS_MAGIC 0x6e736673
   605  #endif
   606  	if (ns_statfs_buf.f_type == NSFS_MAGIC
   607  	    || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) {
   608  		if (setns(mnt_fd, CLONE_NEWNS) < 0) {
   609  			die("cannot join preserved per-user mount namespace %s",
   610  			    group->name);
   611  		}
   612  		debug("joined preserved mount namespace %s", group->name);
   613  		return 0;
   614  	}
   615  	return ESRCH;
   616  }
   617  
   618  static void setup_signals_for_helper(void)
   619  {
   620  	/* Ignore the SIGPIPE signal so that we get EPIPE on the read / write
   621  	 * operations attempting to work with a closed pipe. This ensures that we
   622  	 * are not killed by the default disposition (terminate) and can return a
   623  	 * non-signal-death return code to the program invoking snap-confine. */
   624  	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
   625  		die("cannot install ignore handler for SIGPIPE");
   626  	}
   627  }
   628  
   629  static void teardown_signals_for_helper(void)
   630  {
   631  	/* Undo operations done by setup_signals_for_helper. */
   632  	if (signal(SIGPIPE, SIG_DFL) == SIG_ERR) {
   633  		die("cannot restore default handler for SIGPIPE");
   634  	}
   635  }
   636  
   637  static void helper_fork(struct sc_mount_ns *group, struct sc_apparmor *apparmor)
   638  {
   639  	// Create a pipe for sending commands to the helper process.
   640  	if (pipe2(group->pipe_master, O_CLOEXEC | O_DIRECT) < 0) {
   641  		die("cannot create pipes for commanding the helper process");
   642  	}
   643  	if (pipe2(group->pipe_helper, O_CLOEXEC | O_DIRECT) < 0) {
   644  		die("cannot create pipes for responding to master process");
   645  	}
   646  	// Store the PID of the "parent" process. This done instead of calls to
   647  	// getppid() because then we can reliably track the PID of the parent even
   648  	// if the child process is re-parented.
   649  	pid_t parent = getpid();
   650  
   651  	// For rationale of forking see this:
   652  	// https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html
   653  	pid_t pid = fork();
   654  	if (pid < 0) {
   655  		die("cannot fork helper process for mount namespace capture");
   656  	}
   657  	if (pid == 0) {
   658  		/* helper */
   659  		sc_cleanup_close(&group->pipe_master[1]);
   660  		sc_cleanup_close(&group->pipe_helper[0]);
   661  		helper_main(group, apparmor, parent);
   662  	} else {
   663  		setup_signals_for_helper();
   664  
   665  		/* master */
   666  		sc_cleanup_close(&group->pipe_master[0]);
   667  		sc_cleanup_close(&group->pipe_helper[1]);
   668  
   669  		// Glibc defines pid as a signed 32bit integer. There's no standard way to
   670  		// print pid's portably so this is the best we can do.
   671  		debug("forked support process %d", (int)pid);
   672  		group->child = pid;
   673  	}
   674  }
   675  
   676  static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor,
   677  			pid_t parent)
   678  {
   679  	// This is the child process which will capture the mount namespace.
   680  	//
   681  	// It will do so by bind-mounting the .mnt after the parent process calls
   682  	// unshare() and finishes setting up the namespace completely. Change the
   683  	// hat to a sub-profile that has limited permissions necessary to
   684  	// accomplish the capture of the mount namespace.
   685  	sc_maybe_aa_change_hat(apparmor, "mount-namespace-capture-helper", 0);
   686  	// Configure the child to die as soon as the parent dies. In an odd
   687  	// case where the parent is killed then we don't want to complete our
   688  	// task or wait for anything.
   689  	if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) {
   690  		die("cannot set parent process death notification signal to SIGINT");
   691  	}
   692  	// Check that parent process is still alive. If this is the case then we
   693  	// can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake us up
   694  	// from read(2) below. In the rare case that the PID numbers overflow and
   695  	// the now-dead parent PID is recycled we will still hang forever on the
   696  	// read from the pipe below.
   697  	if (kill(parent, 0) < 0) {
   698  		switch (errno) {
   699  		case ESRCH:
   700  			// When snap-confine executes it will fork a helper process. That
   701  			// process establishes an elaborate dance to ensure both itself and
   702  			// the parent are operating exactly as specified, so that no
   703  			// processes are left behind for unbound amount of time. As a part
   704  			// of that dance the child pings the parent to ensure it is still
   705  			// alive after establishing a notification signal to be sent in
   706  			// case the parent dies. This is a race avoidance mechanism, we set
   707  			// up the notification and then check if the parent is alive by the
   708  			// time we are done.
   709  			//
   710  			// In the case when the parent does go away we used to call
   711  			// abort(). On some distributions this would trigger an unclean
   712  			// process termination error report to be sent. One such example is
   713  			// the Ubuntu error tracker. Since the parent process can be
   714  			// legitimately interrupted and killed, this should not generate an
   715  			// error report. As such, perform clean exit in this specific case.
   716  			debug("parent process has terminated");
   717  			exit(0);
   718  		default:
   719  			die("cannot confirm that parent process is alive");
   720  			break;
   721  		}
   722  	}
   723  	if (fchdir(group->dir_fd) < 0) {
   724  		die("cannot move to directory with preserved namespaces");
   725  	}
   726  	int command = -1;
   727  	int run = 1;
   728  	while (run) {
   729  		debug("helper process waiting for command");
   730  		sc_enable_sanity_timeout();
   731  		if (read(group->pipe_master[0], &command, sizeof command) < 0) {
   732  			int saved_errno = errno;
   733  			// This will ensure we get the correct error message
   734  			// if there is a read error because the timeout
   735  			// expired.
   736  			sc_disable_sanity_timeout();
   737  			errno = saved_errno;
   738  			die("cannot read command from the pipe");
   739  		}
   740  		sc_disable_sanity_timeout();
   741  		debug("helper process received command %d", command);
   742  		switch (command) {
   743  		case HELPER_CMD_EXIT:
   744  			run = 0;
   745  			break;
   746  		case HELPER_CMD_CAPTURE_MOUNT_NS:
   747  			helper_capture_ns(group, parent);
   748  			break;
   749  		case HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS:
   750  			helper_capture_per_user_ns(group, parent);
   751  			break;
   752  		}
   753  		if (write(group->pipe_helper[1], &command, sizeof command) < 0) {
   754  			die("cannot write ack");
   755  		}
   756  	}
   757  	debug("helper process exiting");
   758  	exit(0);
   759  }
   760  
   761  static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent)
   762  {
   763  	char src[PATH_MAX] = { 0 };
   764  	char dst[PATH_MAX] = { 0 };
   765  
   766  	debug("capturing per-snap mount namespace");
   767  	sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent);
   768  	sc_must_snprintf(dst, sizeof dst, "%s.mnt", group->name);
   769  
   770  	/* Ensure the bind mount destination exists. */
   771  	int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600);
   772  	if (fd < 0) {
   773  		die("cannot create file %s", dst);
   774  	}
   775  	close(fd);
   776  
   777  	if (mount(src, dst, NULL, MS_BIND, NULL) < 0) {
   778  		die("cannot preserve mount namespace of process %d as %s",
   779  		    (int)parent, dst);
   780  	}
   781  	debug("mount namespace of process %d preserved as %s",
   782  	      (int)parent, dst);
   783  }
   784  
   785  static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent)
   786  {
   787  	char src[PATH_MAX] = { 0 };
   788  	char dst[PATH_MAX] = { 0 };
   789  	uid_t uid = getuid();
   790  
   791  	debug("capturing per-snap, per-user mount namespace");
   792  	sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent);
   793  	sc_must_snprintf(dst, sizeof dst, "%s.%d.mnt", group->name, (int)uid);
   794  
   795  	/* Ensure the bind mount destination exists. */
   796  	int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600);
   797  	if (fd < 0) {
   798  		die("cannot create file %s", dst);
   799  	}
   800  	close(fd);
   801  
   802  	if (mount(src, dst, NULL, MS_BIND, NULL) < 0) {
   803  		die("cannot preserve per-user mount namespace of process %d as %s", (int)parent, dst);
   804  	}
   805  	debug("per-user mount namespace of process %d preserved as %s",
   806  	      (int)parent, dst);
   807  }
   808  
   809  static void sc_message_capture_helper(struct sc_mount_ns *group, int command_id)
   810  {
   811  	int ack;
   812  	if (group->child == 0) {
   813  		die("precondition failed: we don't have a helper process");
   814  	}
   815  	if (group->pipe_master[1] < 0) {
   816  		die("precondition failed: we don't have a pipe");
   817  	}
   818  	if (group->pipe_helper[0] < 0) {
   819  		die("precondition failed: we don't have a pipe");
   820  	}
   821  	debug("sending command %d to helper process (pid: %d)",
   822  	      command_id, group->child);
   823  	if (write(group->pipe_master[1], &command_id, sizeof command_id) < 0) {
   824  		die("cannot send command %d to helper process", command_id);
   825  	}
   826  	debug("waiting for response from helper");
   827  	int read_n = read(group->pipe_helper[0], &ack, sizeof ack);
   828  	if (read_n < 0) {
   829  		die("cannot receive ack from helper process");
   830  	}
   831  	if (read_n == 0) {
   832  		die("unexpected eof from helper process");
   833  	}
   834  }
   835  
   836  static void sc_wait_for_capture_helper(struct sc_mount_ns *group)
   837  {
   838  	if (group->child == 0) {
   839  		die("precondition failed: we don't have a helper process");
   840  	}
   841  	debug("waiting for the helper process to exit");
   842  	int status = 0;
   843  	errno = 0;
   844  	if (waitpid(group->child, &status, 0) < 0) {
   845  		die("cannot wait for the helper process");
   846  	}
   847  	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
   848  		die("helper process exited abnormally");
   849  	}
   850  	debug("helper process exited normally");
   851  	group->child = 0;
   852  	teardown_signals_for_helper();
   853  }
   854  
   855  void sc_fork_helper(struct sc_mount_ns *group, struct sc_apparmor *apparmor)
   856  {
   857  	helper_fork(group, apparmor);
   858  }
   859  
   860  void sc_preserve_populated_mount_ns(struct sc_mount_ns *group)
   861  {
   862  	sc_message_capture_helper(group, HELPER_CMD_CAPTURE_MOUNT_NS);
   863  }
   864  
   865  void sc_preserve_populated_per_user_mount_ns(struct sc_mount_ns *group)
   866  {
   867  	sc_message_capture_helper(group, HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS);
   868  }
   869  
   870  void sc_wait_for_helper(struct sc_mount_ns *group)
   871  {
   872  	sc_message_capture_helper(group, HELPER_CMD_EXIT);
   873  	sc_wait_for_capture_helper(group);
   874  }
   875  
   876  void sc_store_ns_info(const sc_invocation * inv)
   877  {
   878  	FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL;
   879  	char info_path[PATH_MAX] = { 0 };
   880  	sc_must_snprintf(info_path, sizeof info_path,
   881  			 "/run/snapd/ns/snap.%s.info", inv->snap_instance);
   882  	int fd = -1;
   883  	fd = open(info_path,
   884  		  O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC | O_NOFOLLOW, 0644);
   885  	if (fd < 0) {
   886  		die("cannot open %s", info_path);
   887  	}
   888  	if (fchown(fd, 0, 0) < 0) {
   889  		die("cannot chown %s to root.root", info_path);
   890  	}
   891  	// The stream now owns the file descriptor.
   892  	stream = fdopen(fd, "w");
   893  	if (stream == NULL) {
   894  		die("cannot get stream from file descriptor");
   895  	}
   896  	fprintf(stream, "base-snap-name=%s\n", inv->orig_base_snap_name);
   897  	if (ferror(stream) != 0) {
   898  		die("I/O error when writing to %s", info_path);
   899  	}
   900  	if (fflush(stream) == EOF) {
   901  		die("cannot flush %s", info_path);
   902  	}
   903  	debug("saved mount namespace meta-data to %s", info_path);
   904  }