github.com/meulengracht/snapd@v0.0.0-20210719210640-8bde69bcc84e/cmd/snap-confine/snap-confine.c (about)

     1  /*
     2   * Copyright (C) 2015-2018 Canonical Ltd
     3   *
     4   * This program is free software: you can redistribute it and/or modify
     5   * it under the terms of the GNU General Public License version 3 as
     6   * published by the Free Software Foundation.
     7   *
     8   * This program is distributed in the hope that it will be useful,
     9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11   * GNU General Public License for more details.
    12   *
    13   * You should have received a copy of the GNU General Public License
    14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15   *
    16   */
    17  #ifdef HAVE_CONFIG_H
    18  #include "config.h"
    19  #endif
    20  
    21  #include <errno.h>
    22  #include <fcntl.h>
    23  #include <glob.h>
    24  #include <sched.h>
    25  #include <signal.h>
    26  #include <stdbool.h>
    27  #include <stdio.h>
    28  #include <stdlib.h>
    29  #include <string.h>
    30  #include <sys/capability.h>
    31  #include <sys/stat.h>
    32  #include <sys/types.h>
    33  #include <unistd.h>
    34  
    35  #include "../libsnap-confine-private/apparmor-support.h"
    36  #include "../libsnap-confine-private/cgroup-freezer-support.h"
    37  #include "../libsnap-confine-private/cgroup-support.h"
    38  #include "../libsnap-confine-private/classic.h"
    39  #include "../libsnap-confine-private/cleanup-funcs.h"
    40  #include "../libsnap-confine-private/feature.h"
    41  #include "../libsnap-confine-private/locking.h"
    42  #include "../libsnap-confine-private/secure-getenv.h"
    43  #include "../libsnap-confine-private/snap.h"
    44  #include "../libsnap-confine-private/string-utils.h"
    45  #include "../libsnap-confine-private/tool.h"
    46  #include "../libsnap-confine-private/utils.h"
    47  #include "cookie-support.h"
    48  #include "mount-support.h"
    49  #include "ns-support.h"
    50  #include "seccomp-support.h"
    51  #include "snap-confine-args.h"
    52  #include "snap-confine-invocation.h"
    53  #include "udev-support.h"
    54  #include "user-support.h"
    55  #ifdef HAVE_SELINUX
    56  #include "selinux-support.h"
    57  #endif
    58  
    59  // sc_maybe_fixup_permissions fixes incorrect permissions
    60  // inside the mount namespace for /var/lib. Before 1ccce4
    61  // this directory was created with permissions 1777.
    62  static void sc_maybe_fixup_permissions(void)
    63  {
    64  	int fd SC_CLEANUP(sc_cleanup_close) = -1;
    65  	struct stat buf;
    66  	fd = open("/var/lib", O_PATH | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
    67  	if (fd < 0) {
    68  		die("cannot open /var/lib");
    69  	}
    70  	if (fstat(fd, &buf) < 0) {
    71  		die("cannot stat /var/lib");
    72  	}
    73  	if ((buf.st_mode & 0777) == 0777) {
    74  		if (fchmod(fd, 0755) != 0) {
    75  			die("cannot chmod /var/lib");
    76  		}
    77  		if (fchown(fd, 0, 0) != 0) {
    78  			die("cannot chown /var/lib");
    79  		}
    80  	}
    81  }
    82  
    83  // sc_maybe_fixup_udev will remove incorrectly created udev tags
    84  // that cause libudev on 16.04 to fail with "udev_enumerate_scan failed".
    85  // See also:
    86  // https://forum.snapcraft.io/t/weird-udev-enumerate-error/2360/17
    87  static void sc_maybe_fixup_udev(void)
    88  {
    89  	glob_t glob_res SC_CLEANUP(globfree) = {
    90  		.gl_pathv = NULL,.gl_pathc = 0,.gl_offs = 0,
    91  	};
    92  	const char *glob_pattern = "/run/udev/tags/snap_*/*nvidia*";
    93  	int err = glob(glob_pattern, 0, NULL, &glob_res);
    94  	if (err == GLOB_NOMATCH) {
    95  		return;
    96  	}
    97  	if (err != 0) {
    98  		die("cannot search using glob pattern %s: %d",
    99  		    glob_pattern, err);
   100  	}
   101  	// kill bogus udev tags for nvidia. They confuse udev, this
   102  	// undoes the damage from github.com/snapcore/snapd/pull/3671.
   103  	//
   104  	// The udev tagging of nvidia got reverted in:
   105  	// https://github.com/snapcore/snapd/pull/4022
   106  	// but leftover files need to get removed or apps won't start
   107  	for (size_t i = 0; i < glob_res.gl_pathc; ++i) {
   108  		unlink(glob_res.gl_pathv[i]);
   109  	}
   110  }
   111  
   112  /**
   113   * sc_preserved_process_state remembers clobbered state to restore.
   114   *
   115   * The umask is preserved and restored to ensure consistent permissions for
   116   * runtime system. The value is preserved and restored perfectly.
   117  **/
   118  typedef struct sc_preserved_process_state {
   119  	mode_t orig_umask;
   120  	int orig_cwd_fd;
   121  	struct stat file_info_orig_cwd;
   122  } sc_preserved_process_state;
   123  
   124  /**
   125   * sc_preserve_and_sanitize_process_state sanitizes process state.
   126   *
   127   * The following process state is sanitized:
   128   *  - the umask is set to 0
   129   *  - the current working directory is set to /
   130   *
   131   * The original values are stored to be restored later. Currently only the
   132   * umask is altered. It is set to zero to make the ownership of created files
   133   * and directories more predictable.
   134  **/
   135  static void sc_preserve_and_sanitize_process_state(sc_preserved_process_state *
   136  						   proc_state)
   137  {
   138  	/* Reset umask to zero, storing the old value. */
   139  	proc_state->orig_umask = umask(0);
   140  	debug("umask reset, old umask was %#4o", proc_state->orig_umask);
   141  	/* Remember a file descriptor corresponding to the original working
   142  	 * directory. This is an O_PATH file descriptor. The descriptor is
   143  	 * used as explained below. */
   144  	proc_state->orig_cwd_fd =
   145  	    openat(AT_FDCWD, ".",
   146  		   O_PATH | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
   147  	if (proc_state->orig_cwd_fd < 0) {
   148  		die("cannot open path of the current working directory");
   149  	}
   150  	if (fstat(proc_state->orig_cwd_fd, &proc_state->file_info_orig_cwd) < 0) {
   151  		die("cannot stat path of the current working directory");
   152  	}
   153  	/* Move to the root directory. */
   154  	if (chdir("/") < 0) {
   155  		die("cannot move to /");
   156  	}
   157  }
   158  
   159  /**
   160   *  sc_restore_process_state restores values stored earlier.
   161  **/
   162  static void sc_restore_process_state(const sc_preserved_process_state *
   163  				     proc_state)
   164  {
   165  	/* Restore original umask */
   166  	umask(proc_state->orig_umask);
   167  	debug("umask restored to %#4o", proc_state->orig_umask);
   168  
   169  	/* Restore original current working directory.
   170  	 *
   171  	 * This part is more involved for the following reasons. While we hold an
   172  	 * O_PATH file descriptor that still points to the original working
   173  	 * directory, that directory may not be representable in the target mount
   174  	 * namespace. A quick example may be /custom that exists on the host but
   175  	 * not in the base snap of the application.
   176  	 *
   177  	 * Also consider when the path of the original working directory now
   178  	 * maps to a different inode we cannot use fchdir(2). One example of
   179  	 * that is the /tmp directory, which exists in both the host mount
   180  	 * namespace and the per-snap mount namespace but actually represents a
   181  	 * different directory.
   182  	 **/
   183  
   184  	/* Read the target of symlink at /proc/self/fd/<fd-of-orig-cwd> */
   185  	char fd_path[PATH_MAX];
   186  	char orig_cwd[PATH_MAX];
   187  	ssize_t nread;
   188  	/* If the original working directory cannot be used for whatever reason then
   189  	 * move the process to a special void directory. */
   190  	const char *sc_void_dir = "/var/lib/snapd/void";
   191  	int void_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
   192  
   193  	sc_must_snprintf(fd_path, sizeof fd_path, "/proc/self/fd/%d",
   194  			 proc_state->orig_cwd_fd);
   195  	nread = readlink(fd_path, orig_cwd, sizeof orig_cwd);
   196  	if (nread < 0) {
   197  		die("cannot read symbolic link target %s", fd_path);
   198  	}
   199  	if (nread == sizeof orig_cwd) {
   200  		die("cannot fit symbolic link target %s", fd_path);
   201  	}
   202  
   203  	/* Open path corresponding to the original working directory in the
   204  	 * execution environment. This may normally fail if the path no longer
   205  	 * exists here, this is not a fatal error. It may also fail if we don't
   206  	 * have permissions to view that path, that is not a fatal error either. */
   207  	int inner_cwd_fd SC_CLEANUP(sc_cleanup_close) = -1;
   208  	inner_cwd_fd =
   209  	    open(orig_cwd, O_PATH | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
   210  	if (inner_cwd_fd < 0) {
   211  		if (errno == EPERM || errno == EACCES || errno == ENOENT) {
   212  			debug
   213  			    ("cannot open path of the original working directory %s",
   214  			     orig_cwd);
   215  			goto the_void;
   216  		}
   217  		/* Any error other than the three above is unexpected. */
   218  		die("cannot open path of the original working directory %s",
   219  		    orig_cwd);
   220  	}
   221  
   222  	/* The original working directory exists in the execution environment
   223  	 * which lets us check if it points to the same inode as before. */
   224  	struct stat file_info_inner;
   225  	if (fstat(inner_cwd_fd, &file_info_inner) < 0) {
   226  		die("cannot stat path of working directory in the execution environment");
   227  	}
   228  
   229  	/* Note that we cannot use proc_state->orig_cwd_fd as that points to the
   230  	 * directory but in another mount namespace and using that causes
   231  	 * weird and undesired effects.
   232  	 *
   233  	 * By the time this code runs we are already running as the
   234  	 * designated user so UNIX permissions are in effect. */
   235  	if (fchdir(inner_cwd_fd) < 0) {
   236  		if (errno == EPERM || errno == EACCES) {
   237  			debug("cannot access original working directory %s",
   238  			      orig_cwd);
   239  			goto the_void;
   240  		}
   241  		die("cannot restore original working directory via path");
   242  	}
   243  	/* The distinction below is only logged and not acted upon. Perhaps someday
   244  	 * this will be somehow communicated to cooperating applications that can
   245  	 * instruct the user and avoid potential confusion. This mostly applies to
   246  	 * tools that are invoked from /tmp. */
   247  	if (proc_state->file_info_orig_cwd.st_dev ==
   248  	    file_info_inner.st_dev
   249  	    && proc_state->file_info_orig_cwd.st_ino ==
   250  	    file_info_inner.st_ino) {
   251  		/* The path of the original working directory points to the same
   252  		 * inode as before. */
   253  		debug("working directory restored to %s", orig_cwd);
   254  	} else {
   255  		/* The path of the original working directory points to a different
   256  		 * inode inside inside the execution environment than the host
   257  		 * environment. */
   258  		debug("working directory re-interpreted to %s", orig_cwd);
   259  	}
   260  	return;
   261   the_void:
   262  	/* The void directory may be absent. On core18 system, and other
   263  	 * systems using bootable base snap coupled with snapd snap, the
   264  	 * /var/lib/snapd directory structure is not provided with packages but
   265  	 * created on demand. */
   266  	void_dir_fd = open(sc_void_dir,
   267  			   O_DIRECTORY | O_PATH | O_NOFOLLOW | O_CLOEXEC);
   268  	if (void_dir_fd < 0 && errno == ENOENT) {
   269  		if (mkdir(sc_void_dir, 0111) < 0) {
   270  			die("cannot create void directory: %s", sc_void_dir);
   271  		}
   272  		if (lchown(sc_void_dir, 0, 0) < 0) {
   273  			die("cannot change ownership of void directory %s",
   274  			    sc_void_dir);
   275  		}
   276  		void_dir_fd = open(sc_void_dir,
   277  				   O_DIRECTORY | O_PATH | O_NOFOLLOW |
   278  				   O_CLOEXEC);
   279  	}
   280  	if (void_dir_fd < 0) {
   281  		die("cannot open the void directory %s", sc_void_dir);
   282  	}
   283  	if (fchdir(void_dir_fd) < 0) {
   284  		die("cannot move to void directory %s", sc_void_dir);
   285  	}
   286  	debug("the process has been placed in the special void directory");
   287  }
   288  
   289  /**
   290   *  sc_cleanup_preserved_process_state releases system resources.
   291  **/
   292  static void sc_cleanup_preserved_process_state(sc_preserved_process_state *
   293  					       proc_state)
   294  {
   295  	sc_cleanup_close(&proc_state->orig_cwd_fd);
   296  }
   297  
   298  static void enter_classic_execution_environment(const sc_invocation * inv,
   299  						gid_t real_gid,
   300  						gid_t saved_gid);
   301  static void enter_non_classic_execution_environment(sc_invocation * inv,
   302  						    struct sc_apparmor *aa,
   303  						    uid_t real_uid,
   304  						    gid_t real_gid,
   305  						    gid_t saved_gid);
   306  
   307  int main(int argc, char **argv)
   308  {
   309  	// Use our super-defensive parser to figure out what we've been asked to do.
   310  	sc_error *err = NULL;
   311  	struct sc_args *args SC_CLEANUP(sc_cleanup_args) = NULL;
   312  	sc_preserved_process_state proc_state
   313  	    SC_CLEANUP(sc_cleanup_preserved_process_state) = {
   314  		.orig_umask = 0,.orig_cwd_fd = -1
   315  	};
   316  	args = sc_nonfatal_parse_args(&argc, &argv, &err);
   317  	sc_die_on_error(err);
   318  
   319  	// Remember certain properties of the process that are clobbered by
   320  	// snap-confine during execution. Those are restored just before calling
   321  	// execv.
   322  	sc_preserve_and_sanitize_process_state(&proc_state);
   323  
   324  	// We've been asked to print the version string so let's just do that.
   325  	if (sc_args_is_version_query(args)) {
   326  		printf("%s %s\n", PACKAGE, PACKAGE_VERSION);
   327  		return 0;
   328  	}
   329  
   330  	/* Collect all invocation parameters. This gives us authoritative
   331  	 * information about what needs to be invoked and how. The data comes
   332  	 * from either the environment or from command line arguments */
   333  	sc_invocation SC_CLEANUP(sc_cleanup_invocation) invocation;
   334  	const char *snap_instance_name_env = getenv("SNAP_INSTANCE_NAME");
   335  	if (snap_instance_name_env == NULL) {
   336  		die("SNAP_INSTANCE_NAME is not set");
   337  	}
   338  	sc_init_invocation(&invocation, args, snap_instance_name_env);
   339  
   340  	// Who are we?
   341  	uid_t real_uid, effective_uid, saved_uid;
   342  	gid_t real_gid, effective_gid, saved_gid;
   343  	if (getresuid(&real_uid, &effective_uid, &saved_uid) != 0) {
   344  		die("getresuid failed");
   345  	}
   346  	if (getresgid(&real_gid, &effective_gid, &saved_gid) != 0) {
   347  		die("getresgid failed");
   348  	}
   349  	debug("ruid: %d, euid: %d, suid: %d",
   350  	      real_uid, effective_uid, saved_uid);
   351  	debug("rgid: %d, egid: %d, sgid: %d",
   352  	      real_gid, effective_gid, saved_gid);
   353  
   354  	// snap-confine needs to run as root for cgroup/udev/mount/apparmor/etc setup.
   355  	if (effective_uid != 0) {
   356  		die("need to run as root or suid");
   357  	}
   358  
   359  	char *snap_context SC_CLEANUP(sc_cleanup_string) = NULL;
   360  	// Do no get snap context value if running a hook (we don't want to overwrite hook's SNAP_COOKIE)
   361  	if (!sc_is_hook_security_tag(invocation.security_tag)) {
   362  		sc_error *err SC_CLEANUP(sc_cleanup_error) = NULL;
   363  		snap_context =
   364  		    sc_cookie_get_from_snapd(invocation.snap_instance, &err);
   365  		/* While the cookie is normally present due to various protection
   366  		 * mechanisms ensuring its creation from snapd, we are not considering
   367  		 * it a critical error for snap-confine in the case it is absent. When
   368  		 * absent snaps attempting to utilize snapctl to interact with snapd
   369  		 * will fail but it is more important to run a little than break
   370  		 * entirely in case snapd-side code is incorrect. Therefore error
   371  		 * information is collected but discarded. */
   372  	}
   373  
   374  	struct sc_apparmor apparmor;
   375  	sc_init_apparmor_support(&apparmor);
   376  	if (!apparmor.is_confined && apparmor.mode != SC_AA_NOT_APPLICABLE
   377  	    && getuid() != 0 && geteuid() == 0) {
   378  		// Refuse to run when this process is running unconfined on a system
   379  		// that supports AppArmor when the effective uid is root and the real
   380  		// id is non-root.  This protects against, for example, unprivileged
   381  		// users trying to leverage the snap-confine in the core snap to
   382  		// escalate privileges.
   383  		die("snap-confine has elevated permissions and is not confined"
   384  		    " but should be. Refusing to continue to avoid"
   385  		    " permission escalation attacks");
   386  	}
   387  
   388  	/* perform global initialization of mount namespace support for non-classic
   389  	 * snaps or both classic and non-classic when parallel-instances feature is
   390  	 * enabled */
   391  	if (!invocation.classic_confinement ||
   392  	    sc_feature_enabled(SC_FEATURE_PARALLEL_INSTANCES)) {
   393  
   394  		/* snap-confine uses privately-shared /run/snapd/ns to store bind-mounted
   395  		 * mount namespaces of each snap. In the case that snap-confine is invoked
   396  		 * from the mount namespace it typically constructs, the said directory
   397  		 * does not contain mount entries for preserved namespaces as those are
   398  		 * only visible in the main, outer namespace.
   399  		 *
   400  		 * In order to operate in such an environment snap-confine must first
   401  		 * re-associate its own process with another namespace in which the
   402  		 * /run/snapd/ns directory is visible. The most obvious candidate is pid
   403  		 * one, which definitely doesn't run in a snap-specific namespace, has a
   404  		 * predictable PID and is long lived.
   405  		 */
   406  		sc_reassociate_with_pid1_mount_ns();
   407  		// Do global initialization:
   408  		int global_lock_fd = sc_lock_global();
   409  		// Ensure that "/" or "/snap" is mounted with the
   410  		// "shared" option on legacy systems, see LP:#1668659
   411  		debug("ensuring that snap mount directory is shared");
   412  		sc_ensure_shared_snap_mount();
   413  		unsigned int experimental_features = 0;
   414  		if (sc_feature_enabled(SC_FEATURE_PARALLEL_INSTANCES)) {
   415  			experimental_features |= SC_FEATURE_PARALLEL_INSTANCES;
   416  		}
   417  		sc_initialize_mount_ns(experimental_features);
   418  		sc_unlock(global_lock_fd);
   419  	}
   420  
   421  	if (invocation.classic_confinement) {
   422  		enter_classic_execution_environment(&invocation, real_gid,
   423  						    saved_gid);
   424  	} else {
   425  		enter_non_classic_execution_environment(&invocation,
   426  							&apparmor,
   427  							real_uid,
   428  							real_gid, saved_gid);
   429  	}
   430  	// Temporarily drop privileges back to the calling user until we can
   431  	// permanently drop (which we can't do just yet due to seccomp, see
   432  	// below).
   433  	sc_identity real_user_identity = {
   434  		.uid = real_uid,
   435  		.gid = real_gid,
   436  		.change_uid = 1,
   437  		.change_gid = 1,
   438  	};
   439  	sc_set_effective_identity(real_user_identity);
   440  	// Ensure that the user data path exists. When creating it use the identity
   441  	// of the calling user (by using real user and group identifiers). This
   442  	// allows the creation of directories inside ~/ on NFS with root_squash
   443  	// attribute.
   444  	setup_user_data();
   445  #if 0
   446  	setup_user_xdg_runtime_dir();
   447  #endif
   448  	// https://wiki.ubuntu.com/SecurityTeam/Specifications/SnappyConfinement
   449  	sc_maybe_aa_change_onexec(&apparmor, invocation.security_tag);
   450  #ifdef HAVE_SELINUX
   451  	// For classic and confined snaps
   452  	sc_selinux_set_snap_execcon();
   453  #endif
   454  	if (snap_context != NULL) {
   455  		setenv("SNAP_COOKIE", snap_context, 1);
   456  		// for compatibility, if facing older snapd.
   457  		setenv("SNAP_CONTEXT", snap_context, 1);
   458  	}
   459  	// Normally setuid/setgid not only permanently drops the UID/GID, but
   460  	// also clears the capabilities bounding sets (see "Effect of user ID
   461  	// changes on capabilities" in 'man capabilities'). To load a seccomp
   462  	// profile, we need either CAP_SYS_ADMIN or PR_SET_NO_NEW_PRIVS. Since
   463  	// NNP causes issues with AppArmor and exec transitions in certain
   464  	// snapd interfaces, keep CAP_SYS_ADMIN temporarily when we are
   465  	// permanently dropping privileges.
   466  	if (getresuid(&real_uid, &effective_uid, &saved_uid) != 0) {
   467  		die("getresuid failed");
   468  	}
   469  	debug("ruid: %d, euid: %d, suid: %d",
   470  	      real_uid, effective_uid, saved_uid);
   471  	struct __user_cap_header_struct hdr =
   472  	    { _LINUX_CAPABILITY_VERSION_3, 0 };
   473  	struct __user_cap_data_struct cap_data[2] = { {0} };
   474  
   475  	// At this point in time, if we are going to permanently drop our
   476  	// effective_uid will not be '0' but our saved_uid will be '0'. Detect
   477  	// and save when we are in the this state so know when to setup the
   478  	// capabilities bounding set, regain CAP_SYS_ADMIN and later drop it.
   479  	bool keep_sys_admin = effective_uid != 0 && saved_uid == 0;
   480  	if (keep_sys_admin) {
   481  		debug("setting capabilities bounding set");
   482  		// clear all 32 bit caps but SYS_ADMIN, with none inheritable
   483  		cap_data[0].effective = CAP_TO_MASK(CAP_SYS_ADMIN);
   484  		cap_data[0].permitted = cap_data[0].effective;
   485  		cap_data[0].inheritable = 0;
   486  		// clear all 64 bit caps
   487  		cap_data[1].effective = 0;
   488  		cap_data[1].permitted = 0;
   489  		cap_data[1].inheritable = 0;
   490  		if (capset(&hdr, cap_data) != 0) {
   491  			die("capset failed");
   492  		}
   493  	}
   494  	// Permanently drop if not root
   495  	if (effective_uid == 0) {
   496  		// Note that we do not call setgroups() here because its ok
   497  		// that the user keeps the groups he already belongs to
   498  		if (setgid(real_gid) != 0)
   499  			die("setgid failed");
   500  		if (setuid(real_uid) != 0)
   501  			die("setuid failed");
   502  
   503  		if (real_gid != 0 && (getuid() == 0 || geteuid() == 0))
   504  			die("permanently dropping privs did not work");
   505  		if (real_uid != 0 && (getgid() == 0 || getegid() == 0))
   506  			die("permanently dropping privs did not work");
   507  	}
   508  	// Now that we've permanently dropped, regain SYS_ADMIN
   509  	if (keep_sys_admin) {
   510  		debug("regaining SYS_ADMIN");
   511  		cap_data[0].effective = CAP_TO_MASK(CAP_SYS_ADMIN);
   512  		cap_data[0].permitted = cap_data[0].effective;
   513  		if (capset(&hdr, cap_data) != 0) {
   514  			die("capset regain failed");
   515  		}
   516  	}
   517  	// Now that we've dropped and regained SYS_ADMIN, we can load the
   518  	// seccomp profiles.
   519  	if (sc_apply_seccomp_profile_for_security_tag(invocation.security_tag)) {
   520  		// If the process is not explicitly unconfined then load the
   521  		// global profile as well.
   522  		sc_apply_global_seccomp_profile();
   523  	}
   524  	// Even though we set inheritable to 0, let's clear SYS_ADMIN
   525  	// explicitly
   526  	if (keep_sys_admin) {
   527  		debug("clearing SYS_ADMIN");
   528  		cap_data[0].effective = 0;
   529  		cap_data[0].permitted = cap_data[0].effective;
   530  		if (capset(&hdr, cap_data) != 0) {
   531  			die("capset clear failed");
   532  		}
   533  	}
   534  	// and exec the new executable
   535  	argv[0] = (char *)invocation.executable;
   536  	debug("execv(%s, %s...)", invocation.executable, argv[0]);
   537  	for (int i = 1; i < argc; ++i) {
   538  		debug(" argv[%i] = %s", i, argv[i]);
   539  	}
   540  	// Restore process state that was recorded earlier.
   541  	sc_restore_process_state(&proc_state);
   542  	execv(invocation.executable, (char *const *)&argv[0]);
   543  	perror("execv failed");
   544  	return 1;
   545  }
   546  
   547  static void enter_classic_execution_environment(const sc_invocation * inv,
   548  						gid_t real_gid, gid_t saved_gid)
   549  {
   550  	/* with parallel-instances enabled, main() reassociated with the mount ns of
   551  	 * PID 1 to make /run/snapd/ns visible */
   552  
   553  	/* 'classic confinement' is designed to run without the sandbox inside the
   554  	 * shared namespace. Specifically:
   555  	 * - snap-confine skips using the snap-specific, private, mount namespace
   556  	 * - snap-confine skips using device cgroups
   557  	 * - snapd sets up a lenient AppArmor profile for snap-confine to use
   558  	 * - snapd sets up a lenient seccomp profile for snap-confine to use
   559  	 */
   560  	debug("preparing classic execution environment");
   561  
   562  	if (!sc_feature_enabled(SC_FEATURE_PARALLEL_INSTANCES)) {
   563  		return;
   564  	}
   565  
   566  	/* all of the following code is experimental and part of parallel instances
   567  	 * of classic snaps support */
   568  
   569  	debug
   570  	    ("(experimental) unsharing the mount namespace (per-classic-snap)");
   571  
   572  	/* Construct a mount namespace where the snap instance directories are
   573  	 * visible under the regular snap name. In order to do that we will:
   574  	 *
   575  	 * - convert SNAP_MOUNT_DIR into a mount point (global init)
   576  	 * - convert /var/snap into a mount point (global init)
   577  	 * - always create a new mount namespace
   578  	 * - for snaps with non empty instance key:
   579  	 *   - set slave propagation recursively on SNAP_MOUNT_DIR and /var/snap
   580  	 *   - recursively bind mount SNAP_MOUNT_DIR/<snap>_<key> on top of SNAP_MOUNT_DIR/<snap>
   581  	 *   - recursively bind mount /var/snap/<snap>_<key> on top of /var/snap/<snap>
   582  	 *
   583  	 * The destination directories /var/snap/<snap> and SNAP_MOUNT_DIR/<snap>
   584  	 * are guaranteed to exist and were created during installation of a given
   585  	 * instance.
   586  	 */
   587  
   588  	if (unshare(CLONE_NEWNS) < 0) {
   589  		die("cannot unshare the mount namespace for parallel installed classic snap");
   590  	}
   591  
   592  	/* Parallel installed classic snap get special handling */
   593  	if (!sc_streq(inv->snap_instance, inv->snap_name)) {
   594  		debug
   595  		    ("(experimental) setting up environment for classic snap instance %s",
   596  		     inv->snap_instance);
   597  
   598  		/* set up mappings for snap and data directories */
   599  		sc_setup_parallel_instance_classic_mounts(inv->snap_name,
   600  							  inv->snap_instance);
   601  	}
   602  }
   603  
   604  static void enter_non_classic_execution_environment(sc_invocation * inv,
   605  						    struct sc_apparmor *aa,
   606  						    uid_t real_uid,
   607  						    gid_t real_gid,
   608  						    gid_t saved_gid)
   609  {
   610  	// main() reassociated with the mount ns of PID 1 to make /run/snapd/ns
   611  	// visible
   612  
   613  	// Find and open snap-update-ns and snap-discard-ns from the same
   614  	// path as where we (snap-confine) were called.
   615  	int snap_update_ns_fd SC_CLEANUP(sc_cleanup_close) = -1;
   616  	snap_update_ns_fd = sc_open_snap_update_ns();
   617  	int snap_discard_ns_fd SC_CLEANUP(sc_cleanup_close) = -1;
   618  	snap_discard_ns_fd = sc_open_snap_discard_ns();
   619  
   620  	// Do per-snap initialization.
   621  	int snap_lock_fd = sc_lock_snap(inv->snap_instance);
   622  	debug("initializing mount namespace: %s", inv->snap_instance);
   623  	struct sc_mount_ns *group = NULL;
   624  	group = sc_open_mount_ns(inv->snap_instance);
   625  
   626  	// Init and check rootfs_dir, apply any fallback behaviors.
   627  	sc_check_rootfs_dir(inv);
   628  
   629  	/** Conditionally create, populate and join the device cgroup. */
   630  	sc_setup_device_cgroup(inv->security_tag);
   631  
   632  	/**
   633  	 * is_normal_mode controls if we should pivot into the base snap.
   634  	 *
   635  	 * There are two modes of execution for snaps that are not using classic
   636  	 * confinement: normal and legacy. The normal mode is where snap-confine
   637  	 * sets up a rootfs and then pivots into it using pivot_root(2). The legacy
   638  	 * mode is when snap-confine just unshares the initial mount namespace,
   639  	 * makes some extra changes but largely runs with what was presented to it
   640  	 * initially.
   641  	 *
   642  	 * Historically the ubuntu-core distribution used the now-legacy mode. This
   643  	 * was sensible then since snaps already (kind of) have the right root
   644  	 * file-system and just need some privacy and isolation features applied.
   645  	 * With the introduction of snaps to classic distributions as well as the
   646  	 * introduction of bases, where each snap can use a different root
   647  	 * filesystem, this lost sensibility and thus became legacy.
   648  	 *
   649  	 * For compatibility with current installations of ubuntu-core
   650  	 * distributions the legacy mode is used when: the distribution is
   651  	 * SC_DISTRO_CORE16 or when the base snap name is not "core" or
   652  	 * "ubuntu-core".
   653  	 *
   654  	 * The SC_DISTRO_CORE16 is applied to systems that boot with the "core",
   655  	 * "ubuntu-core" or "core16" snap. Systems using the "core18" base snap do
   656  	 * not qualify for that classification.
   657  	 **/
   658  	sc_distro distro = sc_classify_distro();
   659  	inv->is_normal_mode = distro != SC_DISTRO_CORE16 ||
   660  	    !sc_streq(inv->orig_base_snap_name, "core");
   661  
   662  	/* Stale mount namespace discarded or no mount namespace to
   663  	   join. We need to construct a new mount namespace ourselves.
   664  	   To capture it we will need a helper process so make one. */
   665  	sc_fork_helper(group, aa);
   666  	int retval = sc_join_preserved_ns(group, aa, inv, snap_discard_ns_fd);
   667  	if (retval == ESRCH) {
   668  		/* Create and populate the mount namespace. This performs all
   669  		   of the bootstrapping mounts, pivots into the new root filesystem and
   670  		   applies the per-snap mount profile using snap-update-ns. */
   671  		debug("unsharing the mount namespace (per-snap)");
   672  		if (unshare(CLONE_NEWNS) < 0) {
   673  			die("cannot unshare the mount namespace");
   674  		}
   675  		sc_populate_mount_ns(aa, snap_update_ns_fd, inv, real_gid,
   676  				     saved_gid);
   677  		sc_store_ns_info(inv);
   678  
   679  		/* Preserve the mount namespace. */
   680  		sc_preserve_populated_mount_ns(group);
   681  	}
   682  
   683  	/* Older versions of snap-confine created incorrect 777 permissions
   684  	   for /var/lib and we need to fixup for systems that had their NS created
   685  	   with an old version. */
   686  	sc_maybe_fixup_permissions();
   687  	sc_maybe_fixup_udev();
   688  
   689  	/* User mount profiles do not apply to non-root users. */
   690  	if (real_uid != 0) {
   691  		debug("joining preserved per-user mount namespace");
   692  		retval =
   693  		    sc_join_preserved_per_user_ns(group, inv->snap_instance);
   694  		if (retval == ESRCH) {
   695  			debug("unsharing the mount namespace (per-user)");
   696  			if (unshare(CLONE_NEWNS) < 0) {
   697  				die("cannot unshare the mount namespace");
   698  			}
   699  			sc_setup_user_mounts(aa, snap_update_ns_fd,
   700  					     inv->snap_instance);
   701  			/* Preserve the mount per-user namespace. But only if the
   702  			 * experimental feature is enabled. This way if the feature is
   703  			 * disabled user mount namespaces will still exist but will be
   704  			 * entirely ephemeral. In addition the call
   705  			 * sc_join_preserved_user_ns() will never find a preserved mount
   706  			 * namespace and will always enter this code branch. */
   707  			if (sc_feature_enabled
   708  			    (SC_FEATURE_PER_USER_MOUNT_NAMESPACE)) {
   709  				sc_preserve_populated_per_user_mount_ns(group);
   710  			} else {
   711  				debug
   712  				    ("NOT preserving per-user mount namespace");
   713  			}
   714  		}
   715  	}
   716  	// With cgroups v1, associate each snap process with a dedicated
   717  	// snap freezer cgroup and snap pids cgroup. All snap processes
   718  	// belonging to one snap share the freezer cgroup. All snap
   719  	// processes belonging to one app or one hook share the pids cgroup.
   720  	//
   721  	// This simplifies testing if any processes belonging to a given snap are
   722  	// still alive as well as to properly account for each application and
   723  	// service.
   724  	//
   725  	// Note that with cgroups v2 there is no separate freeezer controller,
   726  	// but the freezer is associated with each group. The call chain when
   727  	// starting the snap application has already ensure that the process has
   728  	// been put in a dedicated group.
   729  	if (!sc_cgroup_is_v2()) {
   730  		sc_cgroup_freezer_join(inv->snap_instance, getpid());
   731  	}
   732  
   733  	sc_unlock(snap_lock_fd);
   734  
   735  	sc_close_mount_ns(group);
   736  
   737  	// Reset path as we cannot rely on the path from the host OS to make sense.
   738  	// The classic distribution may use any PATH that makes sense but we cannot
   739  	// assume it makes sense for the core snap layout. Note that the /usr/local
   740  	// directories are explicitly left out as they are not part of the core
   741  	// snap.
   742  	debug("resetting PATH to values in sync with core snap");
   743  	setenv("PATH",
   744  	       "/usr/local/sbin:"
   745  	       "/usr/local/bin:"
   746  	       "/usr/sbin:"
   747  	       "/usr/bin:"
   748  	       "/sbin:" "/bin:" "/usr/games:" "/usr/local/games", 1);
   749  	// Ensure we set the various TMPDIRs to /tmp. One of the parts of setting
   750  	// up the mount namespace is to create a private /tmp directory (this is
   751  	// done in sc_populate_mount_ns() above). The host environment may point to
   752  	// a directory not accessible by snaps so we need to reset it here.
   753  	const char *tmpd[] = { "TMPDIR", "TEMPDIR", NULL };
   754  	int i;
   755  	for (i = 0; tmpd[i] != NULL; i++) {
   756  		if (setenv(tmpd[i], "/tmp", 1) != 0) {
   757  			die("cannot set environment variable '%s'", tmpd[i]);
   758  		}
   759  	}
   760  }