github.com/rigado/snapd@v2.42.5-go-mod+incompatible/cmd/snap-confine/snap-confine.c

github.com/rigado/snapd@v2.42.5-go-mod+incompatible/cmd/snap-confine/snap-confine.c (about)

     1  /*
     2   * Copyright (C) 2015-2018 Canonical Ltd
     3   *
     4   * This program is free software: you can redistribute it and/or modify
     5   * it under the terms of the GNU General Public License version 3 as
     6   * published by the Free Software Foundation.
     7   *
     8   * This program is distributed in the hope that it will be useful,
     9   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    10   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11   * GNU General Public License for more details.
    12   *
    13   * You should have received a copy of the GNU General Public License
    14   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15   *
    16   */
    17  #ifdef HAVE_CONFIG_H
    18  #include "config.h"
    19  #endif
    20  
    21  #include <errno.h>
    22  #include <fcntl.h>
    23  #include <glob.h>
    24  #include <sched.h>
    25  #include <signal.h>
    26  #include <stdbool.h>
    27  #include <stdio.h>
    28  #include <stdlib.h>
    29  #include <string.h>
    30  #include <sys/capability.h>
    31  #include <sys/stat.h>
    32  #include <sys/types.h>
    33  #include <unistd.h>
    34  
    35  #include "../libsnap-confine-private/apparmor-support.h"
    36  #include "../libsnap-confine-private/cgroup-freezer-support.h"
    37  #include "../libsnap-confine-private/cgroup-pids-support.h"
    38  #include "../libsnap-confine-private/cgroup-support.h"
    39  #include "../libsnap-confine-private/classic.h"
    40  #include "../libsnap-confine-private/cleanup-funcs.h"
    41  #include "../libsnap-confine-private/feature.h"
    42  #include "../libsnap-confine-private/locking.h"
    43  #include "../libsnap-confine-private/secure-getenv.h"
    44  #include "../libsnap-confine-private/snap.h"
    45  #include "../libsnap-confine-private/string-utils.h"
    46  #include "../libsnap-confine-private/tool.h"
    47  #include "../libsnap-confine-private/utils.h"
    48  #include "cookie-support.h"
    49  #include "mount-support.h"
    50  #include "ns-support.h"
    51  #include "seccomp-support.h"
    52  #include "snap-confine-args.h"
    53  #include "snap-confine-invocation.h"
    54  #include "udev-support.h"
    55  #include "user-support.h"
    56  #ifdef HAVE_SELINUX
    57  #include "selinux-support.h"
    58  #endif
    59  
    60  // sc_maybe_fixup_permissions fixes incorrect permissions
    61  // inside the mount namespace for /var/lib. Before 1ccce4
    62  // this directory was created with permissions 1777.
    63  static void sc_maybe_fixup_permissions(void)
    64  {
    65  	struct stat buf;
    66  	if (stat("/var/lib", &buf) != 0) {
    67  		die("cannot stat /var/lib");
    68  	}
    69  	if ((buf.st_mode & 0777) == 0777) {
    70  		if (chmod("/var/lib", 0755) != 0) {
    71  			die("cannot chmod /var/lib");
    72  		}
    73  		if (chown("/var/lib", 0, 0) != 0) {
    74  			die("cannot chown /var/lib");
    75  		}
    76  	}
    77  }
    78  
    79  // sc_maybe_fixup_udev will remove incorrectly created udev tags
    80  // that cause libudev on 16.04 to fail with "udev_enumerate_scan failed".
    81  // See also:
    82  // https://forum.snapcraft.io/t/weird-udev-enumerate-error/2360/17
    83  static void sc_maybe_fixup_udev(void)
    84  {
    85  	glob_t glob_res SC_CLEANUP(globfree) = {
    86  		.gl_pathv = NULL,.gl_pathc = 0,.gl_offs = 0,
    87  	};
    88  	const char *glob_pattern = "/run/udev/tags/snap_*/*nvidia*";
    89  	int err = glob(glob_pattern, 0, NULL, &glob_res);
    90  	if (err == GLOB_NOMATCH) {
    91  		return;
    92  	}
    93  	if (err != 0) {
    94  		die("cannot search using glob pattern %s: %d",
    95  		    glob_pattern, err);
    96  	}
    97  	// kill bogus udev tags for nvidia. They confuse udev, this
    98  	// undoes the damage from github.com/snapcore/snapd/pull/3671.
    99  	//
   100  	// The udev tagging of nvidia got reverted in:
   101  	// https://github.com/snapcore/snapd/pull/4022
   102  	// but leftover files need to get removed or apps won't start
   103  	for (size_t i = 0; i < glob_res.gl_pathc; ++i) {
   104  		unlink(glob_res.gl_pathv[i]);
   105  	}
   106  }
   107  
   108  /**
   109   * sc_preserved_process_state remembers clobbered state to restore.
   110   *
   111   * The umask is preserved and restored to ensure consistent permissions for
   112   * runtime system. The value is preserved and restored perfectly.
   113  **/
   114  typedef struct sc_preserved_process_state {
   115  	mode_t orig_umask;
   116  	int orig_cwd_fd;
   117  	struct stat file_info_orig_cwd;
   118  } sc_preserved_process_state;
   119  
   120  /**
   121   * sc_preserve_and_sanitize_process_state sanitizes process state.
   122   *
   123   * The following process state is sanitised:
   124   *  - the umask is set to 0
   125   *  - the current working directory is set to /
   126   *
   127   * The original values are stored to be restored later. Currently only the
   128   * umask is altered. It is set to zero to make the ownership of created files
   129   * and directories more predictable.
   130  **/
   131  static void sc_preserve_and_sanitize_process_state(sc_preserved_process_state *
   132  						   proc_state)
   133  {
   134  	/* Reset umask to zero, storing the old value. */
   135  	proc_state->orig_umask = umask(0);
   136  	debug("umask reset, old umask was %#4o", proc_state->orig_umask);
   137  	/* Remember a file descriptor corresponding to the original working
   138  	 * directory. This is an O_PATH file descriptor. The descriptor is
   139  	 * used as explained below. */
   140  	proc_state->orig_cwd_fd =
   141  	    openat(AT_FDCWD, ".",
   142  		   O_PATH | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
   143  	if (proc_state->orig_cwd_fd < 0) {
   144  		die("cannot open path of the current working directory");
   145  	}
   146  	if (fstat(proc_state->orig_cwd_fd, &proc_state->file_info_orig_cwd) < 0) {
   147  		die("cannot stat path of the current working directory");
   148  	}
   149  	/* Move to the root directory. */
   150  	if (chdir("/") < 0) {
   151  		die("cannot move to /");
   152  	}
   153  }
   154  
   155  /**
   156   *  sc_restore_process_state restores values stored earlier.
   157  **/
   158  static void sc_restore_process_state(const sc_preserved_process_state *
   159  				     proc_state)
   160  {
   161  	/* Restore original umask */
   162  	umask(proc_state->orig_umask);
   163  	debug("umask restored to %#4o", proc_state->orig_umask);
   164  
   165  	/* Restore original current working directory.
   166  	 *
   167  	 * This part is more involved for the following reasons. While we hold an
   168  	 * O_PATH file descriptor that still points to the original working
   169  	 * directory, that directory may not be representable in the target mount
   170  	 * namespace. A quick example may be /custom that exists on the host but
   171  	 * not in the base snap of the application.
   172  	 *
   173  	 * Also consider when the path of the original working directory now
   174  	 * maps to a different inode we cannot use fchdir(2). One example of
   175  	 * that is the /tmp directory, which exists in both the host mount
   176  	 * namespace and the per-snap mount namespace but actually represents a
   177  	 * different directory.
   178  	 **/
   179  
   180  	/* Read the target of symlink at /proc/self/fd/<fd-of-orig-cwd> */
   181  	char fd_path[PATH_MAX];
   182  	char orig_cwd[PATH_MAX];
   183  	ssize_t nread;
   184  	/* If the original working directory cannot be used for whatever reason then
   185  	 * move the process to a special void directory. */
   186  	const char *sc_void_dir = "/var/lib/snapd/void";
   187  	int void_dir_fd SC_CLEANUP(sc_cleanup_close) = -1;
   188  
   189  	sc_must_snprintf(fd_path, sizeof fd_path, "/proc/self/fd/%d",
   190  			 proc_state->orig_cwd_fd);
   191  	nread = readlink(fd_path, orig_cwd, sizeof orig_cwd);
   192  	if (nread < 0) {
   193  		die("cannot read symbolic link target %s", fd_path);
   194  	}
   195  	if (nread == sizeof orig_cwd) {
   196  		die("cannot fit symbolic link target %s", fd_path);
   197  	}
   198  
   199  	/* Open path corresponding to the original working directory in the
   200  	 * execution environment. This may normally fail if the path no longer
   201  	 * exists here, this is not a fatal error. It may also fail if we don't
   202  	 * have permissions to view that path, that is not a fatal error either. */
   203  	int inner_cwd_fd SC_CLEANUP(sc_cleanup_close) = -1;
   204  	inner_cwd_fd =
   205  	    open(orig_cwd, O_PATH | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW);
   206  	if (inner_cwd_fd < 0) {
   207  		if (errno == EPERM || errno == EACCES || errno == ENOENT) {
   208  			debug
   209  			    ("cannot open path of the original working directory %s",
   210  			     orig_cwd);
   211  			goto the_void;
   212  		}
   213  		/* Any error other than the three above is unexpected. */
   214  		die("cannot open path of the original working directory %s",
   215  		    orig_cwd);
   216  	}
   217  
   218  	/* The original working directory exists in the execution environment
   219  	 * which lets us check if it points to the same inode as before. */
   220  	struct stat file_info_inner;
   221  	if (fstat(inner_cwd_fd, &file_info_inner) < 0) {
   222  		die("cannot stat path of working directory in the execution environment");
   223  	}
   224  
   225  	/* Note that we cannot use proc_state->orig_cwd_fd as that points to the
   226  	 * directory but in another mount namespace and using that causes
   227  	 * weird and undesired effects.
   228  	 *
   229  	 * By the time this code runs we are already running as the
   230  	 * designated user so UNIX permissions are in effect. */
   231  	if (fchdir(inner_cwd_fd) < 0) {
   232  		if (errno == EPERM || errno == EACCES) {
   233  			debug("cannot access original working directory %s",
   234  			      orig_cwd);
   235  			goto the_void;
   236  		}
   237  		die("cannot restore original working directory via path");
   238  	}
   239  	/* The distinction below is only logged and not acted upon. Perhaps someday
   240  	 * this will be somehow communicated to cooperating applications that can
   241  	 * instruct the user and avoid potential confusion. This mostly applies to
   242  	 * tools that are invoked from /tmp. */
   243  	if (proc_state->file_info_orig_cwd.st_dev ==
   244  	    file_info_inner.st_dev
   245  	    && proc_state->file_info_orig_cwd.st_ino ==
   246  	    file_info_inner.st_ino) {
   247  		/* The path of the original working directory points to the same
   248  		 * inode as before. */
   249  		debug("working directory restored to %s", orig_cwd);
   250  	} else {
   251  		/* The path of the original working directory points to a different
   252  		 * inode inside inside the execution environment than the host
   253  		 * environment. */
   254  		debug("working directory re-interpreted to %s", orig_cwd);
   255  	}
   256  	return;
   257   the_void:
   258  	/* The void directory may be absent. On core18 system, and other
   259  	 * systems using bootable base snap coupled with snapd snap, the
   260  	 * /var/lib/snapd directory structure is not provided with packages but
   261  	 * created on demand. */
   262  	void_dir_fd = open(sc_void_dir,
   263  			   O_DIRECTORY | O_PATH | O_NOFOLLOW | O_CLOEXEC);
   264  	if (void_dir_fd < 0 && errno == ENOENT) {
   265  		if (mkdir(sc_void_dir, 0111) < 0) {
   266  			die("cannot create void directory: %s", sc_void_dir);
   267  		}
   268  		if (lchown(sc_void_dir, 0, 0) < 0) {
   269  			die("cannot change ownership of void directory %s",
   270  			    sc_void_dir);
   271  		}
   272  		void_dir_fd = open(sc_void_dir,
   273  				   O_DIRECTORY | O_PATH | O_NOFOLLOW |
   274  				   O_CLOEXEC);
   275  	}
   276  	if (void_dir_fd < 0) {
   277  		die("cannot open the void directory %s", sc_void_dir);
   278  	}
   279  	if (fchdir(void_dir_fd) < 0) {
   280  		die("cannot move to void directory %s", sc_void_dir);
   281  	}
   282  	debug("the process has been placed in the special void directory");
   283  }
   284  
   285  /**
   286   *  sc_cleanup_preserved_process_state releases system resources.
   287  **/
   288  static void sc_cleanup_preserved_process_state(sc_preserved_process_state *
   289  					       proc_state)
   290  {
   291  	sc_cleanup_close(&proc_state->orig_cwd_fd);
   292  }
   293  
   294  static void enter_classic_execution_environment(void);
   295  static void enter_non_classic_execution_environment(sc_invocation * inv,
   296  						    struct sc_apparmor *aa,
   297  						    uid_t real_uid,
   298  						    gid_t real_gid,
   299  						    gid_t saved_gid);
   300  
   301  int main(int argc, char **argv)
   302  {
   303  	// Use our super-defensive parser to figure out what we've been asked to do.
   304  	sc_error *err = NULL;
   305  	struct sc_args *args SC_CLEANUP(sc_cleanup_args) = NULL;
   306  	sc_preserved_process_state proc_state
   307  	    SC_CLEANUP(sc_cleanup_preserved_process_state) = {
   308  		.orig_umask = 0,.orig_cwd_fd = -1
   309  	};
   310  	args = sc_nonfatal_parse_args(&argc, &argv, &err);
   311  	sc_die_on_error(err);
   312  
   313  	// Remember certain properties of the process that are clobbered by
   314  	// snap-confine during execution. Those are restored just before calling
   315  	// execv.
   316  	sc_preserve_and_sanitize_process_state(&proc_state);
   317  
   318  	// We've been asked to print the version string so let's just do that.
   319  	if (sc_args_is_version_query(args)) {
   320  		printf("%s %s\n", PACKAGE, PACKAGE_VERSION);
   321  		return 0;
   322  	}
   323  
   324  	/* Collect all invocation parameters. This gives us authoritative
   325  	 * information about what needs to be invoked and how. The data comes
   326  	 * from either the environment or from command line arguments */
   327  	sc_invocation SC_CLEANUP(sc_cleanup_invocation) invocation;
   328  	const char *snap_instance_name_env = getenv("SNAP_INSTANCE_NAME");
   329  	if (snap_instance_name_env == NULL) {
   330  		die("SNAP_INSTANCE_NAME is not set");
   331  	}
   332  	sc_init_invocation(&invocation, args, snap_instance_name_env);
   333  
   334  	// Who are we?
   335  	uid_t real_uid, effective_uid, saved_uid;
   336  	gid_t real_gid, effective_gid, saved_gid;
   337  	if (getresuid(&real_uid, &effective_uid, &saved_uid) != 0) {
   338  		die("getresuid failed");
   339  	}
   340  	if (getresgid(&real_gid, &effective_gid, &saved_gid) != 0) {
   341  		die("getresgid failed");
   342  	}
   343  	debug("ruid: %d, euid: %d, suid: %d",
   344  	      real_uid, effective_uid, saved_uid);
   345  	debug("rgid: %d, egid: %d, sgid: %d",
   346  	      real_gid, effective_gid, saved_gid);
   347  
   348  	// snap-confine runs as both setuid root and setgid root.
   349  	// Temporarily drop group privileges here and reraise later
   350  	// as needed.
   351  	if (effective_gid == 0 && real_gid != 0) {
   352  		if (setegid(real_gid) != 0) {
   353  			die("cannot set effective group id to %d", real_gid);
   354  		}
   355  	}
   356  #ifndef CAPS_OVER_SETUID
   357  	// this code always needs to run as root for the cgroup/udev setup,
   358  	// however for the tests we allow it to run as non-root
   359  	if (geteuid() != 0 && secure_getenv("SNAP_CONFINE_NO_ROOT") == NULL) {
   360  		die("need to run as root or suid");
   361  	}
   362  #endif
   363  
   364  	char *snap_context SC_CLEANUP(sc_cleanup_string) = NULL;
   365  	// Do no get snap context value if running a hook (we don't want to overwrite hook's SNAP_COOKIE)
   366  	if (!sc_is_hook_security_tag(invocation.security_tag)) {
   367  		sc_error *err SC_CLEANUP(sc_cleanup_error) = NULL;
   368  		snap_context =
   369  		    sc_cookie_get_from_snapd(invocation.snap_instance, &err);
   370  		/* While the cookie is normally present due to various protection
   371  		 * mechanisms ensuring its creation from snapd, we are not considering
   372  		 * it a critical error for snap-confine in the case it is absent. When
   373  		 * absent snaps attempting to utilize snapctl to interact with snapd
   374  		 * will fail but it is more important to run a little than break
   375  		 * entirely in case snapd-side code is incorrect. Therefore error
   376  		 * information is collected but discarded. */
   377  	}
   378  
   379  	struct sc_apparmor apparmor;
   380  	sc_init_apparmor_support(&apparmor);
   381  	if (!apparmor.is_confined && apparmor.mode != SC_AA_NOT_APPLICABLE
   382  	    && getuid() != 0 && geteuid() == 0) {
   383  		// Refuse to run when this process is running unconfined on a system
   384  		// that supports AppArmor when the effective uid is root and the real
   385  		// id is non-root.  This protects against, for example, unprivileged
   386  		// users trying to leverage the snap-confine in the core snap to
   387  		// escalate privileges.
   388  		die("snap-confine has elevated permissions and is not confined"
   389  		    " but should be. Refusing to continue to avoid"
   390  		    " permission escalation attacks");
   391  	}
   392  	// TODO: check for similar situation and linux capabilities.
   393  	if (geteuid() == 0) {
   394  		if (invocation.classic_confinement) {
   395  			enter_classic_execution_environment();
   396  		} else {
   397  			enter_non_classic_execution_environment(&invocation,
   398  								&apparmor,
   399  								real_uid,
   400  								real_gid,
   401  								saved_gid);
   402  		}
   403  		// The rest does not so temporarily drop privs back to calling
   404  		// user (we'll permanently drop after loading seccomp)
   405  		if (setegid(real_gid) != 0)
   406  			die("setegid failed");
   407  		if (seteuid(real_uid) != 0)
   408  			die("seteuid failed");
   409  
   410  		if (real_gid != 0 && geteuid() == 0)
   411  			die("dropping privs did not work");
   412  		if (real_uid != 0 && getegid() == 0)
   413  			die("dropping privs did not work");
   414  	}
   415  	// Ensure that the user data path exists.
   416  	setup_user_data();
   417  #if 0
   418  	setup_user_xdg_runtime_dir();
   419  #endif
   420  	// https://wiki.ubuntu.com/SecurityTeam/Specifications/SnappyConfinement
   421  	sc_maybe_aa_change_onexec(&apparmor, invocation.security_tag);
   422  #ifdef HAVE_SELINUX
   423  	// For classic and confined snaps
   424  	sc_selinux_set_snap_execcon();
   425  #endif
   426  	if (snap_context != NULL) {
   427  		setenv("SNAP_COOKIE", snap_context, 1);
   428  		// for compatibility, if facing older snapd.
   429  		setenv("SNAP_CONTEXT", snap_context, 1);
   430  	}
   431  	// Normally setuid/setgid not only permanently drops the UID/GID, but
   432  	// also clears the capabilities bounding sets (see "Effect of user ID
   433  	// changes on capabilities" in 'man capabilities'). To load a seccomp
   434  	// profile, we need either CAP_SYS_ADMIN or PR_SET_NO_NEW_PRIVS. Since
   435  	// NNP causes issues with AppArmor and exec transitions in certain
   436  	// snapd interfaces, keep CAP_SYS_ADMIN temporarily when we are
   437  	// permanently dropping privileges.
   438  	if (getresuid(&real_uid, &effective_uid, &saved_uid) != 0) {
   439  		die("getresuid failed");
   440  	}
   441  	debug("ruid: %d, euid: %d, suid: %d",
   442  	      real_uid, effective_uid, saved_uid);
   443  	struct __user_cap_header_struct hdr =
   444  	    { _LINUX_CAPABILITY_VERSION_3, 0 };
   445  	struct __user_cap_data_struct cap_data[2] = { {0} };
   446  
   447  	// At this point in time, if we are going to permanently drop our
   448  	// effective_uid will not be '0' but our saved_uid will be '0'. Detect
   449  	// and save when we are in the this state so know when to setup the
   450  	// capabilities bounding set, regain CAP_SYS_ADMIN and later drop it.
   451  	bool keep_sys_admin = effective_uid != 0 && saved_uid == 0;
   452  	if (keep_sys_admin) {
   453  		debug("setting capabilities bounding set");
   454  		// clear all 32 bit caps but SYS_ADMIN, with none inheritable
   455  		cap_data[0].effective = CAP_TO_MASK(CAP_SYS_ADMIN);
   456  		cap_data[0].permitted = cap_data[0].effective;
   457  		cap_data[0].inheritable = 0;
   458  		// clear all 64 bit caps
   459  		cap_data[1].effective = 0;
   460  		cap_data[1].permitted = 0;
   461  		cap_data[1].inheritable = 0;
   462  		if (capset(&hdr, cap_data) != 0) {
   463  			die("capset failed");
   464  		}
   465  	}
   466  	// Permanently drop if not root
   467  	if (effective_uid == 0) {
   468  		// Note that we do not call setgroups() here because its ok
   469  		// that the user keeps the groups he already belongs to
   470  		if (setgid(real_gid) != 0)
   471  			die("setgid failed");
   472  		if (setuid(real_uid) != 0)
   473  			die("setuid failed");
   474  
   475  		if (real_gid != 0 && (getuid() == 0 || geteuid() == 0))
   476  			die("permanently dropping privs did not work");
   477  		if (real_uid != 0 && (getgid() == 0 || getegid() == 0))
   478  			die("permanently dropping privs did not work");
   479  	}
   480  	// Now that we've permanently dropped, regain SYS_ADMIN
   481  	if (keep_sys_admin) {
   482  		debug("regaining SYS_ADMIN");
   483  		cap_data[0].effective = CAP_TO_MASK(CAP_SYS_ADMIN);
   484  		cap_data[0].permitted = cap_data[0].effective;
   485  		if (capset(&hdr, cap_data) != 0) {
   486  			die("capset regain failed");
   487  		}
   488  	}
   489  	// Now that we've dropped and regained SYS_ADMIN, we can load the
   490  	// seccomp profiles.
   491  	if (sc_apply_seccomp_profile_for_security_tag(invocation.security_tag)) {
   492  		// If the process is not explicitly unconfined then load the
   493  		// global profile as well.
   494  		sc_apply_global_seccomp_profile();
   495  	}
   496  	// Even though we set inheritable to 0, let's clear SYS_ADMIN
   497  	// explicitly
   498  	if (keep_sys_admin) {
   499  		debug("clearing SYS_ADMIN");
   500  		cap_data[0].effective = 0;
   501  		cap_data[0].permitted = cap_data[0].effective;
   502  		if (capset(&hdr, cap_data) != 0) {
   503  			die("capset clear failed");
   504  		}
   505  	}
   506  	// and exec the new executable
   507  	argv[0] = (char *)invocation.executable;
   508  	debug("execv(%s, %s...)", invocation.executable, argv[0]);
   509  	for (int i = 1; i < argc; ++i) {
   510  		debug(" argv[%i] = %s", i, argv[i]);
   511  	}
   512  	// Restore process state that was recorded earlier.
   513  	sc_restore_process_state(&proc_state);
   514  	execv(invocation.executable, (char *const *)&argv[0]);
   515  	perror("execv failed");
   516  	return 1;
   517  }
   518  
   519  static void enter_classic_execution_environment(void)
   520  {
   521  	/* 'classic confinement' is designed to run without the sandbox inside the
   522  	 * shared namespace. Specifically:
   523  	 * - snap-confine skips using the snap-specific mount namespace
   524  	 * - snap-confine skips using device cgroups
   525  	 * - snapd sets up a lenient AppArmor profile for snap-confine to use
   526  	 * - snapd sets up a lenient seccomp profile for snap-confine to use
   527  	 */
   528  	debug("skipping sandbox setup, classic confinement in use");
   529  }
   530  
   531  static void enter_non_classic_execution_environment(sc_invocation * inv,
   532  						    struct sc_apparmor *aa,
   533  						    uid_t real_uid,
   534  						    gid_t real_gid,
   535  						    gid_t saved_gid)
   536  {
   537  	/* snap-confine uses privately-shared /run/snapd/ns to store bind-mounted
   538  	 * mount namespaces of each snap. In the case that snap-confine is invoked
   539  	 * from the mount namespace it typically constructs, the said directory
   540  	 * does not contain mount entries for preserved namespaces as those are
   541  	 * only visible in the main, outer namespace.
   542  	 *
   543  	 * In order to operate in such an environment snap-confine must first
   544  	 * re-associate its own process with another namespace in which the
   545  	 * /run/snapd/ns directory is visible. The most obvious candidate is pid
   546  	 * one, which definitely doesn't run in a snap-specific namespace, has a
   547  	 * predictable PID and is long lived.
   548  	 */
   549  	sc_reassociate_with_pid1_mount_ns();
   550  	// Do global initialization:
   551  	int global_lock_fd = sc_lock_global();
   552  	// ensure that "/" or "/snap" is mounted with the
   553  	// "shared" option, see LP:#1668659
   554  	debug("ensuring that snap mount directory is shared");
   555  	sc_ensure_shared_snap_mount();
   556  	debug("unsharing snap namespace directory");
   557  	sc_initialize_mount_ns();
   558  	sc_unlock(global_lock_fd);
   559  
   560  	// Find and open snap-update-ns and snap-discard-ns from the same
   561  	// path as where we (snap-confine) were called.
   562  	int snap_update_ns_fd SC_CLEANUP(sc_cleanup_close) = -1;
   563  	snap_update_ns_fd = sc_open_snap_update_ns();
   564  	int snap_discard_ns_fd SC_CLEANUP(sc_cleanup_close) = -1;
   565  	snap_discard_ns_fd = sc_open_snap_discard_ns();
   566  
   567  	// Do per-snap initialization.
   568  	int snap_lock_fd = sc_lock_snap(inv->snap_instance);
   569  	debug("initializing mount namespace: %s", inv->snap_instance);
   570  	struct sc_mount_ns *group = NULL;
   571  	group = sc_open_mount_ns(inv->snap_instance);
   572  
   573  	// Init and check rootfs_dir, apply any fallback behaviors.
   574  	sc_check_rootfs_dir(inv);
   575  
   576  	/** Populate and join the device control group. */
   577  	struct snappy_udev udev_s;
   578  	if (snappy_udev_init(inv->security_tag, &udev_s) == 0) {
   579  		if (!sc_cgroup_is_v2()) {
   580  			setup_devices_cgroup(inv->security_tag, &udev_s);
   581  		}
   582  	}
   583  	snappy_udev_cleanup(&udev_s);
   584  
   585  	/**
   586  	 * is_normal_mode controls if we should pivot into the base snap.
   587  	 *
   588  	 * There are two modes of execution for snaps that are not using classic
   589  	 * confinement: normal and legacy. The normal mode is where snap-confine
   590  	 * sets up a rootfs and then pivots into it using pivot_root(2). The legacy
   591  	 * mode is when snap-confine just unshares the initial mount namespace,
   592  	 * makes some extra changes but largely runs with what was presented to it
   593  	 * initially.
   594  	 *
   595  	 * Historically the ubuntu-core distribution used the now-legacy mode. This
   596  	 * was sensible then since snaps already (kind of) have the right root
   597  	 * file-system and just need some privacy and isolation features applied.
   598  	 * With the introduction of snaps to classic distributions as well as the
   599  	 * introduction of bases, where each snap can use a different root
   600  	 * filesystem, this lost sensibility and thus became legacy.
   601  	 *
   602  	 * For compatibility with current installations of ubuntu-core
   603  	 * distributions the legacy mode is used when: the distribution is
   604  	 * SC_DISTRO_CORE16 or when the base snap name is not "core" or
   605  	 * "ubuntu-core".
   606  	 *
   607  	 * The SC_DISTRO_CORE16 is applied to systems that boot with the "core",
   608  	 * "ubuntu-core" or "core16" snap. Systems using the "core18" base snap do
   609  	 * not qualify for that classification.
   610  	 **/
   611  	sc_distro distro = sc_classify_distro();
   612  	inv->is_normal_mode = distro != SC_DISTRO_CORE16 ||
   613  	    !sc_streq(inv->orig_base_snap_name, "core");
   614  
   615  	/* Stale mount namespace discarded or no mount namespace to
   616  	   join. We need to construct a new mount namespace ourselves.
   617  	   To capture it we will need a helper process so make one. */
   618  	sc_fork_helper(group, aa);
   619  	int retval = sc_join_preserved_ns(group, aa, inv, snap_discard_ns_fd);
   620  	if (retval == ESRCH) {
   621  		/* Create and populate the mount namespace. This performs all
   622  		   of the bootstrapping mounts, pivots into the new root filesystem and
   623  		   applies the per-snap mount profile using snap-update-ns. */
   624  		debug("unsharing the mount namespace (per-snap)");
   625  		if (unshare(CLONE_NEWNS) < 0) {
   626  			die("cannot unshare the mount namespace");
   627  		}
   628  		sc_populate_mount_ns(aa, snap_update_ns_fd, inv);
   629  		sc_store_ns_info(inv);
   630  
   631  		/* Preserve the mount namespace. */
   632  		sc_preserve_populated_mount_ns(group);
   633  	}
   634  
   635  	/* Older versions of snap-confine created incorrect 777 permissions
   636  	   for /var/lib and we need to fixup for systems that had their NS created
   637  	   with an old version. */
   638  	sc_maybe_fixup_permissions();
   639  	sc_maybe_fixup_udev();
   640  
   641  	/* User mount profiles do not apply to non-root users. */
   642  	if (real_uid != 0) {
   643  		debug("joining preserved per-user mount namespace");
   644  		retval =
   645  		    sc_join_preserved_per_user_ns(group, inv->snap_instance);
   646  		if (retval == ESRCH) {
   647  			debug("unsharing the mount namespace (per-user)");
   648  			if (unshare(CLONE_NEWNS) < 0) {
   649  				die("cannot unshare the mount namespace");
   650  			}
   651  			sc_setup_user_mounts(aa, snap_update_ns_fd,
   652  					     inv->snap_instance);
   653  			/* Preserve the mount per-user namespace. But only if the
   654  			 * experimental feature is enabled. This way if the feature is
   655  			 * disabled user mount namespaces will still exist but will be
   656  			 * entirely ephemeral. In addition the call
   657  			 * sc_join_preserved_user_ns() will never find a preserved mount
   658  			 * namespace and will always enter this code branch. */
   659  			if (sc_feature_enabled
   660  			    (SC_FEATURE_PER_USER_MOUNT_NAMESPACE)) {
   661  				sc_preserve_populated_per_user_mount_ns(group);
   662  			} else {
   663  				debug
   664  				    ("NOT preserving per-user mount namespace");
   665  			}
   666  		}
   667  	}
   668  	// Associate each snap process with a dedicated snap freezer cgroup and
   669  	// snap pids cgroup. All snap processes belonging to one snap share the
   670  	// freezer cgroup. All snap processes belonging to one app or one hook
   671  	// share the pids cgroup.
   672  	//
   673  	// This simplifies testing if any processes belonging to a given snap are
   674  	// still alive as well as to properly account for each application and
   675  	// service.
   676  	if (getegid() != 0 && saved_gid == 0) {
   677  		// Temporarily raise egid so we can chown the freezer cgroup under LXD.
   678  		if (setegid(0) != 0) {
   679  			die("cannot set effective group id to root");
   680  		}
   681  	}
   682  	if (!sc_cgroup_is_v2()) {
   683  		sc_cgroup_freezer_join(inv->snap_instance, getpid());
   684  		if (sc_feature_enabled(SC_FEATURE_REFRESH_APP_AWARENESS)) {
   685  			sc_cgroup_pids_join(inv->security_tag, getpid());
   686  		}
   687  	}
   688  	if (geteuid() == 0 && real_gid != 0) {
   689  		if (setegid(real_gid) != 0) {
   690  			die("cannot set effective group id to %d", real_gid);
   691  		}
   692  	}
   693  
   694  	sc_unlock(snap_lock_fd);
   695  
   696  	sc_close_mount_ns(group);
   697  
   698  	// Reset path as we cannot rely on the path from the host OS to make sense.
   699  	// The classic distribution may use any PATH that makes sense but we cannot
   700  	// assume it makes sense for the core snap layout. Note that the /usr/local
   701  	// directories are explicitly left out as they are not part of the core
   702  	// snap.
   703  	debug("resetting PATH to values in sync with core snap");
   704  	setenv("PATH",
   705  	       "/usr/local/sbin:"
   706  	       "/usr/local/bin:"
   707  	       "/usr/sbin:"
   708  	       "/usr/bin:"
   709  	       "/sbin:" "/bin:" "/usr/games:" "/usr/local/games", 1);
   710  	// Ensure we set the various TMPDIRs to /tmp. One of the parts of setting
   711  	// up the mount namespace is to create a private /tmp directory (this is
   712  	// done in sc_populate_mount_ns() above). The host environment may point to
   713  	// a directory not accessible by snaps so we need to reset it here.
   714  	const char *tmpd[] = { "TMPDIR", "TEMPDIR", NULL };
   715  	int i;
   716  	for (i = 0; tmpd[i] != NULL; i++) {
   717  		if (setenv(tmpd[i], "/tmp", 1) != 0) {
   718  			die("cannot set environment variable '%s'", tmpd[i]);
   719  		}
   720  	}
   721  }