github.com/kubiko/snapd@v0.0.0-20201013125620-d4f3094d9ddf/cmd/snap-confine/mount-support.c (about) 1 /* 2 * Copyright (C) 2015 Canonical Ltd 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 3 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 * 16 */ 17 #ifdef HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #include "mount-support.h" 22 23 #include <errno.h> 24 #include <fcntl.h> 25 #include <libgen.h> 26 #include <limits.h> 27 #include <mntent.h> 28 #include <sched.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <sys/mount.h> 33 #include <sys/stat.h> 34 #include <sys/syscall.h> 35 #include <sys/types.h> 36 #include <sys/wait.h> 37 #include <unistd.h> 38 39 #include "../libsnap-confine-private/apparmor-support.h" 40 #include "../libsnap-confine-private/classic.h" 41 #include "../libsnap-confine-private/cleanup-funcs.h" 42 #include "../libsnap-confine-private/mount-opt.h" 43 #include "../libsnap-confine-private/mountinfo.h" 44 #include "../libsnap-confine-private/snap.h" 45 #include "../libsnap-confine-private/string-utils.h" 46 #include "../libsnap-confine-private/tool.h" 47 #include "../libsnap-confine-private/utils.h" 48 #include "mount-support-nvidia.h" 49 50 #define MAX_BUF 1000 51 52 static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode); 53 54 // TODO: simplify this, after all it is just a tmpfs 55 // TODO: fold this into bootstrap 56 static void setup_private_mount(const char *snap_name) 57 { 58 // Create a 0700 base directory. This is the "base" directory that is 59 // protected from other users. This directory name is NOT randomly 60 // generated. This has several properties: 61 // 62 // Users can relate to the name and can find the temporary directory as 63 // visible from within the snap. If this directory was random it would be 64 // harder to find because there may be situations in which multiple 65 // directories related to the same snap name would exist. 66 // 67 // Snapd can partially manage the directory. Specifically on snap remove 68 // snapd could remove the directory and everything in it, potentially 69 // avoiding runaway disk use on a machine that either never reboots or uses 70 // persistent /tmp directory. 71 // 72 // Underneath the base directory there is a "tmp" sub-directory that has 73 // mode 1777 and behaves as a typical /tmp directory would. That directory 74 // is used as a bind-mounted /tmp directory. 75 // 76 // Because the directories are reused across invocations by distinct users 77 // and because the directories are trivially guessable, each invocation 78 // unconditionally chowns/chmods them to appropriate values. 79 char base_dir[MAX_BUF] = { 0 }; 80 char tmp_dir[MAX_BUF] = { 0 }; 81 int base_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 82 int tmp_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 83 sc_must_snprintf(base_dir, sizeof(base_dir), "/tmp/snap.%s", snap_name); 84 sc_must_snprintf(tmp_dir, sizeof(tmp_dir), "%s/tmp", base_dir); 85 86 /* Switch to root group so that mkdir and open calls below create filesystem 87 * elements that are not owned by the user calling into snap-confine. */ 88 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 89 // Create /tmp/snap.$SNAP_NAME/ 0700 root.root. Ignore EEXIST since we want 90 // to reuse and we will open with O_NOFOLLOW, below. 91 if (mkdir(base_dir, 0700) < 0 && errno != EEXIST) { 92 die("cannot create base directory %s", base_dir); 93 } 94 base_dir_fd = open(base_dir, 95 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 96 if (base_dir_fd < 0) { 97 die("cannot open base directory %s", base_dir); 98 } 99 /* This seems redundant on first read but it has the non-obvious 100 * property of changing existing directories that have already existed 101 * but had incorrect ownership or permission. This is possible due to 102 * earlier bugs in snap-confine and due to the fact that some systems 103 * use persistent /tmp directory and may not clean up leftover files 104 * for arbitrarily long. This comment applies the following two pairs 105 * of fchmod and fchown. */ 106 if (fchmod(base_dir_fd, 0700) < 0) { 107 die("cannot chmod base directory %s to 0700", base_dir); 108 } 109 if (fchown(base_dir_fd, 0, 0) < 0) { 110 die("cannot chown base directory %s to root.root", base_dir); 111 } 112 // Create /tmp/snap.$SNAP_NAME/tmp 01777 root.root Ignore EEXIST since we 113 // want to reuse and we will open with O_NOFOLLOW, below. 114 if (mkdirat(base_dir_fd, "tmp", 01777) < 0 && errno != EEXIST) { 115 die("cannot create private tmp directory %s/tmp", base_dir); 116 } 117 (void)sc_set_effective_identity(old); 118 tmp_dir_fd = openat(base_dir_fd, "tmp", 119 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 120 if (tmp_dir_fd < 0) { 121 die("cannot open private tmp directory %s/tmp", base_dir); 122 } 123 if (fchmod(tmp_dir_fd, 01777) < 0) { 124 die("cannot chmod private tmp directory %s/tmp to 01777", 125 base_dir); 126 } 127 if (fchown(tmp_dir_fd, 0, 0) < 0) { 128 die("cannot chown private tmp directory %s/tmp to root.root", 129 base_dir); 130 } 131 sc_do_mount(tmp_dir, "/tmp", NULL, MS_BIND, NULL); 132 sc_do_mount("none", "/tmp", NULL, MS_PRIVATE, NULL); 133 } 134 135 // TODO: fold this into bootstrap 136 static void setup_private_pts(void) 137 { 138 // See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt 139 // 140 // Ubuntu by default uses devpts 'single-instance' mode where 141 // /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change 142 // the startup scripts though, so we follow the instructions in point 143 // '4' of 'User-space changes' in the above doc. In other words, after 144 // unshare(CLONE_NEWNS), we mount devpts with -o 145 // newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto 146 // /dev/ptmx 147 148 struct stat st; 149 150 // Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode 151 // which doesn't provide the isolation we require. 152 if (stat("/dev/pts/ptmx", &st) != 0) { 153 die("cannot stat /dev/pts/ptmx"); 154 } 155 // Make sure /dev/ptmx exists so we can bind mount over it 156 if (stat("/dev/ptmx", &st) != 0) { 157 die("cannot stat /dev/ptmx"); 158 } 159 // Since multi-instance, use ptmxmode=0666. The other options are 160 // copied from /etc/default/devpts 161 sc_do_mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, 162 "newinstance,ptmxmode=0666,mode=0620,gid=5"); 163 sc_do_mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0); 164 } 165 166 struct sc_mount { 167 const char *path; 168 bool is_bidirectional; 169 // Alternate path defines the rbind mount "alternative" of path. 170 // It exists so that we can make /media on systems that use /run/media. 171 const char *altpath; 172 // Optional mount points are not processed unless the source and 173 // destination both exist. 174 bool is_optional; 175 }; 176 177 struct sc_mount_config { 178 const char *rootfs_dir; 179 // The struct is terminated with an entry with NULL path. 180 const struct sc_mount *mounts; 181 sc_distro distro; 182 bool normal_mode; 183 const char *base_snap_name; 184 }; 185 186 /** 187 * Bootstrap mount namespace. 188 * 189 * This is a chunk of tricky code that lets us have full control over the 190 * layout and direction of propagation of mount events. The documentation below 191 * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source 192 * tree. 193 * 194 * As a reminder two definitions are quoted below: 195 * 196 * A 'propagation event' is defined as event generated on a vfsmount 197 * that leads to mount or unmount actions in other vfsmounts. 198 * 199 * A 'peer group' is defined as a group of vfsmounts that propagate 200 * events to each other. 201 * 202 * (end of quote). 203 * 204 * The main idea is to setup a mount namespace that has a root filesystem with 205 * vfsmounts and peer groups that, depending on the location, either isolate 206 * or share with the rest of the system. 207 * 208 * The vast majority of the filesystem is shared in one direction. Events from 209 * the outside (from the main mount namespace) propagate inside (to namespaces 210 * of particular snaps) so things like new snap revisions, mounted drives, etc, 211 * just show up as expected but even if a snap is exploited or malicious in 212 * nature it cannot affect anything in another namespace where it might cause 213 * security or stability issues. 214 * 215 * Selected directories (today just /media) can be shared in both directions. 216 * This allows snaps with sufficient privileges to either create, through the 217 * mount system call, additional mount points that are visible by the rest of 218 * the system (both the main mount namespace and namespaces of individual 219 * snaps) or remove them, through the unmount system call. 220 **/ 221 static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config) 222 { 223 char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX"; 224 char src[PATH_MAX] = { 0 }; 225 char dst[PATH_MAX] = { 0 }; 226 if (mkdtemp(scratch_dir) == NULL) { 227 die("cannot create temporary directory for the root file system"); 228 } 229 // NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new 230 // mount namespace and have a private list of mounts. 231 debug("scratch directory for constructing namespace: %s", scratch_dir); 232 // Make the root filesystem recursively shared. This way propagation events 233 // will be shared with main mount namespace. 234 sc_do_mount("none", "/", NULL, MS_REC | MS_SHARED, NULL); 235 // Bind mount the temporary scratch directory for root filesystem over 236 // itself so that it is a mount point. This is done so that it can become 237 // unbindable as explained below. 238 sc_do_mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL); 239 // Make the scratch directory unbindable. 240 // 241 // This is necessary as otherwise a mount loop can occur and the kernel 242 // would crash. The term unbindable simply states that it cannot be bind 243 // mounted anywhere. When we construct recursive bind mounts below this 244 // guarantees that this directory will not be replicated anywhere. 245 sc_do_mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL); 246 // Recursively bind mount desired root filesystem directory over the 247 // scratch directory. This puts the initial content into the scratch space 248 // and serves as a foundation for all subsequent operations below. 249 // 250 // The mount is recursive because it can either be applied to the root 251 // filesystem of a core system (aka all-snap) or the core snap on a classic 252 // system. In the former case we need recursive bind mounts to accurately 253 // replicate the state of the root filesystem into the scratch directory. 254 sc_do_mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND, 255 NULL); 256 // Make the scratch directory recursively slave. Nothing done there will be 257 // shared with the initial mount namespace. This effectively detaches us, 258 // in one way, from the original namespace and coupled with pivot_root 259 // below serves as the foundation of the mount sandbox. 260 sc_do_mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL); 261 // Bind mount certain directories from the host filesystem to the scratch 262 // directory. By default mount events will propagate in both into and out 263 // of the peer group. This way the running application can alter any global 264 // state visible on the host and in other snaps. This can be restricted by 265 // disabling the "is_bidirectional" flag as can be seen below. 266 for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL; 267 mnt++) { 268 269 if (mnt->is_bidirectional) { 270 sc_identity old = 271 sc_set_effective_identity(sc_root_group_identity()); 272 if (mkdir(mnt->path, 0755) < 0 && errno != EEXIST) { 273 die("cannot create %s", mnt->path); 274 } 275 (void)sc_set_effective_identity(old); 276 } 277 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 278 mnt->path); 279 if (mnt->is_optional) { 280 bool ok = sc_do_optional_mount(mnt->path, dst, NULL, 281 MS_REC | MS_BIND, NULL); 282 if (!ok) { 283 // If we cannot mount it, just continue. 284 continue; 285 } 286 } else { 287 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, 288 NULL); 289 } 290 if (!mnt->is_bidirectional) { 291 // Mount events will only propagate inwards to the namespace. This 292 // way the running application cannot alter any global state apart 293 // from that of its own snap. 294 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 295 } 296 if (mnt->altpath == NULL) { 297 continue; 298 } 299 // An alternate path of mnt->path is provided at another location. 300 // It should behave exactly the same as the original. 301 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 302 mnt->altpath); 303 struct stat stat_buf; 304 if (lstat(dst, &stat_buf) < 0) { 305 die("cannot lstat %s", dst); 306 } 307 if ((stat_buf.st_mode & S_IFMT) == S_IFLNK) { 308 die("cannot bind mount alternate path over a symlink: %s", dst); 309 } 310 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL); 311 if (!mnt->is_bidirectional) { 312 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 313 } 314 } 315 if (config->normal_mode) { 316 // Since we mounted /etc from the host filesystem to the scratch directory, 317 // we may need to put certain directories from the desired root filesystem 318 // (e.g. the core snap) back. This way the behavior of running snaps is not 319 // affected by the alternatives directory from the host, if one exists. 320 // 321 // Fixes the following bugs: 322 // - https://bugs.launchpad.net/snap-confine/+bug/1580018 323 // - https://bugzilla.opensuse.org/show_bug.cgi?id=1028568 324 const char *dirs_from_core[] = 325 { "/etc/alternatives", "/etc/ssl", "/etc/nsswitch.conf", 326 NULL 327 }; 328 for (const char **dirs = dirs_from_core; *dirs != NULL; dirs++) { 329 const char *dir = *dirs; 330 if (access(dir, F_OK) != 0) { 331 continue; 332 } 333 struct stat dst_stat; 334 struct stat src_stat; 335 sc_must_snprintf(src, sizeof src, "%s%s", 336 config->rootfs_dir, dir); 337 sc_must_snprintf(dst, sizeof dst, "%s%s", 338 scratch_dir, dir); 339 if (lstat(src, &src_stat) != 0) { 340 if (errno == ENOENT) { 341 continue; 342 } 343 die("cannot stat %s from desired rootfs", src); 344 } 345 if (!S_ISREG(src_stat.st_mode) 346 && !S_ISDIR(src_stat.st_mode)) { 347 debug 348 ("entry %s from the desired rootfs is not a file or directory, skipping mount", 349 src); 350 continue; 351 } 352 353 if (lstat(dst, &dst_stat) != 0) { 354 if (errno == ENOENT) { 355 continue; 356 } 357 die("cannot stat %s from host", src); 358 } 359 if (!S_ISREG(dst_stat.st_mode) 360 && !S_ISDIR(dst_stat.st_mode)) { 361 debug 362 ("entry %s from the host is not a file or directory, skipping mount", 363 src); 364 continue; 365 } 366 367 if ((dst_stat.st_mode & S_IFMT) != 368 (src_stat.st_mode & S_IFMT)) { 369 debug 370 ("entries %s and %s are of different types, skipping mount", 371 dst, src); 372 continue; 373 } 374 // both source and destination exist where both are either files 375 // or both are directories 376 sc_do_mount(src, dst, NULL, MS_BIND, NULL); 377 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 378 } 379 } 380 // The "core" base snap is special as it contains snapd and friends. 381 // Other base snaps do not, so whenever a base snap other than core is 382 // in use we need extra provisions for setting up internal tooling to 383 // be available. 384 // 385 // However on a core18 (and similar) system the core snap is not 386 // a special base anymore and we should map our own tooling in. 387 if (config->distro == SC_DISTRO_CORE_OTHER 388 || !sc_streq(config->base_snap_name, "core")) { 389 // when bases are used we need to bind-mount the libexecdir 390 // (that contains snap-exec) into /usr/lib/snapd of the 391 // base snap so that snap-exec is available for the snaps 392 // (base snaps do not ship snapd) 393 394 // dst is always /usr/lib/snapd as this is where snapd 395 // assumes to find snap-exec 396 sc_must_snprintf(dst, sizeof dst, "%s/usr/lib/snapd", 397 scratch_dir); 398 399 // bind mount the current $ROOT/usr/lib/snapd path, 400 // where $ROOT is either "/" or the "/snap/{core,snapd}/current" 401 // that we are re-execing from 402 char *src = NULL; 403 char self[PATH_MAX + 1] = { 0 }; 404 ssize_t nread; 405 nread = readlink("/proc/self/exe", self, sizeof self - 1); 406 if (nread < 0) { 407 die("cannot read /proc/self/exe"); 408 } 409 // Though we initialized self to NULs and passed one less to 410 // readlink, therefore guaranteeing that self is 411 // zero-terminated, perform an explicit assignment to make 412 // Coverity happy. 413 self[nread] = '\0'; 414 // this cannot happen except when the kernel is buggy 415 if (strstr(self, "/snap-confine") == NULL) { 416 die("cannot use result from readlink: %s", self); 417 } 418 src = dirname(self); 419 // dirname(path) might return '.' depending on path. 420 // /proc/self/exe should always point 421 // to an absolute path, but let's guarantee that. 422 if (src[0] != '/') { 423 die("cannot use the result of dirname(): %s", src); 424 } 425 426 sc_do_mount(src, dst, NULL, MS_BIND | MS_RDONLY, NULL); 427 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 428 } 429 // Bind mount the directory where all snaps are mounted. The location of 430 // the this directory on the host filesystem may not match the location in 431 // the desired root filesystem. In the "core" and "ubuntu-core" snaps the 432 // directory is always /snap. On the host it is a build-time configuration 433 // option stored in SNAP_MOUNT_DIR. In legacy mode (or in other words, not 434 // in normal mode), we don't need to do this because /snap is fixed and 435 // already contains the correct view of the mounted snaps. 436 if (config->normal_mode) { 437 sc_must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir); 438 sc_do_mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC, NULL); 439 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 440 } 441 // Create the hostfs directory if one is missing. This directory is a part 442 // of packaging now so perhaps this code can be removed later. 443 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 444 if (mkdir(SC_HOSTFS_DIR, 0755) < 0) { 445 if (errno != EEXIST) { 446 die("cannot perform operation: mkdir %s", SC_HOSTFS_DIR); 447 } 448 } 449 (void)sc_set_effective_identity(old); 450 // Ensure that hostfs isgroup owned by root. We may have (now or earlier) 451 // created the directory as the user who first ran a snap on a given 452 // system and the group identity of that user is visilbe on disk. 453 // This was LP:#1665004 454 struct stat sb; 455 if (stat(SC_HOSTFS_DIR, &sb) < 0) { 456 die("cannot stat %s", SC_HOSTFS_DIR); 457 } 458 if (sb.st_uid != 0 || sb.st_gid != 0) { 459 if (chown(SC_HOSTFS_DIR, 0, 0) < 0) { 460 die("cannot change user/group owner of %s to root", 461 SC_HOSTFS_DIR); 462 } 463 } 464 // Make the upcoming "put_old" directory for pivot_root private so that 465 // mount events don't propagate to any peer group. In practice pivot root 466 // has a number of undocumented requirements and one of them is that the 467 // "put_old" directory (the second argument) cannot be shared in any way. 468 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR); 469 sc_do_mount(dst, dst, NULL, MS_BIND, NULL); 470 sc_do_mount("none", dst, NULL, MS_PRIVATE, NULL); 471 // On classic mount the nvidia driver. Ideally this would be done in an 472 // uniform way after pivot_root but this is good enough and requires less 473 // code changes the nvidia code assumes it has access to the existing 474 // pre-pivot filesystem. 475 if (config->distro == SC_DISTRO_CLASSIC) { 476 sc_mount_nvidia_driver(scratch_dir); 477 } 478 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 479 // pivot_root 480 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 481 // Use pivot_root to "chroot" into the scratch directory. 482 // 483 // Q: Why are we using something as esoteric as pivot_root(2)? 484 // A: Because this makes apparmor handling easy. Using a normal chroot 485 // makes all apparmor rules conditional. We are either running on an 486 // all-snap system where this would-be chroot didn't happen and all the 487 // rules see / as the root file system _OR_ we are running on top of a 488 // classic distribution and this chroot has now moved all paths to 489 // /tmp/snap.rootfs_*. 490 // 491 // Because we are using unshare(2) with CLONE_NEWNS we can essentially use 492 // pivot_root just like chroot but this makes apparmor unaware of the old 493 // root so everything works okay. 494 // 495 // HINT: If you are debugging this and are trying to see why pivot_root 496 // happens to return EINVAL with any changes you may be making, please 497 // consider applying 498 // misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree 499 // kernel. 500 debug("performing operation: pivot_root %s %s", scratch_dir, dst); 501 if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) { 502 die("cannot perform operation: pivot_root %s %s", scratch_dir, 503 dst); 504 } 505 // Unmount the self-bind mount over the scratch directory created earlier 506 // in the original root filesystem (which is now mounted on SC_HOSTFS_DIR). 507 // This way we can remove the temporary directory we created and "clean up" 508 // after ourselves nicely. 509 sc_must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir); 510 sc_do_umount(dst, UMOUNT_NOFOLLOW); 511 // Remove the scratch directory. Note that we are using the path that is 512 // based on the old root filesystem as after pivot_root we cannot guarantee 513 // what is present at the same location normally. (It is probably an empty 514 // /tmp directory that is populated in another place). 515 debug("performing operation: rmdir %s", dst); 516 if (rmdir(scratch_dir) < 0) { 517 die("cannot perform operation: rmdir %s", dst); 518 }; 519 // Make the old root filesystem recursively slave. This way operations 520 // performed in this mount namespace will not propagate to the peer group. 521 // This is another essential part of the confinement system. 522 sc_do_mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL); 523 // Detach the redundant hostfs version of sysfs since it shows up in the 524 // mount table and software inspecting the mount table may become confused 525 // (eg, docker and LP:# 162601). 526 sc_must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR); 527 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 528 // Detach the redundant hostfs version of /dev since it shows up in the 529 // mount table and software inspecting the mount table may become confused. 530 sc_must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR); 531 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 532 // Detach the redundant hostfs version of /proc since it shows up in the 533 // mount table and software inspecting the mount table may become confused. 534 sc_must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR); 535 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 536 // Detach both views of /writable: the one from hostfs and the one directly 537 // visible in /writable. Interfaces don't grant access to this directory 538 // and it has a large duplicated view of many mount points. Note that this 539 // is only applicable to ubuntu-core systems. 540 sc_detach_views_of_writable(config->distro, config->normal_mode); 541 } 542 543 static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode) 544 { 545 // Note that prior to detaching either mount point we switch the 546 // propagation to private to both limit the change to just this view and to 547 // prevent otherwise occurring event propagation from self-conflicting and 548 // returning EBUSY. A similar approach is used by snap-update-ns and is 549 // documented in umount(2). 550 const char *writable_dir = "/writable"; 551 const char *hostfs_writable_dir = "/var/lib/snapd/hostfs/writable"; 552 553 // Writable only exists on ubuntu-core. 554 if (distro == SC_DISTRO_CLASSIC) { 555 return; 556 } 557 // On all core distributions we see /var/lib/snapd/hostfs/writable that 558 // exposes writable, with a structure specific to ubuntu-core. 559 debug("detaching %s", hostfs_writable_dir); 560 sc_do_mount("none", hostfs_writable_dir, NULL, 561 MS_REC | MS_PRIVATE, NULL); 562 sc_do_umount(hostfs_writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH); 563 564 // On ubuntu-core 16, when the executed snap uses core as base we also see 565 // the /writable that we directly inherited from the initial mount 566 // namespace. 567 if (distro == SC_DISTRO_CORE16 && !normal_mode) { 568 debug("detaching %s", writable_dir); 569 sc_do_mount("none", writable_dir, NULL, MS_REC | MS_PRIVATE, 570 NULL); 571 sc_do_umount(writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH); 572 } 573 } 574 575 /** 576 * @path: a pathname where / replaced with '\0'. 577 * @offsetp: pointer to int showing which path segment was last seen. 578 * Updated on return to reflect the next segment. 579 * @fulllen: full original path length. 580 * Returns a pointer to the next path segment, or NULL if done. 581 */ 582 static char * __attribute__((used)) 583 get_nextpath(char *path, size_t *offsetp, size_t fulllen) 584 { 585 size_t offset = *offsetp; 586 587 if (offset >= fulllen) 588 return NULL; 589 590 while (offset < fulllen && path[offset] != '\0') 591 offset++; 592 while (offset < fulllen && path[offset] == '\0') 593 offset++; 594 595 *offsetp = offset; 596 return (offset < fulllen) ? &path[offset] : NULL; 597 } 598 599 /** 600 * Check that @subdir is a subdir of @dir. 601 **/ 602 static bool __attribute__((used)) 603 is_subdir(const char *subdir, const char *dir) 604 { 605 size_t dirlen = strlen(dir); 606 size_t subdirlen = strlen(subdir); 607 608 // @dir has to be at least as long as @subdir 609 if (subdirlen < dirlen) 610 return false; 611 // @dir has to be a prefix of @subdir 612 if (strncmp(subdir, dir, dirlen) != 0) 613 return false; 614 // @dir can look like "path/" (that is, end with the directory separator). 615 // When that is the case then given the test above we can be sure @subdir 616 // is a real subdirectory. 617 if (dirlen > 0 && dir[dirlen - 1] == '/') 618 return true; 619 // @subdir can look like "path/stuff" and when the directory separator 620 // is exactly at the spot where @dir ends (that is, it was not caught 621 // by the test above) then @subdir is a real subdirectory. 622 if (subdir[dirlen] == '/' && dirlen > 0) 623 return true; 624 // If both @dir and @subdir have identical length then given that the 625 // prefix check above @subdir is a real subdirectory. 626 if (subdirlen == dirlen) 627 return true; 628 return false; 629 } 630 631 void sc_populate_mount_ns(struct sc_apparmor *apparmor, int snap_update_ns_fd, 632 const sc_invocation * inv, const gid_t real_gid, 633 const gid_t saved_gid) 634 { 635 // Classify the current distribution, as claimed by /etc/os-release. 636 sc_distro distro = sc_classify_distro(); 637 638 // Check which mode we should run in, normal or legacy. 639 if (inv->is_normal_mode) { 640 // In normal mode we use the base snap as / and set up several bind mounts. 641 const struct sc_mount mounts[] = { 642 {"/dev"}, // because it contains devices on host OS 643 {"/etc"}, // because that's where /etc/resolv.conf lives, perhaps a bad idea 644 {"/home"}, // to support /home/*/snap and home interface 645 {"/root"}, // because that is $HOME for services 646 {"/proc"}, // fundamental filesystem 647 {"/sys"}, // fundamental filesystem 648 {"/tmp"}, // to get writable tmp 649 {"/var/snap"}, // to get access to global snap data 650 {"/var/lib/snapd"}, // to get access to snapd state and seccomp profiles 651 {"/var/tmp"}, // to get access to the other temporary directory 652 {"/run"}, // to get /run with sockets and what not 653 {"/lib/modules",.is_optional = true}, // access to the modules of the running kernel 654 {"/lib/firmware",.is_optional = true}, // access to the firmware of the running kernel 655 {"/usr/src"}, // FIXME: move to SecurityMounts in system-trace interface 656 {"/var/log"}, // FIXME: move to SecurityMounts in log-observe interface 657 #ifdef MERGED_USR 658 {"/run/media", true, "/media"}, // access to the users removable devices 659 #else 660 {"/media", true}, // access to the users removable devices 661 #endif // MERGED_USR 662 {"/run/netns", true}, // access to the 'ip netns' network namespaces 663 // The /mnt directory is optional in base snaps to ensure backwards 664 // compatibility with the first version of base snaps that was 665 // released. 666 {"/mnt",.is_optional = true}, // to support the removable-media interface 667 {"/var/lib/extrausers",.is_optional = true}, // access to UID/GID of extrausers (if available) 668 {}, 669 }; 670 struct sc_mount_config normal_config = { 671 .rootfs_dir = inv->rootfs_dir, 672 .mounts = mounts, 673 .distro = distro, 674 .normal_mode = true, 675 .base_snap_name = inv->base_snap_name, 676 }; 677 sc_bootstrap_mount_namespace(&normal_config); 678 } else { 679 // In legacy mode we don't pivot and instead just arrange bi- 680 // directional mount propagation for two directories. 681 const struct sc_mount mounts[] = { 682 {"/media", true}, 683 {"/run/netns", true}, 684 {}, 685 }; 686 struct sc_mount_config legacy_config = { 687 .rootfs_dir = "/", 688 .mounts = mounts, 689 .distro = distro, 690 .normal_mode = false, 691 .base_snap_name = inv->base_snap_name, 692 }; 693 sc_bootstrap_mount_namespace(&legacy_config); 694 } 695 696 // TODO: rename this and fold it into bootstrap 697 setup_private_mount(inv->snap_instance); 698 // set up private /dev/pts 699 // TODO: fold this into bootstrap 700 setup_private_pts(); 701 702 // setup the security backend bind mounts 703 sc_call_snap_update_ns(snap_update_ns_fd, inv->snap_instance, apparmor); 704 } 705 706 static bool is_mounted_with_shared_option(const char *dir) 707 __attribute__((nonnull(1))); 708 709 static bool is_mounted_with_shared_option(const char *dir) 710 { 711 sc_mountinfo *sm SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 712 sm = sc_parse_mountinfo(NULL); 713 if (sm == NULL) { 714 die("cannot parse /proc/self/mountinfo"); 715 } 716 sc_mountinfo_entry *entry = sc_first_mountinfo_entry(sm); 717 while (entry != NULL) { 718 const char *mount_dir = entry->mount_dir; 719 if (sc_streq(mount_dir, dir)) { 720 const char *optional_fields = entry->optional_fields; 721 if (strstr(optional_fields, "shared:") != NULL) { 722 return true; 723 } 724 } 725 entry = sc_next_mountinfo_entry(entry); 726 } 727 return false; 728 } 729 730 void sc_ensure_shared_snap_mount(void) 731 { 732 if (!is_mounted_with_shared_option("/") 733 && !is_mounted_with_shared_option(SNAP_MOUNT_DIR)) { 734 // TODO: We could be more aggressive and refuse to function but since 735 // we have no data on actual environments that happen to limp along in 736 // this configuration let's not do that yet. This code should be 737 // removed once we have a measurement and feedback mechanism that lets 738 // us decide based on measurable data. 739 sc_do_mount(SNAP_MOUNT_DIR, SNAP_MOUNT_DIR, "none", 740 MS_BIND | MS_REC, 0); 741 sc_do_mount("none", SNAP_MOUNT_DIR, NULL, MS_SHARED | MS_REC, 742 NULL); 743 } 744 } 745 746 void sc_setup_user_mounts(struct sc_apparmor *apparmor, int snap_update_ns_fd, 747 const char *snap_name) 748 { 749 debug("%s: %s", __FUNCTION__, snap_name); 750 751 char profile_path[PATH_MAX]; 752 struct stat st; 753 754 sc_must_snprintf(profile_path, sizeof(profile_path), 755 "/var/lib/snapd/mount/snap.%s.user-fstab", snap_name); 756 if (stat(profile_path, &st) != 0) { 757 // It is ok for the user fstab to not exist. 758 return; 759 } 760 761 // In our new mount namespace, recursively change all mounts 762 // to slave mode, so we see changes from the parent namespace 763 // but don't propagate our own changes. 764 sc_do_mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL); 765 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 766 sc_call_snap_update_ns_as_user(snap_update_ns_fd, snap_name, apparmor); 767 (void)sc_set_effective_identity(old); 768 } 769 770 void sc_ensure_snap_dir_shared_mounts(void) 771 { 772 const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL }; 773 for (int i = 0; dirs[i] != NULL; i++) { 774 const char *dir = dirs[i]; 775 if (!is_mounted_with_shared_option(dir)) { 776 /* Since this directory isn't yet shared (but it should be), 777 * recursively bind mount it, then recursively share it so that 778 * changes to the host are seen in the snap and vice-versa. This 779 * allows us to fine-tune propagation events elsewhere for this new 780 * mountpoint. 781 * 782 * Not using MS_SLAVE because it's too late for SNAP_MOUNT_DIR, 783 * since snaps are already mounted, and it's not needed for 784 * /var/snap. 785 */ 786 sc_do_mount(dir, dir, "none", MS_BIND | MS_REC, 0); 787 sc_do_mount("none", dir, NULL, MS_REC | MS_SHARED, 788 NULL); 789 } 790 } 791 } 792 793 void sc_setup_parallel_instance_classic_mounts(const char *snap_name, 794 const char *snap_instance_name) 795 { 796 char src[PATH_MAX] = { 0 }; 797 char dst[PATH_MAX] = { 0 }; 798 799 const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL }; 800 for (int i = 0; dirs[i] != NULL; i++) { 801 const char *dir = dirs[i]; 802 sc_do_mount("none", dir, NULL, MS_REC | MS_SLAVE, NULL); 803 } 804 805 /* Mount SNAP_MOUNT_DIR/<snap>_<key> on SNAP_MOUNT_DIR/<snap> */ 806 sc_must_snprintf(src, sizeof src, "%s/%s", SNAP_MOUNT_DIR, 807 snap_instance_name); 808 sc_must_snprintf(dst, sizeof dst, "%s/%s", SNAP_MOUNT_DIR, snap_name); 809 sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0); 810 811 /* Mount /var/snap/<snap>_<key> on /var/snap/<snap> */ 812 sc_must_snprintf(src, sizeof src, "/var/snap/%s", snap_instance_name); 813 sc_must_snprintf(dst, sizeof dst, "/var/snap/%s", snap_name); 814 sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0); 815 }