github.com/hugh712/snapd@v0.0.0-20200910133618-1a99902bd583/cmd/snap-confine/mount-support.c (about) 1 /* 2 * Copyright (C) 2015 Canonical Ltd 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 3 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 * 16 */ 17 #ifdef HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #include "mount-support.h" 22 23 #include <errno.h> 24 #include <fcntl.h> 25 #include <libgen.h> 26 #include <limits.h> 27 #include <mntent.h> 28 #include <sched.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <sys/mount.h> 33 #include <sys/stat.h> 34 #include <sys/syscall.h> 35 #include <sys/types.h> 36 #include <sys/types.h> 37 #include <sys/wait.h> 38 #include <unistd.h> 39 40 #include "../libsnap-confine-private/apparmor-support.h" 41 #include "../libsnap-confine-private/classic.h" 42 #include "../libsnap-confine-private/cleanup-funcs.h" 43 #include "../libsnap-confine-private/mount-opt.h" 44 #include "../libsnap-confine-private/mountinfo.h" 45 #include "../libsnap-confine-private/snap.h" 46 #include "../libsnap-confine-private/string-utils.h" 47 #include "../libsnap-confine-private/tool.h" 48 #include "../libsnap-confine-private/utils.h" 49 #include "mount-support-nvidia.h" 50 51 #define MAX_BUF 1000 52 53 static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode); 54 55 // TODO: simplify this, after all it is just a tmpfs 56 // TODO: fold this into bootstrap 57 static void setup_private_mount(const char *snap_name) 58 { 59 // Create a 0700 base directory. This is the "base" directory that is 60 // protected from other users. This directory name is NOT randomly 61 // generated. This has several properties: 62 // 63 // Users can relate to the name and can find the temporary directory as 64 // visible from within the snap. If this directory was random it would be 65 // harder to find because there may be situations in which multiple 66 // directories related to the same snap name would exist. 67 // 68 // Snapd can partially manage the directory. Specifically on snap remove 69 // snapd could remove the directory and everything in it, potentially 70 // avoiding runaway disk use on a machine that either never reboots or uses 71 // persistent /tmp directory. 72 // 73 // Underneath the base directory there is a "tmp" sub-directory that has 74 // mode 1777 and behaves as a typical /tmp directory would. That directory 75 // is used as a bind-mounted /tmp directory. 76 // 77 // Because the directories are reused across invocations by distinct users 78 // and because the directories are trivially guessable, each invocation 79 // unconditionally chowns/chmods them to appropriate values. 80 char base_dir[MAX_BUF] = { 0 }; 81 char tmp_dir[MAX_BUF] = { 0 }; 82 int base_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 83 int tmp_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 84 sc_must_snprintf(base_dir, sizeof(base_dir), "/tmp/snap.%s", snap_name); 85 sc_must_snprintf(tmp_dir, sizeof(tmp_dir), "%s/tmp", base_dir); 86 87 /* Switch to root group so that mkdir and open calls below create filesystem 88 * elements that are not owned by the user calling into snap-confine. */ 89 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 90 // Create /tmp/snap.$SNAP_NAME/ 0700 root.root. Ignore EEXIST since we want 91 // to reuse and we will open with O_NOFOLLOW, below. 92 if (mkdir(base_dir, 0700) < 0 && errno != EEXIST) { 93 die("cannot create base directory %s", base_dir); 94 } 95 base_dir_fd = open(base_dir, 96 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 97 if (base_dir_fd < 0) { 98 die("cannot open base directory %s", base_dir); 99 } 100 /* This seems redundant on first read but it has the non-obvious 101 * property of changing existing directories that have already existed 102 * but had incorrect ownership or permission. This is possible due to 103 * earlier bugs in snap-confine and due to the fact that some systems 104 * use persistent /tmp directory and may not clean up leftover files 105 * for arbitrarily long. This comment applies the following two pairs 106 * of fchmod and fchown. */ 107 if (fchmod(base_dir_fd, 0700) < 0) { 108 die("cannot chmod base directory %s to 0700", base_dir); 109 } 110 if (fchown(base_dir_fd, 0, 0) < 0) { 111 die("cannot chown base directory %s to root.root", base_dir); 112 } 113 // Create /tmp/snap.$SNAP_NAME/tmp 01777 root.root Ignore EEXIST since we 114 // want to reuse and we will open with O_NOFOLLOW, below. 115 if (mkdirat(base_dir_fd, "tmp", 01777) < 0 && errno != EEXIST) { 116 die("cannot create private tmp directory %s/tmp", base_dir); 117 } 118 (void)sc_set_effective_identity(old); 119 tmp_dir_fd = openat(base_dir_fd, "tmp", 120 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 121 if (tmp_dir_fd < 0) { 122 die("cannot open private tmp directory %s/tmp", base_dir); 123 } 124 if (fchmod(tmp_dir_fd, 01777) < 0) { 125 die("cannot chmod private tmp directory %s/tmp to 01777", 126 base_dir); 127 } 128 if (fchown(tmp_dir_fd, 0, 0) < 0) { 129 die("cannot chown private tmp directory %s/tmp to root.root", 130 base_dir); 131 } 132 sc_do_mount(tmp_dir, "/tmp", NULL, MS_BIND, NULL); 133 sc_do_mount("none", "/tmp", NULL, MS_PRIVATE, NULL); 134 } 135 136 // TODO: fold this into bootstrap 137 static void setup_private_pts(void) 138 { 139 // See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt 140 // 141 // Ubuntu by default uses devpts 'single-instance' mode where 142 // /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change 143 // the startup scripts though, so we follow the instructions in point 144 // '4' of 'User-space changes' in the above doc. In other words, after 145 // unshare(CLONE_NEWNS), we mount devpts with -o 146 // newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto 147 // /dev/ptmx 148 149 struct stat st; 150 151 // Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode 152 // which doesn't provide the isolation we require. 153 if (stat("/dev/pts/ptmx", &st) != 0) { 154 die("cannot stat /dev/pts/ptmx"); 155 } 156 // Make sure /dev/ptmx exists so we can bind mount over it 157 if (stat("/dev/ptmx", &st) != 0) { 158 die("cannot stat /dev/ptmx"); 159 } 160 // Since multi-instance, use ptmxmode=0666. The other options are 161 // copied from /etc/default/devpts 162 sc_do_mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, 163 "newinstance,ptmxmode=0666,mode=0620,gid=5"); 164 sc_do_mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0); 165 } 166 167 struct sc_mount { 168 const char *path; 169 bool is_bidirectional; 170 // Alternate path defines the rbind mount "alternative" of path. 171 // It exists so that we can make /media on systems that use /run/media. 172 const char *altpath; 173 // Optional mount points are not processed unless the source and 174 // destination both exist. 175 bool is_optional; 176 }; 177 178 struct sc_mount_config { 179 const char *rootfs_dir; 180 // The struct is terminated with an entry with NULL path. 181 const struct sc_mount *mounts; 182 sc_distro distro; 183 bool normal_mode; 184 const char *base_snap_name; 185 }; 186 187 /** 188 * Bootstrap mount namespace. 189 * 190 * This is a chunk of tricky code that lets us have full control over the 191 * layout and direction of propagation of mount events. The documentation below 192 * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source 193 * tree. 194 * 195 * As a reminder two definitions are quoted below: 196 * 197 * A 'propagation event' is defined as event generated on a vfsmount 198 * that leads to mount or unmount actions in other vfsmounts. 199 * 200 * A 'peer group' is defined as a group of vfsmounts that propagate 201 * events to each other. 202 * 203 * (end of quote). 204 * 205 * The main idea is to setup a mount namespace that has a root filesystem with 206 * vfsmounts and peer groups that, depending on the location, either isolate 207 * or share with the rest of the system. 208 * 209 * The vast majority of the filesystem is shared in one direction. Events from 210 * the outside (from the main mount namespace) propagate inside (to namespaces 211 * of particular snaps) so things like new snap revisions, mounted drives, etc, 212 * just show up as expected but even if a snap is exploited or malicious in 213 * nature it cannot affect anything in another namespace where it might cause 214 * security or stability issues. 215 * 216 * Selected directories (today just /media) can be shared in both directions. 217 * This allows snaps with sufficient privileges to either create, through the 218 * mount system call, additional mount points that are visible by the rest of 219 * the system (both the main mount namespace and namespaces of individual 220 * snaps) or remove them, through the unmount system call. 221 **/ 222 static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config) 223 { 224 char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX"; 225 char src[PATH_MAX] = { 0 }; 226 char dst[PATH_MAX] = { 0 }; 227 if (mkdtemp(scratch_dir) == NULL) { 228 die("cannot create temporary directory for the root file system"); 229 } 230 // NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new 231 // mount namespace and have a private list of mounts. 232 debug("scratch directory for constructing namespace: %s", scratch_dir); 233 // Make the root filesystem recursively shared. This way propagation events 234 // will be shared with main mount namespace. 235 sc_do_mount("none", "/", NULL, MS_REC | MS_SHARED, NULL); 236 // Bind mount the temporary scratch directory for root filesystem over 237 // itself so that it is a mount point. This is done so that it can become 238 // unbindable as explained below. 239 sc_do_mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL); 240 // Make the scratch directory unbindable. 241 // 242 // This is necessary as otherwise a mount loop can occur and the kernel 243 // would crash. The term unbindable simply states that it cannot be bind 244 // mounted anywhere. When we construct recursive bind mounts below this 245 // guarantees that this directory will not be replicated anywhere. 246 sc_do_mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL); 247 // Recursively bind mount desired root filesystem directory over the 248 // scratch directory. This puts the initial content into the scratch space 249 // and serves as a foundation for all subsequent operations below. 250 // 251 // The mount is recursive because it can either be applied to the root 252 // filesystem of a core system (aka all-snap) or the core snap on a classic 253 // system. In the former case we need recursive bind mounts to accurately 254 // replicate the state of the root filesystem into the scratch directory. 255 sc_do_mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND, 256 NULL); 257 // Make the scratch directory recursively slave. Nothing done there will be 258 // shared with the initial mount namespace. This effectively detaches us, 259 // in one way, from the original namespace and coupled with pivot_root 260 // below serves as the foundation of the mount sandbox. 261 sc_do_mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL); 262 // Bind mount certain directories from the host filesystem to the scratch 263 // directory. By default mount events will propagate in both into and out 264 // of the peer group. This way the running application can alter any global 265 // state visible on the host and in other snaps. This can be restricted by 266 // disabling the "is_bidirectional" flag as can be seen below. 267 for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL; 268 mnt++) { 269 270 if (mnt->is_bidirectional) { 271 sc_identity old = 272 sc_set_effective_identity(sc_root_group_identity()); 273 if (mkdir(mnt->path, 0755) < 0 && errno != EEXIST) { 274 die("cannot create %s", mnt->path); 275 } 276 (void)sc_set_effective_identity(old); 277 } 278 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 279 mnt->path); 280 if (mnt->is_optional) { 281 bool ok = sc_do_optional_mount(mnt->path, dst, NULL, 282 MS_REC | MS_BIND, NULL); 283 if (!ok) { 284 // If we cannot mount it, just continue. 285 continue; 286 } 287 } else { 288 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, 289 NULL); 290 } 291 if (!mnt->is_bidirectional) { 292 // Mount events will only propagate inwards to the namespace. This 293 // way the running application cannot alter any global state apart 294 // from that of its own snap. 295 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 296 } 297 if (mnt->altpath == NULL) { 298 continue; 299 } 300 // An alternate path of mnt->path is provided at another location. 301 // It should behave exactly the same as the original. 302 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 303 mnt->altpath); 304 struct stat stat_buf; 305 if (lstat(dst, &stat_buf) < 0) { 306 die("cannot lstat %s", dst); 307 } 308 if ((stat_buf.st_mode & S_IFMT) == S_IFLNK) { 309 die("cannot bind mount alternate path over a symlink: %s", dst); 310 } 311 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL); 312 if (!mnt->is_bidirectional) { 313 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 314 } 315 } 316 if (config->normal_mode) { 317 // Since we mounted /etc from the host filesystem to the scratch directory, 318 // we may need to put certain directories from the desired root filesystem 319 // (e.g. the core snap) back. This way the behavior of running snaps is not 320 // affected by the alternatives directory from the host, if one exists. 321 // 322 // Fixes the following bugs: 323 // - https://bugs.launchpad.net/snap-confine/+bug/1580018 324 // - https://bugzilla.opensuse.org/show_bug.cgi?id=1028568 325 const char *dirs_from_core[] = 326 { "/etc/alternatives", "/etc/ssl", "/etc/nsswitch.conf", 327 NULL 328 }; 329 for (const char **dirs = dirs_from_core; *dirs != NULL; dirs++) { 330 const char *dir = *dirs; 331 if (access(dir, F_OK) != 0) { 332 continue; 333 } 334 struct stat dst_stat; 335 struct stat src_stat; 336 sc_must_snprintf(src, sizeof src, "%s%s", 337 config->rootfs_dir, dir); 338 sc_must_snprintf(dst, sizeof dst, "%s%s", 339 scratch_dir, dir); 340 if (lstat(src, &src_stat) != 0) { 341 if (errno == ENOENT) { 342 continue; 343 } 344 die("cannot stat %s from desired rootfs", src); 345 } 346 if (!S_ISREG(src_stat.st_mode) 347 && !S_ISDIR(src_stat.st_mode)) { 348 debug 349 ("entry %s from the desired rootfs is not a file or directory, skipping mount", 350 src); 351 continue; 352 } 353 354 if (lstat(dst, &dst_stat) != 0) { 355 if (errno == ENOENT) { 356 continue; 357 } 358 die("cannot stat %s from host", src); 359 } 360 if (!S_ISREG(dst_stat.st_mode) 361 && !S_ISDIR(dst_stat.st_mode)) { 362 debug 363 ("entry %s from the host is not a file or directory, skipping mount", 364 src); 365 continue; 366 } 367 368 if ((dst_stat.st_mode & S_IFMT) != 369 (src_stat.st_mode & S_IFMT)) { 370 debug 371 ("entries %s and %s are of different types, skipping mount", 372 dst, src); 373 continue; 374 } 375 // both source and destination exist where both are either files 376 // or both are directories 377 sc_do_mount(src, dst, NULL, MS_BIND, NULL); 378 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 379 } 380 } 381 // The "core" base snap is special as it contains snapd and friends. 382 // Other base snaps do not, so whenever a base snap other than core is 383 // in use we need extra provisions for setting up internal tooling to 384 // be available. 385 // 386 // However on a core18 (and similar) system the core snap is not 387 // a special base anymore and we should map our own tooling in. 388 if (config->distro == SC_DISTRO_CORE_OTHER 389 || !sc_streq(config->base_snap_name, "core")) { 390 // when bases are used we need to bind-mount the libexecdir 391 // (that contains snap-exec) into /usr/lib/snapd of the 392 // base snap so that snap-exec is available for the snaps 393 // (base snaps do not ship snapd) 394 395 // dst is always /usr/lib/snapd as this is where snapd 396 // assumes to find snap-exec 397 sc_must_snprintf(dst, sizeof dst, "%s/usr/lib/snapd", 398 scratch_dir); 399 400 // bind mount the current $ROOT/usr/lib/snapd path, 401 // where $ROOT is either "/" or the "/snap/{core,snapd}/current" 402 // that we are re-execing from 403 char *src = NULL; 404 char self[PATH_MAX + 1] = { 0 }; 405 ssize_t nread; 406 nread = readlink("/proc/self/exe", self, sizeof self - 1); 407 if (nread < 0) { 408 die("cannot read /proc/self/exe"); 409 } 410 // Though we initialized self to NULs and passed one less to 411 // readlink, therefore guaranteeing that self is 412 // zero-terminated, perform an explicit assignment to make 413 // Coverity happy. 414 self[nread] = '\0'; 415 // this cannot happen except when the kernel is buggy 416 if (strstr(self, "/snap-confine") == NULL) { 417 die("cannot use result from readlink: %s", self); 418 } 419 src = dirname(self); 420 // dirname(path) might return '.' depending on path. 421 // /proc/self/exe should always point 422 // to an absolute path, but let's guarantee that. 423 if (src[0] != '/') { 424 die("cannot use the result of dirname(): %s", src); 425 } 426 427 sc_do_mount(src, dst, NULL, MS_BIND | MS_RDONLY, NULL); 428 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 429 } 430 // Bind mount the directory where all snaps are mounted. The location of 431 // the this directory on the host filesystem may not match the location in 432 // the desired root filesystem. In the "core" and "ubuntu-core" snaps the 433 // directory is always /snap. On the host it is a build-time configuration 434 // option stored in SNAP_MOUNT_DIR. In legacy mode (or in other words, not 435 // in normal mode), we don't need to do this because /snap is fixed and 436 // already contains the correct view of the mounted snaps. 437 if (config->normal_mode) { 438 sc_must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir); 439 sc_do_mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC, NULL); 440 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 441 } 442 // Create the hostfs directory if one is missing. This directory is a part 443 // of packaging now so perhaps this code can be removed later. 444 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 445 if (mkdir(SC_HOSTFS_DIR, 0755) < 0) { 446 if (errno != EEXIST) { 447 die("cannot perform operation: mkdir %s", SC_HOSTFS_DIR); 448 } 449 } 450 (void)sc_set_effective_identity(old); 451 // Ensure that hostfs isgroup owned by root. We may have (now or earlier) 452 // created the directory as the user who first ran a snap on a given 453 // system and the group identity of that user is visilbe on disk. 454 // This was LP:#1665004 455 struct stat sb; 456 if (stat(SC_HOSTFS_DIR, &sb) < 0) { 457 die("cannot stat %s", SC_HOSTFS_DIR); 458 } 459 if (sb.st_uid != 0 || sb.st_gid != 0) { 460 if (chown(SC_HOSTFS_DIR, 0, 0) < 0) { 461 die("cannot change user/group owner of %s to root", 462 SC_HOSTFS_DIR); 463 } 464 } 465 // Make the upcoming "put_old" directory for pivot_root private so that 466 // mount events don't propagate to any peer group. In practice pivot root 467 // has a number of undocumented requirements and one of them is that the 468 // "put_old" directory (the second argument) cannot be shared in any way. 469 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR); 470 sc_do_mount(dst, dst, NULL, MS_BIND, NULL); 471 sc_do_mount("none", dst, NULL, MS_PRIVATE, NULL); 472 // On classic mount the nvidia driver. Ideally this would be done in an 473 // uniform way after pivot_root but this is good enough and requires less 474 // code changes the nvidia code assumes it has access to the existing 475 // pre-pivot filesystem. 476 if (config->distro == SC_DISTRO_CLASSIC) { 477 sc_mount_nvidia_driver(scratch_dir); 478 } 479 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 480 // pivot_root 481 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 482 // Use pivot_root to "chroot" into the scratch directory. 483 // 484 // Q: Why are we using something as esoteric as pivot_root(2)? 485 // A: Because this makes apparmor handling easy. Using a normal chroot 486 // makes all apparmor rules conditional. We are either running on an 487 // all-snap system where this would-be chroot didn't happen and all the 488 // rules see / as the root file system _OR_ we are running on top of a 489 // classic distribution and this chroot has now moved all paths to 490 // /tmp/snap.rootfs_*. 491 // 492 // Because we are using unshare(2) with CLONE_NEWNS we can essentially use 493 // pivot_root just like chroot but this makes apparmor unaware of the old 494 // root so everything works okay. 495 // 496 // HINT: If you are debugging this and are trying to see why pivot_root 497 // happens to return EINVAL with any changes you may be making, please 498 // consider applying 499 // misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree 500 // kernel. 501 debug("performing operation: pivot_root %s %s", scratch_dir, dst); 502 if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) { 503 die("cannot perform operation: pivot_root %s %s", scratch_dir, 504 dst); 505 } 506 // Unmount the self-bind mount over the scratch directory created earlier 507 // in the original root filesystem (which is now mounted on SC_HOSTFS_DIR). 508 // This way we can remove the temporary directory we created and "clean up" 509 // after ourselves nicely. 510 sc_must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir); 511 sc_do_umount(dst, UMOUNT_NOFOLLOW); 512 // Remove the scratch directory. Note that we are using the path that is 513 // based on the old root filesystem as after pivot_root we cannot guarantee 514 // what is present at the same location normally. (It is probably an empty 515 // /tmp directory that is populated in another place). 516 debug("performing operation: rmdir %s", dst); 517 if (rmdir(scratch_dir) < 0) { 518 die("cannot perform operation: rmdir %s", dst); 519 }; 520 // Make the old root filesystem recursively slave. This way operations 521 // performed in this mount namespace will not propagate to the peer group. 522 // This is another essential part of the confinement system. 523 sc_do_mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL); 524 // Detach the redundant hostfs version of sysfs since it shows up in the 525 // mount table and software inspecting the mount table may become confused 526 // (eg, docker and LP:# 162601). 527 sc_must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR); 528 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 529 // Detach the redundant hostfs version of /dev since it shows up in the 530 // mount table and software inspecting the mount table may become confused. 531 sc_must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR); 532 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 533 // Detach the redundant hostfs version of /proc since it shows up in the 534 // mount table and software inspecting the mount table may become confused. 535 sc_must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR); 536 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 537 // Detach both views of /writable: the one from hostfs and the one directly 538 // visible in /writable. Interfaces don't grant access to this directory 539 // and it has a large duplicated view of many mount points. Note that this 540 // is only applicable to ubuntu-core systems. 541 sc_detach_views_of_writable(config->distro, config->normal_mode); 542 } 543 544 static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode) 545 { 546 // Note that prior to detaching either mount point we switch the 547 // propagation to private to both limit the change to just this view and to 548 // prevent otherwise occurring event propagation from self-conflicting and 549 // returning EBUSY. A similar approach is used by snap-update-ns and is 550 // documented in umount(2). 551 const char *writable_dir = "/writable"; 552 const char *hostfs_writable_dir = "/var/lib/snapd/hostfs/writable"; 553 554 // Writable only exists on ubuntu-core. 555 if (distro == SC_DISTRO_CLASSIC) { 556 return; 557 } 558 // On all core distributions we see /var/lib/snapd/hostfs/writable that 559 // exposes writable, with a structure specific to ubuntu-core. 560 debug("detaching %s", hostfs_writable_dir); 561 sc_do_mount("none", hostfs_writable_dir, NULL, 562 MS_REC | MS_PRIVATE, NULL); 563 sc_do_umount(hostfs_writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH); 564 565 // On ubuntu-core 16, when the executed snap uses core as base we also see 566 // the /writable that we directly inherited from the initial mount 567 // namespace. 568 if (distro == SC_DISTRO_CORE16 && !normal_mode) { 569 debug("detaching %s", writable_dir); 570 sc_do_mount("none", writable_dir, NULL, MS_REC | MS_PRIVATE, 571 NULL); 572 sc_do_umount(writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH); 573 } 574 } 575 576 /** 577 * @path: a pathname where / replaced with '\0'. 578 * @offsetp: pointer to int showing which path segment was last seen. 579 * Updated on return to reflect the next segment. 580 * @fulllen: full original path length. 581 * Returns a pointer to the next path segment, or NULL if done. 582 */ 583 static char * __attribute__((used)) 584 get_nextpath(char *path, size_t *offsetp, size_t fulllen) 585 { 586 size_t offset = *offsetp; 587 588 if (offset >= fulllen) 589 return NULL; 590 591 while (offset < fulllen && path[offset] != '\0') 592 offset++; 593 while (offset < fulllen && path[offset] == '\0') 594 offset++; 595 596 *offsetp = offset; 597 return (offset < fulllen) ? &path[offset] : NULL; 598 } 599 600 /** 601 * Check that @subdir is a subdir of @dir. 602 **/ 603 static bool __attribute__((used)) 604 is_subdir(const char *subdir, const char *dir) 605 { 606 size_t dirlen = strlen(dir); 607 size_t subdirlen = strlen(subdir); 608 609 // @dir has to be at least as long as @subdir 610 if (subdirlen < dirlen) 611 return false; 612 // @dir has to be a prefix of @subdir 613 if (strncmp(subdir, dir, dirlen) != 0) 614 return false; 615 // @dir can look like "path/" (that is, end with the directory separator). 616 // When that is the case then given the test above we can be sure @subdir 617 // is a real subdirectory. 618 if (dirlen > 0 && dir[dirlen - 1] == '/') 619 return true; 620 // @subdir can look like "path/stuff" and when the directory separator 621 // is exactly at the spot where @dir ends (that is, it was not caught 622 // by the test above) then @subdir is a real subdirectory. 623 if (subdir[dirlen] == '/' && dirlen > 0) 624 return true; 625 // If both @dir and @subdir have identical length then given that the 626 // prefix check above @subdir is a real subdirectory. 627 if (subdirlen == dirlen) 628 return true; 629 return false; 630 } 631 632 void sc_populate_mount_ns(struct sc_apparmor *apparmor, int snap_update_ns_fd, 633 const sc_invocation * inv, const gid_t real_gid, 634 const gid_t saved_gid) 635 { 636 // Classify the current distribution, as claimed by /etc/os-release. 637 sc_distro distro = sc_classify_distro(); 638 639 // Check which mode we should run in, normal or legacy. 640 if (inv->is_normal_mode) { 641 // In normal mode we use the base snap as / and set up several bind mounts. 642 const struct sc_mount mounts[] = { 643 {"/dev"}, // because it contains devices on host OS 644 {"/etc"}, // because that's where /etc/resolv.conf lives, perhaps a bad idea 645 {"/home"}, // to support /home/*/snap and home interface 646 {"/root"}, // because that is $HOME for services 647 {"/proc"}, // fundamental filesystem 648 {"/sys"}, // fundamental filesystem 649 {"/tmp"}, // to get writable tmp 650 {"/var/snap"}, // to get access to global snap data 651 {"/var/lib/snapd"}, // to get access to snapd state and seccomp profiles 652 {"/var/tmp"}, // to get access to the other temporary directory 653 {"/run"}, // to get /run with sockets and what not 654 {"/lib/modules",.is_optional = true}, // access to the modules of the running kernel 655 {"/lib/firmware",.is_optional = true}, // access to the firmware of the running kernel 656 {"/usr/src"}, // FIXME: move to SecurityMounts in system-trace interface 657 {"/var/log"}, // FIXME: move to SecurityMounts in log-observe interface 658 #ifdef MERGED_USR 659 {"/run/media", true, "/media"}, // access to the users removable devices 660 #else 661 {"/media", true}, // access to the users removable devices 662 #endif // MERGED_USR 663 {"/run/netns", true}, // access to the 'ip netns' network namespaces 664 // The /mnt directory is optional in base snaps to ensure backwards 665 // compatibility with the first version of base snaps that was 666 // released. 667 {"/mnt",.is_optional = true}, // to support the removable-media interface 668 {"/var/lib/extrausers",.is_optional = true}, // access to UID/GID of extrausers (if available) 669 {}, 670 }; 671 struct sc_mount_config normal_config = { 672 .rootfs_dir = inv->rootfs_dir, 673 .mounts = mounts, 674 .distro = distro, 675 .normal_mode = true, 676 .base_snap_name = inv->base_snap_name, 677 }; 678 sc_bootstrap_mount_namespace(&normal_config); 679 } else { 680 // In legacy mode we don't pivot and instead just arrange bi- 681 // directional mount propagation for two directories. 682 const struct sc_mount mounts[] = { 683 {"/media", true}, 684 {"/run/netns", true}, 685 {}, 686 }; 687 struct sc_mount_config legacy_config = { 688 .rootfs_dir = "/", 689 .mounts = mounts, 690 .distro = distro, 691 .normal_mode = false, 692 .base_snap_name = inv->base_snap_name, 693 }; 694 sc_bootstrap_mount_namespace(&legacy_config); 695 } 696 697 // TODO: rename this and fold it into bootstrap 698 setup_private_mount(inv->snap_instance); 699 // set up private /dev/pts 700 // TODO: fold this into bootstrap 701 setup_private_pts(); 702 703 // setup the security backend bind mounts 704 sc_call_snap_update_ns(snap_update_ns_fd, inv->snap_instance, apparmor); 705 } 706 707 static bool is_mounted_with_shared_option(const char *dir) 708 __attribute__((nonnull(1))); 709 710 static bool is_mounted_with_shared_option(const char *dir) 711 { 712 sc_mountinfo *sm SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 713 sm = sc_parse_mountinfo(NULL); 714 if (sm == NULL) { 715 die("cannot parse /proc/self/mountinfo"); 716 } 717 sc_mountinfo_entry *entry = sc_first_mountinfo_entry(sm); 718 while (entry != NULL) { 719 const char *mount_dir = entry->mount_dir; 720 if (sc_streq(mount_dir, dir)) { 721 const char *optional_fields = entry->optional_fields; 722 if (strstr(optional_fields, "shared:") != NULL) { 723 return true; 724 } 725 } 726 entry = sc_next_mountinfo_entry(entry); 727 } 728 return false; 729 } 730 731 void sc_ensure_shared_snap_mount(void) 732 { 733 if (!is_mounted_with_shared_option("/") 734 && !is_mounted_with_shared_option(SNAP_MOUNT_DIR)) { 735 // TODO: We could be more aggressive and refuse to function but since 736 // we have no data on actual environments that happen to limp along in 737 // this configuration let's not do that yet. This code should be 738 // removed once we have a measurement and feedback mechanism that lets 739 // us decide based on measurable data. 740 sc_do_mount(SNAP_MOUNT_DIR, SNAP_MOUNT_DIR, "none", 741 MS_BIND | MS_REC, 0); 742 sc_do_mount("none", SNAP_MOUNT_DIR, NULL, MS_SHARED | MS_REC, 743 NULL); 744 } 745 } 746 747 void sc_setup_user_mounts(struct sc_apparmor *apparmor, int snap_update_ns_fd, 748 const char *snap_name) 749 { 750 debug("%s: %s", __FUNCTION__, snap_name); 751 752 char profile_path[PATH_MAX]; 753 struct stat st; 754 755 sc_must_snprintf(profile_path, sizeof(profile_path), 756 "/var/lib/snapd/mount/snap.%s.user-fstab", snap_name); 757 if (stat(profile_path, &st) != 0) { 758 // It is ok for the user fstab to not exist. 759 return; 760 } 761 762 // In our new mount namespace, recursively change all mounts 763 // to slave mode, so we see changes from the parent namespace 764 // but don't propagate our own changes. 765 sc_do_mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL); 766 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 767 sc_call_snap_update_ns_as_user(snap_update_ns_fd, snap_name, apparmor); 768 (void)sc_set_effective_identity(old); 769 } 770 771 void sc_ensure_snap_dir_shared_mounts(void) 772 { 773 const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL }; 774 for (int i = 0; dirs[i] != NULL; i++) { 775 const char *dir = dirs[i]; 776 if (!is_mounted_with_shared_option(dir)) { 777 /* Since this directory isn't yet shared (but it should be), 778 * recursively bind mount it, then recursively share it so that 779 * changes to the host are seen in the snap and vice-versa. This 780 * allows us to fine-tune propagation events elsewhere for this new 781 * mountpoint. 782 * 783 * Not using MS_SLAVE because it's too late for SNAP_MOUNT_DIR, 784 * since snaps are already mounted, and it's not needed for 785 * /var/snap. 786 */ 787 sc_do_mount(dir, dir, "none", MS_BIND | MS_REC, 0); 788 sc_do_mount("none", dir, NULL, MS_REC | MS_SHARED, 789 NULL); 790 } 791 } 792 } 793 794 void sc_setup_parallel_instance_classic_mounts(const char *snap_name, 795 const char *snap_instance_name) 796 { 797 char src[PATH_MAX] = { 0 }; 798 char dst[PATH_MAX] = { 0 }; 799 800 const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL }; 801 for (int i = 0; dirs[i] != NULL; i++) { 802 const char *dir = dirs[i]; 803 sc_do_mount("none", dir, NULL, MS_REC | MS_SLAVE, NULL); 804 } 805 806 /* Mount SNAP_MOUNT_DIR/<snap>_<key> on SNAP_MOUNT_DIR/<snap> */ 807 sc_must_snprintf(src, sizeof src, "%s/%s", SNAP_MOUNT_DIR, 808 snap_instance_name); 809 sc_must_snprintf(dst, sizeof dst, "%s/%s", SNAP_MOUNT_DIR, snap_name); 810 sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0); 811 812 /* Mount /var/snap/<snap>_<key> on /var/snap/<snap> */ 813 sc_must_snprintf(src, sizeof src, "/var/snap/%s", snap_instance_name); 814 sc_must_snprintf(dst, sizeof dst, "/var/snap/%s", snap_name); 815 sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0); 816 }