github.com/chipaca/snappy@v0.0.0-20210104084008-1f06296fe8ad/cmd/snap-confine/mount-support.c (about) 1 /* 2 * Copyright (C) 2015 Canonical Ltd 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 3 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 * 16 */ 17 #ifdef HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #include "mount-support.h" 22 23 #include <errno.h> 24 #include <fcntl.h> 25 #include <libgen.h> 26 #include <limits.h> 27 #include <mntent.h> 28 #include <sched.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <sys/mount.h> 33 #include <sys/stat.h> 34 #include <sys/syscall.h> 35 #include <sys/types.h> 36 #include <sys/wait.h> 37 #include <unistd.h> 38 39 #include "../libsnap-confine-private/apparmor-support.h" 40 #include "../libsnap-confine-private/classic.h" 41 #include "../libsnap-confine-private/cleanup-funcs.h" 42 #include "../libsnap-confine-private/mount-opt.h" 43 #include "../libsnap-confine-private/mountinfo.h" 44 #include "../libsnap-confine-private/snap.h" 45 #include "../libsnap-confine-private/string-utils.h" 46 #include "../libsnap-confine-private/tool.h" 47 #include "../libsnap-confine-private/utils.h" 48 #include "mount-support-nvidia.h" 49 50 #define MAX_BUF 1000 51 52 static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode); 53 54 // TODO: simplify this, after all it is just a tmpfs 55 // TODO: fold this into bootstrap 56 static void setup_private_mount(const char *snap_name) 57 { 58 // Create a 0700 base directory. This is the "base" directory that is 59 // protected from other users. This directory name is NOT randomly 60 // generated. This has several properties: 61 // 62 // Users can relate to the name and can find the temporary directory as 63 // visible from within the snap. If this directory was random it would be 64 // harder to find because there may be situations in which multiple 65 // directories related to the same snap name would exist. 66 // 67 // Snapd can partially manage the directory. Specifically on snap remove 68 // snapd could remove the directory and everything in it, potentially 69 // avoiding runaway disk use on a machine that either never reboots or uses 70 // persistent /tmp directory. 71 // 72 // Underneath the base directory there is a "tmp" sub-directory that has 73 // mode 1777 and behaves as a typical /tmp directory would. That directory 74 // is used as a bind-mounted /tmp directory. 75 // 76 // Because the directories are reused across invocations by distinct users 77 // and because the directories are trivially guessable, each invocation 78 // unconditionally chowns/chmods them to appropriate values. 79 char base_dir[MAX_BUF] = { 0 }; 80 char tmp_dir[MAX_BUF] = { 0 }; 81 int base_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 82 int tmp_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 83 sc_must_snprintf(base_dir, sizeof(base_dir), "/tmp/snap.%s", snap_name); 84 sc_must_snprintf(tmp_dir, sizeof(tmp_dir), "%s/tmp", base_dir); 85 86 /* Switch to root group so that mkdir and open calls below create filesystem 87 * elements that are not owned by the user calling into snap-confine. */ 88 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 89 // Create /tmp/snap.$SNAP_NAME/ 0700 root.root. Ignore EEXIST since we want 90 // to reuse and we will open with O_NOFOLLOW, below. 91 if (mkdir(base_dir, 0700) < 0 && errno != EEXIST) { 92 die("cannot create base directory %s", base_dir); 93 } 94 base_dir_fd = open(base_dir, 95 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 96 if (base_dir_fd < 0) { 97 die("cannot open base directory %s", base_dir); 98 } 99 /* This seems redundant on first read but it has the non-obvious 100 * property of changing existing directories that have already existed 101 * but had incorrect ownership or permission. This is possible due to 102 * earlier bugs in snap-confine and due to the fact that some systems 103 * use persistent /tmp directory and may not clean up leftover files 104 * for arbitrarily long. This comment applies the following two pairs 105 * of fchmod and fchown. */ 106 if (fchmod(base_dir_fd, 0700) < 0) { 107 die("cannot chmod base directory %s to 0700", base_dir); 108 } 109 if (fchown(base_dir_fd, 0, 0) < 0) { 110 die("cannot chown base directory %s to root.root", base_dir); 111 } 112 // Create /tmp/snap.$SNAP_NAME/tmp 01777 root.root Ignore EEXIST since we 113 // want to reuse and we will open with O_NOFOLLOW, below. 114 if (mkdirat(base_dir_fd, "tmp", 01777) < 0 && errno != EEXIST) { 115 die("cannot create private tmp directory %s/tmp", base_dir); 116 } 117 (void)sc_set_effective_identity(old); 118 tmp_dir_fd = openat(base_dir_fd, "tmp", 119 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 120 if (tmp_dir_fd < 0) { 121 die("cannot open private tmp directory %s/tmp", base_dir); 122 } 123 if (fchmod(tmp_dir_fd, 01777) < 0) { 124 die("cannot chmod private tmp directory %s/tmp to 01777", 125 base_dir); 126 } 127 if (fchown(tmp_dir_fd, 0, 0) < 0) { 128 die("cannot chown private tmp directory %s/tmp to root.root", 129 base_dir); 130 } 131 sc_do_mount(tmp_dir, "/tmp", NULL, MS_BIND, NULL); 132 sc_do_mount("none", "/tmp", NULL, MS_PRIVATE, NULL); 133 } 134 135 // TODO: fold this into bootstrap 136 static void setup_private_pts(void) 137 { 138 // See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt 139 // 140 // Ubuntu by default uses devpts 'single-instance' mode where 141 // /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change 142 // the startup scripts though, so we follow the instructions in point 143 // '4' of 'User-space changes' in the above doc. In other words, after 144 // unshare(CLONE_NEWNS), we mount devpts with -o 145 // newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto 146 // /dev/ptmx 147 148 struct stat st; 149 150 // Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode 151 // which doesn't provide the isolation we require. 152 if (stat("/dev/pts/ptmx", &st) != 0) { 153 die("cannot stat /dev/pts/ptmx"); 154 } 155 // Make sure /dev/ptmx exists so we can bind mount over it 156 if (stat("/dev/ptmx", &st) != 0) { 157 die("cannot stat /dev/ptmx"); 158 } 159 // Since multi-instance, use ptmxmode=0666. The other options are 160 // copied from /etc/default/devpts 161 sc_do_mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, 162 "newinstance,ptmxmode=0666,mode=0620,gid=5"); 163 sc_do_mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0); 164 } 165 166 struct sc_mount { 167 const char *path; 168 bool is_bidirectional; 169 // Alternate path defines the rbind mount "alternative" of path. 170 // It exists so that we can make /media on systems that use /run/media. 171 const char *altpath; 172 // Optional mount points are not processed unless the source and 173 // destination both exist. 174 bool is_optional; 175 }; 176 177 struct sc_mount_config { 178 const char *rootfs_dir; 179 // The struct is terminated with an entry with NULL path. 180 const struct sc_mount *mounts; 181 sc_distro distro; 182 bool normal_mode; 183 const char *base_snap_name; 184 }; 185 186 /** 187 * Bootstrap mount namespace. 188 * 189 * This is a chunk of tricky code that lets us have full control over the 190 * layout and direction of propagation of mount events. The documentation below 191 * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source 192 * tree. 193 * 194 * As a reminder two definitions are quoted below: 195 * 196 * A 'propagation event' is defined as event generated on a vfsmount 197 * that leads to mount or unmount actions in other vfsmounts. 198 * 199 * A 'peer group' is defined as a group of vfsmounts that propagate 200 * events to each other. 201 * 202 * (end of quote). 203 * 204 * The main idea is to setup a mount namespace that has a root filesystem with 205 * vfsmounts and peer groups that, depending on the location, either isolate 206 * or share with the rest of the system. 207 * 208 * The vast majority of the filesystem is shared in one direction. Events from 209 * the outside (from the main mount namespace) propagate inside (to namespaces 210 * of particular snaps) so things like new snap revisions, mounted drives, etc, 211 * just show up as expected but even if a snap is exploited or malicious in 212 * nature it cannot affect anything in another namespace where it might cause 213 * security or stability issues. 214 * 215 * Selected directories (today just /media) can be shared in both directions. 216 * This allows snaps with sufficient privileges to either create, through the 217 * mount system call, additional mount points that are visible by the rest of 218 * the system (both the main mount namespace and namespaces of individual 219 * snaps) or remove them, through the unmount system call. 220 **/ 221 static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config) 222 { 223 char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX"; 224 char src[PATH_MAX] = { 0 }; 225 char dst[PATH_MAX] = { 0 }; 226 if (mkdtemp(scratch_dir) == NULL) { 227 die("cannot create temporary directory for the root file system"); 228 } 229 // NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new 230 // mount namespace and have a private list of mounts. 231 debug("scratch directory for constructing namespace: %s", scratch_dir); 232 // Make the root filesystem recursively shared. This way propagation events 233 // will be shared with main mount namespace. 234 sc_do_mount("none", "/", NULL, MS_REC | MS_SHARED, NULL); 235 // Bind mount the temporary scratch directory for root filesystem over 236 // itself so that it is a mount point. This is done so that it can become 237 // unbindable as explained below. 238 sc_do_mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL); 239 // Make the scratch directory unbindable. 240 // 241 // This is necessary as otherwise a mount loop can occur and the kernel 242 // would crash. The term unbindable simply states that it cannot be bind 243 // mounted anywhere. When we construct recursive bind mounts below this 244 // guarantees that this directory will not be replicated anywhere. 245 sc_do_mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL); 246 // Recursively bind mount desired root filesystem directory over the 247 // scratch directory. This puts the initial content into the scratch space 248 // and serves as a foundation for all subsequent operations below. 249 // 250 // The mount is recursive because it can either be applied to the root 251 // filesystem of a core system (aka all-snap) or the core snap on a classic 252 // system. In the former case we need recursive bind mounts to accurately 253 // replicate the state of the root filesystem into the scratch directory. 254 sc_do_mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND, 255 NULL); 256 // Make the scratch directory recursively slave. Nothing done there will be 257 // shared with the initial mount namespace. This effectively detaches us, 258 // in one way, from the original namespace and coupled with pivot_root 259 // below serves as the foundation of the mount sandbox. 260 sc_do_mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL); 261 // Bind mount certain directories from the host filesystem to the scratch 262 // directory. By default mount events will propagate in both into and out 263 // of the peer group. This way the running application can alter any global 264 // state visible on the host and in other snaps. This can be restricted by 265 // disabling the "is_bidirectional" flag as can be seen below. 266 for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL; 267 mnt++) { 268 269 if (mnt->is_bidirectional) { 270 sc_identity old = 271 sc_set_effective_identity(sc_root_group_identity()); 272 if (mkdir(mnt->path, 0755) < 0 && errno != EEXIST) { 273 die("cannot create %s", mnt->path); 274 } 275 (void)sc_set_effective_identity(old); 276 } 277 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 278 mnt->path); 279 if (mnt->is_optional) { 280 bool ok = sc_do_optional_mount(mnt->path, dst, NULL, 281 MS_REC | MS_BIND, NULL); 282 if (!ok) { 283 // If we cannot mount it, just continue. 284 continue; 285 } 286 } else { 287 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, 288 NULL); 289 } 290 if (!mnt->is_bidirectional) { 291 // Mount events will only propagate inwards to the namespace. This 292 // way the running application cannot alter any global state apart 293 // from that of its own snap. 294 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 295 } 296 if (mnt->altpath == NULL) { 297 continue; 298 } 299 // An alternate path of mnt->path is provided at another location. 300 // It should behave exactly the same as the original. 301 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 302 mnt->altpath); 303 struct stat stat_buf; 304 if (lstat(dst, &stat_buf) < 0) { 305 die("cannot lstat %s", dst); 306 } 307 if ((stat_buf.st_mode & S_IFMT) == S_IFLNK) { 308 die("cannot bind mount alternate path over a symlink: %s", dst); 309 } 310 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL); 311 if (!mnt->is_bidirectional) { 312 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 313 } 314 } 315 if (config->normal_mode) { 316 // Since we mounted /etc from the host filesystem to the scratch directory, 317 // we may need to put certain directories from the desired root filesystem 318 // (e.g. the core snap) back. This way the behavior of running snaps is not 319 // affected by the alternatives directory from the host, if one exists. 320 // 321 // Fixes the following bugs: 322 // - https://bugs.launchpad.net/snap-confine/+bug/1580018 323 // - https://bugzilla.opensuse.org/show_bug.cgi?id=1028568 324 const char *dirs_from_core[] = { 325 "/etc/alternatives", "/etc/ssl", "/etc/nsswitch.conf", 326 // Some specifc and privileged interfaces (e.g docker-support) give 327 // access to apparmor_parser from the base snap which at a minimum 328 // needs to use matching configuration from the base snap instead 329 // of from the users host system. 330 "/etc/apparmor", "/etc/apparmor.d", 331 NULL 332 }; 333 for (const char **dirs = dirs_from_core; *dirs != NULL; dirs++) { 334 const char *dir = *dirs; 335 if (access(dir, F_OK) != 0) { 336 continue; 337 } 338 struct stat dst_stat; 339 struct stat src_stat; 340 sc_must_snprintf(src, sizeof src, "%s%s", 341 config->rootfs_dir, dir); 342 sc_must_snprintf(dst, sizeof dst, "%s%s", 343 scratch_dir, dir); 344 if (lstat(src, &src_stat) != 0) { 345 if (errno == ENOENT) { 346 continue; 347 } 348 die("cannot stat %s from desired rootfs", src); 349 } 350 if (!S_ISREG(src_stat.st_mode) 351 && !S_ISDIR(src_stat.st_mode)) { 352 debug 353 ("entry %s from the desired rootfs is not a file or directory, skipping mount", 354 src); 355 continue; 356 } 357 358 if (lstat(dst, &dst_stat) != 0) { 359 if (errno == ENOENT) { 360 continue; 361 } 362 die("cannot stat %s from host", src); 363 } 364 if (!S_ISREG(dst_stat.st_mode) 365 && !S_ISDIR(dst_stat.st_mode)) { 366 debug 367 ("entry %s from the host is not a file or directory, skipping mount", 368 src); 369 continue; 370 } 371 372 if ((dst_stat.st_mode & S_IFMT) != 373 (src_stat.st_mode & S_IFMT)) { 374 debug 375 ("entries %s and %s are of different types, skipping mount", 376 dst, src); 377 continue; 378 } 379 // both source and destination exist where both are either files 380 // or both are directories 381 sc_do_mount(src, dst, NULL, MS_BIND, NULL); 382 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 383 } 384 } 385 // The "core" base snap is special as it contains snapd and friends. 386 // Other base snaps do not, so whenever a base snap other than core is 387 // in use we need extra provisions for setting up internal tooling to 388 // be available. 389 // 390 // However on a core18 (and similar) system the core snap is not 391 // a special base anymore and we should map our own tooling in. 392 if (config->distro == SC_DISTRO_CORE_OTHER 393 || !sc_streq(config->base_snap_name, "core")) { 394 // when bases are used we need to bind-mount the libexecdir 395 // (that contains snap-exec) into /usr/lib/snapd of the 396 // base snap so that snap-exec is available for the snaps 397 // (base snaps do not ship snapd) 398 399 // dst is always /usr/lib/snapd as this is where snapd 400 // assumes to find snap-exec 401 sc_must_snprintf(dst, sizeof dst, "%s/usr/lib/snapd", 402 scratch_dir); 403 404 // bind mount the current $ROOT/usr/lib/snapd path, 405 // where $ROOT is either "/" or the "/snap/{core,snapd}/current" 406 // that we are re-execing from 407 char *src = NULL; 408 char self[PATH_MAX + 1] = { 0 }; 409 ssize_t nread; 410 nread = readlink("/proc/self/exe", self, sizeof self - 1); 411 if (nread < 0) { 412 die("cannot read /proc/self/exe"); 413 } 414 // Though we initialized self to NULs and passed one less to 415 // readlink, therefore guaranteeing that self is 416 // zero-terminated, perform an explicit assignment to make 417 // Coverity happy. 418 self[nread] = '\0'; 419 // this cannot happen except when the kernel is buggy 420 if (strstr(self, "/snap-confine") == NULL) { 421 die("cannot use result from readlink: %s", self); 422 } 423 src = dirname(self); 424 // dirname(path) might return '.' depending on path. 425 // /proc/self/exe should always point 426 // to an absolute path, but let's guarantee that. 427 if (src[0] != '/') { 428 die("cannot use the result of dirname(): %s", src); 429 } 430 431 sc_do_mount(src, dst, NULL, MS_BIND | MS_RDONLY, NULL); 432 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 433 } 434 // Bind mount the directory where all snaps are mounted. The location of 435 // the this directory on the host filesystem may not match the location in 436 // the desired root filesystem. In the "core" and "ubuntu-core" snaps the 437 // directory is always /snap. On the host it is a build-time configuration 438 // option stored in SNAP_MOUNT_DIR. In legacy mode (or in other words, not 439 // in normal mode), we don't need to do this because /snap is fixed and 440 // already contains the correct view of the mounted snaps. 441 if (config->normal_mode) { 442 sc_must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir); 443 sc_do_mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC, NULL); 444 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 445 } 446 // Create the hostfs directory if one is missing. This directory is a part 447 // of packaging now so perhaps this code can be removed later. 448 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 449 if (mkdir(SC_HOSTFS_DIR, 0755) < 0) { 450 if (errno != EEXIST) { 451 die("cannot perform operation: mkdir %s", SC_HOSTFS_DIR); 452 } 453 } 454 (void)sc_set_effective_identity(old); 455 // Ensure that hostfs isgroup owned by root. We may have (now or earlier) 456 // created the directory as the user who first ran a snap on a given 457 // system and the group identity of that user is visilbe on disk. 458 // This was LP:#1665004 459 struct stat sb; 460 if (stat(SC_HOSTFS_DIR, &sb) < 0) { 461 die("cannot stat %s", SC_HOSTFS_DIR); 462 } 463 if (sb.st_uid != 0 || sb.st_gid != 0) { 464 if (chown(SC_HOSTFS_DIR, 0, 0) < 0) { 465 die("cannot change user/group owner of %s to root", 466 SC_HOSTFS_DIR); 467 } 468 } 469 // Make the upcoming "put_old" directory for pivot_root private so that 470 // mount events don't propagate to any peer group. In practice pivot root 471 // has a number of undocumented requirements and one of them is that the 472 // "put_old" directory (the second argument) cannot be shared in any way. 473 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR); 474 sc_do_mount(dst, dst, NULL, MS_BIND, NULL); 475 sc_do_mount("none", dst, NULL, MS_PRIVATE, NULL); 476 // On classic mount the nvidia driver. Ideally this would be done in an 477 // uniform way after pivot_root but this is good enough and requires less 478 // code changes the nvidia code assumes it has access to the existing 479 // pre-pivot filesystem. 480 if (config->distro == SC_DISTRO_CLASSIC) { 481 sc_mount_nvidia_driver(scratch_dir); 482 } 483 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 484 // pivot_root 485 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 486 // Use pivot_root to "chroot" into the scratch directory. 487 // 488 // Q: Why are we using something as esoteric as pivot_root(2)? 489 // A: Because this makes apparmor handling easy. Using a normal chroot 490 // makes all apparmor rules conditional. We are either running on an 491 // all-snap system where this would-be chroot didn't happen and all the 492 // rules see / as the root file system _OR_ we are running on top of a 493 // classic distribution and this chroot has now moved all paths to 494 // /tmp/snap.rootfs_*. 495 // 496 // Because we are using unshare(2) with CLONE_NEWNS we can essentially use 497 // pivot_root just like chroot but this makes apparmor unaware of the old 498 // root so everything works okay. 499 // 500 // HINT: If you are debugging this and are trying to see why pivot_root 501 // happens to return EINVAL with any changes you may be making, please 502 // consider applying 503 // misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree 504 // kernel. 505 debug("performing operation: pivot_root %s %s", scratch_dir, dst); 506 if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) { 507 die("cannot perform operation: pivot_root %s %s", scratch_dir, 508 dst); 509 } 510 // Unmount the self-bind mount over the scratch directory created earlier 511 // in the original root filesystem (which is now mounted on SC_HOSTFS_DIR). 512 // This way we can remove the temporary directory we created and "clean up" 513 // after ourselves nicely. 514 sc_must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir); 515 sc_do_umount(dst, UMOUNT_NOFOLLOW); 516 // Remove the scratch directory. Note that we are using the path that is 517 // based on the old root filesystem as after pivot_root we cannot guarantee 518 // what is present at the same location normally. (It is probably an empty 519 // /tmp directory that is populated in another place). 520 debug("performing operation: rmdir %s", dst); 521 if (rmdir(scratch_dir) < 0) { 522 die("cannot perform operation: rmdir %s", dst); 523 }; 524 // Make the old root filesystem recursively slave. This way operations 525 // performed in this mount namespace will not propagate to the peer group. 526 // This is another essential part of the confinement system. 527 sc_do_mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL); 528 // Detach the redundant hostfs version of sysfs since it shows up in the 529 // mount table and software inspecting the mount table may become confused 530 // (eg, docker and LP:# 162601). 531 sc_must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR); 532 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 533 // Detach the redundant hostfs version of /dev since it shows up in the 534 // mount table and software inspecting the mount table may become confused. 535 sc_must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR); 536 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 537 // Detach the redundant hostfs version of /proc since it shows up in the 538 // mount table and software inspecting the mount table may become confused. 539 sc_must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR); 540 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 541 // Detach both views of /writable: the one from hostfs and the one directly 542 // visible in /writable. Interfaces don't grant access to this directory 543 // and it has a large duplicated view of many mount points. Note that this 544 // is only applicable to ubuntu-core systems. 545 sc_detach_views_of_writable(config->distro, config->normal_mode); 546 } 547 548 static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode) 549 { 550 // Note that prior to detaching either mount point we switch the 551 // propagation to private to both limit the change to just this view and to 552 // prevent otherwise occurring event propagation from self-conflicting and 553 // returning EBUSY. A similar approach is used by snap-update-ns and is 554 // documented in umount(2). 555 const char *writable_dir = "/writable"; 556 const char *hostfs_writable_dir = "/var/lib/snapd/hostfs/writable"; 557 558 // Writable only exists on ubuntu-core. 559 if (distro == SC_DISTRO_CLASSIC) { 560 return; 561 } 562 // On all core distributions we see /var/lib/snapd/hostfs/writable that 563 // exposes writable, with a structure specific to ubuntu-core. 564 debug("detaching %s", hostfs_writable_dir); 565 sc_do_mount("none", hostfs_writable_dir, NULL, 566 MS_REC | MS_PRIVATE, NULL); 567 sc_do_umount(hostfs_writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH); 568 569 // On ubuntu-core 16, when the executed snap uses core as base we also see 570 // the /writable that we directly inherited from the initial mount 571 // namespace. 572 if (distro == SC_DISTRO_CORE16 && !normal_mode) { 573 debug("detaching %s", writable_dir); 574 sc_do_mount("none", writable_dir, NULL, MS_REC | MS_PRIVATE, 575 NULL); 576 sc_do_umount(writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH); 577 } 578 } 579 580 /** 581 * @path: a pathname where / replaced with '\0'. 582 * @offsetp: pointer to int showing which path segment was last seen. 583 * Updated on return to reflect the next segment. 584 * @fulllen: full original path length. 585 * Returns a pointer to the next path segment, or NULL if done. 586 */ 587 static char * __attribute__((used)) 588 get_nextpath(char *path, size_t *offsetp, size_t fulllen) 589 { 590 size_t offset = *offsetp; 591 592 if (offset >= fulllen) 593 return NULL; 594 595 while (offset < fulllen && path[offset] != '\0') 596 offset++; 597 while (offset < fulllen && path[offset] == '\0') 598 offset++; 599 600 *offsetp = offset; 601 return (offset < fulllen) ? &path[offset] : NULL; 602 } 603 604 /** 605 * Check that @subdir is a subdir of @dir. 606 **/ 607 static bool __attribute__((used)) 608 is_subdir(const char *subdir, const char *dir) 609 { 610 size_t dirlen = strlen(dir); 611 size_t subdirlen = strlen(subdir); 612 613 // @dir has to be at least as long as @subdir 614 if (subdirlen < dirlen) 615 return false; 616 // @dir has to be a prefix of @subdir 617 if (strncmp(subdir, dir, dirlen) != 0) 618 return false; 619 // @dir can look like "path/" (that is, end with the directory separator). 620 // When that is the case then given the test above we can be sure @subdir 621 // is a real subdirectory. 622 if (dirlen > 0 && dir[dirlen - 1] == '/') 623 return true; 624 // @subdir can look like "path/stuff" and when the directory separator 625 // is exactly at the spot where @dir ends (that is, it was not caught 626 // by the test above) then @subdir is a real subdirectory. 627 if (subdir[dirlen] == '/' && dirlen > 0) 628 return true; 629 // If both @dir and @subdir have identical length then given that the 630 // prefix check above @subdir is a real subdirectory. 631 if (subdirlen == dirlen) 632 return true; 633 return false; 634 } 635 636 void sc_populate_mount_ns(struct sc_apparmor *apparmor, int snap_update_ns_fd, 637 const sc_invocation * inv, const gid_t real_gid, 638 const gid_t saved_gid) 639 { 640 // Classify the current distribution, as claimed by /etc/os-release. 641 sc_distro distro = sc_classify_distro(); 642 643 // Check which mode we should run in, normal or legacy. 644 if (inv->is_normal_mode) { 645 // In normal mode we use the base snap as / and set up several bind mounts. 646 const struct sc_mount mounts[] = { 647 {"/dev"}, // because it contains devices on host OS 648 {"/etc"}, // because that's where /etc/resolv.conf lives, perhaps a bad idea 649 {"/home"}, // to support /home/*/snap and home interface 650 {"/root"}, // because that is $HOME for services 651 {"/proc"}, // fundamental filesystem 652 {"/sys"}, // fundamental filesystem 653 {"/tmp"}, // to get writable tmp 654 {"/var/snap"}, // to get access to global snap data 655 {"/var/lib/snapd"}, // to get access to snapd state and seccomp profiles 656 {"/var/tmp"}, // to get access to the other temporary directory 657 {"/run"}, // to get /run with sockets and what not 658 {"/lib/modules",.is_optional = true}, // access to the modules of the running kernel 659 {"/lib/firmware",.is_optional = true}, // access to the firmware of the running kernel 660 {"/usr/src"}, // FIXME: move to SecurityMounts in system-trace interface 661 {"/var/log"}, // FIXME: move to SecurityMounts in log-observe interface 662 #ifdef MERGED_USR 663 {"/run/media", true, "/media"}, // access to the users removable devices 664 #else 665 {"/media", true}, // access to the users removable devices 666 #endif // MERGED_USR 667 {"/run/netns", true}, // access to the 'ip netns' network namespaces 668 // The /mnt directory is optional in base snaps to ensure backwards 669 // compatibility with the first version of base snaps that was 670 // released. 671 {"/mnt",.is_optional = true}, // to support the removable-media interface 672 {"/var/lib/extrausers",.is_optional = true}, // access to UID/GID of extrausers (if available) 673 {}, 674 }; 675 struct sc_mount_config normal_config = { 676 .rootfs_dir = inv->rootfs_dir, 677 .mounts = mounts, 678 .distro = distro, 679 .normal_mode = true, 680 .base_snap_name = inv->base_snap_name, 681 }; 682 sc_bootstrap_mount_namespace(&normal_config); 683 } else { 684 // In legacy mode we don't pivot and instead just arrange bi- 685 // directional mount propagation for two directories. 686 const struct sc_mount mounts[] = { 687 {"/media", true}, 688 {"/run/netns", true}, 689 {}, 690 }; 691 struct sc_mount_config legacy_config = { 692 .rootfs_dir = "/", 693 .mounts = mounts, 694 .distro = distro, 695 .normal_mode = false, 696 .base_snap_name = inv->base_snap_name, 697 }; 698 sc_bootstrap_mount_namespace(&legacy_config); 699 } 700 701 // TODO: rename this and fold it into bootstrap 702 setup_private_mount(inv->snap_instance); 703 // set up private /dev/pts 704 // TODO: fold this into bootstrap 705 setup_private_pts(); 706 707 // setup the security backend bind mounts 708 sc_call_snap_update_ns(snap_update_ns_fd, inv->snap_instance, apparmor); 709 } 710 711 static bool is_mounted_with_shared_option(const char *dir) 712 __attribute__((nonnull(1))); 713 714 static bool is_mounted_with_shared_option(const char *dir) 715 { 716 sc_mountinfo *sm SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 717 sm = sc_parse_mountinfo(NULL); 718 if (sm == NULL) { 719 die("cannot parse /proc/self/mountinfo"); 720 } 721 sc_mountinfo_entry *entry = sc_first_mountinfo_entry(sm); 722 while (entry != NULL) { 723 const char *mount_dir = entry->mount_dir; 724 if (sc_streq(mount_dir, dir)) { 725 const char *optional_fields = entry->optional_fields; 726 if (strstr(optional_fields, "shared:") != NULL) { 727 return true; 728 } 729 } 730 entry = sc_next_mountinfo_entry(entry); 731 } 732 return false; 733 } 734 735 void sc_ensure_shared_snap_mount(void) 736 { 737 if (!is_mounted_with_shared_option("/") 738 && !is_mounted_with_shared_option(SNAP_MOUNT_DIR)) { 739 // TODO: We could be more aggressive and refuse to function but since 740 // we have no data on actual environments that happen to limp along in 741 // this configuration let's not do that yet. This code should be 742 // removed once we have a measurement and feedback mechanism that lets 743 // us decide based on measurable data. 744 sc_do_mount(SNAP_MOUNT_DIR, SNAP_MOUNT_DIR, "none", 745 MS_BIND | MS_REC, 0); 746 sc_do_mount("none", SNAP_MOUNT_DIR, NULL, MS_SHARED | MS_REC, 747 NULL); 748 } 749 } 750 751 void sc_setup_user_mounts(struct sc_apparmor *apparmor, int snap_update_ns_fd, 752 const char *snap_name) 753 { 754 debug("%s: %s", __FUNCTION__, snap_name); 755 756 char profile_path[PATH_MAX]; 757 struct stat st; 758 759 sc_must_snprintf(profile_path, sizeof(profile_path), 760 "/var/lib/snapd/mount/snap.%s.user-fstab", snap_name); 761 if (stat(profile_path, &st) != 0) { 762 // It is ok for the user fstab to not exist. 763 return; 764 } 765 766 // In our new mount namespace, recursively change all mounts 767 // to slave mode, so we see changes from the parent namespace 768 // but don't propagate our own changes. 769 sc_do_mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL); 770 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 771 sc_call_snap_update_ns_as_user(snap_update_ns_fd, snap_name, apparmor); 772 (void)sc_set_effective_identity(old); 773 } 774 775 void sc_ensure_snap_dir_shared_mounts(void) 776 { 777 const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL }; 778 for (int i = 0; dirs[i] != NULL; i++) { 779 const char *dir = dirs[i]; 780 if (!is_mounted_with_shared_option(dir)) { 781 /* Since this directory isn't yet shared (but it should be), 782 * recursively bind mount it, then recursively share it so that 783 * changes to the host are seen in the snap and vice-versa. This 784 * allows us to fine-tune propagation events elsewhere for this new 785 * mountpoint. 786 * 787 * Not using MS_SLAVE because it's too late for SNAP_MOUNT_DIR, 788 * since snaps are already mounted, and it's not needed for 789 * /var/snap. 790 */ 791 sc_do_mount(dir, dir, "none", MS_BIND | MS_REC, 0); 792 sc_do_mount("none", dir, NULL, MS_REC | MS_SHARED, 793 NULL); 794 } 795 } 796 } 797 798 void sc_setup_parallel_instance_classic_mounts(const char *snap_name, 799 const char *snap_instance_name) 800 { 801 char src[PATH_MAX] = { 0 }; 802 char dst[PATH_MAX] = { 0 }; 803 804 const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL }; 805 for (int i = 0; dirs[i] != NULL; i++) { 806 const char *dir = dirs[i]; 807 sc_do_mount("none", dir, NULL, MS_REC | MS_SLAVE, NULL); 808 } 809 810 /* Mount SNAP_MOUNT_DIR/<snap>_<key> on SNAP_MOUNT_DIR/<snap> */ 811 sc_must_snprintf(src, sizeof src, "%s/%s", SNAP_MOUNT_DIR, 812 snap_instance_name); 813 sc_must_snprintf(dst, sizeof dst, "%s/%s", SNAP_MOUNT_DIR, snap_name); 814 sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0); 815 816 /* Mount /var/snap/<snap>_<key> on /var/snap/<snap> */ 817 sc_must_snprintf(src, sizeof src, "/var/snap/%s", snap_instance_name); 818 sc_must_snprintf(dst, sizeof dst, "/var/snap/%s", snap_name); 819 sc_do_mount(src, dst, "none", MS_BIND | MS_REC, 0); 820 }