github.com/Lephar/snapd@v0.0.0-20210825215435-c7fba9cef4d2/cmd/snap-confine/mount-support.c (about) 1 /* 2 * Copyright (C) 2015 Canonical Ltd 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 3 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 * 16 */ 17 #ifdef HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #include "mount-support.h" 22 23 #include <errno.h> 24 #include <fcntl.h> 25 #include <libgen.h> 26 #include <limits.h> 27 #include <mntent.h> 28 #include <sched.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <sys/mount.h> 33 #include <sys/stat.h> 34 #include <sys/syscall.h> 35 #include <sys/types.h> 36 #include <sys/wait.h> 37 #include <unistd.h> 38 39 #include "../libsnap-confine-private/apparmor-support.h" 40 #include "../libsnap-confine-private/classic.h" 41 #include "../libsnap-confine-private/cleanup-funcs.h" 42 #include "../libsnap-confine-private/mount-opt.h" 43 #include "../libsnap-confine-private/mountinfo.h" 44 #include "../libsnap-confine-private/snap.h" 45 #include "../libsnap-confine-private/string-utils.h" 46 #include "../libsnap-confine-private/tool.h" 47 #include "../libsnap-confine-private/utils.h" 48 #include "mount-support-nvidia.h" 49 50 #define MAX_BUF 1000 51 52 static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode); 53 54 // TODO: simplify this, after all it is just a tmpfs 55 // TODO: fold this into bootstrap 56 static void setup_private_mount(const char *snap_name) 57 { 58 // Create a 0700 base directory. This is the "base" directory that is 59 // protected from other users. This directory name is NOT randomly 60 // generated. This has several properties: 61 // 62 // Users can relate to the name and can find the temporary directory as 63 // visible from within the snap. If this directory was random it would be 64 // harder to find because there may be situations in which multiple 65 // directories related to the same snap name would exist. 66 // 67 // Snapd can partially manage the directory. Specifically on snap remove 68 // snapd could remove the directory and everything in it, potentially 69 // avoiding runaway disk use on a machine that either never reboots or uses 70 // persistent /tmp directory. 71 // 72 // Underneath the base directory there is a "tmp" sub-directory that has 73 // mode 1777 and behaves as a typical /tmp directory would. That directory 74 // is used as a bind-mounted /tmp directory. 75 // 76 // Because the directories are reused across invocations by distinct users 77 // and because the directories are trivially guessable, each invocation 78 // unconditionally chowns/chmods them to appropriate values. 79 char base_dir[MAX_BUF] = { 0 }; 80 char tmp_dir[MAX_BUF] = { 0 }; 81 int base_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 82 int tmp_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 83 sc_must_snprintf(base_dir, sizeof(base_dir), "/tmp/snap.%s", snap_name); 84 sc_must_snprintf(tmp_dir, sizeof(tmp_dir), "%s/tmp", base_dir); 85 86 /* Switch to root group so that mkdir and open calls below create filesystem 87 * elements that are not owned by the user calling into snap-confine. */ 88 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 89 // Create /tmp/snap.$SNAP_NAME/ 0700 root.root. Ignore EEXIST since we want 90 // to reuse and we will open with O_NOFOLLOW, below. 91 if (mkdir(base_dir, 0700) < 0 && errno != EEXIST) { 92 die("cannot create base directory %s", base_dir); 93 } 94 base_dir_fd = open(base_dir, 95 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 96 if (base_dir_fd < 0) { 97 die("cannot open base directory %s", base_dir); 98 } 99 /* This seems redundant on first read but it has the non-obvious 100 * property of changing existing directories that have already existed 101 * but had incorrect ownership or permission. This is possible due to 102 * earlier bugs in snap-confine and due to the fact that some systems 103 * use persistent /tmp directory and may not clean up leftover files 104 * for arbitrarily long. This comment applies the following two pairs 105 * of fchmod and fchown. */ 106 if (fchmod(base_dir_fd, 0700) < 0) { 107 die("cannot chmod base directory %s to 0700", base_dir); 108 } 109 if (fchown(base_dir_fd, 0, 0) < 0) { 110 die("cannot chown base directory %s to root.root", base_dir); 111 } 112 // Create /tmp/snap.$SNAP_NAME/tmp 01777 root.root Ignore EEXIST since we 113 // want to reuse and we will open with O_NOFOLLOW, below. 114 if (mkdirat(base_dir_fd, "tmp", 01777) < 0 && errno != EEXIST) { 115 die("cannot create private tmp directory %s/tmp", base_dir); 116 } 117 (void)sc_set_effective_identity(old); 118 tmp_dir_fd = openat(base_dir_fd, "tmp", 119 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 120 if (tmp_dir_fd < 0) { 121 die("cannot open private tmp directory %s/tmp", base_dir); 122 } 123 if (fchmod(tmp_dir_fd, 01777) < 0) { 124 die("cannot chmod private tmp directory %s/tmp to 01777", 125 base_dir); 126 } 127 if (fchown(tmp_dir_fd, 0, 0) < 0) { 128 die("cannot chown private tmp directory %s/tmp to root.root", 129 base_dir); 130 } 131 sc_do_mount(tmp_dir, "/tmp", NULL, MS_BIND, NULL); 132 sc_do_mount("none", "/tmp", NULL, MS_PRIVATE, NULL); 133 } 134 135 // TODO: fold this into bootstrap 136 static void setup_private_pts(void) 137 { 138 // See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt 139 // 140 // Ubuntu by default uses devpts 'single-instance' mode where 141 // /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change 142 // the startup scripts though, so we follow the instructions in point 143 // '4' of 'User-space changes' in the above doc. In other words, after 144 // unshare(CLONE_NEWNS), we mount devpts with -o 145 // newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto 146 // /dev/ptmx 147 148 struct stat st; 149 150 // Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode 151 // which doesn't provide the isolation we require. 152 if (stat("/dev/pts/ptmx", &st) != 0) { 153 die("cannot stat /dev/pts/ptmx"); 154 } 155 // Make sure /dev/ptmx exists so we can bind mount over it 156 if (stat("/dev/ptmx", &st) != 0) { 157 die("cannot stat /dev/ptmx"); 158 } 159 // Since multi-instance, use ptmxmode=0666. The other options are 160 // copied from /etc/default/devpts 161 sc_do_mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, 162 "newinstance,ptmxmode=0666,mode=0620,gid=5"); 163 sc_do_mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, NULL); 164 } 165 166 struct sc_mount { 167 const char *path; 168 bool is_bidirectional; 169 // Alternate path defines the rbind mount "alternative" of path. 170 // It exists so that we can make /media on systems that use /run/media. 171 const char *altpath; 172 // Optional mount points are not processed unless the source and 173 // destination both exist. 174 bool is_optional; 175 }; 176 177 struct sc_mount_config { 178 const char *rootfs_dir; 179 // The struct is terminated with an entry with NULL path. 180 const struct sc_mount *mounts; 181 sc_distro distro; 182 bool normal_mode; 183 const char *base_snap_name; 184 }; 185 186 /** 187 * Bootstrap mount namespace. 188 * 189 * This is a chunk of tricky code that lets us have full control over the 190 * layout and direction of propagation of mount events. The documentation below 191 * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source 192 * tree. 193 * 194 * As a reminder two definitions are quoted below: 195 * 196 * A 'propagation event' is defined as event generated on a vfsmount 197 * that leads to mount or unmount actions in other vfsmounts. 198 * 199 * A 'peer group' is defined as a group of vfsmounts that propagate 200 * events to each other. 201 * 202 * (end of quote). 203 * 204 * The main idea is to setup a mount namespace that has a root filesystem with 205 * vfsmounts and peer groups that, depending on the location, either isolate 206 * or share with the rest of the system. 207 * 208 * The vast majority of the filesystem is shared in one direction. Events from 209 * the outside (from the main mount namespace) propagate inside (to namespaces 210 * of particular snaps) so things like new snap revisions, mounted drives, etc, 211 * just show up as expected but even if a snap is exploited or malicious in 212 * nature it cannot affect anything in another namespace where it might cause 213 * security or stability issues. 214 * 215 * Selected directories (today just /media) can be shared in both directions. 216 * This allows snaps with sufficient privileges to either create, through the 217 * mount system call, additional mount points that are visible by the rest of 218 * the system (both the main mount namespace and namespaces of individual 219 * snaps) or remove them, through the unmount system call. 220 **/ 221 static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config) 222 { 223 char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX"; 224 char src[PATH_MAX] = { 0 }; 225 char dst[PATH_MAX] = { 0 }; 226 if (mkdtemp(scratch_dir) == NULL) { 227 die("cannot create temporary directory for the root file system"); 228 } 229 // NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new 230 // mount namespace and have a private list of mounts. 231 debug("scratch directory for constructing namespace: %s", scratch_dir); 232 // Make the root filesystem recursively shared. This way propagation events 233 // will be shared with main mount namespace. 234 sc_do_mount("none", "/", NULL, MS_REC | MS_SHARED, NULL); 235 // Bind mount the temporary scratch directory for root filesystem over 236 // itself so that it is a mount point. This is done so that it can become 237 // unbindable as explained below. 238 sc_do_mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL); 239 // Make the scratch directory unbindable. 240 // 241 // This is necessary as otherwise a mount loop can occur and the kernel 242 // would crash. The term unbindable simply states that it cannot be bind 243 // mounted anywhere. When we construct recursive bind mounts below this 244 // guarantees that this directory will not be replicated anywhere. 245 sc_do_mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL); 246 // Recursively bind mount desired root filesystem directory over the 247 // scratch directory. This puts the initial content into the scratch space 248 // and serves as a foundation for all subsequent operations below. 249 // 250 // The mount is recursive because it can either be applied to the root 251 // filesystem of a core system (aka all-snap) or the core snap on a classic 252 // system. In the former case we need recursive bind mounts to accurately 253 // replicate the state of the root filesystem into the scratch directory. 254 sc_do_mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND, 255 NULL); 256 // Make the scratch directory recursively slave. Nothing done there will be 257 // shared with the initial mount namespace. This effectively detaches us, 258 // in one way, from the original namespace and coupled with pivot_root 259 // below serves as the foundation of the mount sandbox. 260 sc_do_mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL); 261 // Bind mount certain directories from the host filesystem to the scratch 262 // directory. By default mount events will propagate in both into and out 263 // of the peer group. This way the running application can alter any global 264 // state visible on the host and in other snaps. This can be restricted by 265 // disabling the "is_bidirectional" flag as can be seen below. 266 for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL; 267 mnt++) { 268 269 if (mnt->is_bidirectional) { 270 sc_identity old = 271 sc_set_effective_identity(sc_root_group_identity()); 272 if (mkdir(mnt->path, 0755) < 0 && errno != EEXIST) { 273 die("cannot create %s", mnt->path); 274 } 275 (void)sc_set_effective_identity(old); 276 } 277 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 278 mnt->path); 279 if (mnt->is_optional) { 280 bool ok = sc_do_optional_mount(mnt->path, dst, NULL, 281 MS_REC | MS_BIND, NULL); 282 if (!ok) { 283 // If we cannot mount it, just continue. 284 continue; 285 } 286 } else { 287 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, 288 NULL); 289 } 290 if (!mnt->is_bidirectional) { 291 // Mount events will only propagate inwards to the namespace. This 292 // way the running application cannot alter any global state apart 293 // from that of its own snap. 294 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 295 } 296 if (mnt->altpath == NULL) { 297 continue; 298 } 299 // An alternate path of mnt->path is provided at another location. 300 // It should behave exactly the same as the original. 301 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 302 mnt->altpath); 303 struct stat stat_buf; 304 if (lstat(dst, &stat_buf) < 0) { 305 die("cannot lstat %s", dst); 306 } 307 if ((stat_buf.st_mode & S_IFMT) == S_IFLNK) { 308 die("cannot bind mount alternate path over a symlink: %s", dst); 309 } 310 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL); 311 if (!mnt->is_bidirectional) { 312 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 313 } 314 } 315 if (config->normal_mode) { 316 // Since we mounted /etc from the host filesystem to the scratch directory, 317 // we may need to put certain directories from the desired root filesystem 318 // (e.g. the core snap) back. This way the behavior of running snaps is not 319 // affected by the alternatives directory from the host, if one exists. 320 // 321 // Fixes the following bugs: 322 // - https://bugs.launchpad.net/snap-confine/+bug/1580018 323 // - https://bugzilla.opensuse.org/show_bug.cgi?id=1028568 324 const char *dirs_from_core[] = { 325 "/etc/alternatives", "/etc/nsswitch.conf", 326 // Some specific and privileged interfaces (e.g docker-support) give 327 // access to apparmor_parser from the base snap which at a minimum 328 // needs to use matching configuration from the base snap instead 329 // of from the users host system. 330 "/etc/apparmor", "/etc/apparmor.d", 331 // Use ssl certs from the base by default unless 332 // using Debian/Ubuntu classic (see below) 333 "/etc/ssl", 334 NULL 335 }; 336 337 for (const char **dirs = dirs_from_core; *dirs != NULL; dirs++) { 338 const char *dir = *dirs; 339 340 // Special case for ubuntu/debian based 341 // classic distros that use the core* snap: 342 // here we use the host /etc/ssl 343 // to support custom ca-cert setups 344 if (sc_streq(dir, "/etc/ssl") && 345 config->distro == SC_DISTRO_CLASSIC && 346 sc_is_debian_like() && 347 sc_startswith(config->base_snap_name, "core")) { 348 continue; 349 } 350 351 if (access(dir, F_OK) != 0) { 352 continue; 353 } 354 struct stat dst_stat; 355 struct stat src_stat; 356 sc_must_snprintf(src, sizeof src, "%s%s", 357 config->rootfs_dir, dir); 358 sc_must_snprintf(dst, sizeof dst, "%s%s", 359 scratch_dir, dir); 360 if (lstat(src, &src_stat) != 0) { 361 if (errno == ENOENT) { 362 continue; 363 } 364 die("cannot stat %s from desired rootfs", src); 365 } 366 if (!S_ISREG(src_stat.st_mode) 367 && !S_ISDIR(src_stat.st_mode)) { 368 debug 369 ("entry %s from the desired rootfs is not a file or directory, skipping mount", 370 src); 371 continue; 372 } 373 374 if (lstat(dst, &dst_stat) != 0) { 375 if (errno == ENOENT) { 376 continue; 377 } 378 die("cannot stat %s from host", src); 379 } 380 if (!S_ISREG(dst_stat.st_mode) 381 && !S_ISDIR(dst_stat.st_mode)) { 382 debug 383 ("entry %s from the host is not a file or directory, skipping mount", 384 src); 385 continue; 386 } 387 388 if ((dst_stat.st_mode & S_IFMT) != 389 (src_stat.st_mode & S_IFMT)) { 390 debug 391 ("entries %s and %s are of different types, skipping mount", 392 dst, src); 393 continue; 394 } 395 // both source and destination exist where both are either files 396 // or both are directories 397 sc_do_mount(src, dst, NULL, MS_BIND, NULL); 398 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 399 } 400 } 401 // The "core" base snap is special as it contains snapd and friends. 402 // Other base snaps do not, so whenever a base snap other than core is 403 // in use we need extra provisions for setting up internal tooling to 404 // be available. 405 // 406 // However on a core18 (and similar) system the core snap is not 407 // a special base anymore and we should map our own tooling in. 408 if (config->distro == SC_DISTRO_CORE_OTHER 409 || !sc_streq(config->base_snap_name, "core")) { 410 // when bases are used we need to bind-mount the libexecdir 411 // (that contains snap-exec) into /usr/lib/snapd of the 412 // base snap so that snap-exec is available for the snaps 413 // (base snaps do not ship snapd) 414 415 // dst is always /usr/lib/snapd as this is where snapd 416 // assumes to find snap-exec 417 sc_must_snprintf(dst, sizeof dst, "%s/usr/lib/snapd", 418 scratch_dir); 419 420 // bind mount the current $ROOT/usr/lib/snapd path, 421 // where $ROOT is either "/" or the "/snap/{core,snapd}/current" 422 // that we are re-execing from 423 char *src = NULL; 424 char self[PATH_MAX + 1] = { 0 }; 425 ssize_t nread; 426 nread = readlink("/proc/self/exe", self, sizeof self - 1); 427 if (nread < 0) { 428 die("cannot read /proc/self/exe"); 429 } 430 // Though we initialized self to NULs and passed one less to 431 // readlink, therefore guaranteeing that self is 432 // zero-terminated, perform an explicit assignment to make 433 // Coverity happy. 434 self[nread] = '\0'; 435 // this cannot happen except when the kernel is buggy 436 if (strstr(self, "/snap-confine") == NULL) { 437 die("cannot use result from readlink: %s", self); 438 } 439 src = dirname(self); 440 // dirname(path) might return '.' depending on path. 441 // /proc/self/exe should always point 442 // to an absolute path, but let's guarantee that. 443 if (src[0] != '/') { 444 die("cannot use the result of dirname(): %s", src); 445 } 446 447 sc_do_mount(src, dst, NULL, MS_BIND | MS_RDONLY, NULL); 448 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 449 } 450 // Bind mount the directory where all snaps are mounted. The location of 451 // the this directory on the host filesystem may not match the location in 452 // the desired root filesystem. In the "core" and "ubuntu-core" snaps the 453 // directory is always /snap. On the host it is a build-time configuration 454 // option stored in SNAP_MOUNT_DIR. In legacy mode (or in other words, not 455 // in normal mode), we don't need to do this because /snap is fixed and 456 // already contains the correct view of the mounted snaps. 457 if (config->normal_mode) { 458 sc_must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir); 459 sc_do_mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC, NULL); 460 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 461 } 462 // Create the hostfs directory if one is missing. This directory is a part 463 // of packaging now so perhaps this code can be removed later. 464 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 465 if (mkdir(SC_HOSTFS_DIR, 0755) < 0) { 466 if (errno != EEXIST) { 467 die("cannot perform operation: mkdir %s", SC_HOSTFS_DIR); 468 } 469 } 470 (void)sc_set_effective_identity(old); 471 // Ensure that hostfs isgroup owned by root. We may have (now or earlier) 472 // created the directory as the user who first ran a snap on a given 473 // system and the group identity of that user is visilbe on disk. 474 // This was LP:#1665004 475 struct stat sb; 476 if (stat(SC_HOSTFS_DIR, &sb) < 0) { 477 die("cannot stat %s", SC_HOSTFS_DIR); 478 } 479 if (sb.st_uid != 0 || sb.st_gid != 0) { 480 if (chown(SC_HOSTFS_DIR, 0, 0) < 0) { 481 die("cannot change user/group owner of %s to root", 482 SC_HOSTFS_DIR); 483 } 484 } 485 // Make the upcoming "put_old" directory for pivot_root private so that 486 // mount events don't propagate to any peer group. In practice pivot root 487 // has a number of undocumented requirements and one of them is that the 488 // "put_old" directory (the second argument) cannot be shared in any way. 489 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR); 490 sc_do_mount(dst, dst, NULL, MS_BIND, NULL); 491 sc_do_mount("none", dst, NULL, MS_PRIVATE, NULL); 492 // On classic mount the nvidia driver. Ideally this would be done in an 493 // uniform way after pivot_root but this is good enough and requires less 494 // code changes the nvidia code assumes it has access to the existing 495 // pre-pivot filesystem. 496 if (config->distro == SC_DISTRO_CLASSIC) { 497 sc_mount_nvidia_driver(scratch_dir); 498 } 499 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 500 // pivot_root 501 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 502 // Use pivot_root to "chroot" into the scratch directory. 503 // 504 // Q: Why are we using something as esoteric as pivot_root(2)? 505 // A: Because this makes apparmor handling easy. Using a normal chroot 506 // makes all apparmor rules conditional. We are either running on an 507 // all-snap system where this would-be chroot didn't happen and all the 508 // rules see / as the root file system _OR_ we are running on top of a 509 // classic distribution and this chroot has now moved all paths to 510 // /tmp/snap.rootfs_*. 511 // 512 // Because we are using unshare(2) with CLONE_NEWNS we can essentially use 513 // pivot_root just like chroot but this makes apparmor unaware of the old 514 // root so everything works okay. 515 // 516 // HINT: If you are debugging this and are trying to see why pivot_root 517 // happens to return EINVAL with any changes you may be making, please 518 // consider applying 519 // misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree 520 // kernel. 521 debug("performing operation: pivot_root %s %s", scratch_dir, dst); 522 if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) { 523 die("cannot perform operation: pivot_root %s %s", scratch_dir, 524 dst); 525 } 526 // Unmount the self-bind mount over the scratch directory created earlier 527 // in the original root filesystem (which is now mounted on SC_HOSTFS_DIR). 528 // This way we can remove the temporary directory we created and "clean up" 529 // after ourselves nicely. 530 sc_must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir); 531 sc_do_umount(dst, UMOUNT_NOFOLLOW); 532 // Remove the scratch directory. Note that we are using the path that is 533 // based on the old root filesystem as after pivot_root we cannot guarantee 534 // what is present at the same location normally. (It is probably an empty 535 // /tmp directory that is populated in another place). 536 debug("performing operation: rmdir %s", dst); 537 if (rmdir(scratch_dir) < 0) { 538 die("cannot perform operation: rmdir %s", dst); 539 }; 540 // Make the old root filesystem recursively slave. This way operations 541 // performed in this mount namespace will not propagate to the peer group. 542 // This is another essential part of the confinement system. 543 sc_do_mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL); 544 // Detach the redundant hostfs version of sysfs since it shows up in the 545 // mount table and software inspecting the mount table may become confused 546 // (eg, docker and LP:# 162601). 547 sc_must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR); 548 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 549 // Detach the redundant hostfs version of /dev since it shows up in the 550 // mount table and software inspecting the mount table may become confused. 551 sc_must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR); 552 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 553 // Detach the redundant hostfs version of /proc since it shows up in the 554 // mount table and software inspecting the mount table may become confused. 555 sc_must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR); 556 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 557 // Detach both views of /writable: the one from hostfs and the one directly 558 // visible in /writable. Interfaces don't grant access to this directory 559 // and it has a large duplicated view of many mount points. Note that this 560 // is only applicable to ubuntu-core systems. 561 sc_detach_views_of_writable(config->distro, config->normal_mode); 562 } 563 564 static void sc_detach_views_of_writable(sc_distro distro, bool normal_mode) 565 { 566 // Note that prior to detaching either mount point we switch the 567 // propagation to private to both limit the change to just this view and to 568 // prevent otherwise occurring event propagation from self-conflicting and 569 // returning EBUSY. A similar approach is used by snap-update-ns and is 570 // documented in umount(2). 571 const char *writable_dir = "/writable"; 572 const char *hostfs_writable_dir = "/var/lib/snapd/hostfs/writable"; 573 574 // Writable only exists on ubuntu-core. 575 if (distro == SC_DISTRO_CLASSIC) { 576 return; 577 } 578 // On all core distributions we see /var/lib/snapd/hostfs/writable that 579 // exposes writable, with a structure specific to ubuntu-core. 580 debug("detaching %s", hostfs_writable_dir); 581 sc_do_mount("none", hostfs_writable_dir, NULL, 582 MS_REC | MS_PRIVATE, NULL); 583 sc_do_umount(hostfs_writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH); 584 585 // On ubuntu-core 16, when the executed snap uses core as base we also see 586 // the /writable that we directly inherited from the initial mount 587 // namespace. 588 if (distro == SC_DISTRO_CORE16 && !normal_mode) { 589 debug("detaching %s", writable_dir); 590 sc_do_mount("none", writable_dir, NULL, MS_REC | MS_PRIVATE, 591 NULL); 592 sc_do_umount(writable_dir, UMOUNT_NOFOLLOW | MNT_DETACH); 593 } 594 } 595 596 /** 597 * @path: a pathname where / replaced with '\0'. 598 * @offsetp: pointer to int showing which path segment was last seen. 599 * Updated on return to reflect the next segment. 600 * @fulllen: full original path length. 601 * Returns a pointer to the next path segment, or NULL if done. 602 */ 603 static char * __attribute__((used)) 604 get_nextpath(char *path, size_t *offsetp, size_t fulllen) 605 { 606 size_t offset = *offsetp; 607 608 if (offset >= fulllen) 609 return NULL; 610 611 while (offset < fulllen && path[offset] != '\0') 612 offset++; 613 while (offset < fulllen && path[offset] == '\0') 614 offset++; 615 616 *offsetp = offset; 617 return (offset < fulllen) ? &path[offset] : NULL; 618 } 619 620 /** 621 * Check that @subdir is a subdir of @dir. 622 **/ 623 static bool __attribute__((used)) 624 is_subdir(const char *subdir, const char *dir) 625 { 626 size_t dirlen = strlen(dir); 627 size_t subdirlen = strlen(subdir); 628 629 // @dir has to be at least as long as @subdir 630 if (subdirlen < dirlen) 631 return false; 632 // @dir has to be a prefix of @subdir 633 if (strncmp(subdir, dir, dirlen) != 0) 634 return false; 635 // @dir can look like "path/" (that is, end with the directory separator). 636 // When that is the case then given the test above we can be sure @subdir 637 // is a real subdirectory. 638 if (dirlen > 0 && dir[dirlen - 1] == '/') 639 return true; 640 // @subdir can look like "path/stuff" and when the directory separator 641 // is exactly at the spot where @dir ends (that is, it was not caught 642 // by the test above) then @subdir is a real subdirectory. 643 if (subdir[dirlen] == '/' && dirlen > 0) 644 return true; 645 // If both @dir and @subdir have identical length then given that the 646 // prefix check above @subdir is a real subdirectory. 647 if (subdirlen == dirlen) 648 return true; 649 return false; 650 } 651 652 void sc_populate_mount_ns(struct sc_apparmor *apparmor, int snap_update_ns_fd, 653 const sc_invocation * inv, const gid_t real_gid, 654 const gid_t saved_gid) 655 { 656 // Classify the current distribution, as claimed by /etc/os-release. 657 sc_distro distro = sc_classify_distro(); 658 659 // Check which mode we should run in, normal or legacy. 660 if (inv->is_normal_mode) { 661 // In normal mode we use the base snap as / and set up several bind mounts. 662 const struct sc_mount mounts[] = { 663 {"/dev"}, // because it contains devices on host OS 664 {"/etc"}, // because that's where /etc/resolv.conf lives, perhaps a bad idea 665 {"/home"}, // to support /home/*/snap and home interface 666 {"/root"}, // because that is $HOME for services 667 {"/proc"}, // fundamental filesystem 668 {"/sys"}, // fundamental filesystem 669 {"/tmp"}, // to get writable tmp 670 {"/var/snap"}, // to get access to global snap data 671 {"/var/lib/snapd"}, // to get access to snapd state and seccomp profiles 672 {"/var/tmp"}, // to get access to the other temporary directory 673 {"/run"}, // to get /run with sockets and what not 674 {"/lib/modules",.is_optional = true}, // access to the modules of the running kernel 675 {"/lib/firmware",.is_optional = true}, // access to the firmware of the running kernel 676 {"/usr/src"}, // FIXME: move to SecurityMounts in system-trace interface 677 {"/var/log"}, // FIXME: move to SecurityMounts in log-observe interface 678 #ifdef MERGED_USR 679 {"/run/media", true, "/media"}, // access to the users removable devices 680 #else 681 {"/media", true}, // access to the users removable devices 682 #endif // MERGED_USR 683 {"/run/netns", true}, // access to the 'ip netns' network namespaces 684 // The /mnt directory is optional in base snaps to ensure backwards 685 // compatibility with the first version of base snaps that was 686 // released. 687 {"/mnt",.is_optional = true}, // to support the removable-media interface 688 {"/var/lib/extrausers",.is_optional = true}, // access to UID/GID of extrausers (if available) 689 {}, 690 }; 691 struct sc_mount_config normal_config = { 692 .rootfs_dir = inv->rootfs_dir, 693 .mounts = mounts, 694 .distro = distro, 695 .normal_mode = true, 696 .base_snap_name = inv->base_snap_name, 697 }; 698 sc_bootstrap_mount_namespace(&normal_config); 699 } else { 700 // In legacy mode we don't pivot and instead just arrange bi- 701 // directional mount propagation for two directories. 702 const struct sc_mount mounts[] = { 703 {"/media", true}, 704 {"/run/netns", true}, 705 {}, 706 }; 707 struct sc_mount_config legacy_config = { 708 .rootfs_dir = "/", 709 .mounts = mounts, 710 .distro = distro, 711 .normal_mode = false, 712 .base_snap_name = inv->base_snap_name, 713 }; 714 sc_bootstrap_mount_namespace(&legacy_config); 715 } 716 717 // TODO: rename this and fold it into bootstrap 718 setup_private_mount(inv->snap_instance); 719 // set up private /dev/pts 720 // TODO: fold this into bootstrap 721 setup_private_pts(); 722 723 // setup the security backend bind mounts 724 sc_call_snap_update_ns(snap_update_ns_fd, inv->snap_instance, apparmor); 725 } 726 727 static bool is_mounted_with_shared_option(const char *dir) 728 __attribute__((nonnull(1))); 729 730 static bool is_mounted_with_shared_option(const char *dir) 731 { 732 sc_mountinfo *sm SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 733 sm = sc_parse_mountinfo(NULL); 734 if (sm == NULL) { 735 die("cannot parse /proc/self/mountinfo"); 736 } 737 sc_mountinfo_entry *entry = sc_first_mountinfo_entry(sm); 738 while (entry != NULL) { 739 const char *mount_dir = entry->mount_dir; 740 if (sc_streq(mount_dir, dir)) { 741 const char *optional_fields = entry->optional_fields; 742 if (strstr(optional_fields, "shared:") != NULL) { 743 return true; 744 } 745 } 746 entry = sc_next_mountinfo_entry(entry); 747 } 748 return false; 749 } 750 751 void sc_ensure_shared_snap_mount(void) 752 { 753 if (!is_mounted_with_shared_option("/") 754 && !is_mounted_with_shared_option(SNAP_MOUNT_DIR)) { 755 // TODO: We could be more aggressive and refuse to function but since 756 // we have no data on actual environments that happen to limp along in 757 // this configuration let's not do that yet. This code should be 758 // removed once we have a measurement and feedback mechanism that lets 759 // us decide based on measurable data. 760 sc_do_mount(SNAP_MOUNT_DIR, SNAP_MOUNT_DIR, "none", 761 MS_BIND | MS_REC, NULL); 762 sc_do_mount("none", SNAP_MOUNT_DIR, NULL, MS_SHARED | MS_REC, 763 NULL); 764 } 765 } 766 767 void sc_setup_user_mounts(struct sc_apparmor *apparmor, int snap_update_ns_fd, 768 const char *snap_name) 769 { 770 debug("%s: %s", __FUNCTION__, snap_name); 771 772 char profile_path[PATH_MAX]; 773 struct stat st; 774 775 sc_must_snprintf(profile_path, sizeof(profile_path), 776 "/var/lib/snapd/mount/snap.%s.user-fstab", snap_name); 777 if (stat(profile_path, &st) != 0) { 778 // It is ok for the user fstab to not exist. 779 return; 780 } 781 782 // In our new mount namespace, recursively change all mounts 783 // to slave mode, so we see changes from the parent namespace 784 // but don't propagate our own changes. 785 sc_do_mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL); 786 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 787 sc_call_snap_update_ns_as_user(snap_update_ns_fd, snap_name, apparmor); 788 (void)sc_set_effective_identity(old); 789 } 790 791 void sc_ensure_snap_dir_shared_mounts(void) 792 { 793 const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL }; 794 for (int i = 0; dirs[i] != NULL; i++) { 795 const char *dir = dirs[i]; 796 if (!is_mounted_with_shared_option(dir)) { 797 /* Since this directory isn't yet shared (but it should be), 798 * recursively bind mount it, then recursively share it so that 799 * changes to the host are seen in the snap and vice-versa. This 800 * allows us to fine-tune propagation events elsewhere for this new 801 * mountpoint. 802 * 803 * Not using MS_SLAVE because it's too late for SNAP_MOUNT_DIR, 804 * since snaps are already mounted, and it's not needed for 805 * /var/snap. 806 */ 807 sc_do_mount(dir, dir, "none", MS_BIND | MS_REC, NULL); 808 sc_do_mount("none", dir, NULL, MS_REC | MS_SHARED, 809 NULL); 810 } 811 } 812 } 813 814 void sc_setup_parallel_instance_classic_mounts(const char *snap_name, 815 const char *snap_instance_name) 816 { 817 char src[PATH_MAX] = { 0 }; 818 char dst[PATH_MAX] = { 0 }; 819 820 const char *dirs[] = { SNAP_MOUNT_DIR, "/var/snap", NULL }; 821 for (int i = 0; dirs[i] != NULL; i++) { 822 const char *dir = dirs[i]; 823 sc_do_mount("none", dir, NULL, MS_REC | MS_SLAVE, NULL); 824 } 825 826 /* Mount SNAP_MOUNT_DIR/<snap>_<key> on SNAP_MOUNT_DIR/<snap> */ 827 sc_must_snprintf(src, sizeof src, "%s/%s", SNAP_MOUNT_DIR, 828 snap_instance_name); 829 sc_must_snprintf(dst, sizeof dst, "%s/%s", SNAP_MOUNT_DIR, snap_name); 830 sc_do_mount(src, dst, "none", MS_BIND | MS_REC, NULL); 831 832 /* Mount /var/snap/<snap>_<key> on /var/snap/<snap> */ 833 sc_must_snprintf(src, sizeof src, "/var/snap/%s", snap_instance_name); 834 sc_must_snprintf(dst, sizeof dst, "/var/snap/%s", snap_name); 835 sc_do_mount(src, dst, "none", MS_BIND | MS_REC, NULL); 836 }