github.com/rigado/snapd@v2.42.5-go-mod+incompatible/cmd/snap-confine/mount-support.c (about) 1 /* 2 * Copyright (C) 2015 Canonical Ltd 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 3 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 * 16 */ 17 #ifdef HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #include "mount-support.h" 22 23 #include <errno.h> 24 #include <fcntl.h> 25 #include <libgen.h> 26 #include <limits.h> 27 #include <mntent.h> 28 #include <sched.h> 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <sys/mount.h> 33 #include <sys/stat.h> 34 #include <sys/syscall.h> 35 #include <sys/types.h> 36 #include <sys/types.h> 37 #include <sys/wait.h> 38 #include <unistd.h> 39 40 #include "../libsnap-confine-private/apparmor-support.h" 41 #include "../libsnap-confine-private/classic.h" 42 #include "../libsnap-confine-private/cleanup-funcs.h" 43 #include "../libsnap-confine-private/mount-opt.h" 44 #include "../libsnap-confine-private/mountinfo.h" 45 #include "../libsnap-confine-private/snap.h" 46 #include "../libsnap-confine-private/string-utils.h" 47 #include "../libsnap-confine-private/tool.h" 48 #include "../libsnap-confine-private/utils.h" 49 #include "mount-support-nvidia.h" 50 51 #define MAX_BUF 1000 52 53 // TODO: simplify this, after all it is just a tmpfs 54 // TODO: fold this into bootstrap 55 static void setup_private_mount(const char *snap_name) 56 { 57 // Create a 0700 base directory. This is the "base" directory that is 58 // protected from other users. This directory name is NOT randomly 59 // generated. This has several properties: 60 // 61 // Users can relate to the name and can find the temporary directory as 62 // visible from within the snap. If this directory was random it would be 63 // harder to find because there may be situations in which multiple 64 // directories related to the same snap name would exist. 65 // 66 // Snapd can partially manage the directory. Specifically on snap remove 67 // snapd could remove the directory and everything in it, potentially 68 // avoiding runaway disk use on a machine that either never reboots or uses 69 // persistent /tmp directory. 70 // 71 // Underneath the base directory there is a "tmp" sub-directory that has 72 // mode 1777 and behaves as a typical /tmp directory would. That directory 73 // is used as a bind-mounted /tmp directory. 74 // 75 // Because the directories are reused across invocations by distinct users 76 // and because the directories are trivially guessable, each invocation 77 // unconditionally chowns/chmods them to appropriate values. 78 char base_dir[MAX_BUF] = { 0 }; 79 char tmp_dir[MAX_BUF] = { 0 }; 80 int base_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 81 int tmp_dir_fd SC_CLEANUP(sc_cleanup_close) = -1; 82 sc_must_snprintf(base_dir, sizeof(base_dir), "/tmp/snap.%s", snap_name); 83 sc_must_snprintf(tmp_dir, sizeof(tmp_dir), "%s/tmp", base_dir); 84 85 // Create /tmp/snap.$SNAP_NAME/ 0700 root.root. Ignore EEXIST since we want 86 // to reuse and we will open with O_NOFOLLOW, below. 87 if (mkdir(base_dir, 0700) < 0 && errno != EEXIST) { 88 die("cannot create base directory %s", base_dir); 89 } 90 base_dir_fd = open(base_dir, 91 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 92 if (base_dir_fd < 0) { 93 die("cannot open base directory %s", base_dir); 94 } 95 if (fchmod(base_dir_fd, 0700) < 0) { 96 die("cannot chmod base directory %s to 0700", base_dir); 97 } 98 if (fchown(base_dir_fd, 0, 0) < 0) { 99 die("cannot chown base directory %s to root.root", base_dir); 100 } 101 // Create /tmp/snap.$SNAP_NAME/tmp 01777 root.root Ignore EEXIST since we 102 // want to reuse and we will open with O_NOFOLLOW, below. 103 if (mkdirat(base_dir_fd, "tmp", 01777) < 0 && errno != EEXIST) { 104 die("cannot create private tmp directory %s/tmp", base_dir); 105 } 106 tmp_dir_fd = openat(base_dir_fd, "tmp", 107 O_RDONLY | O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); 108 if (tmp_dir_fd < 0) { 109 die("cannot open private tmp directory %s/tmp", base_dir); 110 } 111 if (fchmod(tmp_dir_fd, 01777) < 0) { 112 die("cannot chmod private tmp directory %s/tmp to 01777", 113 base_dir); 114 } 115 if (fchown(tmp_dir_fd, 0, 0) < 0) { 116 die("cannot chown private tmp directory %s/tmp to root.root", 117 base_dir); 118 } 119 sc_do_mount(tmp_dir, "/tmp", NULL, MS_BIND, NULL); 120 sc_do_mount("none", "/tmp", NULL, MS_PRIVATE, NULL); 121 } 122 123 // TODO: fold this into bootstrap 124 static void setup_private_pts(void) 125 { 126 // See https://www.kernel.org/doc/Documentation/filesystems/devpts.txt 127 // 128 // Ubuntu by default uses devpts 'single-instance' mode where 129 // /dev/pts/ptmx is mounted with ptmxmode=0000. We don't want to change 130 // the startup scripts though, so we follow the instructions in point 131 // '4' of 'User-space changes' in the above doc. In other words, after 132 // unshare(CLONE_NEWNS), we mount devpts with -o 133 // newinstance,ptmxmode=0666 and then bind mount /dev/pts/ptmx onto 134 // /dev/ptmx 135 136 struct stat st; 137 138 // Make sure /dev/pts/ptmx exists, otherwise we are in legacy mode 139 // which doesn't provide the isolation we require. 140 if (stat("/dev/pts/ptmx", &st) != 0) { 141 die("cannot stat /dev/pts/ptmx"); 142 } 143 // Make sure /dev/ptmx exists so we can bind mount over it 144 if (stat("/dev/ptmx", &st) != 0) { 145 die("cannot stat /dev/ptmx"); 146 } 147 // Since multi-instance, use ptmxmode=0666. The other options are 148 // copied from /etc/default/devpts 149 sc_do_mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, 150 "newinstance,ptmxmode=0666,mode=0620,gid=5"); 151 sc_do_mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0); 152 } 153 154 struct sc_mount { 155 const char *path; 156 bool is_bidirectional; 157 // Alternate path defines the rbind mount "alternative" of path. 158 // It exists so that we can make /media on systems that use /run/media. 159 const char *altpath; 160 // Optional mount points are not processed unless the source and 161 // destination both exist. 162 bool is_optional; 163 }; 164 165 struct sc_mount_config { 166 const char *rootfs_dir; 167 // The struct is terminated with an entry with NULL path. 168 const struct sc_mount *mounts; 169 sc_distro distro; 170 bool normal_mode; 171 const char *base_snap_name; 172 }; 173 174 /** 175 * Bootstrap mount namespace. 176 * 177 * This is a chunk of tricky code that lets us have full control over the 178 * layout and direction of propagation of mount events. The documentation below 179 * assumes knowledge of the 'sharedsubtree.txt' document from the kernel source 180 * tree. 181 * 182 * As a reminder two definitions are quoted below: 183 * 184 * A 'propagation event' is defined as event generated on a vfsmount 185 * that leads to mount or unmount actions in other vfsmounts. 186 * 187 * A 'peer group' is defined as a group of vfsmounts that propagate 188 * events to each other. 189 * 190 * (end of quote). 191 * 192 * The main idea is to setup a mount namespace that has a root filesystem with 193 * vfsmounts and peer groups that, depending on the location, either isolate 194 * or share with the rest of the system. 195 * 196 * The vast majority of the filesystem is shared in one direction. Events from 197 * the outside (from the main mount namespace) propagate inside (to namespaces 198 * of particular snaps) so things like new snap revisions, mounted drives, etc, 199 * just show up as expected but even if a snap is exploited or malicious in 200 * nature it cannot affect anything in another namespace where it might cause 201 * security or stability issues. 202 * 203 * Selected directories (today just /media) can be shared in both directions. 204 * This allows snaps with sufficient privileges to either create, through the 205 * mount system call, additional mount points that are visible by the rest of 206 * the system (both the main mount namespace and namespaces of individual 207 * snaps) or remove them, through the unmount system call. 208 **/ 209 static void sc_bootstrap_mount_namespace(const struct sc_mount_config *config) 210 { 211 char scratch_dir[] = "/tmp/snap.rootfs_XXXXXX"; 212 char src[PATH_MAX] = { 0 }; 213 char dst[PATH_MAX] = { 0 }; 214 if (mkdtemp(scratch_dir) == NULL) { 215 die("cannot create temporary directory for the root file system"); 216 } 217 // NOTE: at this stage we just called unshare(CLONE_NEWNS). We are in a new 218 // mount namespace and have a private list of mounts. 219 debug("scratch directory for constructing namespace: %s", scratch_dir); 220 // Make the root filesystem recursively shared. This way propagation events 221 // will be shared with main mount namespace. 222 sc_do_mount("none", "/", NULL, MS_REC | MS_SHARED, NULL); 223 // Bind mount the temporary scratch directory for root filesystem over 224 // itself so that it is a mount point. This is done so that it can become 225 // unbindable as explained below. 226 sc_do_mount(scratch_dir, scratch_dir, NULL, MS_BIND, NULL); 227 // Make the scratch directory unbindable. 228 // 229 // This is necessary as otherwise a mount loop can occur and the kernel 230 // would crash. The term unbindable simply states that it cannot be bind 231 // mounted anywhere. When we construct recursive bind mounts below this 232 // guarantees that this directory will not be replicated anywhere. 233 sc_do_mount("none", scratch_dir, NULL, MS_UNBINDABLE, NULL); 234 // Recursively bind mount desired root filesystem directory over the 235 // scratch directory. This puts the initial content into the scratch space 236 // and serves as a foundation for all subsequent operations below. 237 // 238 // The mount is recursive because it can either be applied to the root 239 // filesystem of a core system (aka all-snap) or the core snap on a classic 240 // system. In the former case we need recursive bind mounts to accurately 241 // replicate the state of the root filesystem into the scratch directory. 242 sc_do_mount(config->rootfs_dir, scratch_dir, NULL, MS_REC | MS_BIND, 243 NULL); 244 // Make the scratch directory recursively slave. Nothing done there will be 245 // shared with the initial mount namespace. This effectively detaches us, 246 // in one way, from the original namespace and coupled with pivot_root 247 // below serves as the foundation of the mount sandbox. 248 sc_do_mount("none", scratch_dir, NULL, MS_REC | MS_SLAVE, NULL); 249 // Bind mount certain directories from the host filesystem to the scratch 250 // directory. By default mount events will propagate in both into and out 251 // of the peer group. This way the running application can alter any global 252 // state visible on the host and in other snaps. This can be restricted by 253 // disabling the "is_bidirectional" flag as can be seen below. 254 for (const struct sc_mount * mnt = config->mounts; mnt->path != NULL; 255 mnt++) { 256 if (mnt->is_bidirectional && mkdir(mnt->path, 0755) < 0 && 257 errno != EEXIST) { 258 die("cannot create %s", mnt->path); 259 } 260 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 261 mnt->path); 262 if (mnt->is_optional) { 263 bool ok = sc_do_optional_mount(mnt->path, dst, NULL, 264 MS_REC | MS_BIND, NULL); 265 if (!ok) { 266 // If we cannot mount it, just continue. 267 continue; 268 } 269 } else { 270 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, 271 NULL); 272 } 273 if (!mnt->is_bidirectional) { 274 // Mount events will only propagate inwards to the namespace. This 275 // way the running application cannot alter any global state apart 276 // from that of its own snap. 277 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 278 } 279 if (mnt->altpath == NULL) { 280 continue; 281 } 282 // An alternate path of mnt->path is provided at another location. 283 // It should behave exactly the same as the original. 284 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, 285 mnt->altpath); 286 struct stat stat_buf; 287 if (lstat(dst, &stat_buf) < 0) { 288 die("cannot lstat %s", dst); 289 } 290 if ((stat_buf.st_mode & S_IFMT) == S_IFLNK) { 291 die("cannot bind mount alternate path over a symlink: %s", dst); 292 } 293 sc_do_mount(mnt->path, dst, NULL, MS_REC | MS_BIND, NULL); 294 if (!mnt->is_bidirectional) { 295 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 296 } 297 } 298 if (config->normal_mode) { 299 // Since we mounted /etc from the host filesystem to the scratch directory, 300 // we may need to put certain directories from the desired root filesystem 301 // (e.g. the core snap) back. This way the behavior of running snaps is not 302 // affected by the alternatives directory from the host, if one exists. 303 // 304 // Fixes the following bugs: 305 // - https://bugs.launchpad.net/snap-confine/+bug/1580018 306 // - https://bugzilla.opensuse.org/show_bug.cgi?id=1028568 307 const char *dirs_from_core[] = 308 { "/etc/alternatives", "/etc/ssl", "/etc/nsswitch.conf", 309 NULL 310 }; 311 for (const char **dirs = dirs_from_core; *dirs != NULL; dirs++) { 312 const char *dir = *dirs; 313 if (access(dir, F_OK) != 0) { 314 continue; 315 } 316 struct stat dst_stat; 317 struct stat src_stat; 318 sc_must_snprintf(src, sizeof src, "%s%s", 319 config->rootfs_dir, dir); 320 sc_must_snprintf(dst, sizeof dst, "%s%s", 321 scratch_dir, dir); 322 if (lstat(src, &src_stat) != 0) { 323 if (errno == ENOENT) { 324 continue; 325 } 326 die("cannot stat %s from desired rootfs", src); 327 } 328 if (!S_ISREG(src_stat.st_mode) 329 && !S_ISDIR(src_stat.st_mode)) { 330 debug 331 ("entry %s from the desired rootfs is not a file or directory, skipping mount", 332 src); 333 continue; 334 } 335 336 if (lstat(dst, &dst_stat) != 0) { 337 if (errno == ENOENT) { 338 continue; 339 } 340 die("cannot stat %s from host", src); 341 } 342 if (!S_ISREG(dst_stat.st_mode) 343 && !S_ISDIR(dst_stat.st_mode)) { 344 debug 345 ("entry %s from the host is not a file or directory, skipping mount", 346 src); 347 continue; 348 } 349 350 if ((dst_stat.st_mode & S_IFMT) != 351 (src_stat.st_mode & S_IFMT)) { 352 debug 353 ("entries %s and %s are of different types, skipping mount", 354 dst, src); 355 continue; 356 } 357 // both source and destination exist where both are either files 358 // or both are directories 359 sc_do_mount(src, dst, NULL, MS_BIND, NULL); 360 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 361 } 362 } 363 // The "core" base snap is special as it contains snapd and friends. 364 // Other base snaps do not, so whenever a base snap other than core is 365 // in use we need extra provisions for setting up internal tooling to 366 // be available. 367 // 368 // However on a core18 (and similar) system the core snap is not 369 // a special base anymore and we should map our own tooling in. 370 if (config->distro == SC_DISTRO_CORE_OTHER 371 || !sc_streq(config->base_snap_name, "core")) { 372 // when bases are used we need to bind-mount the libexecdir 373 // (that contains snap-exec) into /usr/lib/snapd of the 374 // base snap so that snap-exec is available for the snaps 375 // (base snaps do not ship snapd) 376 377 // dst is always /usr/lib/snapd as this is where snapd 378 // assumes to find snap-exec 379 sc_must_snprintf(dst, sizeof dst, "%s/usr/lib/snapd", 380 scratch_dir); 381 382 // bind mount the current $ROOT/usr/lib/snapd path, 383 // where $ROOT is either "/" or the "/snap/{core,snapd}/current" 384 // that we are re-execing from 385 char *src = NULL; 386 char self[PATH_MAX + 1] = { 0 }; 387 if (readlink("/proc/self/exe", self, sizeof(self) - 1) < 0) { 388 die("cannot read /proc/self/exe"); 389 } 390 // this cannot happen except when the kernel is buggy 391 if (strstr(self, "/snap-confine") == NULL) { 392 die("cannot use result from readlink: %s", self); 393 } 394 src = dirname(self); 395 // dirname(path) might return '.' depending on path. 396 // /proc/self/exe should always point 397 // to an absolute path, but let's guarantee that. 398 if (src[0] != '/') { 399 die("cannot use the result of dirname(): %s", src); 400 } 401 402 sc_do_mount(src, dst, NULL, MS_BIND | MS_RDONLY, NULL); 403 sc_do_mount("none", dst, NULL, MS_SLAVE, NULL); 404 } 405 // Bind mount the directory where all snaps are mounted. The location of 406 // the this directory on the host filesystem may not match the location in 407 // the desired root filesystem. In the "core" and "ubuntu-core" snaps the 408 // directory is always /snap. On the host it is a build-time configuration 409 // option stored in SNAP_MOUNT_DIR. In legacy mode (or in other words, not 410 // in normal mode), we don't need to do this because /snap is fixed and 411 // already contains the correct view of the mounted snaps. 412 if (config->normal_mode) { 413 sc_must_snprintf(dst, sizeof dst, "%s/snap", scratch_dir); 414 sc_do_mount(SNAP_MOUNT_DIR, dst, NULL, MS_BIND | MS_REC, NULL); 415 sc_do_mount("none", dst, NULL, MS_REC | MS_SLAVE, NULL); 416 } 417 // Create the hostfs directory if one is missing. This directory is a part 418 // of packaging now so perhaps this code can be removed later. 419 if (access(SC_HOSTFS_DIR, F_OK) != 0) { 420 debug("creating missing hostfs directory"); 421 if (mkdir(SC_HOSTFS_DIR, 0755) != 0) { 422 die("cannot perform operation: mkdir %s", 423 SC_HOSTFS_DIR); 424 } 425 } 426 // Ensure that hostfs isgroup owned by root. We may have (now or earlier) 427 // created the directory as the user who first ran a snap on a given 428 // system and the group identity of that user is visilbe on disk. 429 // This was LP:#1665004 430 struct stat sb; 431 if (stat(SC_HOSTFS_DIR, &sb) < 0) { 432 die("cannot stat %s", SC_HOSTFS_DIR); 433 } 434 if (sb.st_uid != 0 || sb.st_gid != 0) { 435 if (chown(SC_HOSTFS_DIR, 0, 0) < 0) { 436 die("cannot change user/group owner of %s to root", 437 SC_HOSTFS_DIR); 438 } 439 } 440 // Make the upcoming "put_old" directory for pivot_root private so that 441 // mount events don't propagate to any peer group. In practice pivot root 442 // has a number of undocumented requirements and one of them is that the 443 // "put_old" directory (the second argument) cannot be shared in any way. 444 sc_must_snprintf(dst, sizeof dst, "%s/%s", scratch_dir, SC_HOSTFS_DIR); 445 sc_do_mount(dst, dst, NULL, MS_BIND, NULL); 446 sc_do_mount("none", dst, NULL, MS_PRIVATE, NULL); 447 // On classic mount the nvidia driver. Ideally this would be done in an 448 // uniform way after pivot_root but this is good enough and requires less 449 // code changes the nvidia code assumes it has access to the existing 450 // pre-pivot filesystem. 451 if (config->distro == SC_DISTRO_CLASSIC) { 452 sc_mount_nvidia_driver(scratch_dir); 453 } 454 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 455 // pivot_root 456 // XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX 457 // Use pivot_root to "chroot" into the scratch directory. 458 // 459 // Q: Why are we using something as esoteric as pivot_root(2)? 460 // A: Because this makes apparmor handling easy. Using a normal chroot 461 // makes all apparmor rules conditional. We are either running on an 462 // all-snap system where this would-be chroot didn't happen and all the 463 // rules see / as the root file system _OR_ we are running on top of a 464 // classic distribution and this chroot has now moved all paths to 465 // /tmp/snap.rootfs_*. 466 // 467 // Because we are using unshare(2) with CLONE_NEWNS we can essentially use 468 // pivot_root just like chroot but this makes apparmor unaware of the old 469 // root so everything works okay. 470 // 471 // HINT: If you are debugging this and are trying to see why pivot_root 472 // happens to return EINVAL with any changes you may be making, please 473 // consider applying 474 // misc/0001-Add-printk-based-debugging-to-pivot_root.patch to your tree 475 // kernel. 476 debug("performing operation: pivot_root %s %s", scratch_dir, dst); 477 if (syscall(SYS_pivot_root, scratch_dir, dst) < 0) { 478 die("cannot perform operation: pivot_root %s %s", scratch_dir, 479 dst); 480 } 481 // Unmount the self-bind mount over the scratch directory created earlier 482 // in the original root filesystem (which is now mounted on SC_HOSTFS_DIR). 483 // This way we can remove the temporary directory we created and "clean up" 484 // after ourselves nicely. 485 sc_must_snprintf(dst, sizeof dst, "%s/%s", SC_HOSTFS_DIR, scratch_dir); 486 sc_do_umount(dst, UMOUNT_NOFOLLOW); 487 // Remove the scratch directory. Note that we are using the path that is 488 // based on the old root filesystem as after pivot_root we cannot guarantee 489 // what is present at the same location normally. (It is probably an empty 490 // /tmp directory that is populated in another place). 491 debug("performing operation: rmdir %s", dst); 492 if (rmdir(scratch_dir) < 0) { 493 die("cannot perform operation: rmdir %s", dst); 494 }; 495 // Make the old root filesystem recursively slave. This way operations 496 // performed in this mount namespace will not propagate to the peer group. 497 // This is another essential part of the confinement system. 498 sc_do_mount("none", SC_HOSTFS_DIR, NULL, MS_REC | MS_SLAVE, NULL); 499 // Detach the redundant hostfs version of sysfs since it shows up in the 500 // mount table and software inspecting the mount table may become confused 501 // (eg, docker and LP:# 162601). 502 sc_must_snprintf(src, sizeof src, "%s/sys", SC_HOSTFS_DIR); 503 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 504 // Detach the redundant hostfs version of /dev since it shows up in the 505 // mount table and software inspecting the mount table may become confused. 506 sc_must_snprintf(src, sizeof src, "%s/dev", SC_HOSTFS_DIR); 507 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 508 // Detach the redundant hostfs version of /proc since it shows up in the 509 // mount table and software inspecting the mount table may become confused. 510 sc_must_snprintf(src, sizeof src, "%s/proc", SC_HOSTFS_DIR); 511 sc_do_umount(src, UMOUNT_NOFOLLOW | MNT_DETACH); 512 } 513 514 /** 515 * @path: a pathname where / replaced with '\0'. 516 * @offsetp: pointer to int showing which path segment was last seen. 517 * Updated on return to reflect the next segment. 518 * @fulllen: full original path length. 519 * Returns a pointer to the next path segment, or NULL if done. 520 */ 521 static char * __attribute__((used)) 522 get_nextpath(char *path, size_t *offsetp, size_t fulllen) 523 { 524 size_t offset = *offsetp; 525 526 if (offset >= fulllen) 527 return NULL; 528 529 while (offset < fulllen && path[offset] != '\0') 530 offset++; 531 while (offset < fulllen && path[offset] == '\0') 532 offset++; 533 534 *offsetp = offset; 535 return (offset < fulllen) ? &path[offset] : NULL; 536 } 537 538 /** 539 * Check that @subdir is a subdir of @dir. 540 **/ 541 static bool __attribute__((used)) 542 is_subdir(const char *subdir, const char *dir) 543 { 544 size_t dirlen = strlen(dir); 545 size_t subdirlen = strlen(subdir); 546 547 // @dir has to be at least as long as @subdir 548 if (subdirlen < dirlen) 549 return false; 550 // @dir has to be a prefix of @subdir 551 if (strncmp(subdir, dir, dirlen) != 0) 552 return false; 553 // @dir can look like "path/" (that is, end with the directory separator). 554 // When that is the case then given the test above we can be sure @subdir 555 // is a real subdirectory. 556 if (dirlen > 0 && dir[dirlen - 1] == '/') 557 return true; 558 // @subdir can look like "path/stuff" and when the directory separator 559 // is exactly at the spot where @dir ends (that is, it was not caught 560 // by the test above) then @subdir is a real subdirectory. 561 if (subdir[dirlen] == '/' && dirlen > 0) 562 return true; 563 // If both @dir and @subdir have identical length then given that the 564 // prefix check above @subdir is a real subdirectory. 565 if (subdirlen == dirlen) 566 return true; 567 return false; 568 } 569 570 void sc_populate_mount_ns(struct sc_apparmor *apparmor, int snap_update_ns_fd, 571 const sc_invocation * inv) 572 { 573 // Classify the current distribution, as claimed by /etc/os-release. 574 sc_distro distro = sc_classify_distro(); 575 576 // Check which mode we should run in, normal or legacy. 577 if (inv->is_normal_mode) { 578 // In normal mode we use the base snap as / and set up several bind mounts. 579 const struct sc_mount mounts[] = { 580 {"/dev"}, // because it contains devices on host OS 581 {"/etc"}, // because that's where /etc/resolv.conf lives, perhaps a bad idea 582 {"/home"}, // to support /home/*/snap and home interface 583 {"/root"}, // because that is $HOME for services 584 {"/proc"}, // fundamental filesystem 585 {"/sys"}, // fundamental filesystem 586 {"/tmp"}, // to get writable tmp 587 {"/var/snap"}, // to get access to global snap data 588 {"/var/lib/snapd"}, // to get access to snapd state and seccomp profiles 589 {"/var/tmp"}, // to get access to the other temporary directory 590 {"/run"}, // to get /run with sockets and what not 591 {"/lib/modules",.is_optional = true}, // access to the modules of the running kernel 592 {"/lib/firmware",.is_optional = true}, // access to the firmware of the running kernel 593 {"/usr/src"}, // FIXME: move to SecurityMounts in system-trace interface 594 {"/var/log"}, // FIXME: move to SecurityMounts in log-observe interface 595 #ifdef MERGED_USR 596 {"/run/media", true, "/media"}, // access to the users removable devices 597 #else 598 {"/media", true}, // access to the users removable devices 599 #endif // MERGED_USR 600 {"/run/netns", true}, // access to the 'ip netns' network namespaces 601 // The /mnt directory is optional in base snaps to ensure backwards 602 // compatibility with the first version of base snaps that was 603 // released. 604 {"/mnt",.is_optional = true}, // to support the removable-media interface 605 {"/var/lib/extrausers",.is_optional = true}, // access to UID/GID of extrausers (if available) 606 {}, 607 }; 608 struct sc_mount_config normal_config = { 609 .rootfs_dir = inv->rootfs_dir, 610 .mounts = mounts, 611 .distro = distro, 612 .normal_mode = true, 613 .base_snap_name = inv->base_snap_name, 614 }; 615 sc_bootstrap_mount_namespace(&normal_config); 616 } else { 617 // In legacy mode we don't pivot and instead just arrange bi- 618 // directional mount propagation for two directories. 619 const struct sc_mount mounts[] = { 620 {"/media", true}, 621 {"/run/netns", true}, 622 {}, 623 }; 624 struct sc_mount_config legacy_config = { 625 .rootfs_dir = "/", 626 .mounts = mounts, 627 .distro = distro, 628 .normal_mode = false, 629 .base_snap_name = inv->base_snap_name, 630 }; 631 sc_bootstrap_mount_namespace(&legacy_config); 632 } 633 634 // set up private mounts 635 // TODO: rename this and fold it into bootstrap 636 setup_private_mount(inv->snap_instance); 637 638 // set up private /dev/pts 639 // TODO: fold this into bootstrap 640 setup_private_pts(); 641 642 // setup the security backend bind mounts 643 sc_call_snap_update_ns(snap_update_ns_fd, inv->snap_instance, apparmor); 644 } 645 646 static bool is_mounted_with_shared_option(const char *dir) 647 __attribute__((nonnull(1))); 648 649 static bool is_mounted_with_shared_option(const char *dir) 650 { 651 sc_mountinfo *sm SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 652 sm = sc_parse_mountinfo(NULL); 653 if (sm == NULL) { 654 die("cannot parse /proc/self/mountinfo"); 655 } 656 sc_mountinfo_entry *entry = sc_first_mountinfo_entry(sm); 657 while (entry != NULL) { 658 const char *mount_dir = entry->mount_dir; 659 if (sc_streq(mount_dir, dir)) { 660 const char *optional_fields = entry->optional_fields; 661 if (strstr(optional_fields, "shared:") != NULL) { 662 return true; 663 } 664 } 665 entry = sc_next_mountinfo_entry(entry); 666 } 667 return false; 668 } 669 670 void sc_ensure_shared_snap_mount(void) 671 { 672 if (!is_mounted_with_shared_option("/") 673 && !is_mounted_with_shared_option(SNAP_MOUNT_DIR)) { 674 // TODO: We could be more aggressive and refuse to function but since 675 // we have no data on actual environments that happen to limp along in 676 // this configuration let's not do that yet. This code should be 677 // removed once we have a measurement and feedback mechanism that lets 678 // us decide based on measurable data. 679 sc_do_mount(SNAP_MOUNT_DIR, SNAP_MOUNT_DIR, "none", 680 MS_BIND | MS_REC, 0); 681 sc_do_mount("none", SNAP_MOUNT_DIR, NULL, MS_SHARED | MS_REC, 682 NULL); 683 } 684 } 685 686 void sc_setup_user_mounts(struct sc_apparmor *apparmor, int snap_update_ns_fd, 687 const char *snap_name) 688 { 689 debug("%s: %s", __FUNCTION__, snap_name); 690 691 char profile_path[PATH_MAX]; 692 struct stat st; 693 694 sc_must_snprintf(profile_path, sizeof(profile_path), 695 "/var/lib/snapd/mount/snap.%s.user-fstab", snap_name); 696 if (stat(profile_path, &st) != 0) { 697 // It is ok for the user fstab to not exist. 698 return; 699 } 700 701 // In our new mount namespace, recursively change all mounts 702 // to slave mode, so we see changes from the parent namespace 703 // but don't propagate our own changes. 704 sc_do_mount("none", "/", NULL, MS_REC | MS_SLAVE, NULL); 705 sc_call_snap_update_ns_as_user(snap_update_ns_fd, snap_name, apparmor); 706 }