github.com/tompreston/snapd@v0.0.0-20210817193607-954edfcb9611/cmd/snap-confine/ns-support.c (about) 1 /* 2 * Copyright (C) 2016 Canonical Ltd 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 3 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 * 16 */ 17 18 #include "ns-support.h" 19 20 #ifdef HAVE_CONFIG_H 21 #include "config.h" 22 #endif 23 24 #include <errno.h> 25 #include <fcntl.h> 26 #include <linux/magic.h> 27 #include <sched.h> 28 #include <signal.h> 29 #include <string.h> 30 #include <sys/eventfd.h> 31 #include <sys/file.h> 32 #include <sys/mount.h> 33 #include <sys/prctl.h> 34 #include <sys/stat.h> 35 #include <sys/sysmacros.h> 36 #include <sys/types.h> 37 #include <sys/vfs.h> 38 #include <sys/wait.h> 39 #include <unistd.h> 40 41 #include "../libsnap-confine-private/cgroup-freezer-support.h" 42 #include "../libsnap-confine-private/cgroup-support.h" 43 #include "../libsnap-confine-private/classic.h" 44 #include "../libsnap-confine-private/cleanup-funcs.h" 45 #include "../libsnap-confine-private/feature.h" 46 #include "../libsnap-confine-private/infofile.h" 47 #include "../libsnap-confine-private/locking.h" 48 #include "../libsnap-confine-private/mountinfo.h" 49 #include "../libsnap-confine-private/string-utils.h" 50 #include "../libsnap-confine-private/tool.h" 51 #include "../libsnap-confine-private/utils.h" 52 #include "user-support.h" 53 #include "mount-support.h" 54 55 /** 56 * Directory where snap-confine keeps namespace files. 57 **/ 58 #define SC_NS_DIR "/run/snapd/ns" 59 60 /** 61 * Effective value of SC_NS_DIR. 62 * 63 * We use 'const char *' so we can update sc_ns_dir in the testsuite 64 **/ 65 static const char *sc_ns_dir = SC_NS_DIR; 66 67 enum { 68 HELPER_CMD_EXIT, 69 HELPER_CMD_CAPTURE_MOUNT_NS, 70 HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS, 71 }; 72 73 void sc_reassociate_with_pid1_mount_ns(void) 74 { 75 int init_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 76 int self_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 77 const char *path_pid_1 = "/proc/1/ns/mnt"; 78 const char *path_pid_self = "/proc/self/ns/mnt"; 79 80 init_mnt_fd = open(path_pid_1, 81 O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 82 if (init_mnt_fd < 0) { 83 die("cannot open path %s", path_pid_1); 84 } 85 self_mnt_fd = open(path_pid_self, 86 O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 87 if (self_mnt_fd < 0) { 88 die("cannot open path %s", path_pid_1); 89 } 90 char init_buf[128] = { 0 }; 91 char self_buf[128] = { 0 }; 92 memset(init_buf, 0, sizeof init_buf); 93 if (readlinkat(init_mnt_fd, "", init_buf, sizeof init_buf) < 0) { 94 if (errno == ENOENT) { 95 // According to namespaces(7) on a pre 3.8 kernel the namespace 96 // files are hardlinks, not symlinks. If that happens readlinkat 97 // fails with ENOENT. As a quick workaround for this special-case 98 // functionality, just bail out and do nothing without raising an 99 // error. 100 return; 101 } 102 die("cannot read mount namespace identifier of pid 1"); 103 } 104 memset(self_buf, 0, sizeof self_buf); 105 if (readlinkat(self_mnt_fd, "", self_buf, sizeof self_buf) < 0) { 106 die("cannot read mount namespace identifier of the current process"); 107 } 108 if (memcmp(init_buf, self_buf, sizeof init_buf) != 0) { 109 debug("moving to mount namespace of pid 1"); 110 // We cannot use O_NOFOLLOW here because that file will always be a 111 // symbolic link. We actually want to open it this way. 112 int init_mnt_fd_real SC_CLEANUP(sc_cleanup_close) = -1; 113 init_mnt_fd_real = open(path_pid_1, O_RDONLY | O_CLOEXEC); 114 if (init_mnt_fd_real < 0) { 115 die("cannot open %s", path_pid_1); 116 } 117 if (setns(init_mnt_fd_real, CLONE_NEWNS) < 0) { 118 die("cannot join mount namespace of pid 1"); 119 } 120 } 121 } 122 123 void sc_initialize_mount_ns(unsigned int experimental_features) 124 { 125 debug("unsharing snap namespace directory"); 126 127 /* Ensure that /run/snapd/ns is a directory. */ 128 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 129 if (sc_nonfatal_mkpath(sc_ns_dir, 0755) < 0) { 130 die("cannot create directory %s", sc_ns_dir); 131 } 132 (void)sc_set_effective_identity(old); 133 134 /* Read and analyze the mount table. We need to see whether /run/snapd/ns 135 * is a mount point with private event propagation. */ 136 sc_mountinfo *info SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 137 info = sc_parse_mountinfo(NULL); 138 if (info == NULL) { 139 die("cannot parse /proc/self/mountinfo"); 140 } 141 142 bool is_mnt = false; 143 bool is_private = false; 144 for (sc_mountinfo_entry * entry = sc_first_mountinfo_entry(info); 145 entry != NULL; entry = sc_next_mountinfo_entry(entry)) { 146 /* Find /run/snapd/ns */ 147 if (!sc_streq(entry->mount_dir, sc_ns_dir)) { 148 continue; 149 } 150 is_mnt = true; 151 if (strstr(entry->optional_fields, "shared:") == NULL) { 152 /* Mount event propagation is not set to shared, good. */ 153 is_private = true; 154 } 155 break; 156 } 157 158 if (!is_mnt) { 159 if (mount(sc_ns_dir, sc_ns_dir, NULL, MS_BIND | MS_REC, NULL) < 160 0) { 161 die("cannot self-bind mount %s", sc_ns_dir); 162 } 163 } 164 165 if (!is_private) { 166 if (mount(NULL, sc_ns_dir, NULL, MS_PRIVATE, NULL) < 0) { 167 die("cannot change propagation type to MS_PRIVATE in %s", sc_ns_dir); 168 } 169 } 170 171 /* code that follows is experimental */ 172 if (experimental_features & SC_FEATURE_PARALLEL_INSTANCES) { 173 // Ensure that SNAP_MOUNT_DIR and /var/snap are shared mount points 174 debug 175 ("(experimental) ensuring snap mount and data directories are mount points"); 176 sc_ensure_snap_dir_shared_mounts(); 177 } 178 } 179 180 struct sc_mount_ns { 181 // Name of the namespace group ($SNAP_NAME). 182 char *name; 183 // Descriptor to the namespace group control directory. This descriptor is 184 // opened with O_PATH|O_DIRECTORY so it's only used for openat() calls. 185 int dir_fd; 186 // Pair of descriptors for a pair for a pipe file descriptors (read end, 187 // write end) that snap-confine uses to send messages to the helper 188 // process and back. 189 int pipe_helper[2]; 190 int pipe_master[2]; 191 // Identifier of the child process that is used during the one-time (per 192 // group) initialization and capture process. 193 pid_t child; 194 }; 195 196 static struct sc_mount_ns *sc_alloc_mount_ns(void) 197 { 198 struct sc_mount_ns *group = calloc(1, sizeof *group); 199 if (group == NULL) { 200 die("cannot allocate memory for sc_mount_ns"); 201 } 202 group->dir_fd = -1; 203 group->pipe_helper[0] = -1; 204 group->pipe_helper[1] = -1; 205 group->pipe_master[0] = -1; 206 group->pipe_master[1] = -1; 207 // Redundant with calloc but some functions check for the non-zero value so 208 // I'd like to keep this explicit in the code. 209 group->child = 0; 210 return group; 211 } 212 213 struct sc_mount_ns *sc_open_mount_ns(const char *group_name) 214 { 215 struct sc_mount_ns *group = sc_alloc_mount_ns(); 216 group->dir_fd = open(sc_ns_dir, 217 O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW); 218 if (group->dir_fd < 0) { 219 die("cannot open directory %s", sc_ns_dir); 220 } 221 group->name = sc_strdup(group_name); 222 return group; 223 } 224 225 void sc_close_mount_ns(struct sc_mount_ns *group) 226 { 227 if (group->child != 0) { 228 sc_wait_for_helper(group); 229 } 230 sc_cleanup_close(&group->dir_fd); 231 sc_cleanup_close(&group->pipe_master[0]); 232 sc_cleanup_close(&group->pipe_master[1]); 233 sc_cleanup_close(&group->pipe_helper[0]); 234 sc_cleanup_close(&group->pipe_helper[1]); 235 free(group->name); 236 free(group); 237 } 238 239 static dev_t find_base_snap_device(const char *base_snap_name, 240 const char *base_snap_rev) 241 { 242 // Find the backing device of the base snap. 243 // TODO: add support for "try mode" base snaps that also need 244 // consideration of the mie->root component. 245 dev_t base_snap_dev = 0; 246 char base_squashfs_path[PATH_MAX]; 247 sc_must_snprintf(base_squashfs_path, 248 sizeof base_squashfs_path, "%s/%s/%s", 249 SNAP_MOUNT_DIR, base_snap_name, base_snap_rev); 250 sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 251 mi = sc_parse_mountinfo(NULL); 252 if (mi == NULL) { 253 die("cannot parse mountinfo of the current process"); 254 } 255 bool found = false; 256 for (sc_mountinfo_entry * mie = 257 sc_first_mountinfo_entry(mi); mie != NULL; 258 mie = sc_next_mountinfo_entry(mie)) { 259 if (sc_streq(mie->mount_dir, base_squashfs_path)) { 260 base_snap_dev = makedev(mie->dev_major, mie->dev_minor); 261 debug("block device of snap %s, revision %s is %d:%d", 262 base_snap_name, base_snap_rev, mie->dev_major, 263 mie->dev_minor); 264 // Don't break when found, we are interested in the last 265 // entry as this is the "effective" one. 266 found = true; 267 } 268 } 269 if (!found) { 270 die("cannot find mount entry for snap %s revision %s", 271 base_snap_name, base_snap_rev); 272 } 273 return base_snap_dev; 274 } 275 276 static bool should_discard_current_ns(dev_t base_snap_dev) 277 { 278 // Inspect the namespace and check if we should discard it. 279 // 280 // The namespace may become "stale" when the rootfs is not the same 281 // device we found above. This will happen whenever the base snap is 282 // refreshed since the namespace was first created. 283 sc_mountinfo_entry *mie; 284 sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 285 286 mi = sc_parse_mountinfo(NULL); 287 if (mi == NULL) { 288 die("cannot parse mountinfo of the current process"); 289 } 290 for (mie = sc_first_mountinfo_entry(mi); mie != NULL; 291 mie = sc_next_mountinfo_entry(mie)) { 292 if (!sc_streq(mie->mount_dir, "/")) { 293 continue; 294 } 295 // NOTE: we want the initial rootfs just in case overmount 296 // was used to do something weird. The initial rootfs was 297 // set up by snap-confine and that is the one we want to 298 // measure. 299 debug("block device of the root filesystem is %d:%d", 300 mie->dev_major, mie->dev_minor); 301 return base_snap_dev != makedev(mie->dev_major, mie->dev_minor); 302 } 303 die("cannot find mount entry of the root filesystem"); 304 } 305 306 enum sc_discard_vote { 307 /** 308 * SC_DISCARD_NO denotes that the mount namespace doesn't have to be 309 * discarded. This happens when the base snap has not changed. 310 **/ 311 SC_DISCARD_NO = 1, 312 /** 313 * SC_DISCARD_SHOULD indicates that the mount namespace should be discarded 314 * but may be reused if it is still inhabited by processes. This only 315 * happens when the base snap revision changes but the name of the base 316 * snap is the same as before. 317 **/ 318 SC_DISCARD_SHOULD = 2, 319 /** 320 * SC_DISCARD_MUST indicates that the mount namespace must be discarded 321 * even if it still inhabited by processes. This only happens when the name 322 * of the base snap changes. 323 **/ 324 SC_DISCARD_MUST = 3, 325 }; 326 327 /** 328 * is_base_transition returns true if a base transition is occurring. 329 * 330 * The function inspects /run/snapd/ns/snap.$SNAP_INSTANCE_NAME.info as well 331 * as the invocation parameters of snap-confine. If the base snap name, as 332 * encoded in the info file and as described by the invocation parameters 333 * differ then a base transition is occurring. If the info file is absent or 334 * does not record the name of the base snap then transition cannot be 335 * detected. 336 **/ 337 static bool is_base_transition(const sc_invocation * inv) 338 { 339 char info_path[PATH_MAX] = { 0 }; 340 sc_must_snprintf(info_path, 341 sizeof info_path, 342 "/run/snapd/ns/snap.%s.info", inv->snap_instance); 343 344 FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL; 345 stream = fopen(info_path, "r"); 346 if (stream == NULL && errno == ENOENT) { 347 // If the info file is absent then we cannot decide if a transition had 348 // occurred. For people upgrading from snap-confine without the info 349 // file, that is the best we can do. 350 return false; 351 } 352 if (stream == NULL) { 353 die("cannot open %s", info_path); 354 } 355 356 char *base_snap_name SC_CLEANUP(sc_cleanup_string) = NULL; 357 sc_error *err = NULL; 358 if (sc_infofile_get_key 359 (stream, "base-snap-name", &base_snap_name, &err) < 0) { 360 sc_die_on_error(err); 361 } 362 363 if (base_snap_name == NULL) { 364 // If the info file doesn't record the name of the base snap then, 365 // again, we cannot decide if a transition had occurred. 366 return false; 367 } 368 369 return !sc_streq(inv->orig_base_snap_name, base_snap_name); 370 } 371 372 static bool sc_is_mount_ns_in_use(const char *snap_instance); 373 374 // The namespace may be stale. To check this we must actually switch into it 375 // but then we use up our setns call (the kernel misbehaves if we setns twice). 376 // To work around this we'll fork a child and use it to probe. The child will 377 // inspect the namespace and send information back via eventfd and then exit 378 // unconditionally. 379 static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd, 380 const sc_invocation * inv, 381 int snap_discard_ns_fd) 382 { 383 char base_snap_rev[PATH_MAX] = { 0 }; 384 dev_t base_snap_dev; 385 int event_fd SC_CLEANUP(sc_cleanup_close) = -1; 386 387 // Read the revision of the base snap by looking at the current symlink. 388 if (readlink(inv->rootfs_dir, base_snap_rev, sizeof base_snap_rev) < 0) { 389 die("cannot read current revision of snap %s", 390 inv->snap_instance); 391 } 392 if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') { 393 die("cannot read current revision of snap %s: value too long", 394 inv->snap_instance); 395 } 396 // Find the device that is backing the current revision of the base snap. 397 base_snap_dev = 398 find_base_snap_device(inv->base_snap_name, base_snap_rev); 399 400 // Store the PID of this process. This is done instead of calls to 401 // getppid() below because then we can reliably track the PID of the 402 // parent even if the child process is re-parented. 403 pid_t parent = getpid(); 404 405 // Create an eventfd for the communication with the child. 406 event_fd = eventfd(0, EFD_CLOEXEC); 407 if (event_fd < 0) { 408 die("cannot create eventfd"); 409 } 410 // Fork a child, it will do the inspection for us. 411 pid_t child = fork(); 412 if (child < 0) { 413 die("cannot fork support process"); 414 } 415 416 if (child == 0) { 417 // This is the child process which will inspect the mount namespace. 418 // 419 // Configure the child to die as soon as the parent dies. In an odd 420 // case where the parent is killed then we don't want to complete our 421 // task or wait for anything. 422 if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { 423 die("cannot set parent process death notification signal to SIGINT"); 424 } 425 // Check that parent process is still alive. If this is the case then 426 // we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake 427 // us up from eventfd_read() below. In the rare case that the PID 428 // numbers overflow and the now-dead parent PID is recycled we will 429 // still hang forever on the read from eventfd below. 430 if (kill(parent, 0) < 0) { 431 switch (errno) { 432 case ESRCH: 433 debug("parent process has terminated"); 434 abort(); 435 default: 436 die("cannot confirm that parent process is alive"); 437 break; 438 } 439 } 440 441 debug("joining preserved mount namespace for inspection"); 442 // Move to the mount namespace of the snap we're trying to inspect. 443 if (setns(mnt_fd, CLONE_NEWNS) < 0) { 444 die("cannot join preserved mount namespace"); 445 } 446 // Check if the namespace needs to be discarded. 447 eventfd_t value = SC_DISCARD_NO; 448 const char *value_str = "no"; 449 450 // TODO: enable this for core distributions. This is complex because on 451 // core the rootfs is mounted in initrd and is _not_ changed (no 452 // pivot_root) and the base snap is again mounted (2nd time) by 453 // systemd. This makes us end up in a situation where the outer base 454 // snap will never match the rootfs inside the mount namespace. 455 if (inv->is_normal_mode 456 && should_discard_current_ns(base_snap_dev)) { 457 value = SC_DISCARD_SHOULD; 458 value_str = "should"; 459 } 460 // If the base snap changed, we must discard the mount namespace and 461 // start over to allow the newly started process to see the requested 462 // base snap. Due to the TODO above always perform explicit transition 463 // check to protect against LP:#1819875 and LP:#1861901 464 if (is_base_transition(inv)) { 465 // The base snap has changed. We must discard ... 466 value = SC_DISCARD_MUST; 467 value_str = "must"; 468 } 469 // Send this back to the parent: 3 - force discard 2 - prefer discard, 1 - keep. 470 // Note that we cannot just use 0 and 1 because of the semantics of eventfd(2). 471 if (eventfd_write(event_fd, value) < 0) { 472 die("cannot send information to %s preserved mount namespace", value_str); 473 } 474 // Exit, we're done. 475 exit(0); 476 } 477 // This is back in the parent process. 478 // 479 // Enable a sanity timeout in case the read blocks for unbound amount of 480 // time. This will ensure we will not hang around while holding the lock. 481 // Next, read the value written by the child process. 482 sc_enable_sanity_timeout(); 483 eventfd_t value = 0; 484 if (eventfd_read(event_fd, &value) < 0) { 485 die("cannot read from eventfd"); 486 } 487 sc_disable_sanity_timeout(); 488 489 // Wait for the child process to exit and collect its exit status. 490 errno = 0; 491 int status = 0; 492 if (waitpid(child, &status, 0) < 0) { 493 die("cannot wait for the support process for mount namespace inspection"); 494 } 495 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { 496 die("support process for mount namespace inspection exited abnormally"); 497 } 498 // If the namespace is up-to-date then we are done. 499 switch (value) { 500 case SC_DISCARD_NO: 501 debug("preserved mount is not stale, reusing"); 502 return 0; 503 case SC_DISCARD_SHOULD: 504 if (sc_is_mount_ns_in_use(inv->snap_instance)) { 505 // Some processes are still using the namespace so we cannot discard it 506 // as that would fracture the view that the set of processes inside 507 // have on what is mounted. 508 debug 509 ("preserved mount namespace is stale but occupied, reusing"); 510 return 0; 511 } 512 break; 513 case SC_DISCARD_MUST: 514 debug 515 ("preserved mount namespace is stale and base snap has changed, discarding"); 516 break; 517 } 518 sc_call_snap_discard_ns(snap_discard_ns_fd, inv->snap_instance); 519 return EAGAIN; 520 } 521 522 static void helper_fork(struct sc_mount_ns *group, 523 struct sc_apparmor *apparmor); 524 static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor, 525 pid_t parent); 526 static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent); 527 static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent); 528 529 int sc_join_preserved_ns(struct sc_mount_ns *group, struct sc_apparmor 530 *apparmor, const sc_invocation * inv, 531 int snap_discard_ns_fd) 532 { 533 // Open the mount namespace file. 534 char mnt_fname[PATH_MAX] = { 0 }; 535 sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.mnt", group->name); 536 int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 537 // NOTE: There is no O_EXCL here because the file can be around but 538 // doesn't have to be a mounted namespace. 539 mnt_fd = openat(group->dir_fd, mnt_fname, 540 O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); 541 if (mnt_fd < 0 && errno == ENOENT) { 542 return ESRCH; 543 } 544 if (mnt_fd < 0) { 545 die("cannot open preserved mount namespace %s", group->name); 546 } 547 // Check if we got an nsfs-based or procfs file or a regular file. This can 548 // be reliably tested because nsfs has an unique filesystem type 549 // NSFS_MAGIC. On older kernels that don't support nsfs yet we can look 550 // for PROC_SUPER_MAGIC instead. 551 // We can just ensure that this is the case thanks to fstatfs. 552 struct statfs ns_statfs_buf; 553 if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) { 554 die("cannot inspect filesystem of preserved mount namespace file"); 555 } 556 // Stat the mount namespace as well, this is later used to check if the 557 // namespace is used by other processes if we are considering discarding a 558 // stale namespace. 559 struct stat ns_stat_buf; 560 if (fstat(mnt_fd, &ns_stat_buf) < 0) { 561 die("cannot inspect preserved mount namespace file"); 562 } 563 #ifndef NSFS_MAGIC 564 // Account for kernel headers old enough to not know about NSFS_MAGIC. 565 #define NSFS_MAGIC 0x6e736673 566 #endif 567 if (ns_statfs_buf.f_type == NSFS_MAGIC 568 || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) { 569 570 // Inspect and perhaps discard the preserved mount namespace. 571 if (sc_inspect_and_maybe_discard_stale_ns 572 (mnt_fd, inv, snap_discard_ns_fd) == EAGAIN) { 573 return ESRCH; 574 } 575 // Move to the mount namespace of the snap we're trying to start. 576 if (setns(mnt_fd, CLONE_NEWNS) < 0) { 577 die("cannot join preserved mount namespace %s", 578 group->name); 579 } 580 debug("joined preserved mount namespace %s", group->name); 581 return 0; 582 } 583 return ESRCH; 584 } 585 586 int sc_join_preserved_per_user_ns(struct sc_mount_ns *group, 587 const char *snap_name) 588 { 589 uid_t uid = getuid(); 590 char mnt_fname[PATH_MAX] = { 0 }; 591 sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.%d.mnt", group->name, 592 (int)uid); 593 594 int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 595 mnt_fd = openat(group->dir_fd, mnt_fname, 596 O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); 597 if (mnt_fd < 0 && errno == ENOENT) { 598 return ESRCH; 599 } 600 if (mnt_fd < 0) { 601 die("cannot open preserved mount namespace %s", group->name); 602 } 603 struct statfs ns_statfs_buf; 604 if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) { 605 die("cannot inspect filesystem of preserved mount namespace file"); 606 } 607 struct stat ns_stat_buf; 608 if (fstat(mnt_fd, &ns_stat_buf) < 0) { 609 die("cannot inspect preserved mount namespace file"); 610 } 611 #ifndef NSFS_MAGIC 612 /* Define NSFS_MAGIC for Ubuntu 14.04 and other older systems. */ 613 #define NSFS_MAGIC 0x6e736673 614 #endif 615 if (ns_statfs_buf.f_type == NSFS_MAGIC 616 || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) { 617 if (setns(mnt_fd, CLONE_NEWNS) < 0) { 618 die("cannot join preserved per-user mount namespace %s", 619 group->name); 620 } 621 debug("joined preserved mount namespace %s", group->name); 622 return 0; 623 } 624 return ESRCH; 625 } 626 627 static void setup_signals_for_helper(void) 628 { 629 /* Ignore the SIGPIPE signal so that we get EPIPE on the read / write 630 * operations attempting to work with a closed pipe. This ensures that we 631 * are not killed by the default disposition (terminate) and can return a 632 * non-signal-death return code to the program invoking snap-confine. */ 633 if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { 634 die("cannot install ignore handler for SIGPIPE"); 635 } 636 } 637 638 static void teardown_signals_for_helper(void) 639 { 640 /* Undo operations done by setup_signals_for_helper. */ 641 if (signal(SIGPIPE, SIG_DFL) == SIG_ERR) { 642 die("cannot restore default handler for SIGPIPE"); 643 } 644 } 645 646 static void helper_fork(struct sc_mount_ns *group, struct sc_apparmor *apparmor) 647 { 648 // Create a pipe for sending commands to the helper process. 649 if (pipe2(group->pipe_master, O_CLOEXEC | O_DIRECT) < 0) { 650 die("cannot create pipes for commanding the helper process"); 651 } 652 if (pipe2(group->pipe_helper, O_CLOEXEC | O_DIRECT) < 0) { 653 die("cannot create pipes for responding to master process"); 654 } 655 // Store the PID of the "parent" process. This done instead of calls to 656 // getppid() because then we can reliably track the PID of the parent even 657 // if the child process is re-parented. 658 pid_t parent = getpid(); 659 660 // For rationale of forking see this: 661 // https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html 662 pid_t pid = fork(); 663 if (pid < 0) { 664 die("cannot fork helper process for mount namespace capture"); 665 } 666 if (pid == 0) { 667 /* helper */ 668 sc_cleanup_close(&group->pipe_master[1]); 669 sc_cleanup_close(&group->pipe_helper[0]); 670 helper_main(group, apparmor, parent); 671 } else { 672 setup_signals_for_helper(); 673 674 /* master */ 675 sc_cleanup_close(&group->pipe_master[0]); 676 sc_cleanup_close(&group->pipe_helper[1]); 677 678 // Glibc defines pid as a signed 32bit integer. There's no standard way to 679 // print pid's portably so this is the best we can do. 680 debug("forked support process %d", (int)pid); 681 group->child = pid; 682 } 683 } 684 685 static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor, 686 pid_t parent) 687 { 688 // This is the child process which will capture the mount namespace. 689 // 690 // It will do so by bind-mounting the .mnt after the parent process calls 691 // unshare() and finishes setting up the namespace completely. Change the 692 // hat to a sub-profile that has limited permissions necessary to 693 // accomplish the capture of the mount namespace. 694 sc_maybe_aa_change_hat(apparmor, "mount-namespace-capture-helper", 0); 695 // Configure the child to die as soon as the parent dies. In an odd 696 // case where the parent is killed then we don't want to complete our 697 // task or wait for anything. 698 if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { 699 die("cannot set parent process death notification signal to SIGINT"); 700 } 701 // Check that parent process is still alive. If this is the case then we 702 // can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake us up 703 // from read(2) below. In the rare case that the PID numbers overflow and 704 // the now-dead parent PID is recycled we will still hang forever on the 705 // read from the pipe below. 706 if (kill(parent, 0) < 0) { 707 switch (errno) { 708 case ESRCH: 709 // When snap-confine executes it will fork a helper process. That 710 // process establishes an elaborate dance to ensure both itself and 711 // the parent are operating exactly as specified, so that no 712 // processes are left behind for unbound amount of time. As a part 713 // of that dance the child pings the parent to ensure it is still 714 // alive after establishing a notification signal to be sent in 715 // case the parent dies. This is a race avoidance mechanism, we set 716 // up the notification and then check if the parent is alive by the 717 // time we are done. 718 // 719 // In the case when the parent does go away we used to call 720 // abort(). On some distributions this would trigger an unclean 721 // process termination error report to be sent. One such example is 722 // the Ubuntu error tracker. Since the parent process can be 723 // legitimately interrupted and killed, this should not generate an 724 // error report. As such, perform clean exit in this specific case. 725 debug("parent process has terminated"); 726 exit(0); 727 default: 728 die("cannot confirm that parent process is alive"); 729 break; 730 } 731 } 732 if (fchdir(group->dir_fd) < 0) { 733 die("cannot move to directory with preserved namespaces"); 734 } 735 int command = -1; 736 int run = 1; 737 while (run) { 738 debug("helper process waiting for command"); 739 sc_enable_sanity_timeout(); 740 if (read(group->pipe_master[0], &command, sizeof command) < 0) { 741 int saved_errno = errno; 742 // This will ensure we get the correct error message 743 // if there is a read error because the timeout 744 // expired. 745 sc_disable_sanity_timeout(); 746 errno = saved_errno; 747 die("cannot read command from the pipe"); 748 } 749 sc_disable_sanity_timeout(); 750 debug("helper process received command %d", command); 751 switch (command) { 752 case HELPER_CMD_EXIT: 753 run = 0; 754 break; 755 case HELPER_CMD_CAPTURE_MOUNT_NS: 756 helper_capture_ns(group, parent); 757 break; 758 case HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS: 759 helper_capture_per_user_ns(group, parent); 760 break; 761 } 762 if (write(group->pipe_helper[1], &command, sizeof command) < 0) { 763 die("cannot write ack"); 764 } 765 } 766 debug("helper process exiting"); 767 exit(0); 768 } 769 770 static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent) 771 { 772 char src[PATH_MAX] = { 0 }; 773 char dst[PATH_MAX] = { 0 }; 774 775 debug("capturing per-snap mount namespace"); 776 sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); 777 sc_must_snprintf(dst, sizeof dst, "%s.mnt", group->name); 778 779 /* Ensure the bind mount destination exists. */ 780 int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600); 781 if (fd < 0) { 782 die("cannot create file %s", dst); 783 } 784 close(fd); 785 786 if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { 787 die("cannot preserve mount namespace of process %d as %s", 788 (int)parent, dst); 789 } 790 debug("mount namespace of process %d preserved as %s", 791 (int)parent, dst); 792 } 793 794 static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent) 795 { 796 char src[PATH_MAX] = { 0 }; 797 char dst[PATH_MAX] = { 0 }; 798 uid_t uid = getuid(); 799 800 debug("capturing per-snap, per-user mount namespace"); 801 sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); 802 sc_must_snprintf(dst, sizeof dst, "%s.%d.mnt", group->name, (int)uid); 803 804 /* Ensure the bind mount destination exists. */ 805 int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600); 806 if (fd < 0) { 807 die("cannot create file %s", dst); 808 } 809 close(fd); 810 811 if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { 812 die("cannot preserve per-user mount namespace of process %d as %s", (int)parent, dst); 813 } 814 debug("per-user mount namespace of process %d preserved as %s", 815 (int)parent, dst); 816 } 817 818 static void sc_message_capture_helper(struct sc_mount_ns *group, int command_id) 819 { 820 int ack; 821 if (group->child == 0) { 822 die("precondition failed: we don't have a helper process"); 823 } 824 if (group->pipe_master[1] < 0) { 825 die("precondition failed: we don't have a pipe"); 826 } 827 if (group->pipe_helper[0] < 0) { 828 die("precondition failed: we don't have a pipe"); 829 } 830 debug("sending command %d to helper process (pid: %d)", 831 command_id, group->child); 832 if (write(group->pipe_master[1], &command_id, sizeof command_id) < 0) { 833 die("cannot send command %d to helper process", command_id); 834 } 835 debug("waiting for response from helper"); 836 int read_n = read(group->pipe_helper[0], &ack, sizeof ack); 837 if (read_n < 0) { 838 die("cannot receive ack from helper process"); 839 } 840 if (read_n == 0) { 841 die("unexpected eof from helper process"); 842 } 843 } 844 845 static void sc_wait_for_capture_helper(struct sc_mount_ns *group) 846 { 847 if (group->child == 0) { 848 die("precondition failed: we don't have a helper process"); 849 } 850 debug("waiting for the helper process to exit"); 851 int status = 0; 852 errno = 0; 853 if (waitpid(group->child, &status, 0) < 0) { 854 die("cannot wait for the helper process"); 855 } 856 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { 857 die("helper process exited abnormally"); 858 } 859 debug("helper process exited normally"); 860 group->child = 0; 861 teardown_signals_for_helper(); 862 } 863 864 void sc_fork_helper(struct sc_mount_ns *group, struct sc_apparmor *apparmor) 865 { 866 helper_fork(group, apparmor); 867 } 868 869 void sc_preserve_populated_mount_ns(struct sc_mount_ns *group) 870 { 871 sc_message_capture_helper(group, HELPER_CMD_CAPTURE_MOUNT_NS); 872 } 873 874 void sc_preserve_populated_per_user_mount_ns(struct sc_mount_ns *group) 875 { 876 sc_message_capture_helper(group, HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS); 877 } 878 879 void sc_wait_for_helper(struct sc_mount_ns *group) 880 { 881 sc_message_capture_helper(group, HELPER_CMD_EXIT); 882 sc_wait_for_capture_helper(group); 883 } 884 885 void sc_store_ns_info(const sc_invocation * inv) 886 { 887 FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL; 888 char info_path[PATH_MAX] = { 0 }; 889 sc_must_snprintf(info_path, sizeof info_path, 890 "/run/snapd/ns/snap.%s.info", inv->snap_instance); 891 int fd = -1; 892 fd = open(info_path, 893 O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC | O_NOFOLLOW, 0644); 894 if (fd < 0) { 895 die("cannot open %s", info_path); 896 } 897 if (fchown(fd, 0, 0) < 0) { 898 die("cannot chown %s to root.root", info_path); 899 } 900 // The stream now owns the file descriptor. 901 stream = fdopen(fd, "w"); 902 if (stream == NULL) { 903 die("cannot get stream from file descriptor"); 904 } 905 fprintf(stream, "base-snap-name=%s\n", inv->orig_base_snap_name); 906 if (ferror(stream) != 0) { 907 die("I/O error when writing to %s", info_path); 908 } 909 if (fflush(stream) == EOF) { 910 die("cannot flush %s", info_path); 911 } 912 debug("saved mount namespace meta-data to %s", info_path); 913 } 914 915 bool sc_is_mount_ns_in_use(const char *snap_instance) 916 { 917 // perform an indirect check of whether the mount namespace is occupied, 918 // with cgroups v1, each snap process is attached to a group under the 919 // freezer controller, however with cgroups v2, we must check for any groups 920 // tracking the snap 921 bool occupied = false; 922 if (sc_cgroup_is_v2()) { 923 // cgroup v2 must consult the tracking groups 924 occupied = sc_cgroup_v2_is_tracking_snap(snap_instance); 925 } else { 926 occupied = sc_cgroup_freezer_occupied(snap_instance); 927 } 928 return occupied; 929 }