github.com/rigado/snapd@v2.42.5-go-mod+incompatible/cmd/snap-confine/ns-support.c (about) 1 /* 2 * Copyright (C) 2016 Canonical Ltd 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 3 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 * 16 */ 17 18 #include "ns-support.h" 19 20 #ifdef HAVE_CONFIG_H 21 #include "config.h" 22 #endif 23 24 #include <errno.h> 25 #include <fcntl.h> 26 #include <linux/magic.h> 27 #include <sched.h> 28 #include <signal.h> 29 #include <string.h> 30 #include <sys/eventfd.h> 31 #include <sys/file.h> 32 #include <sys/mount.h> 33 #include <sys/prctl.h> 34 #include <sys/stat.h> 35 #include <sys/sysmacros.h> 36 #include <sys/types.h> 37 #include <sys/vfs.h> 38 #include <sys/wait.h> 39 #include <unistd.h> 40 41 #include "../libsnap-confine-private/cgroup-freezer-support.h" 42 #include "../libsnap-confine-private/cgroup-support.h" 43 #include "../libsnap-confine-private/classic.h" 44 #include "../libsnap-confine-private/cleanup-funcs.h" 45 #include "../libsnap-confine-private/infofile.h" 46 #include "../libsnap-confine-private/locking.h" 47 #include "../libsnap-confine-private/mountinfo.h" 48 #include "../libsnap-confine-private/string-utils.h" 49 #include "../libsnap-confine-private/tool.h" 50 #include "../libsnap-confine-private/utils.h" 51 #include "user-support.h" 52 53 /** 54 * Directory where snap-confine keeps namespace files. 55 **/ 56 #define SC_NS_DIR "/run/snapd/ns" 57 58 /** 59 * Effective value of SC_NS_DIR. 60 * 61 * We use 'const char *' so we can update sc_ns_dir in the testsuite 62 **/ 63 static const char *sc_ns_dir = SC_NS_DIR; 64 65 enum { 66 HELPER_CMD_EXIT, 67 HELPER_CMD_CAPTURE_MOUNT_NS, 68 HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS, 69 }; 70 71 void sc_reassociate_with_pid1_mount_ns(void) 72 { 73 int init_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 74 int self_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 75 const char *path_pid_1 = "/proc/1/ns/mnt"; 76 const char *path_pid_self = "/proc/self/ns/mnt"; 77 78 init_mnt_fd = open(path_pid_1, 79 O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 80 if (init_mnt_fd < 0) { 81 die("cannot open path %s", path_pid_1); 82 } 83 self_mnt_fd = open(path_pid_self, 84 O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 85 if (self_mnt_fd < 0) { 86 die("cannot open path %s", path_pid_1); 87 } 88 char init_buf[128] = { 0 }; 89 char self_buf[128] = { 0 }; 90 memset(init_buf, 0, sizeof init_buf); 91 if (readlinkat(init_mnt_fd, "", init_buf, sizeof init_buf) < 0) { 92 if (errno == ENOENT) { 93 // According to namespaces(7) on a pre 3.8 kernel the namespace 94 // files are hardlinks, not sylinks. If that happens readlinkat 95 // fails with ENOENT. As a quick workaround for this special-case 96 // functionality, just bail out and do nothing without raising an 97 // error. 98 return; 99 } 100 die("cannot read mount namespace identifier of pid 1"); 101 } 102 memset(self_buf, 0, sizeof self_buf); 103 if (readlinkat(self_mnt_fd, "", self_buf, sizeof self_buf) < 0) { 104 die("cannot read mount namespace identifier of the current process"); 105 } 106 if (memcmp(init_buf, self_buf, sizeof init_buf) != 0) { 107 debug("moving to mount namespace of pid 1"); 108 // We cannot use O_NOFOLLOW here because that file will always be a 109 // symbolic link. We actually want to open it this way. 110 int init_mnt_fd_real SC_CLEANUP(sc_cleanup_close) = -1; 111 init_mnt_fd_real = open(path_pid_1, O_RDONLY | O_CLOEXEC); 112 if (init_mnt_fd_real < 0) { 113 die("cannot open %s", path_pid_1); 114 } 115 if (setns(init_mnt_fd_real, CLONE_NEWNS) < 0) { 116 die("cannot join mount namespace of pid 1"); 117 } 118 } 119 } 120 121 void sc_initialize_mount_ns(void) 122 { 123 /* Ensure that /run/snapd/ns is a directory. */ 124 if (sc_nonfatal_mkpath(sc_ns_dir, 0755) < 0) { 125 die("cannot create directory %s", sc_ns_dir); 126 } 127 128 /* Read and analyze the mount table. We need to see whether /run/snapd/ns 129 * is a mount point with private event propagation. */ 130 sc_mountinfo *info SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 131 info = sc_parse_mountinfo(NULL); 132 if (info == NULL) { 133 die("cannot parse /proc/self/mountinfo"); 134 } 135 136 bool is_mnt = false; 137 bool is_private = false; 138 for (sc_mountinfo_entry * entry = sc_first_mountinfo_entry(info); 139 entry != NULL; entry = sc_next_mountinfo_entry(entry)) { 140 /* Find /run/snapd/ns */ 141 if (!sc_streq(entry->mount_dir, sc_ns_dir)) { 142 continue; 143 } 144 is_mnt = true; 145 if (strstr(entry->optional_fields, "shared:") == NULL) { 146 /* Mount event propagation is not set to shared, good. */ 147 is_private = true; 148 } 149 break; 150 } 151 152 if (!is_mnt) { 153 if (mount(sc_ns_dir, sc_ns_dir, NULL, MS_BIND | MS_REC, NULL) < 154 0) { 155 die("cannot self-bind mount %s", sc_ns_dir); 156 } 157 } 158 159 if (!is_private) { 160 if (mount(NULL, sc_ns_dir, NULL, MS_PRIVATE, NULL) < 0) { 161 die("cannot change propagation type to MS_PRIVATE in %s", sc_ns_dir); 162 } 163 } 164 } 165 166 struct sc_mount_ns { 167 // Name of the namespace group ($SNAP_NAME). 168 char *name; 169 // Descriptor to the namespace group control directory. This descriptor is 170 // opened with O_PATH|O_DIRECTORY so it's only used for openat() calls. 171 int dir_fd; 172 // Pair of descriptors for a pair for a pipe file descriptors (read end, 173 // write end) that snap-confine uses to send messages to the helper 174 // process and back. 175 int pipe_helper[2]; 176 int pipe_master[2]; 177 // Identifier of the child process that is used during the one-time (per 178 // group) initialization and capture process. 179 pid_t child; 180 }; 181 182 static struct sc_mount_ns *sc_alloc_mount_ns(void) 183 { 184 struct sc_mount_ns *group = calloc(1, sizeof *group); 185 if (group == NULL) { 186 die("cannot allocate memory for sc_mount_ns"); 187 } 188 group->dir_fd = -1; 189 group->pipe_helper[0] = -1; 190 group->pipe_helper[1] = -1; 191 group->pipe_master[0] = -1; 192 group->pipe_master[1] = -1; 193 // Redundant with calloc but some functions check for the non-zero value so 194 // I'd like to keep this explicit in the code. 195 group->child = 0; 196 return group; 197 } 198 199 struct sc_mount_ns *sc_open_mount_ns(const char *group_name) 200 { 201 struct sc_mount_ns *group = sc_alloc_mount_ns(); 202 group->dir_fd = open(sc_ns_dir, 203 O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW); 204 if (group->dir_fd < 0) { 205 die("cannot open directory %s", sc_ns_dir); 206 } 207 group->name = sc_strdup(group_name); 208 return group; 209 } 210 211 void sc_close_mount_ns(struct sc_mount_ns *group) 212 { 213 if (group->child != 0) { 214 sc_wait_for_helper(group); 215 } 216 sc_cleanup_close(&group->dir_fd); 217 sc_cleanup_close(&group->pipe_master[0]); 218 sc_cleanup_close(&group->pipe_master[1]); 219 sc_cleanup_close(&group->pipe_helper[0]); 220 sc_cleanup_close(&group->pipe_helper[1]); 221 free(group->name); 222 free(group); 223 } 224 225 static dev_t find_base_snap_device(const char *base_snap_name, 226 const char *base_snap_rev) 227 { 228 // Find the backing device of the base snap. 229 // TODO: add support for "try mode" base snaps that also need 230 // consideration of the mie->root component. 231 dev_t base_snap_dev = 0; 232 char base_squashfs_path[PATH_MAX]; 233 sc_must_snprintf(base_squashfs_path, 234 sizeof base_squashfs_path, "%s/%s/%s", 235 SNAP_MOUNT_DIR, base_snap_name, base_snap_rev); 236 sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 237 mi = sc_parse_mountinfo(NULL); 238 if (mi == NULL) { 239 die("cannot parse mountinfo of the current process"); 240 } 241 bool found = false; 242 for (sc_mountinfo_entry * mie = 243 sc_first_mountinfo_entry(mi); mie != NULL; 244 mie = sc_next_mountinfo_entry(mie)) { 245 if (sc_streq(mie->mount_dir, base_squashfs_path)) { 246 base_snap_dev = makedev(mie->dev_major, mie->dev_minor); 247 debug("block device of snap %s, revision %s is %d:%d", 248 base_snap_name, base_snap_rev, mie->dev_major, 249 mie->dev_minor); 250 // Don't break when found, we are interested in the last 251 // entry as this is the "effective" one. 252 found = true; 253 } 254 } 255 if (!found) { 256 die("cannot find mount entry for snap %s revision %s", 257 base_snap_name, base_snap_rev); 258 } 259 return base_snap_dev; 260 } 261 262 static bool should_discard_current_ns(dev_t base_snap_dev) 263 { 264 // Inspect the namespace and check if we should discard it. 265 // 266 // The namespace may become "stale" when the rootfs is not the same 267 // device we found above. This will happen whenever the base snap is 268 // refreshed since the namespace was first created. 269 sc_mountinfo_entry *mie; 270 sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 271 272 mi = sc_parse_mountinfo(NULL); 273 if (mi == NULL) { 274 die("cannot parse mountinfo of the current process"); 275 } 276 for (mie = sc_first_mountinfo_entry(mi); mie != NULL; 277 mie = sc_next_mountinfo_entry(mie)) { 278 if (!sc_streq(mie->mount_dir, "/")) { 279 continue; 280 } 281 // NOTE: we want the initial rootfs just in case overmount 282 // was used to do something weird. The initial rootfs was 283 // set up by snap-confine and that is the one we want to 284 // measure. 285 debug("block device of the root filesystem is %d:%d", 286 mie->dev_major, mie->dev_minor); 287 return base_snap_dev != makedev(mie->dev_major, mie->dev_minor); 288 } 289 die("cannot find mount entry of the root filesystem"); 290 } 291 292 enum sc_discard_vote { 293 /** 294 * SC_DISCARD_NO denotes that the mount namespace doesn't have to be 295 * discarded. This happens when the base snap has not changed. 296 **/ 297 SC_DISCARD_NO = 1, 298 /** 299 * SC_DISCARD_SHOULD indicates that the mount namespace should be discarded 300 * but may be reused if it is still inhabited by processes. This only 301 * happens when the base snap revision changes but the name of the base 302 * snap is the same as before. 303 **/ 304 SC_DISCARD_SHOULD = 2, 305 /** 306 * SC_DISCARD_MUST indicates that the mount namespace must be discarded 307 * even if it still inhabited by processes. This only happens when the name 308 * of the base snap changes. 309 **/ 310 SC_DISCARD_MUST = 3, 311 }; 312 313 /** 314 * is_base_transition returns true if a base transition is occurring. 315 * 316 * The function inspects /run/snapd/ns/snap.$SNAP_INSTANCE_NAME.info as well 317 * as the invocation parameters of snap-confine. If the base snap name, as 318 * encoded in the info file and as described by the invocation parameters 319 * differ then a base transition is occurring. If the info file is absent or 320 * does not record the name of the base snap then transition cannot be 321 * detected. 322 **/ 323 static bool is_base_transition(const sc_invocation * inv) 324 { 325 char info_path[PATH_MAX] = { 0 }; 326 sc_must_snprintf(info_path, 327 sizeof info_path, 328 "/run/snapd/ns/snap.%s.info", inv->snap_instance); 329 330 FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL; 331 stream = fopen(info_path, "r"); 332 if (stream == NULL && errno == ENOENT) { 333 // If the info file is absent then we cannot decide if a transition had 334 // occurred. For people upgrading from snap-confine without the info 335 // file, that is the best we can do. 336 return false; 337 } 338 if (stream == NULL) { 339 die("cannot open %s", info_path); 340 } 341 342 char *base_snap_name SC_CLEANUP(sc_cleanup_string) = NULL; 343 sc_error *err = NULL; 344 if (sc_infofile_get_key 345 (stream, "base-snap-name", &base_snap_name, &err) < 0) { 346 sc_die_on_error(err); 347 } 348 349 if (base_snap_name == NULL) { 350 // If the info file doesn't record the name of the base snap then, 351 // again, we cannot decide if a transition had occurred. 352 return false; 353 } 354 355 return !sc_streq(inv->orig_base_snap_name, base_snap_name); 356 } 357 358 // The namespace may be stale. To check this we must actually switch into it 359 // but then we use up our setns call (the kernel misbehaves if we setns twice). 360 // To work around this we'll fork a child and use it to probe. The child will 361 // inspect the namespace and send information back via eventfd and then exit 362 // unconditionally. 363 static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd, 364 const sc_invocation * inv, 365 int snap_discard_ns_fd) 366 { 367 char base_snap_rev[PATH_MAX] = { 0 }; 368 dev_t base_snap_dev; 369 int event_fd SC_CLEANUP(sc_cleanup_close) = -1; 370 371 // Read the revision of the base snap by looking at the current symlink. 372 if (readlink(inv->rootfs_dir, base_snap_rev, sizeof base_snap_rev) < 0) { 373 die("cannot read current revision of snap %s", 374 inv->snap_instance); 375 } 376 if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') { 377 die("cannot read current revision of snap %s: value too long", 378 inv->snap_instance); 379 } 380 // Find the device that is backing the current revision of the base snap. 381 base_snap_dev = 382 find_base_snap_device(inv->base_snap_name, base_snap_rev); 383 384 // Store the PID of this process. This is done instead of calls to 385 // getppid() below because then we can reliably track the PID of the 386 // parent even if the child process is re-parented. 387 pid_t parent = getpid(); 388 389 // Create an eventfd for the communication with the child. 390 event_fd = eventfd(0, EFD_CLOEXEC); 391 if (event_fd < 0) { 392 die("cannot create eventfd"); 393 } 394 // Fork a child, it will do the inspection for us. 395 pid_t child = fork(); 396 if (child < 0) { 397 die("cannot fork support process"); 398 } 399 400 if (child == 0) { 401 // This is the child process which will inspect the mount namespace. 402 // 403 // Configure the child to die as soon as the parent dies. In an odd 404 // case where the parent is killed then we don't want to complete our 405 // task or wait for anything. 406 if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { 407 die("cannot set parent process death notification signal to SIGINT"); 408 } 409 // Check that parent process is still alive. If this is the case then 410 // we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake 411 // us up from eventfd_read() below. In the rare case that the PID 412 // numbers overflow and the now-dead parent PID is recycled we will 413 // still hang forever on the read from eventfd below. 414 if (kill(parent, 0) < 0) { 415 switch (errno) { 416 case ESRCH: 417 debug("parent process has terminated"); 418 abort(); 419 default: 420 die("cannot confirm that parent process is alive"); 421 break; 422 } 423 } 424 425 debug("joining preserved mount namespace for inspection"); 426 // Move to the mount namespace of the snap we're trying to inspect. 427 if (setns(mnt_fd, CLONE_NEWNS) < 0) { 428 die("cannot join preserved mount namespace"); 429 } 430 // Check if the namespace needs to be discarded. 431 eventfd_t value = SC_DISCARD_NO; 432 const char *value_str = "no"; 433 434 // TODO: enable this for core distributions. This is complex because on 435 // core the rootfs is mounted in initrd and is _not_ changed (no 436 // pivot_root) and the base snap is again mounted (2nd time) by 437 // systemd. This makes us end up in a situation where the outer base 438 // snap will never match the rootfs inside the mount namespace. 439 if (inv->is_normal_mode 440 && should_discard_current_ns(base_snap_dev)) { 441 value = SC_DISCARD_SHOULD; 442 value_str = "should"; 443 444 // The namespace is stale so also check if we must discard it due to the 445 // base snap changing. If the base snap changed, we must discard since even 446 // though currently running processes from this snap will continue to see 447 // the old base, we want new processes to use the new base. See LP: 448 // #1819875 for details. 449 if (is_base_transition(inv)) { 450 // The base snap has changed. We must discard ... 451 value = SC_DISCARD_MUST; 452 value_str = "must"; 453 } 454 } 455 // Send this back to the parent: 3 - force discard 2 - prefer discard, 1 - keep. 456 // Note that we cannot just use 0 and 1 because of the semantics of eventfd(2). 457 if (eventfd_write(event_fd, value) < 0) { 458 die("cannot send information to %s preserved mount namespace", value_str); 459 } 460 // Exit, we're done. 461 exit(0); 462 } 463 // This is back in the parent process. 464 // 465 // Enable a sanity timeout in case the read blocks for unbound amount of 466 // time. This will ensure we will not hang around while holding the lock. 467 // Next, read the value written by the child process. 468 sc_enable_sanity_timeout(); 469 eventfd_t value = 0; 470 if (eventfd_read(event_fd, &value) < 0) { 471 die("cannot read from eventfd"); 472 } 473 sc_disable_sanity_timeout(); 474 475 // Wait for the child process to exit and collect its exit status. 476 errno = 0; 477 int status = 0; 478 if (waitpid(child, &status, 0) < 0) { 479 die("cannot wait for the support process for mount namespace inspection"); 480 } 481 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { 482 die("support process for mount namespace inspection exited abnormally"); 483 } 484 // If the namespace is up-to-date then we are done. 485 switch (value) { 486 case SC_DISCARD_NO: 487 debug("preserved mount is not stale, reusing"); 488 return 0; 489 case SC_DISCARD_SHOULD: 490 if (sc_cgroup_is_v2()) { 491 debug 492 ("WARNING: cgroup v2 detected, preserved mount namespace process presence check unsupported, discarding"); 493 break; 494 } 495 if (sc_cgroup_freezer_occupied(inv->snap_instance)) { 496 // Some processes are still using the namespace so we cannot discard it 497 // as that would fracture the view that the set of processes inside 498 // have on what is mounted. 499 debug 500 ("preserved mount namespace is stale but occupied, reusing"); 501 return 0; 502 } 503 break; 504 case SC_DISCARD_MUST: 505 debug 506 ("preserved mount namespace is stale and base snap has changed, discarding"); 507 break; 508 } 509 sc_call_snap_discard_ns(snap_discard_ns_fd, inv->snap_instance); 510 return EAGAIN; 511 } 512 513 static void helper_fork(struct sc_mount_ns *group, 514 struct sc_apparmor *apparmor); 515 static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor, 516 pid_t parent); 517 static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent); 518 static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent); 519 520 int sc_join_preserved_ns(struct sc_mount_ns *group, struct sc_apparmor 521 *apparmor, const sc_invocation * inv, 522 int snap_discard_ns_fd) 523 { 524 // Open the mount namespace file. 525 char mnt_fname[PATH_MAX] = { 0 }; 526 sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.mnt", group->name); 527 int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 528 // NOTE: There is no O_EXCL here because the file can be around but 529 // doesn't have to be a mounted namespace. 530 mnt_fd = openat(group->dir_fd, mnt_fname, 531 O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); 532 if (mnt_fd < 0 && errno == ENOENT) { 533 return ESRCH; 534 } 535 if (mnt_fd < 0) { 536 die("cannot open preserved mount namespace %s", group->name); 537 } 538 // Check if we got an nsfs-based or procfs file or a regular file. This can 539 // be reliably tested because nsfs has an unique filesystem type 540 // NSFS_MAGIC. On older kernels that don't support nsfs yet we can look 541 // for PROC_SUPER_MAGIC instead. 542 // We can just ensure that this is the case thanks to fstatfs. 543 struct statfs ns_statfs_buf; 544 if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) { 545 die("cannot inspect filesystem of preserved mount namespace file"); 546 } 547 // Stat the mount namespace as well, this is later used to check if the 548 // namespace is used by other processes if we are considering discarding a 549 // stale namespace. 550 struct stat ns_stat_buf; 551 if (fstat(mnt_fd, &ns_stat_buf) < 0) { 552 die("cannot inspect preserved mount namespace file"); 553 } 554 #ifndef NSFS_MAGIC 555 // Account for kernel headers old enough to not know about NSFS_MAGIC. 556 #define NSFS_MAGIC 0x6e736673 557 #endif 558 if (ns_statfs_buf.f_type == NSFS_MAGIC 559 || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) { 560 561 // Inspect and perhaps discard the preserved mount namespace. 562 if (sc_inspect_and_maybe_discard_stale_ns 563 (mnt_fd, inv, snap_discard_ns_fd) == EAGAIN) { 564 return ESRCH; 565 } 566 // Move to the mount namespace of the snap we're trying to start. 567 if (setns(mnt_fd, CLONE_NEWNS) < 0) { 568 die("cannot join preserved mount namespace %s", 569 group->name); 570 } 571 debug("joined preserved mount namespace %s", group->name); 572 return 0; 573 } 574 return ESRCH; 575 } 576 577 int sc_join_preserved_per_user_ns(struct sc_mount_ns *group, 578 const char *snap_name) 579 { 580 uid_t uid = getuid(); 581 char mnt_fname[PATH_MAX] = { 0 }; 582 sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.%d.mnt", group->name, 583 (int)uid); 584 585 int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 586 mnt_fd = openat(group->dir_fd, mnt_fname, 587 O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); 588 if (mnt_fd < 0 && errno == ENOENT) { 589 return ESRCH; 590 } 591 if (mnt_fd < 0) { 592 die("cannot open preserved mount namespace %s", group->name); 593 } 594 struct statfs ns_statfs_buf; 595 if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) { 596 die("cannot inspect filesystem of preserved mount namespace file"); 597 } 598 struct stat ns_stat_buf; 599 if (fstat(mnt_fd, &ns_stat_buf) < 0) { 600 die("cannot inspect preserved mount namespace file"); 601 } 602 #ifndef NSFS_MAGIC 603 /* Define NSFS_MAGIC for Ubuntu 14.04 and other older systems. */ 604 #define NSFS_MAGIC 0x6e736673 605 #endif 606 if (ns_statfs_buf.f_type == NSFS_MAGIC 607 || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) { 608 if (setns(mnt_fd, CLONE_NEWNS) < 0) { 609 die("cannot join preserved per-user mount namespace %s", 610 group->name); 611 } 612 debug("joined preserved mount namespace %s", group->name); 613 return 0; 614 } 615 return ESRCH; 616 } 617 618 static void setup_signals_for_helper(void) 619 { 620 /* Ignore the SIGPIPE signal so that we get EPIPE on the read / write 621 * operations attempting to work with a closed pipe. This ensures that we 622 * are not killed by the default disposition (terminate) and can return a 623 * non-signal-death return code to the program invoking snap-confine. */ 624 if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { 625 die("cannot install ignore handler for SIGPIPE"); 626 } 627 } 628 629 static void teardown_signals_for_helper(void) 630 { 631 /* Undo operations done by setup_signals_for_helper. */ 632 if (signal(SIGPIPE, SIG_DFL) == SIG_ERR) { 633 die("cannot restore default handler for SIGPIPE"); 634 } 635 } 636 637 static void helper_fork(struct sc_mount_ns *group, struct sc_apparmor *apparmor) 638 { 639 // Create a pipe for sending commands to the helper process. 640 if (pipe2(group->pipe_master, O_CLOEXEC | O_DIRECT) < 0) { 641 die("cannot create pipes for commanding the helper process"); 642 } 643 if (pipe2(group->pipe_helper, O_CLOEXEC | O_DIRECT) < 0) { 644 die("cannot create pipes for responding to master process"); 645 } 646 // Store the PID of the "parent" process. This done instead of calls to 647 // getppid() because then we can reliably track the PID of the parent even 648 // if the child process is re-parented. 649 pid_t parent = getpid(); 650 651 // For rationale of forking see this: 652 // https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html 653 pid_t pid = fork(); 654 if (pid < 0) { 655 die("cannot fork helper process for mount namespace capture"); 656 } 657 if (pid == 0) { 658 /* helper */ 659 sc_cleanup_close(&group->pipe_master[1]); 660 sc_cleanup_close(&group->pipe_helper[0]); 661 helper_main(group, apparmor, parent); 662 } else { 663 setup_signals_for_helper(); 664 665 /* master */ 666 sc_cleanup_close(&group->pipe_master[0]); 667 sc_cleanup_close(&group->pipe_helper[1]); 668 669 // Glibc defines pid as a signed 32bit integer. There's no standard way to 670 // print pid's portably so this is the best we can do. 671 debug("forked support process %d", (int)pid); 672 group->child = pid; 673 } 674 } 675 676 static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor, 677 pid_t parent) 678 { 679 // This is the child process which will capture the mount namespace. 680 // 681 // It will do so by bind-mounting the .mnt after the parent process calls 682 // unshare() and finishes setting up the namespace completely. Change the 683 // hat to a sub-profile that has limited permissions necessary to 684 // accomplish the capture of the mount namespace. 685 sc_maybe_aa_change_hat(apparmor, "mount-namespace-capture-helper", 0); 686 // Configure the child to die as soon as the parent dies. In an odd 687 // case where the parent is killed then we don't want to complete our 688 // task or wait for anything. 689 if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { 690 die("cannot set parent process death notification signal to SIGINT"); 691 } 692 // Check that parent process is still alive. If this is the case then we 693 // can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake us up 694 // from read(2) below. In the rare case that the PID numbers overflow and 695 // the now-dead parent PID is recycled we will still hang forever on the 696 // read from the pipe below. 697 if (kill(parent, 0) < 0) { 698 switch (errno) { 699 case ESRCH: 700 // When snap-confine executes it will fork a helper process. That 701 // process establishes an elaborate dance to ensure both itself and 702 // the parent are operating exactly as specified, so that no 703 // processes are left behind for unbound amount of time. As a part 704 // of that dance the child pings the parent to ensure it is still 705 // alive after establishing a notification signal to be sent in 706 // case the parent dies. This is a race avoidance mechanism, we set 707 // up the notification and then check if the parent is alive by the 708 // time we are done. 709 // 710 // In the case when the parent does go away we used to call 711 // abort(). On some distributions this would trigger an unclean 712 // process termination error report to be sent. One such example is 713 // the Ubuntu error tracker. Since the parent process can be 714 // legitimately interrupted and killed, this should not generate an 715 // error report. As such, perform clean exit in this specific case. 716 debug("parent process has terminated"); 717 exit(0); 718 default: 719 die("cannot confirm that parent process is alive"); 720 break; 721 } 722 } 723 if (fchdir(group->dir_fd) < 0) { 724 die("cannot move to directory with preserved namespaces"); 725 } 726 int command = -1; 727 int run = 1; 728 while (run) { 729 debug("helper process waiting for command"); 730 sc_enable_sanity_timeout(); 731 if (read(group->pipe_master[0], &command, sizeof command) < 0) { 732 int saved_errno = errno; 733 // This will ensure we get the correct error message 734 // if there is a read error because the timeout 735 // expired. 736 sc_disable_sanity_timeout(); 737 errno = saved_errno; 738 die("cannot read command from the pipe"); 739 } 740 sc_disable_sanity_timeout(); 741 debug("helper process received command %d", command); 742 switch (command) { 743 case HELPER_CMD_EXIT: 744 run = 0; 745 break; 746 case HELPER_CMD_CAPTURE_MOUNT_NS: 747 helper_capture_ns(group, parent); 748 break; 749 case HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS: 750 helper_capture_per_user_ns(group, parent); 751 break; 752 } 753 if (write(group->pipe_helper[1], &command, sizeof command) < 0) { 754 die("cannot write ack"); 755 } 756 } 757 debug("helper process exiting"); 758 exit(0); 759 } 760 761 static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent) 762 { 763 char src[PATH_MAX] = { 0 }; 764 char dst[PATH_MAX] = { 0 }; 765 766 debug("capturing per-snap mount namespace"); 767 sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); 768 sc_must_snprintf(dst, sizeof dst, "%s.mnt", group->name); 769 770 /* Ensure the bind mount destination exists. */ 771 int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600); 772 if (fd < 0) { 773 die("cannot create file %s", dst); 774 } 775 close(fd); 776 777 if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { 778 die("cannot preserve mount namespace of process %d as %s", 779 (int)parent, dst); 780 } 781 debug("mount namespace of process %d preserved as %s", 782 (int)parent, dst); 783 } 784 785 static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent) 786 { 787 char src[PATH_MAX] = { 0 }; 788 char dst[PATH_MAX] = { 0 }; 789 uid_t uid = getuid(); 790 791 debug("capturing per-snap, per-user mount namespace"); 792 sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); 793 sc_must_snprintf(dst, sizeof dst, "%s.%d.mnt", group->name, (int)uid); 794 795 /* Ensure the bind mount destination exists. */ 796 int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600); 797 if (fd < 0) { 798 die("cannot create file %s", dst); 799 } 800 close(fd); 801 802 if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { 803 die("cannot preserve per-user mount namespace of process %d as %s", (int)parent, dst); 804 } 805 debug("per-user mount namespace of process %d preserved as %s", 806 (int)parent, dst); 807 } 808 809 static void sc_message_capture_helper(struct sc_mount_ns *group, int command_id) 810 { 811 int ack; 812 if (group->child == 0) { 813 die("precondition failed: we don't have a helper process"); 814 } 815 if (group->pipe_master[1] < 0) { 816 die("precondition failed: we don't have a pipe"); 817 } 818 if (group->pipe_helper[0] < 0) { 819 die("precondition failed: we don't have a pipe"); 820 } 821 debug("sending command %d to helper process (pid: %d)", 822 command_id, group->child); 823 if (write(group->pipe_master[1], &command_id, sizeof command_id) < 0) { 824 die("cannot send command %d to helper process", command_id); 825 } 826 debug("waiting for response from helper"); 827 int read_n = read(group->pipe_helper[0], &ack, sizeof ack); 828 if (read_n < 0) { 829 die("cannot receive ack from helper process"); 830 } 831 if (read_n == 0) { 832 die("unexpected eof from helper process"); 833 } 834 } 835 836 static void sc_wait_for_capture_helper(struct sc_mount_ns *group) 837 { 838 if (group->child == 0) { 839 die("precondition failed: we don't have a helper process"); 840 } 841 debug("waiting for the helper process to exit"); 842 int status = 0; 843 errno = 0; 844 if (waitpid(group->child, &status, 0) < 0) { 845 die("cannot wait for the helper process"); 846 } 847 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { 848 die("helper process exited abnormally"); 849 } 850 debug("helper process exited normally"); 851 group->child = 0; 852 teardown_signals_for_helper(); 853 } 854 855 void sc_fork_helper(struct sc_mount_ns *group, struct sc_apparmor *apparmor) 856 { 857 helper_fork(group, apparmor); 858 } 859 860 void sc_preserve_populated_mount_ns(struct sc_mount_ns *group) 861 { 862 sc_message_capture_helper(group, HELPER_CMD_CAPTURE_MOUNT_NS); 863 } 864 865 void sc_preserve_populated_per_user_mount_ns(struct sc_mount_ns *group) 866 { 867 sc_message_capture_helper(group, HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS); 868 } 869 870 void sc_wait_for_helper(struct sc_mount_ns *group) 871 { 872 sc_message_capture_helper(group, HELPER_CMD_EXIT); 873 sc_wait_for_capture_helper(group); 874 } 875 876 void sc_store_ns_info(const sc_invocation * inv) 877 { 878 FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL; 879 char info_path[PATH_MAX] = { 0 }; 880 sc_must_snprintf(info_path, sizeof info_path, 881 "/run/snapd/ns/snap.%s.info", inv->snap_instance); 882 int fd = -1; 883 fd = open(info_path, 884 O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC | O_NOFOLLOW, 0644); 885 if (fd < 0) { 886 die("cannot open %s", info_path); 887 } 888 if (fchown(fd, 0, 0) < 0) { 889 die("cannot chown %s to root.root", info_path); 890 } 891 // The stream now owns the file descriptor. 892 stream = fdopen(fd, "w"); 893 if (stream == NULL) { 894 die("cannot get stream from file descriptor"); 895 } 896 fprintf(stream, "base-snap-name=%s\n", inv->orig_base_snap_name); 897 if (ferror(stream) != 0) { 898 die("I/O error when writing to %s", info_path); 899 } 900 if (fflush(stream) == EOF) { 901 die("cannot flush %s", info_path); 902 } 903 debug("saved mount namespace meta-data to %s", info_path); 904 }