github.com/kubiko/snapd@v0.0.0-20201013125620-d4f3094d9ddf/cmd/snap-confine/ns-support.c (about) 1 /* 2 * Copyright (C) 2016 Canonical Ltd 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 3 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program. If not, see <http://www.gnu.org/licenses/>. 15 * 16 */ 17 18 #include "ns-support.h" 19 20 #ifdef HAVE_CONFIG_H 21 #include "config.h" 22 #endif 23 24 #include <errno.h> 25 #include <fcntl.h> 26 #include <linux/magic.h> 27 #include <sched.h> 28 #include <signal.h> 29 #include <string.h> 30 #include <sys/eventfd.h> 31 #include <sys/file.h> 32 #include <sys/mount.h> 33 #include <sys/prctl.h> 34 #include <sys/stat.h> 35 #include <sys/sysmacros.h> 36 #include <sys/types.h> 37 #include <sys/vfs.h> 38 #include <sys/wait.h> 39 #include <unistd.h> 40 41 #include "../libsnap-confine-private/cgroup-freezer-support.h" 42 #include "../libsnap-confine-private/cgroup-support.h" 43 #include "../libsnap-confine-private/classic.h" 44 #include "../libsnap-confine-private/cleanup-funcs.h" 45 #include "../libsnap-confine-private/feature.h" 46 #include "../libsnap-confine-private/infofile.h" 47 #include "../libsnap-confine-private/locking.h" 48 #include "../libsnap-confine-private/mountinfo.h" 49 #include "../libsnap-confine-private/string-utils.h" 50 #include "../libsnap-confine-private/tool.h" 51 #include "../libsnap-confine-private/utils.h" 52 #include "user-support.h" 53 #include "mount-support.h" 54 55 /** 56 * Directory where snap-confine keeps namespace files. 57 **/ 58 #define SC_NS_DIR "/run/snapd/ns" 59 60 /** 61 * Effective value of SC_NS_DIR. 62 * 63 * We use 'const char *' so we can update sc_ns_dir in the testsuite 64 **/ 65 static const char *sc_ns_dir = SC_NS_DIR; 66 67 enum { 68 HELPER_CMD_EXIT, 69 HELPER_CMD_CAPTURE_MOUNT_NS, 70 HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS, 71 }; 72 73 void sc_reassociate_with_pid1_mount_ns(void) 74 { 75 int init_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 76 int self_mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 77 const char *path_pid_1 = "/proc/1/ns/mnt"; 78 const char *path_pid_self = "/proc/self/ns/mnt"; 79 80 init_mnt_fd = open(path_pid_1, 81 O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 82 if (init_mnt_fd < 0) { 83 die("cannot open path %s", path_pid_1); 84 } 85 self_mnt_fd = open(path_pid_self, 86 O_RDONLY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 87 if (self_mnt_fd < 0) { 88 die("cannot open path %s", path_pid_1); 89 } 90 char init_buf[128] = { 0 }; 91 char self_buf[128] = { 0 }; 92 memset(init_buf, 0, sizeof init_buf); 93 if (readlinkat(init_mnt_fd, "", init_buf, sizeof init_buf) < 0) { 94 if (errno == ENOENT) { 95 // According to namespaces(7) on a pre 3.8 kernel the namespace 96 // files are hardlinks, not symlinks. If that happens readlinkat 97 // fails with ENOENT. As a quick workaround for this special-case 98 // functionality, just bail out and do nothing without raising an 99 // error. 100 return; 101 } 102 die("cannot read mount namespace identifier of pid 1"); 103 } 104 memset(self_buf, 0, sizeof self_buf); 105 if (readlinkat(self_mnt_fd, "", self_buf, sizeof self_buf) < 0) { 106 die("cannot read mount namespace identifier of the current process"); 107 } 108 if (memcmp(init_buf, self_buf, sizeof init_buf) != 0) { 109 debug("moving to mount namespace of pid 1"); 110 // We cannot use O_NOFOLLOW here because that file will always be a 111 // symbolic link. We actually want to open it this way. 112 int init_mnt_fd_real SC_CLEANUP(sc_cleanup_close) = -1; 113 init_mnt_fd_real = open(path_pid_1, O_RDONLY | O_CLOEXEC); 114 if (init_mnt_fd_real < 0) { 115 die("cannot open %s", path_pid_1); 116 } 117 if (setns(init_mnt_fd_real, CLONE_NEWNS) < 0) { 118 die("cannot join mount namespace of pid 1"); 119 } 120 } 121 } 122 123 void sc_initialize_mount_ns(unsigned int experimental_features) 124 { 125 debug("unsharing snap namespace directory"); 126 127 /* Ensure that /run/snapd/ns is a directory. */ 128 sc_identity old = sc_set_effective_identity(sc_root_group_identity()); 129 if (sc_nonfatal_mkpath(sc_ns_dir, 0755) < 0) { 130 die("cannot create directory %s", sc_ns_dir); 131 } 132 (void)sc_set_effective_identity(old); 133 134 /* Read and analyze the mount table. We need to see whether /run/snapd/ns 135 * is a mount point with private event propagation. */ 136 sc_mountinfo *info SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 137 info = sc_parse_mountinfo(NULL); 138 if (info == NULL) { 139 die("cannot parse /proc/self/mountinfo"); 140 } 141 142 bool is_mnt = false; 143 bool is_private = false; 144 for (sc_mountinfo_entry * entry = sc_first_mountinfo_entry(info); 145 entry != NULL; entry = sc_next_mountinfo_entry(entry)) { 146 /* Find /run/snapd/ns */ 147 if (!sc_streq(entry->mount_dir, sc_ns_dir)) { 148 continue; 149 } 150 is_mnt = true; 151 if (strstr(entry->optional_fields, "shared:") == NULL) { 152 /* Mount event propagation is not set to shared, good. */ 153 is_private = true; 154 } 155 break; 156 } 157 158 if (!is_mnt) { 159 if (mount(sc_ns_dir, sc_ns_dir, NULL, MS_BIND | MS_REC, NULL) < 160 0) { 161 die("cannot self-bind mount %s", sc_ns_dir); 162 } 163 } 164 165 if (!is_private) { 166 if (mount(NULL, sc_ns_dir, NULL, MS_PRIVATE, NULL) < 0) { 167 die("cannot change propagation type to MS_PRIVATE in %s", sc_ns_dir); 168 } 169 } 170 171 /* code that follows is experimental */ 172 if (experimental_features & SC_FEATURE_PARALLEL_INSTANCES) { 173 // Ensure that SNAP_MOUNT_DIR and /var/snap are shared mount points 174 debug 175 ("(experimental) ensuring snap mount and data directories are mount points"); 176 sc_ensure_snap_dir_shared_mounts(); 177 } 178 } 179 180 struct sc_mount_ns { 181 // Name of the namespace group ($SNAP_NAME). 182 char *name; 183 // Descriptor to the namespace group control directory. This descriptor is 184 // opened with O_PATH|O_DIRECTORY so it's only used for openat() calls. 185 int dir_fd; 186 // Pair of descriptors for a pair for a pipe file descriptors (read end, 187 // write end) that snap-confine uses to send messages to the helper 188 // process and back. 189 int pipe_helper[2]; 190 int pipe_master[2]; 191 // Identifier of the child process that is used during the one-time (per 192 // group) initialization and capture process. 193 pid_t child; 194 }; 195 196 static struct sc_mount_ns *sc_alloc_mount_ns(void) 197 { 198 struct sc_mount_ns *group = calloc(1, sizeof *group); 199 if (group == NULL) { 200 die("cannot allocate memory for sc_mount_ns"); 201 } 202 group->dir_fd = -1; 203 group->pipe_helper[0] = -1; 204 group->pipe_helper[1] = -1; 205 group->pipe_master[0] = -1; 206 group->pipe_master[1] = -1; 207 // Redundant with calloc but some functions check for the non-zero value so 208 // I'd like to keep this explicit in the code. 209 group->child = 0; 210 return group; 211 } 212 213 struct sc_mount_ns *sc_open_mount_ns(const char *group_name) 214 { 215 struct sc_mount_ns *group = sc_alloc_mount_ns(); 216 group->dir_fd = open(sc_ns_dir, 217 O_DIRECTORY | O_PATH | O_CLOEXEC | O_NOFOLLOW); 218 if (group->dir_fd < 0) { 219 die("cannot open directory %s", sc_ns_dir); 220 } 221 group->name = sc_strdup(group_name); 222 return group; 223 } 224 225 void sc_close_mount_ns(struct sc_mount_ns *group) 226 { 227 if (group->child != 0) { 228 sc_wait_for_helper(group); 229 } 230 sc_cleanup_close(&group->dir_fd); 231 sc_cleanup_close(&group->pipe_master[0]); 232 sc_cleanup_close(&group->pipe_master[1]); 233 sc_cleanup_close(&group->pipe_helper[0]); 234 sc_cleanup_close(&group->pipe_helper[1]); 235 free(group->name); 236 free(group); 237 } 238 239 static dev_t find_base_snap_device(const char *base_snap_name, 240 const char *base_snap_rev) 241 { 242 // Find the backing device of the base snap. 243 // TODO: add support for "try mode" base snaps that also need 244 // consideration of the mie->root component. 245 dev_t base_snap_dev = 0; 246 char base_squashfs_path[PATH_MAX]; 247 sc_must_snprintf(base_squashfs_path, 248 sizeof base_squashfs_path, "%s/%s/%s", 249 SNAP_MOUNT_DIR, base_snap_name, base_snap_rev); 250 sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 251 mi = sc_parse_mountinfo(NULL); 252 if (mi == NULL) { 253 die("cannot parse mountinfo of the current process"); 254 } 255 bool found = false; 256 for (sc_mountinfo_entry * mie = 257 sc_first_mountinfo_entry(mi); mie != NULL; 258 mie = sc_next_mountinfo_entry(mie)) { 259 if (sc_streq(mie->mount_dir, base_squashfs_path)) { 260 base_snap_dev = makedev(mie->dev_major, mie->dev_minor); 261 debug("block device of snap %s, revision %s is %d:%d", 262 base_snap_name, base_snap_rev, mie->dev_major, 263 mie->dev_minor); 264 // Don't break when found, we are interested in the last 265 // entry as this is the "effective" one. 266 found = true; 267 } 268 } 269 if (!found) { 270 die("cannot find mount entry for snap %s revision %s", 271 base_snap_name, base_snap_rev); 272 } 273 return base_snap_dev; 274 } 275 276 static bool should_discard_current_ns(dev_t base_snap_dev) 277 { 278 // Inspect the namespace and check if we should discard it. 279 // 280 // The namespace may become "stale" when the rootfs is not the same 281 // device we found above. This will happen whenever the base snap is 282 // refreshed since the namespace was first created. 283 sc_mountinfo_entry *mie; 284 sc_mountinfo *mi SC_CLEANUP(sc_cleanup_mountinfo) = NULL; 285 286 mi = sc_parse_mountinfo(NULL); 287 if (mi == NULL) { 288 die("cannot parse mountinfo of the current process"); 289 } 290 for (mie = sc_first_mountinfo_entry(mi); mie != NULL; 291 mie = sc_next_mountinfo_entry(mie)) { 292 if (!sc_streq(mie->mount_dir, "/")) { 293 continue; 294 } 295 // NOTE: we want the initial rootfs just in case overmount 296 // was used to do something weird. The initial rootfs was 297 // set up by snap-confine and that is the one we want to 298 // measure. 299 debug("block device of the root filesystem is %d:%d", 300 mie->dev_major, mie->dev_minor); 301 return base_snap_dev != makedev(mie->dev_major, mie->dev_minor); 302 } 303 die("cannot find mount entry of the root filesystem"); 304 } 305 306 enum sc_discard_vote { 307 /** 308 * SC_DISCARD_NO denotes that the mount namespace doesn't have to be 309 * discarded. This happens when the base snap has not changed. 310 **/ 311 SC_DISCARD_NO = 1, 312 /** 313 * SC_DISCARD_SHOULD indicates that the mount namespace should be discarded 314 * but may be reused if it is still inhabited by processes. This only 315 * happens when the base snap revision changes but the name of the base 316 * snap is the same as before. 317 **/ 318 SC_DISCARD_SHOULD = 2, 319 /** 320 * SC_DISCARD_MUST indicates that the mount namespace must be discarded 321 * even if it still inhabited by processes. This only happens when the name 322 * of the base snap changes. 323 **/ 324 SC_DISCARD_MUST = 3, 325 }; 326 327 /** 328 * is_base_transition returns true if a base transition is occurring. 329 * 330 * The function inspects /run/snapd/ns/snap.$SNAP_INSTANCE_NAME.info as well 331 * as the invocation parameters of snap-confine. If the base snap name, as 332 * encoded in the info file and as described by the invocation parameters 333 * differ then a base transition is occurring. If the info file is absent or 334 * does not record the name of the base snap then transition cannot be 335 * detected. 336 **/ 337 static bool is_base_transition(const sc_invocation * inv) 338 { 339 char info_path[PATH_MAX] = { 0 }; 340 sc_must_snprintf(info_path, 341 sizeof info_path, 342 "/run/snapd/ns/snap.%s.info", inv->snap_instance); 343 344 FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL; 345 stream = fopen(info_path, "r"); 346 if (stream == NULL && errno == ENOENT) { 347 // If the info file is absent then we cannot decide if a transition had 348 // occurred. For people upgrading from snap-confine without the info 349 // file, that is the best we can do. 350 return false; 351 } 352 if (stream == NULL) { 353 die("cannot open %s", info_path); 354 } 355 356 char *base_snap_name SC_CLEANUP(sc_cleanup_string) = NULL; 357 sc_error *err = NULL; 358 if (sc_infofile_get_key 359 (stream, "base-snap-name", &base_snap_name, &err) < 0) { 360 sc_die_on_error(err); 361 } 362 363 if (base_snap_name == NULL) { 364 // If the info file doesn't record the name of the base snap then, 365 // again, we cannot decide if a transition had occurred. 366 return false; 367 } 368 369 return !sc_streq(inv->orig_base_snap_name, base_snap_name); 370 } 371 372 // The namespace may be stale. To check this we must actually switch into it 373 // but then we use up our setns call (the kernel misbehaves if we setns twice). 374 // To work around this we'll fork a child and use it to probe. The child will 375 // inspect the namespace and send information back via eventfd and then exit 376 // unconditionally. 377 static int sc_inspect_and_maybe_discard_stale_ns(int mnt_fd, 378 const sc_invocation * inv, 379 int snap_discard_ns_fd) 380 { 381 char base_snap_rev[PATH_MAX] = { 0 }; 382 dev_t base_snap_dev; 383 int event_fd SC_CLEANUP(sc_cleanup_close) = -1; 384 385 // Read the revision of the base snap by looking at the current symlink. 386 if (readlink(inv->rootfs_dir, base_snap_rev, sizeof base_snap_rev) < 0) { 387 die("cannot read current revision of snap %s", 388 inv->snap_instance); 389 } 390 if (base_snap_rev[sizeof base_snap_rev - 1] != '\0') { 391 die("cannot read current revision of snap %s: value too long", 392 inv->snap_instance); 393 } 394 // Find the device that is backing the current revision of the base snap. 395 base_snap_dev = 396 find_base_snap_device(inv->base_snap_name, base_snap_rev); 397 398 // Store the PID of this process. This is done instead of calls to 399 // getppid() below because then we can reliably track the PID of the 400 // parent even if the child process is re-parented. 401 pid_t parent = getpid(); 402 403 // Create an eventfd for the communication with the child. 404 event_fd = eventfd(0, EFD_CLOEXEC); 405 if (event_fd < 0) { 406 die("cannot create eventfd"); 407 } 408 // Fork a child, it will do the inspection for us. 409 pid_t child = fork(); 410 if (child < 0) { 411 die("cannot fork support process"); 412 } 413 414 if (child == 0) { 415 // This is the child process which will inspect the mount namespace. 416 // 417 // Configure the child to die as soon as the parent dies. In an odd 418 // case where the parent is killed then we don't want to complete our 419 // task or wait for anything. 420 if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { 421 die("cannot set parent process death notification signal to SIGINT"); 422 } 423 // Check that parent process is still alive. If this is the case then 424 // we can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake 425 // us up from eventfd_read() below. In the rare case that the PID 426 // numbers overflow and the now-dead parent PID is recycled we will 427 // still hang forever on the read from eventfd below. 428 if (kill(parent, 0) < 0) { 429 switch (errno) { 430 case ESRCH: 431 debug("parent process has terminated"); 432 abort(); 433 default: 434 die("cannot confirm that parent process is alive"); 435 break; 436 } 437 } 438 439 debug("joining preserved mount namespace for inspection"); 440 // Move to the mount namespace of the snap we're trying to inspect. 441 if (setns(mnt_fd, CLONE_NEWNS) < 0) { 442 die("cannot join preserved mount namespace"); 443 } 444 // Check if the namespace needs to be discarded. 445 eventfd_t value = SC_DISCARD_NO; 446 const char *value_str = "no"; 447 448 // TODO: enable this for core distributions. This is complex because on 449 // core the rootfs is mounted in initrd and is _not_ changed (no 450 // pivot_root) and the base snap is again mounted (2nd time) by 451 // systemd. This makes us end up in a situation where the outer base 452 // snap will never match the rootfs inside the mount namespace. 453 if (inv->is_normal_mode 454 && should_discard_current_ns(base_snap_dev)) { 455 value = SC_DISCARD_SHOULD; 456 value_str = "should"; 457 458 } 459 // If the base snap changed, we must discard the mount namespace and 460 // start over to allow the newly started process to see the requested 461 // base snap. Due to the TODO above always perform explicit transition 462 // check to protect against LP:#1819875 and LP:#1861901 463 if (is_base_transition(inv)) { 464 // The base snap has changed. We must discard ... 465 value = SC_DISCARD_MUST; 466 value_str = "must"; 467 } 468 // Send this back to the parent: 3 - force discard 2 - prefer discard, 1 - keep. 469 // Note that we cannot just use 0 and 1 because of the semantics of eventfd(2). 470 if (eventfd_write(event_fd, value) < 0) { 471 die("cannot send information to %s preserved mount namespace", value_str); 472 } 473 // Exit, we're done. 474 exit(0); 475 } 476 // This is back in the parent process. 477 // 478 // Enable a sanity timeout in case the read blocks for unbound amount of 479 // time. This will ensure we will not hang around while holding the lock. 480 // Next, read the value written by the child process. 481 sc_enable_sanity_timeout(); 482 eventfd_t value = 0; 483 if (eventfd_read(event_fd, &value) < 0) { 484 die("cannot read from eventfd"); 485 } 486 sc_disable_sanity_timeout(); 487 488 // Wait for the child process to exit and collect its exit status. 489 errno = 0; 490 int status = 0; 491 if (waitpid(child, &status, 0) < 0) { 492 die("cannot wait for the support process for mount namespace inspection"); 493 } 494 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { 495 die("support process for mount namespace inspection exited abnormally"); 496 } 497 // If the namespace is up-to-date then we are done. 498 switch (value) { 499 case SC_DISCARD_NO: 500 debug("preserved mount is not stale, reusing"); 501 return 0; 502 case SC_DISCARD_SHOULD: 503 if (sc_cgroup_is_v2()) { 504 debug 505 ("WARNING: cgroup v2 detected, preserved mount namespace process presence check unsupported, discarding"); 506 break; 507 } 508 if (sc_cgroup_freezer_occupied(inv->snap_instance)) { 509 // Some processes are still using the namespace so we cannot discard it 510 // as that would fracture the view that the set of processes inside 511 // have on what is mounted. 512 debug 513 ("preserved mount namespace is stale but occupied, reusing"); 514 return 0; 515 } 516 break; 517 case SC_DISCARD_MUST: 518 debug 519 ("preserved mount namespace is stale and base snap has changed, discarding"); 520 break; 521 } 522 sc_call_snap_discard_ns(snap_discard_ns_fd, inv->snap_instance); 523 return EAGAIN; 524 } 525 526 static void helper_fork(struct sc_mount_ns *group, 527 struct sc_apparmor *apparmor); 528 static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor, 529 pid_t parent); 530 static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent); 531 static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent); 532 533 int sc_join_preserved_ns(struct sc_mount_ns *group, struct sc_apparmor 534 *apparmor, const sc_invocation * inv, 535 int snap_discard_ns_fd) 536 { 537 // Open the mount namespace file. 538 char mnt_fname[PATH_MAX] = { 0 }; 539 sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.mnt", group->name); 540 int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 541 // NOTE: There is no O_EXCL here because the file can be around but 542 // doesn't have to be a mounted namespace. 543 mnt_fd = openat(group->dir_fd, mnt_fname, 544 O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); 545 if (mnt_fd < 0 && errno == ENOENT) { 546 return ESRCH; 547 } 548 if (mnt_fd < 0) { 549 die("cannot open preserved mount namespace %s", group->name); 550 } 551 // Check if we got an nsfs-based or procfs file or a regular file. This can 552 // be reliably tested because nsfs has an unique filesystem type 553 // NSFS_MAGIC. On older kernels that don't support nsfs yet we can look 554 // for PROC_SUPER_MAGIC instead. 555 // We can just ensure that this is the case thanks to fstatfs. 556 struct statfs ns_statfs_buf; 557 if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) { 558 die("cannot inspect filesystem of preserved mount namespace file"); 559 } 560 // Stat the mount namespace as well, this is later used to check if the 561 // namespace is used by other processes if we are considering discarding a 562 // stale namespace. 563 struct stat ns_stat_buf; 564 if (fstat(mnt_fd, &ns_stat_buf) < 0) { 565 die("cannot inspect preserved mount namespace file"); 566 } 567 #ifndef NSFS_MAGIC 568 // Account for kernel headers old enough to not know about NSFS_MAGIC. 569 #define NSFS_MAGIC 0x6e736673 570 #endif 571 if (ns_statfs_buf.f_type == NSFS_MAGIC 572 || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) { 573 574 // Inspect and perhaps discard the preserved mount namespace. 575 if (sc_inspect_and_maybe_discard_stale_ns 576 (mnt_fd, inv, snap_discard_ns_fd) == EAGAIN) { 577 return ESRCH; 578 } 579 // Move to the mount namespace of the snap we're trying to start. 580 if (setns(mnt_fd, CLONE_NEWNS) < 0) { 581 die("cannot join preserved mount namespace %s", 582 group->name); 583 } 584 debug("joined preserved mount namespace %s", group->name); 585 return 0; 586 } 587 return ESRCH; 588 } 589 590 int sc_join_preserved_per_user_ns(struct sc_mount_ns *group, 591 const char *snap_name) 592 { 593 uid_t uid = getuid(); 594 char mnt_fname[PATH_MAX] = { 0 }; 595 sc_must_snprintf(mnt_fname, sizeof mnt_fname, "%s.%d.mnt", group->name, 596 (int)uid); 597 598 int mnt_fd SC_CLEANUP(sc_cleanup_close) = -1; 599 mnt_fd = openat(group->dir_fd, mnt_fname, 600 O_RDONLY | O_CLOEXEC | O_NOFOLLOW, 0600); 601 if (mnt_fd < 0 && errno == ENOENT) { 602 return ESRCH; 603 } 604 if (mnt_fd < 0) { 605 die("cannot open preserved mount namespace %s", group->name); 606 } 607 struct statfs ns_statfs_buf; 608 if (fstatfs(mnt_fd, &ns_statfs_buf) < 0) { 609 die("cannot inspect filesystem of preserved mount namespace file"); 610 } 611 struct stat ns_stat_buf; 612 if (fstat(mnt_fd, &ns_stat_buf) < 0) { 613 die("cannot inspect preserved mount namespace file"); 614 } 615 #ifndef NSFS_MAGIC 616 /* Define NSFS_MAGIC for Ubuntu 14.04 and other older systems. */ 617 #define NSFS_MAGIC 0x6e736673 618 #endif 619 if (ns_statfs_buf.f_type == NSFS_MAGIC 620 || ns_statfs_buf.f_type == PROC_SUPER_MAGIC) { 621 if (setns(mnt_fd, CLONE_NEWNS) < 0) { 622 die("cannot join preserved per-user mount namespace %s", 623 group->name); 624 } 625 debug("joined preserved mount namespace %s", group->name); 626 return 0; 627 } 628 return ESRCH; 629 } 630 631 static void setup_signals_for_helper(void) 632 { 633 /* Ignore the SIGPIPE signal so that we get EPIPE on the read / write 634 * operations attempting to work with a closed pipe. This ensures that we 635 * are not killed by the default disposition (terminate) and can return a 636 * non-signal-death return code to the program invoking snap-confine. */ 637 if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { 638 die("cannot install ignore handler for SIGPIPE"); 639 } 640 } 641 642 static void teardown_signals_for_helper(void) 643 { 644 /* Undo operations done by setup_signals_for_helper. */ 645 if (signal(SIGPIPE, SIG_DFL) == SIG_ERR) { 646 die("cannot restore default handler for SIGPIPE"); 647 } 648 } 649 650 static void helper_fork(struct sc_mount_ns *group, struct sc_apparmor *apparmor) 651 { 652 // Create a pipe for sending commands to the helper process. 653 if (pipe2(group->pipe_master, O_CLOEXEC | O_DIRECT) < 0) { 654 die("cannot create pipes for commanding the helper process"); 655 } 656 if (pipe2(group->pipe_helper, O_CLOEXEC | O_DIRECT) < 0) { 657 die("cannot create pipes for responding to master process"); 658 } 659 // Store the PID of the "parent" process. This done instead of calls to 660 // getppid() because then we can reliably track the PID of the parent even 661 // if the child process is re-parented. 662 pid_t parent = getpid(); 663 664 // For rationale of forking see this: 665 // https://lists.linuxfoundation.org/pipermail/containers/2013-August/033386.html 666 pid_t pid = fork(); 667 if (pid < 0) { 668 die("cannot fork helper process for mount namespace capture"); 669 } 670 if (pid == 0) { 671 /* helper */ 672 sc_cleanup_close(&group->pipe_master[1]); 673 sc_cleanup_close(&group->pipe_helper[0]); 674 helper_main(group, apparmor, parent); 675 } else { 676 setup_signals_for_helper(); 677 678 /* master */ 679 sc_cleanup_close(&group->pipe_master[0]); 680 sc_cleanup_close(&group->pipe_helper[1]); 681 682 // Glibc defines pid as a signed 32bit integer. There's no standard way to 683 // print pid's portably so this is the best we can do. 684 debug("forked support process %d", (int)pid); 685 group->child = pid; 686 } 687 } 688 689 static void helper_main(struct sc_mount_ns *group, struct sc_apparmor *apparmor, 690 pid_t parent) 691 { 692 // This is the child process which will capture the mount namespace. 693 // 694 // It will do so by bind-mounting the .mnt after the parent process calls 695 // unshare() and finishes setting up the namespace completely. Change the 696 // hat to a sub-profile that has limited permissions necessary to 697 // accomplish the capture of the mount namespace. 698 sc_maybe_aa_change_hat(apparmor, "mount-namespace-capture-helper", 0); 699 // Configure the child to die as soon as the parent dies. In an odd 700 // case where the parent is killed then we don't want to complete our 701 // task or wait for anything. 702 if (prctl(PR_SET_PDEATHSIG, SIGINT, 0, 0, 0) < 0) { 703 die("cannot set parent process death notification signal to SIGINT"); 704 } 705 // Check that parent process is still alive. If this is the case then we 706 // can *almost* reliably rely on the PR_SET_PDEATHSIG signal to wake us up 707 // from read(2) below. In the rare case that the PID numbers overflow and 708 // the now-dead parent PID is recycled we will still hang forever on the 709 // read from the pipe below. 710 if (kill(parent, 0) < 0) { 711 switch (errno) { 712 case ESRCH: 713 // When snap-confine executes it will fork a helper process. That 714 // process establishes an elaborate dance to ensure both itself and 715 // the parent are operating exactly as specified, so that no 716 // processes are left behind for unbound amount of time. As a part 717 // of that dance the child pings the parent to ensure it is still 718 // alive after establishing a notification signal to be sent in 719 // case the parent dies. This is a race avoidance mechanism, we set 720 // up the notification and then check if the parent is alive by the 721 // time we are done. 722 // 723 // In the case when the parent does go away we used to call 724 // abort(). On some distributions this would trigger an unclean 725 // process termination error report to be sent. One such example is 726 // the Ubuntu error tracker. Since the parent process can be 727 // legitimately interrupted and killed, this should not generate an 728 // error report. As such, perform clean exit in this specific case. 729 debug("parent process has terminated"); 730 exit(0); 731 default: 732 die("cannot confirm that parent process is alive"); 733 break; 734 } 735 } 736 if (fchdir(group->dir_fd) < 0) { 737 die("cannot move to directory with preserved namespaces"); 738 } 739 int command = -1; 740 int run = 1; 741 while (run) { 742 debug("helper process waiting for command"); 743 sc_enable_sanity_timeout(); 744 if (read(group->pipe_master[0], &command, sizeof command) < 0) { 745 int saved_errno = errno; 746 // This will ensure we get the correct error message 747 // if there is a read error because the timeout 748 // expired. 749 sc_disable_sanity_timeout(); 750 errno = saved_errno; 751 die("cannot read command from the pipe"); 752 } 753 sc_disable_sanity_timeout(); 754 debug("helper process received command %d", command); 755 switch (command) { 756 case HELPER_CMD_EXIT: 757 run = 0; 758 break; 759 case HELPER_CMD_CAPTURE_MOUNT_NS: 760 helper_capture_ns(group, parent); 761 break; 762 case HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS: 763 helper_capture_per_user_ns(group, parent); 764 break; 765 } 766 if (write(group->pipe_helper[1], &command, sizeof command) < 0) { 767 die("cannot write ack"); 768 } 769 } 770 debug("helper process exiting"); 771 exit(0); 772 } 773 774 static void helper_capture_ns(struct sc_mount_ns *group, pid_t parent) 775 { 776 char src[PATH_MAX] = { 0 }; 777 char dst[PATH_MAX] = { 0 }; 778 779 debug("capturing per-snap mount namespace"); 780 sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); 781 sc_must_snprintf(dst, sizeof dst, "%s.mnt", group->name); 782 783 /* Ensure the bind mount destination exists. */ 784 int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600); 785 if (fd < 0) { 786 die("cannot create file %s", dst); 787 } 788 close(fd); 789 790 if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { 791 die("cannot preserve mount namespace of process %d as %s", 792 (int)parent, dst); 793 } 794 debug("mount namespace of process %d preserved as %s", 795 (int)parent, dst); 796 } 797 798 static void helper_capture_per_user_ns(struct sc_mount_ns *group, pid_t parent) 799 { 800 char src[PATH_MAX] = { 0 }; 801 char dst[PATH_MAX] = { 0 }; 802 uid_t uid = getuid(); 803 804 debug("capturing per-snap, per-user mount namespace"); 805 sc_must_snprintf(src, sizeof src, "/proc/%d/ns/mnt", (int)parent); 806 sc_must_snprintf(dst, sizeof dst, "%s.%d.mnt", group->name, (int)uid); 807 808 /* Ensure the bind mount destination exists. */ 809 int fd = open(dst, O_CREAT | O_CLOEXEC | O_NOFOLLOW | O_RDONLY, 0600); 810 if (fd < 0) { 811 die("cannot create file %s", dst); 812 } 813 close(fd); 814 815 if (mount(src, dst, NULL, MS_BIND, NULL) < 0) { 816 die("cannot preserve per-user mount namespace of process %d as %s", (int)parent, dst); 817 } 818 debug("per-user mount namespace of process %d preserved as %s", 819 (int)parent, dst); 820 } 821 822 static void sc_message_capture_helper(struct sc_mount_ns *group, int command_id) 823 { 824 int ack; 825 if (group->child == 0) { 826 die("precondition failed: we don't have a helper process"); 827 } 828 if (group->pipe_master[1] < 0) { 829 die("precondition failed: we don't have a pipe"); 830 } 831 if (group->pipe_helper[0] < 0) { 832 die("precondition failed: we don't have a pipe"); 833 } 834 debug("sending command %d to helper process (pid: %d)", 835 command_id, group->child); 836 if (write(group->pipe_master[1], &command_id, sizeof command_id) < 0) { 837 die("cannot send command %d to helper process", command_id); 838 } 839 debug("waiting for response from helper"); 840 int read_n = read(group->pipe_helper[0], &ack, sizeof ack); 841 if (read_n < 0) { 842 die("cannot receive ack from helper process"); 843 } 844 if (read_n == 0) { 845 die("unexpected eof from helper process"); 846 } 847 } 848 849 static void sc_wait_for_capture_helper(struct sc_mount_ns *group) 850 { 851 if (group->child == 0) { 852 die("precondition failed: we don't have a helper process"); 853 } 854 debug("waiting for the helper process to exit"); 855 int status = 0; 856 errno = 0; 857 if (waitpid(group->child, &status, 0) < 0) { 858 die("cannot wait for the helper process"); 859 } 860 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { 861 die("helper process exited abnormally"); 862 } 863 debug("helper process exited normally"); 864 group->child = 0; 865 teardown_signals_for_helper(); 866 } 867 868 void sc_fork_helper(struct sc_mount_ns *group, struct sc_apparmor *apparmor) 869 { 870 helper_fork(group, apparmor); 871 } 872 873 void sc_preserve_populated_mount_ns(struct sc_mount_ns *group) 874 { 875 sc_message_capture_helper(group, HELPER_CMD_CAPTURE_MOUNT_NS); 876 } 877 878 void sc_preserve_populated_per_user_mount_ns(struct sc_mount_ns *group) 879 { 880 sc_message_capture_helper(group, HELPER_CMD_CAPTURE_PER_USER_MOUNT_NS); 881 } 882 883 void sc_wait_for_helper(struct sc_mount_ns *group) 884 { 885 sc_message_capture_helper(group, HELPER_CMD_EXIT); 886 sc_wait_for_capture_helper(group); 887 } 888 889 void sc_store_ns_info(const sc_invocation * inv) 890 { 891 FILE *stream SC_CLEANUP(sc_cleanup_file) = NULL; 892 char info_path[PATH_MAX] = { 0 }; 893 sc_must_snprintf(info_path, sizeof info_path, 894 "/run/snapd/ns/snap.%s.info", inv->snap_instance); 895 int fd = -1; 896 fd = open(info_path, 897 O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC | O_NOFOLLOW, 0644); 898 if (fd < 0) { 899 die("cannot open %s", info_path); 900 } 901 if (fchown(fd, 0, 0) < 0) { 902 die("cannot chown %s to root.root", info_path); 903 } 904 // The stream now owns the file descriptor. 905 stream = fdopen(fd, "w"); 906 if (stream == NULL) { 907 die("cannot get stream from file descriptor"); 908 } 909 fprintf(stream, "base-snap-name=%s\n", inv->orig_base_snap_name); 910 if (ferror(stream) != 0) { 911 die("I/O error when writing to %s", info_path); 912 } 913 if (fflush(stream) == EOF) { 914 die("cannot flush %s", info_path); 915 } 916 debug("saved mount namespace meta-data to %s", info_path); 917 }