github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/nsenter/nsexec.c (about) 1 2 #define _GNU_SOURCE 3 #include <endian.h> 4 #include <errno.h> 5 #include <fcntl.h> 6 #include <grp.h> 7 #include <sched.h> 8 #include <setjmp.h> 9 #include <signal.h> 10 #include <stdarg.h> 11 #include <stdbool.h> 12 #include <stdint.h> 13 #include <stdio.h> 14 #include <stdlib.h> 15 #include <stdbool.h> 16 #include <string.h> 17 #include <unistd.h> 18 19 #include <sys/ioctl.h> 20 #include <sys/prctl.h> 21 #include <sys/socket.h> 22 #include <sys/types.h> 23 #include <sys/wait.h> 24 25 #include <linux/limits.h> 26 #include <linux/netlink.h> 27 #include <linux/types.h> 28 29 #include "getenv.h" 30 #include "log.h" 31 /* Get all of the CLONE_NEW* flags. */ 32 #include "namespace.h" 33 34 /* Synchronisation values. */ 35 enum sync_t { 36 SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ 37 SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ 38 SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ 39 SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ 40 SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ 41 SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ 42 SYNC_TIMEOFFSETS_PLS = 0x46, /* Request parent to write timens offsets. */ 43 SYNC_TIMEOFFSETS_ACK = 0x47, /* Timens offsets were written. */ 44 }; 45 46 #define STAGE_SETUP -1 47 /* longjmp() arguments. */ 48 #define STAGE_PARENT 0 49 #define STAGE_CHILD 1 50 #define STAGE_INIT 2 51 52 /* Stores the current stage of nsexec. */ 53 int current_stage = STAGE_SETUP; 54 55 /* Assume the stack grows down, so arguments should be above it. */ 56 struct clone_t { 57 /* 58 * Reserve some space for clone() to locate arguments 59 * and retcode in this place 60 */ 61 char stack[4096] __attribute__((aligned(16))); 62 char stack_ptr[0]; 63 64 /* There's two children. This is used to execute the different code. */ 65 jmp_buf *env; 66 int jmpval; 67 }; 68 69 struct nlconfig_t { 70 char *data; 71 72 /* Process settings. */ 73 uint32_t cloneflags; 74 char *oom_score_adj; 75 size_t oom_score_adj_len; 76 77 /* User namespace settings. */ 78 char *uidmap; 79 size_t uidmap_len; 80 char *gidmap; 81 size_t gidmap_len; 82 char *namespaces; 83 size_t namespaces_len; 84 uint8_t is_setgroup; 85 86 /* Rootless container settings. */ 87 uint8_t is_rootless_euid; /* boolean */ 88 char *uidmappath; 89 size_t uidmappath_len; 90 char *gidmappath; 91 size_t gidmappath_len; 92 93 /* Time NS offsets. */ 94 char *timensoffset; 95 size_t timensoffset_len; 96 }; 97 98 /* 99 * List of netlink message types sent to us as part of bootstrapping the init. 100 * These constants are defined in libcontainer/message_linux.go. 101 */ 102 #define INIT_MSG 62000 103 #define CLONE_FLAGS_ATTR 27281 104 #define NS_PATHS_ATTR 27282 105 #define UIDMAP_ATTR 27283 106 #define GIDMAP_ATTR 27284 107 #define SETGROUP_ATTR 27285 108 #define OOM_SCORE_ADJ_ATTR 27286 109 #define ROOTLESS_EUID_ATTR 27287 110 #define UIDMAPPATH_ATTR 27288 111 #define GIDMAPPATH_ATTR 27289 112 #define TIMENSOFFSET_ATTR 27290 113 114 /* 115 * Use the raw syscall for versions of glibc which don't include a function for 116 * it, namely (glibc 2.12). 117 */ 118 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 119 # define _GNU_SOURCE 120 # include "syscall.h" 121 # if !defined(SYS_setns) && defined(__NR_setns) 122 # define SYS_setns __NR_setns 123 # endif 124 125 # ifndef SYS_setns 126 # error "setns(2) syscall not supported by glibc version" 127 # endif 128 129 int setns(int fd, int nstype) 130 { 131 return syscall(SYS_setns, fd, nstype); 132 } 133 #endif 134 135 /* XXX: This is ugly. */ 136 static int syncfd = -1; 137 138 static int write_file(char *data, size_t data_len, char *pathfmt, ...) 139 { 140 int fd, len, ret = 0; 141 char path[PATH_MAX]; 142 143 va_list ap; 144 va_start(ap, pathfmt); 145 len = vsnprintf(path, PATH_MAX, pathfmt, ap); 146 va_end(ap); 147 if (len < 0) 148 return -1; 149 150 fd = open(path, O_RDWR); 151 if (fd < 0) { 152 return -1; 153 } 154 155 len = write(fd, data, data_len); 156 if (len != data_len) { 157 ret = -1; 158 goto out; 159 } 160 161 out: 162 close(fd); 163 return ret; 164 } 165 166 enum policy_t { 167 SETGROUPS_DEFAULT = 0, 168 SETGROUPS_ALLOW, 169 SETGROUPS_DENY, 170 }; 171 172 /* This *must* be called before we touch gid_map. */ 173 static void update_setgroups(int pid, enum policy_t setgroup) 174 { 175 char *policy; 176 177 switch (setgroup) { 178 case SETGROUPS_ALLOW: 179 policy = "allow"; 180 break; 181 case SETGROUPS_DENY: 182 policy = "deny"; 183 break; 184 case SETGROUPS_DEFAULT: 185 default: 186 /* Nothing to do. */ 187 return; 188 } 189 190 if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { 191 /* 192 * If the kernel is too old to support /proc/pid/setgroups, 193 * open(2) or write(2) will return ENOENT. This is fine. 194 */ 195 if (errno != ENOENT) 196 bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); 197 } 198 } 199 200 static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) 201 { 202 int child; 203 204 /* 205 * If @app is NULL, execve will segfault. Just check it here and bail (if 206 * we're in this path, the caller is already getting desperate and there 207 * isn't a backup to this failing). This usually would be a configuration 208 * or programming issue. 209 */ 210 if (!app) 211 bail("mapping tool not present"); 212 213 child = fork(); 214 if (child < 0) 215 bail("failed to fork"); 216 217 if (!child) { 218 #define MAX_ARGV 20 219 char *argv[MAX_ARGV]; 220 char *envp[] = { NULL }; 221 char pid_fmt[16]; 222 int argc = 0; 223 char *next; 224 225 snprintf(pid_fmt, 16, "%d", pid); 226 227 argv[argc++] = (char *)app; 228 argv[argc++] = pid_fmt; 229 /* 230 * Convert the map string into a list of argument that 231 * newuidmap/newgidmap can understand. 232 */ 233 234 while (argc < MAX_ARGV) { 235 if (*map == '\0') { 236 argv[argc++] = NULL; 237 break; 238 } 239 argv[argc++] = map; 240 next = strpbrk(map, "\n "); 241 if (next == NULL) 242 break; 243 *next++ = '\0'; 244 map = next + strspn(next, "\n "); 245 } 246 247 execve(app, argv, envp); 248 bail("failed to execv"); 249 } else { 250 int status; 251 252 while (true) { 253 if (waitpid(child, &status, 0) < 0) { 254 if (errno == EINTR) 255 continue; 256 bail("failed to waitpid"); 257 } 258 if (WIFEXITED(status) || WIFSIGNALED(status)) 259 return WEXITSTATUS(status); 260 } 261 } 262 263 return -1; 264 } 265 266 static void update_uidmap(const char *path, int pid, char *map, size_t map_len) 267 { 268 if (map == NULL || map_len == 0) 269 return; 270 271 write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map); 272 if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { 273 if (errno != EPERM) 274 bail("failed to update /proc/%d/uid_map", pid); 275 write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path); 276 if (try_mapping_tool(path, pid, map, map_len)) 277 bail("failed to use newuid map on %d", pid); 278 } 279 } 280 281 static void update_gidmap(const char *path, int pid, char *map, size_t map_len) 282 { 283 if (map == NULL || map_len == 0) 284 return; 285 286 write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map); 287 if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { 288 if (errno != EPERM) 289 bail("failed to update /proc/%d/gid_map", pid); 290 write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path); 291 if (try_mapping_tool(path, pid, map, map_len)) 292 bail("failed to use newgid map on %d", pid); 293 } 294 } 295 296 static void update_oom_score_adj(char *data, size_t len) 297 { 298 if (data == NULL || len == 0) 299 return; 300 301 write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data); 302 if (write_file(data, len, "/proc/self/oom_score_adj") < 0) 303 bail("failed to update /proc/self/oom_score_adj"); 304 } 305 306 /* A dummy function that just jumps to the given jumpval. */ 307 static int child_func(void *arg) __attribute__((noinline)); 308 static int child_func(void *arg) 309 { 310 struct clone_t *ca = (struct clone_t *)arg; 311 longjmp(*ca->env, ca->jmpval); 312 } 313 314 static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline)); 315 static int clone_parent(jmp_buf *env, int jmpval) 316 { 317 struct clone_t ca = { 318 .env = env, 319 .jmpval = jmpval, 320 }; 321 322 return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); 323 } 324 325 /* Returns the clone(2) flag for a namespace, given the name of a namespace. */ 326 static int nsflag(char *name) 327 { 328 if (!strcmp(name, "cgroup")) 329 return CLONE_NEWCGROUP; 330 else if (!strcmp(name, "ipc")) 331 return CLONE_NEWIPC; 332 else if (!strcmp(name, "mnt")) 333 return CLONE_NEWNS; 334 else if (!strcmp(name, "net")) 335 return CLONE_NEWNET; 336 else if (!strcmp(name, "pid")) 337 return CLONE_NEWPID; 338 else if (!strcmp(name, "user")) 339 return CLONE_NEWUSER; 340 else if (!strcmp(name, "uts")) 341 return CLONE_NEWUTS; 342 else if (!strcmp(name, "time")) 343 return CLONE_NEWTIME; 344 345 /* If we don't recognise a name, fallback to 0. */ 346 return 0; 347 } 348 349 static uint32_t readint32(char *buf) 350 { 351 return *(uint32_t *) buf; 352 } 353 354 static uint8_t readint8(char *buf) 355 { 356 return *(uint8_t *) buf; 357 } 358 359 static void nl_parse(int fd, struct nlconfig_t *config) 360 { 361 size_t len, size; 362 struct nlmsghdr hdr; 363 char *data, *current; 364 365 /* Retrieve the netlink header. */ 366 len = read(fd, &hdr, NLMSG_HDRLEN); 367 if (len != NLMSG_HDRLEN) 368 bail("invalid netlink header length %zu", len); 369 370 if (hdr.nlmsg_type == NLMSG_ERROR) 371 bail("failed to read netlink message"); 372 373 if (hdr.nlmsg_type != INIT_MSG) 374 bail("unexpected msg type %d", hdr.nlmsg_type); 375 376 /* Retrieve data. */ 377 size = NLMSG_PAYLOAD(&hdr, 0); 378 current = data = malloc(size); 379 if (!data) 380 bail("failed to allocate %zu bytes of memory for nl_payload", size); 381 382 len = read(fd, data, size); 383 if (len != size) 384 bail("failed to read netlink payload, %zu != %zu", len, size); 385 386 /* Parse the netlink payload. */ 387 config->data = data; 388 while (current < data + size) { 389 struct nlattr *nlattr = (struct nlattr *)current; 390 size_t payload_len = nlattr->nla_len - NLA_HDRLEN; 391 392 /* Advance to payload. */ 393 current += NLA_HDRLEN; 394 395 /* Handle payload. */ 396 switch (nlattr->nla_type) { 397 case CLONE_FLAGS_ATTR: 398 config->cloneflags = readint32(current); 399 break; 400 case ROOTLESS_EUID_ATTR: 401 config->is_rootless_euid = readint8(current); /* boolean */ 402 break; 403 case OOM_SCORE_ADJ_ATTR: 404 config->oom_score_adj = current; 405 config->oom_score_adj_len = payload_len; 406 break; 407 case NS_PATHS_ATTR: 408 config->namespaces = current; 409 config->namespaces_len = payload_len; 410 break; 411 case UIDMAP_ATTR: 412 config->uidmap = current; 413 config->uidmap_len = payload_len; 414 break; 415 case GIDMAP_ATTR: 416 config->gidmap = current; 417 config->gidmap_len = payload_len; 418 break; 419 case UIDMAPPATH_ATTR: 420 config->uidmappath = current; 421 config->uidmappath_len = payload_len; 422 break; 423 case GIDMAPPATH_ATTR: 424 config->gidmappath = current; 425 config->gidmappath_len = payload_len; 426 break; 427 case SETGROUP_ATTR: 428 config->is_setgroup = readint8(current); 429 break; 430 case TIMENSOFFSET_ATTR: 431 config->timensoffset = current; 432 config->timensoffset_len = payload_len; 433 break; 434 default: 435 bail("unknown netlink message type %d", nlattr->nla_type); 436 } 437 438 current += NLA_ALIGN(payload_len); 439 } 440 } 441 442 void nl_free(struct nlconfig_t *config) 443 { 444 free(config->data); 445 } 446 447 void join_namespaces(char *nslist) 448 { 449 int num = 0, i; 450 char *saveptr = NULL; 451 char *namespace = strtok_r(nslist, ",", &saveptr); 452 struct namespace_t { 453 int fd; 454 char type[PATH_MAX]; 455 char path[PATH_MAX]; 456 } *namespaces = NULL; 457 458 if (!namespace || !strlen(namespace) || !strlen(nslist)) 459 bail("ns paths are empty"); 460 461 /* 462 * We have to open the file descriptors first, since after 463 * we join the mnt namespace we might no longer be able to 464 * access the paths. 465 */ 466 do { 467 int fd; 468 char *path; 469 struct namespace_t *ns; 470 471 /* Resize the namespace array. */ 472 namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); 473 if (!namespaces) 474 bail("failed to reallocate namespace array"); 475 ns = &namespaces[num - 1]; 476 477 /* Split 'ns:path'. */ 478 path = strstr(namespace, ":"); 479 if (!path) 480 bail("failed to parse %s", namespace); 481 *path++ = '\0'; 482 483 fd = open(path, O_RDONLY); 484 if (fd < 0) 485 bail("failed to open %s", path); 486 487 ns->fd = fd; 488 strncpy(ns->type, namespace, PATH_MAX - 1); 489 strncpy(ns->path, path, PATH_MAX - 1); 490 ns->path[PATH_MAX - 1] = '\0'; 491 } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); 492 493 /* 494 * The ordering in which we join namespaces is important. We should 495 * always join the user namespace *first*. This is all guaranteed 496 * from the container_linux.go side of this, so we're just going to 497 * follow the order given to us. 498 */ 499 500 for (i = 0; i < num; i++) { 501 struct namespace_t *ns = &namespaces[i]; 502 int flag = nsflag(ns->type); 503 504 write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path); 505 if (setns(ns->fd, flag) < 0) 506 bail("failed to setns into %s namespace", ns->type); 507 508 close(ns->fd); 509 } 510 511 free(namespaces); 512 } 513 514 static inline int sane_kill(pid_t pid, int signum) 515 { 516 if (pid > 0) 517 return kill(pid, signum); 518 else 519 return 0; 520 } 521 522 void try_unshare(int flags, const char *msg) 523 { 524 write_log(DEBUG, "unshare %s", msg); 525 /* 526 * Kernels prior to v4.3 may return EINVAL on unshare when another process 527 * reads runc's /proc/$PID/status or /proc/$PID/maps. To work around this, 528 * retry on EINVAL a few times. 529 */ 530 int retries = 5; 531 for (; retries > 0; retries--) { 532 if (unshare(flags) == 0) { 533 return; 534 } 535 if (errno != EINVAL) 536 break; 537 } 538 bail("failed to unshare %s", msg); 539 } 540 541 static void update_timens_offsets(pid_t pid, char *map, size_t map_len) 542 { 543 if (map == NULL || map_len == 0) 544 return; 545 write_log(DEBUG, "update /proc/%d/timens_offsets to '%s'", pid, map); 546 if (write_file(map, map_len, "/proc/%d/timens_offsets", pid) < 0) 547 bail("failed to update /proc/%d/timens_offsets", pid); 548 } 549 550 void nsexec(void) 551 { 552 int pipenum; 553 jmp_buf env; 554 int sync_child_pipe[2], sync_grandchild_pipe[2]; 555 struct nlconfig_t config = { 0 }; 556 557 /* 558 * Setup a pipe to send logs to the parent. This should happen 559 * first, because bail will use that pipe. 560 */ 561 setup_logpipe(); 562 563 /* 564 * Get the init pipe fd from the environment. The init pipe is used to 565 * read the bootstrap data and tell the parent what the new pids are 566 * after the setup is done. 567 */ 568 pipenum = getenv_int("_LIBCONTAINER_INITPIPE"); 569 if (pipenum < 0) { 570 /* We are not a runc init. Just return to go runtime. */ 571 return; 572 } 573 574 /* 575 * Inform the parent we're past initial setup. 576 * For the other side of this, see initWaiter. 577 */ 578 if (write(pipenum, "", 1) != 1) 579 bail("could not inform the parent we are past initial setup"); 580 581 write_log(DEBUG, "=> nsexec container setup"); 582 583 /* Parse all of the netlink configuration. */ 584 nl_parse(pipenum, &config); 585 586 /* Set oom_score_adj. This has to be done before !dumpable because 587 * /proc/self/oom_score_adj is not writeable unless you're an privileged 588 * user (if !dumpable is set). All children inherit their parent's 589 * oom_score_adj value on fork(2) so this will always be propagated 590 * properly. 591 */ 592 update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len); 593 594 /* 595 * Make the process non-dumpable, to avoid various race conditions that 596 * could cause processes in namespaces we're joining to access host 597 * resources (or potentially execute code). 598 * 599 * However, if the number of namespaces we are joining is 0, we are not 600 * going to be switching to a different security context. Thus setting 601 * ourselves to be non-dumpable only breaks things (like rootless 602 * containers), which is the recommendation from the kernel folks. 603 */ 604 if (config.namespaces) { 605 write_log(DEBUG, "set process as non-dumpable"); 606 if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) 607 bail("failed to set process as non-dumpable"); 608 } 609 610 /* Pipe so we can tell the child when we've finished setting up. */ 611 if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0) 612 bail("failed to setup sync pipe between parent and child"); 613 614 /* 615 * We need a new socketpair to sync with grandchild so we don't have 616 * race condition with child. 617 */ 618 if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0) 619 bail("failed to setup sync pipe between parent and grandchild"); 620 621 /* TODO: Currently we aren't dealing with child deaths properly. */ 622 623 /* 624 * Okay, so this is quite annoying. 625 * 626 * In order for this unsharing code to be more extensible we need to split 627 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case 628 * would be if we did clone(CLONE_NEWUSER) and the other namespaces 629 * separately, but because of SELinux issues we cannot really do that. But 630 * we cannot just dump the namespace flags into clone(...) because several 631 * usecases (such as rootless containers) require more granularity around 632 * the namespace setup. In addition, some older kernels had issues where 633 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot 634 * handle this while also dealing with SELinux so we choose SELinux support 635 * over broken kernel support). 636 * 637 * However, if we unshare(2) the user namespace *before* we clone(2), then 638 * all hell breaks loose. 639 * 640 * The parent no longer has permissions to do many things (unshare(2) drops 641 * all capabilities in your old namespace), and the container cannot be set 642 * up to have more than one {uid,gid} mapping. This is obviously less than 643 * ideal. In order to fix this, we have to first clone(2) and then unshare. 644 * 645 * Unfortunately, it's not as simple as that. We have to fork to enter the 646 * PID namespace (the PID namespace only applies to children). Since we'll 647 * have to double-fork, this clone_parent() call won't be able to get the 648 * PID of the _actual_ init process (without doing more synchronisation than 649 * I can deal with at the moment). So we'll just get the parent to send it 650 * for us, the only job of this process is to update 651 * /proc/pid/{setgroups,uid_map,gid_map}. 652 * 653 * And as a result of the above, we also need to setns(2) in the first child 654 * because if we join a PID namespace in the topmost parent then our child 655 * will be in that namespace (and it will not be able to give us a PID value 656 * that makes sense without resorting to sending things with cmsg). 657 * 658 * This also deals with an older issue caused by dumping cloneflags into 659 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so 660 * we have to unshare(2) before clone(2) in order to do this. This was fixed 661 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was 662 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're 663 * aware, the last mainline kernel which had this bug was Linux 3.12. 664 * However, we cannot comment on which kernels the broken patch was 665 * backported to. 666 * 667 * -- Aleksa "what has my life come to?" Sarai 668 */ 669 670 switch (setjmp(env)) { 671 /* 672 * Stage 0: We're in the parent. Our job is just to create a new child 673 * (stage 1: STAGE_CHILD) process and write its uid_map and 674 * gid_map. That process will go on to create a new process, then 675 * it will send us its PID which we will send to the bootstrap 676 * process. 677 */ 678 case STAGE_PARENT:{ 679 int len; 680 pid_t stage1_pid = -1, stage2_pid = -1; 681 bool stage1_complete, stage2_complete; 682 683 /* For debugging. */ 684 current_stage = STAGE_PARENT; 685 prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); 686 write_log(DEBUG, "~> nsexec stage-0"); 687 688 /* Start the process of getting a container. */ 689 write_log(DEBUG, "spawn stage-1"); 690 stage1_pid = clone_parent(&env, STAGE_CHILD); 691 if (stage1_pid < 0) 692 bail("unable to spawn stage-1"); 693 694 syncfd = sync_child_pipe[1]; 695 if (close(sync_child_pipe[0]) < 0) 696 bail("failed to close sync_child_pipe[0] fd"); 697 698 /* 699 * State machine for synchronisation with the children. We only 700 * return once both the child and grandchild are ready. 701 */ 702 write_log(DEBUG, "-> stage-1 synchronisation loop"); 703 stage1_complete = false; 704 while (!stage1_complete) { 705 enum sync_t s; 706 707 if (read(syncfd, &s, sizeof(s)) != sizeof(s)) 708 bail("failed to sync with stage-1: next state"); 709 710 switch (s) { 711 case SYNC_USERMAP_PLS: 712 write_log(DEBUG, "stage-1 requested userns mappings"); 713 714 /* 715 * Enable setgroups(2) if we've been asked to. But we also 716 * have to explicitly disable setgroups(2) if we're 717 * creating a rootless container for single-entry mapping. 718 * i.e. config.is_setgroup == false. 719 * (this is required since Linux 3.19). 720 * 721 * For rootless multi-entry mapping, config.is_setgroup shall be true and 722 * newuidmap/newgidmap shall be used. 723 */ 724 if (config.is_rootless_euid && !config.is_setgroup) 725 update_setgroups(stage1_pid, SETGROUPS_DENY); 726 727 /* Set up mappings. */ 728 update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len); 729 update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len); 730 731 s = SYNC_USERMAP_ACK; 732 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { 733 sane_kill(stage1_pid, SIGKILL); 734 sane_kill(stage2_pid, SIGKILL); 735 bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)"); 736 } 737 break; 738 case SYNC_RECVPID_PLS: 739 write_log(DEBUG, "stage-1 requested pid to be forwarded"); 740 741 /* Get the stage-2 pid. */ 742 if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { 743 sane_kill(stage1_pid, SIGKILL); 744 bail("failed to sync with stage-1: read(stage2_pid)"); 745 } 746 747 /* Send ACK. */ 748 s = SYNC_RECVPID_ACK; 749 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { 750 sane_kill(stage1_pid, SIGKILL); 751 sane_kill(stage2_pid, SIGKILL); 752 bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)"); 753 } 754 755 /* 756 * Send both the stage-1 and stage-2 pids back to runc. 757 * runc needs the stage-2 to continue process management, 758 * but because stage-1 was spawned with CLONE_PARENT we 759 * cannot reap it within stage-0 and thus we need to ask 760 * runc to reap the zombie for us. 761 */ 762 write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", 763 stage1_pid, stage2_pid); 764 len = 765 dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid, 766 stage2_pid); 767 if (len < 0) { 768 sane_kill(stage1_pid, SIGKILL); 769 sane_kill(stage2_pid, SIGKILL); 770 bail("failed to sync with runc: write(pid-JSON)"); 771 } 772 break; 773 case SYNC_TIMEOFFSETS_PLS: 774 write_log(DEBUG, "stage-1 requested timens offsets to be configured"); 775 update_timens_offsets(stage1_pid, config.timensoffset, config.timensoffset_len); 776 s = SYNC_TIMEOFFSETS_ACK; 777 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { 778 sane_kill(stage1_pid, SIGKILL); 779 bail("failed to sync with child: write(SYNC_TIMEOFFSETS_ACK)"); 780 } 781 break; 782 case SYNC_CHILD_FINISH: 783 write_log(DEBUG, "stage-1 complete"); 784 stage1_complete = true; 785 break; 786 default: 787 bail("unexpected sync value: %u", s); 788 } 789 } 790 write_log(DEBUG, "<- stage-1 synchronisation loop"); 791 792 /* Now sync with grandchild. */ 793 syncfd = sync_grandchild_pipe[1]; 794 if (close(sync_grandchild_pipe[0]) < 0) 795 bail("failed to close sync_grandchild_pipe[0] fd"); 796 797 write_log(DEBUG, "-> stage-2 synchronisation loop"); 798 stage2_complete = false; 799 while (!stage2_complete) { 800 enum sync_t s; 801 802 write_log(DEBUG, "signalling stage-2 to run"); 803 s = SYNC_GRANDCHILD; 804 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { 805 sane_kill(stage2_pid, SIGKILL); 806 bail("failed to sync with child: write(SYNC_GRANDCHILD)"); 807 } 808 809 if (read(syncfd, &s, sizeof(s)) != sizeof(s)) 810 bail("failed to sync with child: next state"); 811 812 switch (s) { 813 case SYNC_CHILD_FINISH: 814 write_log(DEBUG, "stage-2 complete"); 815 stage2_complete = true; 816 break; 817 default: 818 bail("unexpected sync value: %u", s); 819 } 820 } 821 write_log(DEBUG, "<- stage-2 synchronisation loop"); 822 write_log(DEBUG, "<~ nsexec stage-0"); 823 exit(0); 824 } 825 break; 826 827 /* 828 * Stage 1: We're in the first child process. Our job is to join any 829 * provided namespaces in the netlink payload and unshare all of 830 * the requested namespaces. If we've been asked to CLONE_NEWUSER, 831 * we will ask our parent (stage 0) to set up our user mappings 832 * for us. Then, we create a new child (stage 2: STAGE_INIT) for 833 * PID namespace. We then send the child's PID to our parent 834 * (stage 0). 835 */ 836 case STAGE_CHILD:{ 837 pid_t stage2_pid = -1; 838 enum sync_t s; 839 840 /* For debugging. */ 841 current_stage = STAGE_CHILD; 842 843 /* We're in a child and thus need to tell the parent if we die. */ 844 syncfd = sync_child_pipe[0]; 845 if (close(sync_child_pipe[1]) < 0) 846 bail("failed to close sync_child_pipe[1] fd"); 847 848 /* For debugging. */ 849 prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); 850 write_log(DEBUG, "~> nsexec stage-1"); 851 852 /* 853 * We need to setns first. We cannot do this earlier (in stage 0) 854 * because of the fact that we forked to get here (the PID of 855 * [stage 2: STAGE_INIT]) would be meaningless). We could send it 856 * using cmsg(3) but that's just annoying. 857 */ 858 if (config.namespaces) 859 join_namespaces(config.namespaces); 860 861 /* 862 * Deal with user namespaces first. They are quite special, as they 863 * affect our ability to unshare other namespaces and are used as 864 * context for privilege checks. 865 * 866 * We don't unshare all namespaces in one go. The reason for this 867 * is that, while the kernel documentation may claim otherwise, 868 * there are certain cases where unsharing all namespaces at once 869 * will result in namespace objects being owned incorrectly. 870 * Ideally we should just fix these kernel bugs, but it's better to 871 * be safe than sorry, and fix them separately. 872 * 873 * A specific case of this is that the SELinux label of the 874 * internal kern-mount that mqueue uses will be incorrect if the 875 * UTS namespace is cloned before the USER namespace is mapped. 876 * I've also heard of similar problems with the network namespace 877 * in some scenarios. This also mirrors how LXC deals with this 878 * problem. 879 */ 880 if (config.cloneflags & CLONE_NEWUSER) { 881 try_unshare(CLONE_NEWUSER, "user namespace"); 882 config.cloneflags &= ~CLONE_NEWUSER; 883 884 /* 885 * We need to set ourselves as dumpable temporarily so that the 886 * parent process can write to our procfs files. 887 */ 888 if (config.namespaces) { 889 write_log(DEBUG, "temporarily set process as dumpable"); 890 if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) 891 bail("failed to temporarily set process as dumpable"); 892 } 893 894 /* 895 * We don't have the privileges to do any mapping here (see the 896 * clone_parent rant). So signal stage-0 to do the mapping for 897 * us. 898 */ 899 write_log(DEBUG, "request stage-0 to map user namespace"); 900 s = SYNC_USERMAP_PLS; 901 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) 902 bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); 903 904 /* ... wait for mapping ... */ 905 write_log(DEBUG, "request stage-0 to map user namespace"); 906 if (read(syncfd, &s, sizeof(s)) != sizeof(s)) 907 bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); 908 if (s != SYNC_USERMAP_ACK) 909 bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); 910 911 /* Revert temporary re-dumpable setting. */ 912 if (config.namespaces) { 913 write_log(DEBUG, "re-set process as non-dumpable"); 914 if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) 915 bail("failed to re-set process as non-dumpable"); 916 } 917 918 /* Become root in the namespace proper. */ 919 if (setresuid(0, 0, 0) < 0) 920 bail("failed to become root in user namespace"); 921 } 922 923 /* 924 * Unshare all of the namespaces. Now, it should be noted that this 925 * ordering might break in the future (especially with rootless 926 * containers). But for now, it's not possible to split this into 927 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. 928 * 929 * Note that we don't merge this with clone() because there were 930 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) 931 * was broken, so we'll just do it the long way anyway. 932 */ 933 try_unshare(config.cloneflags, "remaining namespaces"); 934 935 if (config.timensoffset) { 936 write_log(DEBUG, "request stage-0 to write timens offsets"); 937 938 s = SYNC_TIMEOFFSETS_PLS; 939 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) 940 bail("failed to sync with parent: write(SYNC_TIMEOFFSETS_PLS)"); 941 942 if (read(syncfd, &s, sizeof(s)) != sizeof(s)) 943 bail("failed to sync with parent: read(SYNC_TIMEOFFSETS_ACK)"); 944 if (s != SYNC_TIMEOFFSETS_ACK) 945 bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s); 946 } 947 948 /* 949 * TODO: What about non-namespace clone flags that we're dropping here? 950 * 951 * We fork again because of PID namespace, setns(2) or unshare(2) don't 952 * change the PID namespace of the calling process, because doing so 953 * would change the caller's idea of its own PID (as reported by getpid()), 954 * which would break many applications and libraries, so we must fork 955 * to actually enter the new PID namespace. 956 */ 957 write_log(DEBUG, "spawn stage-2"); 958 stage2_pid = clone_parent(&env, STAGE_INIT); 959 if (stage2_pid < 0) 960 bail("unable to spawn stage-2"); 961 962 /* Send the child to our parent, which knows what it's doing. */ 963 write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid); 964 s = SYNC_RECVPID_PLS; 965 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { 966 sane_kill(stage2_pid, SIGKILL); 967 bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); 968 } 969 if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { 970 sane_kill(stage2_pid, SIGKILL); 971 bail("failed to sync with parent: write(stage2_pid)"); 972 } 973 974 /* ... wait for parent to get the pid ... */ 975 if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { 976 sane_kill(stage2_pid, SIGKILL); 977 bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); 978 } 979 if (s != SYNC_RECVPID_ACK) { 980 sane_kill(stage2_pid, SIGKILL); 981 bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); 982 } 983 984 write_log(DEBUG, "signal completion to stage-0"); 985 s = SYNC_CHILD_FINISH; 986 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { 987 sane_kill(stage2_pid, SIGKILL); 988 bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); 989 } 990 991 /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */ 992 write_log(DEBUG, "<~ nsexec stage-1"); 993 exit(0); 994 } 995 break; 996 997 /* 998 * Stage 2: We're the final child process, and the only process that will 999 * actually return to the Go runtime. Our job is to just do the 1000 * final cleanup steps and then return to the Go runtime to allow 1001 * init_linux.go to run. 1002 */ 1003 case STAGE_INIT:{ 1004 /* 1005 * We're inside the child now, having jumped from the 1006 * start_child() code after forking in the parent. 1007 */ 1008 enum sync_t s; 1009 1010 /* For debugging. */ 1011 current_stage = STAGE_INIT; 1012 1013 /* We're in a child and thus need to tell the parent if we die. */ 1014 syncfd = sync_grandchild_pipe[0]; 1015 if (close(sync_grandchild_pipe[1]) < 0) 1016 bail("failed to close sync_grandchild_pipe[1] fd"); 1017 1018 if (close(sync_child_pipe[0]) < 0) 1019 bail("failed to close sync_child_pipe[0] fd"); 1020 1021 /* For debugging. */ 1022 prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); 1023 write_log(DEBUG, "~> nsexec stage-2"); 1024 1025 if (read(syncfd, &s, sizeof(s)) != sizeof(s)) 1026 bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); 1027 if (s != SYNC_GRANDCHILD) 1028 bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s); 1029 1030 if (setsid() < 0) 1031 bail("setsid failed"); 1032 1033 if (setuid(0) < 0) 1034 bail("setuid failed"); 1035 1036 if (setgid(0) < 0) 1037 bail("setgid failed"); 1038 1039 if (!config.is_rootless_euid && config.is_setgroup) { 1040 if (setgroups(0, NULL) < 0) 1041 bail("setgroups failed"); 1042 } 1043 1044 write_log(DEBUG, "signal completion to stage-0"); 1045 s = SYNC_CHILD_FINISH; 1046 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) 1047 bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); 1048 1049 /* Close sync pipes. */ 1050 if (close(sync_grandchild_pipe[0]) < 0) 1051 bail("failed to close sync_grandchild_pipe[0] fd"); 1052 1053 /* Free netlink data. */ 1054 nl_free(&config); 1055 1056 /* Finish executing, let the Go runtime take over. */ 1057 write_log(DEBUG, "<= nsexec container setup"); 1058 write_log(DEBUG, "booting up go runtime ..."); 1059 return; 1060 } 1061 break; 1062 default: 1063 bail("unexpected jump value"); 1064 } 1065 1066 /* Should never be reached. */ 1067 bail("should never be reached"); 1068 }