github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/mergeCode/runc/libcontainer/nsenter/nsexec.c (about) 1 #define _GNU_SOURCE 2 #include <endian.h> 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <grp.h> 6 #include <sched.h> 7 #include <setjmp.h> 8 #include <signal.h> 9 #include <stdarg.h> 10 #include <stdbool.h> 11 #include <stdint.h> 12 #include <stdio.h> 13 #include <stdlib.h> 14 #include <stdbool.h> 15 #include <string.h> 16 #include <unistd.h> 17 18 #include <sys/ioctl.h> 19 #include <sys/prctl.h> 20 #include <sys/socket.h> 21 #include <sys/types.h> 22 23 #include <linux/limits.h> 24 #include <linux/netlink.h> 25 #include <linux/types.h> 26 27 /* Get all of the CLONE_NEW* flags. */ 28 #include "namespace.h" 29 30 /* Synchronisation values. */ 31 enum sync_t { 32 SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ 33 SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ 34 SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ 35 SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ 36 37 /* XXX: This doesn't help with segfaults and other such issues. */ 38 SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ 39 }; 40 41 /* longjmp() arguments. */ 42 #define JUMP_PARENT 0x00 43 #define JUMP_CHILD 0xA0 44 #define JUMP_INIT 0xA1 45 46 /* JSON buffer. */ 47 #define JSON_MAX 4096 48 49 /* Assume the stack grows down, so arguments should be above it. */ 50 struct clone_t { 51 /* 52 * Reserve some space for clone() to locate arguments 53 * and retcode in this place 54 */ 55 char stack[4096] __attribute__ ((aligned(16))); 56 char stack_ptr[0]; 57 58 /* There's two children. This is used to execute the different code. */ 59 jmp_buf *env; 60 int jmpval; 61 }; 62 63 struct nlconfig_t { 64 char *data; 65 uint32_t cloneflags; 66 char *uidmap; 67 size_t uidmap_len; 68 char *gidmap; 69 size_t gidmap_len; 70 char *namespaces; 71 size_t namespaces_len; 72 uint8_t is_setgroup; 73 int consolefd; 74 }; 75 76 /* 77 * List of netlink message types sent to us as part of bootstrapping the init. 78 * These constants are defined in libcontainer/message_linux.go. 79 */ 80 #define INIT_MSG 62000 81 #define CLONE_FLAGS_ATTR 27281 82 #define CONSOLE_PATH_ATTR 27282 83 #define NS_PATHS_ATTR 27283 84 #define UIDMAP_ATTR 27284 85 #define GIDMAP_ATTR 27285 86 #define SETGROUP_ATTR 27286 87 88 /* 89 * Use the raw syscall for versions of glibc which don't include a function for 90 * it, namely (glibc 2.12). 91 */ 92 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 93 # define _GNU_SOURCE 94 # include "syscall.h" 95 # if !defined(SYS_setns) && defined(__NR_setns) 96 # define SYS_setns __NR_setns 97 # endif 98 99 #ifndef SYS_setns 100 # error "setns(2) syscall not supported by glibc version" 101 #endif 102 103 int setns(int fd, int nstype) 104 { 105 return syscall(SYS_setns, fd, nstype); 106 } 107 #endif 108 109 /* XXX: This is ugly. */ 110 static int syncfd = -1; 111 112 /* TODO(cyphar): Fix this so it correctly deals with syncT. */ 113 #define bail(fmt, ...) \ 114 do { \ 115 int ret = __COUNTER__ + 1; \ 116 fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ 117 if (syncfd >= 0) { \ 118 enum sync_t s = SYNC_ERR; \ 119 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \ 120 fprintf(stderr, "nsenter: failed: write(s)"); \ 121 if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \ 122 fprintf(stderr, "nsenter: failed: write(ret)"); \ 123 } \ 124 exit(ret); \ 125 } while(0) 126 127 static int write_file(char *data, size_t data_len, char *pathfmt, ...) 128 { 129 int fd, len, ret = 0; 130 char path[PATH_MAX]; 131 132 va_list ap; 133 va_start(ap, pathfmt); 134 len = vsnprintf(path, PATH_MAX, pathfmt, ap); 135 va_end(ap); 136 if (len < 0) 137 return -1; 138 139 fd = open(path, O_RDWR); 140 if (fd < 0) { 141 ret = -1; 142 goto out; 143 } 144 145 len = write(fd, data, data_len); 146 if (len != data_len) { 147 ret = -1; 148 goto out; 149 } 150 151 out: 152 close(fd); 153 return ret; 154 } 155 156 enum policy_t { 157 SETGROUPS_DEFAULT = 0, 158 SETGROUPS_ALLOW, 159 SETGROUPS_DENY, 160 }; 161 162 /* This *must* be called before we touch gid_map. */ 163 static void update_setgroups(int pid, enum policy_t setgroup) 164 { 165 char *policy; 166 167 switch (setgroup) { 168 case SETGROUPS_ALLOW: 169 policy = "allow"; 170 break; 171 case SETGROUPS_DENY: 172 policy = "deny"; 173 break; 174 case SETGROUPS_DEFAULT: 175 /* Nothing to do. */ 176 return; 177 } 178 179 if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { 180 /* 181 * If the kernel is too old to support /proc/pid/setgroups, 182 * open(2) or write(2) will return ENOENT. This is fine. 183 */ 184 if (errno != ENOENT) 185 bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); 186 } 187 } 188 189 static void update_uidmap(int pid, char *map, int map_len) 190 { 191 if (map == NULL || map_len <= 0) 192 return; 193 194 if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) 195 bail("failed to update /proc/%d/uid_map", pid); 196 } 197 198 static void update_gidmap(int pid, char *map, int map_len) 199 { 200 if (map == NULL || map_len <= 0) 201 return; 202 203 if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) 204 bail("failed to update /proc/%d/gid_map", pid); 205 } 206 207 /* A dummy function that just jumps to the given jumpval. */ 208 static int child_func(void *arg) __attribute__ ((noinline)); 209 static int child_func(void *arg) 210 { 211 struct clone_t *ca = (struct clone_t *)arg; 212 longjmp(*ca->env, ca->jmpval); 213 } 214 215 static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); 216 static int clone_parent(jmp_buf *env, int jmpval) 217 { 218 struct clone_t ca = { 219 .env = env, 220 .jmpval = jmpval, 221 }; 222 223 return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); 224 } 225 226 /* 227 * Gets the init pipe fd from the environment, which is used to read the 228 * bootstrap data and tell the parent what the new pid is after we finish 229 * setting up the environment. 230 */ 231 static int initpipe(void) 232 { 233 int pipenum; 234 char *initpipe, *endptr; 235 236 initpipe = getenv("_LIBCONTAINER_INITPIPE"); 237 if (initpipe == NULL || *initpipe == '\0') 238 return -1; 239 240 pipenum = strtol(initpipe, &endptr, 10); 241 if (*endptr != '\0') 242 bail("unable to parse _LIBCONTAINER_INITPIPE"); 243 244 return pipenum; 245 } 246 247 /* Returns the clone(2) flag for a namespace, given the name of a namespace. */ 248 static int nsflag(char *name) 249 { 250 if (!strcmp(name, "cgroup")) 251 return CLONE_NEWCGROUP; 252 else if (!strcmp(name, "ipc")) 253 return CLONE_NEWIPC; 254 else if (!strcmp(name, "mnt")) 255 return CLONE_NEWNS; 256 else if (!strcmp(name, "net")) 257 return CLONE_NEWNET; 258 else if (!strcmp(name, "pid")) 259 return CLONE_NEWPID; 260 else if (!strcmp(name, "user")) 261 return CLONE_NEWUSER; 262 else if (!strcmp(name, "uts")) 263 return CLONE_NEWUTS; 264 265 /* If we don't recognise a name, fallback to 0. */ 266 return 0; 267 } 268 269 static uint32_t readint32(char *buf) 270 { 271 return *(uint32_t *) buf; 272 } 273 274 static uint8_t readint8(char *buf) 275 { 276 return *(uint8_t *) buf; 277 } 278 279 static void nl_parse(int fd, struct nlconfig_t *config) 280 { 281 size_t len, size; 282 struct nlmsghdr hdr; 283 char *data, *current; 284 285 /* Retrieve the netlink header. */ 286 len = read(fd, &hdr, NLMSG_HDRLEN); 287 if (len != NLMSG_HDRLEN) 288 bail("invalid netlink header length %lu", len); 289 290 if (hdr.nlmsg_type == NLMSG_ERROR) 291 bail("failed to read netlink message"); 292 293 if (hdr.nlmsg_type != INIT_MSG) 294 bail("unexpected msg type %d", hdr.nlmsg_type); 295 296 /* Retrieve data. */ 297 size = NLMSG_PAYLOAD(&hdr, 0); 298 current = data = malloc(size); 299 if (!data) 300 bail("failed to allocate %zu bytes of memory for nl_payload", size); 301 302 len = read(fd, data, size); 303 if (len != size) 304 bail("failed to read netlink payload, %lu != %lu", len, size); 305 306 /* Parse the netlink payload. */ 307 config->data = data; 308 config->consolefd = -1; 309 while (current < data + size) { 310 struct nlattr *nlattr = (struct nlattr *)current; 311 size_t payload_len = nlattr->nla_len - NLA_HDRLEN; 312 313 /* Advance to payload. */ 314 current += NLA_HDRLEN; 315 316 /* Handle payload. */ 317 switch (nlattr->nla_type) { 318 case CLONE_FLAGS_ATTR: 319 config->cloneflags = readint32(current); 320 break; 321 case CONSOLE_PATH_ATTR: 322 /* 323 * We open the console here because we currently evaluate console 324 * paths from the *host* namespaces. 325 */ 326 config->consolefd = open(current, O_RDWR); 327 if (config->consolefd < 0) 328 bail("failed to open console %s", current); 329 break; 330 case NS_PATHS_ATTR: 331 config->namespaces = current; 332 config->namespaces_len = payload_len; 333 break; 334 case UIDMAP_ATTR: 335 config->uidmap = current; 336 config->uidmap_len = payload_len; 337 break; 338 case GIDMAP_ATTR: 339 config->gidmap = current; 340 config->gidmap_len = payload_len; 341 break; 342 case SETGROUP_ATTR: 343 config->is_setgroup = readint8(current); 344 break; 345 default: 346 bail("unknown netlink message type %d", nlattr->nla_type); 347 } 348 349 current += NLA_ALIGN(payload_len); 350 } 351 } 352 353 void nl_free(struct nlconfig_t *config) 354 { 355 free(config->data); 356 } 357 358 void join_namespaces(char *nslist) 359 { 360 int num = 0, i; 361 char *saveptr = NULL; 362 char *namespace = strtok_r(nslist, ",", &saveptr); 363 struct namespace_t { 364 int fd; 365 int ns; 366 char type[PATH_MAX]; 367 char path[PATH_MAX]; 368 } *namespaces = NULL; 369 370 if (!namespace || !strlen(namespace) || !strlen(nslist)) 371 bail("ns paths are empty"); 372 373 /* 374 * We have to open the file descriptors first, since after 375 * we join the mnt namespace we might no longer be able to 376 * access the paths. 377 */ 378 do { 379 int fd; 380 char *path; 381 struct namespace_t *ns; 382 383 /* Resize the namespace array. */ 384 namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); 385 if (!namespaces) 386 bail("failed to reallocate namespace array"); 387 ns = &namespaces[num - 1]; 388 389 /* Split 'ns:path'. */ 390 path = strstr(namespace, ":"); 391 if (!path) 392 bail("failed to parse %s", namespace); 393 *path++ = '\0'; 394 395 fd = open(path, O_RDONLY); 396 if (fd < 0) 397 bail("failed to open %s", namespace); 398 399 ns->fd = fd; 400 ns->ns = nsflag(namespace); 401 strncpy(ns->path, path, PATH_MAX); 402 } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); 403 404 /* 405 * The ordering in which we join namespaces is important. We should 406 * always join the user namespace *first*. This is all guaranteed 407 * from the container_linux.go side of this, so we're just going to 408 * follow the order given to us. 409 */ 410 411 for (i = 0; i < num; i++) { 412 struct namespace_t ns = namespaces[i]; 413 414 if (setns(ns.fd, ns.ns) < 0) 415 bail("failed to setns to %s", ns.path); 416 417 close(ns.fd); 418 } 419 420 free(namespaces); 421 } 422 423 void nsexec(void) 424 { 425 int pipenum; 426 jmp_buf env; 427 int syncpipe[2]; 428 struct nlconfig_t config = {0}; 429 430 /* 431 * If we don't have an init pipe, just return to the go routine. 432 * We'll only get an init pipe for start or exec. 433 */ 434 pipenum = initpipe(); 435 if (pipenum == -1) 436 return; 437 438 /* Parse all of the netlink configuration. */ 439 nl_parse(pipenum, &config); 440 441 /* Pipe so we can tell the child when we've finished setting up. */ 442 if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0) 443 bail("failed to setup sync pipe between parent and child"); 444 445 /* TODO: Currently we aren't dealing with child deaths properly. */ 446 447 /* 448 * Okay, so this is quite annoying. 449 * 450 * In order for this unsharing code to be more extensible we need to split 451 * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case 452 * would be if we did clone(CLONE_NEWUSER) and the other namespaces 453 * separately, but because of SELinux issues we cannot really do that. But 454 * we cannot just dump the namespace flags into clone(...) because several 455 * usecases (such as rootless containers) require more granularity around 456 * the namespace setup. In addition, some older kernels had issues where 457 * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot 458 * handle this while also dealing with SELinux so we choose SELinux support 459 * over broken kernel support). 460 * 461 * However, if we unshare(2) the user namespace *before* we clone(2), then 462 * all hell breaks loose. 463 * 464 * The parent no longer has permissions to do many things (unshare(2) drops 465 * all capabilities in your old namespace), and the container cannot be set 466 * up to have more than one {uid,gid} mapping. This is obviously less than 467 * ideal. In order to fix this, we have to first clone(2) and then unshare. 468 * 469 * Unfortunately, it's not as simple as that. We have to fork to enter the 470 * PID namespace (the PID namespace only applies to children). Since we'll 471 * have to double-fork, this clone_parent() call won't be able to get the 472 * PID of the _actual_ init process (without doing more synchronisation than 473 * I can deal with at the moment). So we'll just get the parent to send it 474 * for us, the only job of this process is to update 475 * /proc/pid/{setgroups,uid_map,gid_map}. 476 * 477 * And as a result of the above, we also need to setns(2) in the first child 478 * because if we join a PID namespace in the topmost parent then our child 479 * will be in that namespace (and it will not be able to give us a PID value 480 * that makes sense without resorting to sending things with cmsg). 481 * 482 * This also deals with an older issue caused by dumping cloneflags into 483 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so 484 * we have to unshare(2) before clone(2) in order to do this. This was fixed 485 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was 486 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're 487 * aware, the last mainline kernel which had this bug was Linux 3.12. 488 * However, we cannot comment on which kernels the broken patch was 489 * backported to. 490 * 491 * -- Aleksa "what has my life come to?" Sarai 492 */ 493 494 switch (setjmp(env)) { 495 /* 496 * Stage 0: We're in the parent. Our job is just to create a new child 497 * (stage 1: JUMP_CHILD) process and write its uid_map and 498 * gid_map. That process will go on to create a new process, then 499 * it will send us its PID which we will send to the bootstrap 500 * process. 501 */ 502 case JUMP_PARENT: { 503 int len; 504 pid_t child; 505 char buf[JSON_MAX]; 506 507 /* For debugging. */ 508 prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); 509 510 /* Start the process of getting a container. */ 511 child = clone_parent(&env, JUMP_CHILD); 512 if (child < 0) 513 bail("unable to fork: child_func"); 514 515 /* State machine for synchronisation with the children. */ 516 while (true) { 517 enum sync_t s; 518 519 /* This doesn't need to be global, we're in the parent. */ 520 int syncfd = syncpipe[1]; 521 522 if (read(syncfd, &s, sizeof(s)) != sizeof(s)) 523 bail("failed to sync with child: next state"); 524 525 switch (s) { 526 case SYNC_ERR: { 527 /* We have to mirror the error code of the child. */ 528 int ret; 529 530 if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) 531 bail("failed to sync with child: read(error code)"); 532 533 exit(ret); 534 } 535 break; 536 case SYNC_USERMAP_PLS: 537 /* Enable setgroups(2) if we've been asked to. */ 538 if (config.is_setgroup) 539 update_setgroups(child, SETGROUPS_ALLOW); 540 541 /* Set up mappings. */ 542 update_uidmap(child, config.uidmap, config.uidmap_len); 543 update_gidmap(child, config.gidmap, config.gidmap_len); 544 545 s = SYNC_USERMAP_ACK; 546 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { 547 kill(child, SIGKILL); 548 bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); 549 } 550 break; 551 case SYNC_USERMAP_ACK: 552 /* We should _never_ receive acks. */ 553 kill(child, SIGKILL); 554 bail("failed to sync with child: unexpected SYNC_USERMAP_ACK"); 555 break; 556 case SYNC_RECVPID_PLS: { 557 pid_t old = child; 558 559 /* Get the init_func pid. */ 560 if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { 561 kill(old, SIGKILL); 562 bail("failed to sync with child: read(childpid)"); 563 } 564 565 /* Send ACK. */ 566 s = SYNC_RECVPID_ACK; 567 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { 568 kill(old, SIGKILL); 569 kill(child, SIGKILL); 570 bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); 571 } 572 } 573 574 /* Leave the loop. */ 575 goto out; 576 case SYNC_RECVPID_ACK: 577 /* We should _never_ receive acks. */ 578 kill(child, SIGKILL); 579 bail("failed to sync with child: unexpected SYNC_RECVPID_ACK"); 580 break; 581 } 582 } 583 584 out: 585 /* Send the init_func pid back to our parent. */ 586 len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child); 587 if (len < 0) { 588 kill(child, SIGKILL); 589 bail("unable to generate JSON for child pid"); 590 } 591 if (write(pipenum, buf, len) != len) { 592 kill(child, SIGKILL); 593 bail("unable to send child pid to bootstrapper"); 594 } 595 596 exit(0); 597 } 598 599 /* 600 * Stage 1: We're in the first child process. Our job is to join any 601 * provided namespaces in the netlink payload and unshare all 602 * of the requested namespaces. If we've been asked to 603 * CLONE_NEWUSER, we will ask our parent (stage 0) to set up 604 * our user mappings for us. Then, we create a new child 605 * (stage 2: JUMP_INIT) for PID namespace. We then send the 606 * child's PID to our parent (stage 0). 607 */ 608 case JUMP_CHILD: { 609 pid_t child; 610 enum sync_t s; 611 612 /* We're in a child and thus need to tell the parent if we die. */ 613 syncfd = syncpipe[0]; 614 615 /* For debugging. */ 616 prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); 617 618 /* 619 * We need to setns first. We cannot do this earlier (in stage 0) 620 * because of the fact that we forked to get here (the PID of 621 * [stage 2: JUMP_INIT]) would be meaningless). We could send it 622 * using cmsg(3) but that's just annoying. 623 */ 624 if (config.namespaces) 625 join_namespaces(config.namespaces); 626 627 /* 628 * Unshare all of the namespaces. Now, it should be noted that this 629 * ordering might break in the future (especially with rootless 630 * containers). But for now, it's not possible to split this into 631 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. 632 * 633 * Note that we don't merge this with clone() because there were 634 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) 635 * was broken, so we'll just do it the long way anyway. 636 */ 637 if (unshare(config.cloneflags) < 0) 638 bail("failed to unshare namespaces"); 639 640 /* 641 * Deal with user namespaces first. They are quite special, as they 642 * affect our ability to unshare other namespaces and are used as 643 * context for privilege checks. 644 */ 645 if (config.cloneflags & CLONE_NEWUSER) { 646 /* 647 * We don't have the privileges to do any mapping here (see the 648 * clone_parent rant). So signal our parent to hook us up. 649 */ 650 651 s = SYNC_USERMAP_PLS; 652 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) 653 bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); 654 655 /* ... wait for mapping ... */ 656 657 if (read(syncfd, &s, sizeof(s)) != sizeof(s)) 658 bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); 659 if (s != SYNC_USERMAP_ACK) 660 bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); 661 } 662 663 /* 664 * TODO: What about non-namespace clone flags that we're dropping here? 665 * 666 * We fork again because of PID namespace, setns(2) or unshare(2) don't 667 * change the PID namespace of the calling process, because doing so 668 * would change the caller's idea of its own PID (as reported by getpid()), 669 * which would break many applications and libraries, so we must fork 670 * to actually enter the new PID namespace. 671 */ 672 child = clone_parent(&env, JUMP_INIT); 673 if (child < 0) 674 bail("unable to fork: init_func"); 675 676 /* Send the child to our parent, which knows what it's doing. */ 677 s = SYNC_RECVPID_PLS; 678 if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { 679 kill(child, SIGKILL); 680 bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); 681 } 682 if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { 683 kill(child, SIGKILL); 684 bail("failed to sync with parent: write(childpid)"); 685 } 686 687 /* ... wait for parent to get the pid ... */ 688 689 if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { 690 kill(child, SIGKILL); 691 bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); 692 } 693 if (s != SYNC_RECVPID_ACK) { 694 kill(child, SIGKILL); 695 bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); 696 } 697 698 /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ 699 exit(0); 700 } 701 702 /* 703 * Stage 2: We're the final child process, and the only process that will 704 * actually return to the Go runtime. Our job is to just do the 705 * final cleanup steps and then return to the Go runtime to allow 706 * init_linux.go to run. 707 */ 708 case JUMP_INIT: { 709 /* 710 * We're inside the child now, having jumped from the 711 * start_child() code after forking in the parent. 712 */ 713 int consolefd = config.consolefd; 714 715 /* We're in a child and thus need to tell the parent if we die. */ 716 syncfd = syncpipe[0]; 717 718 /* For debugging. */ 719 prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0); 720 721 if (setsid() < 0) 722 bail("setsid failed"); 723 724 if (setuid(0) < 0) 725 bail("setuid failed"); 726 727 if (setgid(0) < 0) 728 bail("setgid failed"); 729 730 if (setgroups(0, NULL) < 0) 731 bail("setgroups failed"); 732 733 if (consolefd != -1) { 734 if (ioctl(consolefd, TIOCSCTTY, 0) < 0) 735 bail("ioctl TIOCSCTTY failed"); 736 if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) 737 bail("failed to dup stdin"); 738 if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) 739 bail("failed to dup stdout"); 740 if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) 741 bail("failed to dup stderr"); 742 } 743 744 /* Close sync pipes. */ 745 close(syncpipe[0]); 746 close(syncpipe[1]); 747 748 /* Free netlink data. */ 749 nl_free(&config); 750 751 /* Finish executing, let the Go runtime take over. */ 752 return; 753 } 754 default: 755 bail("unexpected jump value"); 756 break; 757 } 758 759 /* Should never be reached. */ 760 bail("should never be reached"); 761 }