github.com/apptainer/singularity@v3.1.1+incompatible/cmd/starter/c/starter.c (about) 1 /* 2 Copyright (c) 2018-2019, Sylabs, Inc. All rights reserved. 3 4 This software is licensed under a 3-clause BSD license. Please 5 consult LICENSE.md file distributed with the sources of this project regarding 6 your rights to use or distribute this software. 7 */ 8 9 10 #define _GNU_SOURCE 11 #include <stdio.h> 12 #include <stdlib.h> 13 #include <stdarg.h> 14 #include <unistd.h> 15 #include <errno.h> 16 #include <ctype.h> 17 #include <string.h> 18 #include <fcntl.h> 19 #include <poll.h> 20 #include <grp.h> 21 #include <link.h> 22 #include <dirent.h> 23 #include <libgen.h> 24 #include <limits.h> 25 #include <sys/mman.h> 26 #include <sys/fsuid.h> 27 #include <sys/mount.h> 28 #include <sys/wait.h> 29 #include <sys/prctl.h> 30 #include <sys/socket.h> 31 #include <sys/stat.h> 32 #include <signal.h> 33 #include <sched.h> 34 #include <setjmp.h> 35 #include <sys/syscall.h> 36 #include <net/if.h> 37 #include <sys/eventfd.h> 38 39 #ifdef SINGULARITY_SECUREBITS 40 # include <linux/securebits.h> 41 #else 42 # include "include/securebits.h" 43 #endif /* SINGULARITY_SECUREBITS */ 44 45 #ifndef PR_SET_NO_NEW_PRIVS 46 #define PR_SET_NO_NEW_PRIVS 38 47 #endif 48 49 #ifndef PR_GET_NO_NEW_PRIVS 50 #define PR_GET_NO_NEW_PRIVS 39 51 #endif 52 53 #ifndef CLONE_NEWUSER 54 #define CLONE_NEWUSER 0x10000000 55 #endif 56 57 #ifndef CLONE_NEWCGROUP 58 #define CLONE_NEWCGROUP 0x02000000 59 #endif 60 61 #include "include/capability.h" 62 #include "include/message.h" 63 #include "include/starter.h" 64 65 #define CLONE_STACK_SIZE 1024*1024 66 #define BUFSIZE 512 67 68 /* C and JSON configuration */ 69 struct cConfig *config; 70 71 /* Socket process communication */ 72 int rpc_socket[2] = {-1, -1}; 73 int master_socket[2] = {-1, -1}; 74 75 #define STAGE1 1 76 #define STAGE2 2 77 #define MASTER 3 78 #define RPC_SERVER 4 79 80 unsigned char execute; 81 82 typedef struct fork_state_s { 83 sigjmp_buf env; 84 } fork_state_t; 85 86 /* copy paste from singularity code */ 87 static int clone_fn(void *data_ptr) { 88 fork_state_t *state = (fork_state_t *)data_ptr; 89 siglongjmp(state->env, 1); 90 } 91 92 static int fork_ns(unsigned int flags) { 93 fork_state_t state; 94 95 if ( sigsetjmp(state.env, 1) ) { 96 return 0; 97 } 98 99 int stack_size = CLONE_STACK_SIZE; 100 char *child_stack_ptr = malloc(stack_size); 101 if ( child_stack_ptr == 0 ) { 102 errno = ENOMEM; 103 return -1; 104 } 105 child_stack_ptr += stack_size; 106 107 int retval = clone(clone_fn, child_stack_ptr, (SIGCHLD|flags), &state); 108 return retval; 109 } 110 111 static void priv_escalate(void) { 112 verbosef("Get root privileges\n"); 113 if ( seteuid(0) < 0 ) { 114 fatalf("Failed to set effective UID to 0\n"); 115 } 116 } 117 118 static void set_parent_death_signal(int signo) { 119 debugf("Set parent death signal to %d\n", signo); 120 if ( prctl(PR_SET_PDEATHSIG, signo) < 0 ) { 121 fatalf("Failed to set parent death signal\n"); 122 } 123 } 124 125 static int prepare_stage(int stage, struct cConfig *config) { 126 uid_t uid = getuid(); 127 struct __user_cap_header_struct header; 128 struct __user_cap_data_struct data[2]; 129 130 set_parent_death_signal(SIGKILL); 131 132 debugf("Entering in stage %d\n", stage); 133 134 header.version = LINUX_CAPABILITY_VERSION; 135 header.pid = 0; 136 137 if ( capget(&header, data) < 0 ) { 138 fatalf("Failed to get processus capabilities\n"); 139 } 140 141 data[1].inheritable = (__u32)(config->capabilities.inheritable >> 32); 142 data[0].inheritable = (__u32)(config->capabilities.inheritable & 0xFFFFFFFF); 143 data[1].permitted = (__u32)(config->capabilities.permitted >> 32); 144 data[0].permitted = (__u32)(config->capabilities.permitted & 0xFFFFFFFF); 145 data[1].effective = (__u32)(config->capabilities.effective >> 32); 146 data[0].effective = (__u32)(config->capabilities.effective & 0xFFFFFFFF); 147 148 int last_cap; 149 for ( last_cap = CAPSET_MAX; ; last_cap-- ) { 150 if ( prctl(PR_CAPBSET_READ, last_cap) > 0 || last_cap == 0 ) { 151 break; 152 } 153 } 154 155 int caps_index; 156 for ( caps_index = 0; caps_index <= last_cap; caps_index++ ) { 157 if ( !(config->capabilities.bounding & (1ULL << caps_index)) ) { 158 if ( prctl(PR_CAPBSET_DROP, caps_index) < 0 ) { 159 fatalf("Failed to drop bounding capabilities set: %s\n", strerror(errno)); 160 } 161 } 162 } 163 164 if ( !(config->namespace.flags & CLONE_NEWUSER) ) { 165 /* apply target UID/GID for root user */ 166 if ( uid == 0 ) { 167 if ( config->container.numGID != 0 || config->container.targetUID != 0 ) { 168 if ( prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP|SECBIT_NO_SETUID_FIXUP_LOCKED) < 0 ) { 169 fatalf("Failed to set securebits: %s\n", strerror(errno)); 170 } 171 } 172 173 if ( config->container.numGID != 0 ) { 174 debugf("Clear additional group IDs\n"); 175 176 if ( setgroups(0, NULL) < 0 ) { 177 fatalf("Unable to clear additional group IDs: %s\n", strerror(errno)); 178 } 179 } 180 181 if ( config->container.numGID >= 2 ) { 182 debugf("Set additional group IDs\n"); 183 184 if ( setgroups(config->container.numGID-1, &config->container.targetGID[1]) < 0 ) { 185 fatalf("Failed to set additional groups: %s\n", strerror(errno)); 186 } 187 } 188 if ( config->container.numGID >= 1 ) { 189 gid_t targetGID = config->container.targetGID[0]; 190 191 debugf("Set main group ID\n"); 192 193 if ( setresgid(targetGID, targetGID, targetGID) < 0 ) { 194 fatalf("Failed to set GID %d: %s\n", targetGID, strerror(errno)); 195 } 196 } 197 if ( config->container.targetUID != 0 ) { 198 uid_t targetUID = config->container.targetUID; 199 200 debugf("Set user ID to %d\n", targetUID); 201 202 if ( setresuid(targetUID, targetUID, targetUID) < 0 ) { 203 fatalf("Failed to drop privileges: %s\n", strerror(errno)); 204 } 205 } 206 } else if ( config->container.isSuid ) { 207 if ( prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP|SECBIT_NO_SETUID_FIXUP_LOCKED) < 0 ) { 208 fatalf("Failed to set securebits: %s\n", strerror(errno)); 209 } 210 211 if ( setresuid(uid, uid, uid) < 0 ) { 212 fatalf("Failed to drop privileges: %s\n", strerror(errno)); 213 } 214 } 215 216 set_parent_death_signal(SIGKILL); 217 } 218 219 if ( config->container.noNewPrivs ) { 220 if ( prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0 ) { 221 fatalf("Failed to set no new privs flag: %s\n", strerror(errno)); 222 } 223 if ( prctl(PR_GET_NO_NEW_PRIVS, 0, 0 ,0, 0) != 1 ) { 224 fatalf("Aborting, failed to set no new privs flag: %s\n", strerror(errno)); 225 } 226 } 227 228 if ( capset(&header, data) < 0 ) { 229 fatalf("Failed to set process capabilities\n"); 230 } 231 232 #ifdef USER_CAPABILITIES 233 // set ambient capabilities if supported 234 for ( caps_index = 0; caps_index <= last_cap; caps_index++ ) { 235 if ( (config->capabilities.ambient & (1ULL << caps_index)) ) { 236 if ( prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, caps_index, 0, 0) < 0 ) { 237 fatalf("Failed to set ambient capability: %s\n", strerror(errno)); 238 } 239 } 240 } 241 #endif 242 243 return stage; 244 } 245 246 static int create_namespace(int nstype) { 247 switch(nstype) { 248 case CLONE_NEWNET: 249 #ifdef NS_CLONE_NEWNET 250 verbosef("Create network namespace\n"); 251 #else 252 warningf("Skipping network namespace creation, not supported\n"); 253 return(0); 254 #endif /* NS_CLONE_NEWNET */ 255 break; 256 case CLONE_NEWIPC: 257 #ifdef NS_CLONE_NEWIPC 258 verbosef("Create ipc namespace\n"); 259 #else 260 warningf("Skipping ipc namespace creation, not supported\n"); 261 return(0); 262 #endif /* NS_CLONE_NEWIPC */ 263 break; 264 case CLONE_NEWNS: 265 #ifdef NS_CLONE_NEWNS 266 verbosef("Create mount namespace\n"); 267 #else 268 warningf("Skipping mount namespace creation, not supported\n"); 269 return(0); 270 #endif /* NS_CLONE_NEWNS */ 271 break; 272 case CLONE_NEWUTS: 273 #ifdef NS_CLONE_NEWUTS 274 verbosef("Create uts namespace\n"); 275 #else 276 warningf("Skipping uts namespace creation, not supported\n"); 277 return(0); 278 #endif /* NS_CLONE_NEWUTS */ 279 break; 280 case CLONE_NEWUSER: 281 #ifdef NS_CLONE_NEWUSER 282 verbosef("Create user namespace\n"); 283 #else 284 warningf("Skipping user namespace creation, not supported\n"); 285 #endif /* NS_CLONE_NEWUSER */ 286 break; 287 #ifdef NS_CLONE_NEWCGROUP 288 case CLONE_NEWCGROUP: 289 verbosef("Create cgroup namespace\n"); 290 break; 291 #endif /* NS_CLONE_NEWCGROUP */ 292 default: 293 warningf("Skipping unknown namespace creation\n"); 294 errno = EINVAL; 295 return(-1); 296 } 297 return unshare(nstype); 298 } 299 300 static int enter_namespace(char *nspath, int nstype) { 301 int ns_fd; 302 303 switch(nstype) { 304 case CLONE_NEWPID: 305 verbosef("Entering in pid namespace\n"); 306 #ifndef NS_CLONE_NEWPID 307 errno = EINVAL; 308 return(-1); 309 #endif /* NS_CLONE_NEWPID */ 310 break; 311 case CLONE_NEWNET: 312 verbosef("Entering in network namespace\n"); 313 #ifndef NS_CLONE_NEWNET 314 errno = EINVAL; 315 return(-1); 316 #endif /* NS_CLONE_NEWNET */ 317 break; 318 case CLONE_NEWIPC: 319 verbosef("Entering in ipc namespace\n"); 320 #ifndef NS_CLONE_NEWIPC 321 errno = EINVAL; 322 return(-1); 323 #endif /* NS_CLONE_NEWIPC */ 324 break; 325 case CLONE_NEWNS: 326 verbosef("Entering in mount namespace\n"); 327 #ifndef NS_CLONE_NEWNS 328 errno = EINVAL; 329 return(-1); 330 #endif /* NS_CLONE_NEWNS */ 331 break; 332 case CLONE_NEWUTS: 333 verbosef("Entering in uts namespace\n"); 334 #ifndef NS_CLONE_NEWUTS 335 errno = EINVAL; 336 return(-1); 337 #endif /* NS_CLONE_NEWUTS */ 338 break; 339 case CLONE_NEWUSER: 340 verbosef("Entering in user namespace\n"); 341 #ifndef NS_CLONE_NEWUSER 342 errno = EINVAL; 343 return(-1); 344 #endif /* NS_CLONE_NEWUSER */ 345 break; 346 #ifdef NS_CLONE_NEWCGROUP 347 case CLONE_NEWCGROUP: 348 verbosef("Entering in cgroup namespace\n"); 349 break; 350 #endif /* NS_CLONE_NEWCGROUP */ 351 default: 352 verbosef("Entering in unknown namespace\n"); 353 errno = EINVAL; 354 return(-1); 355 } 356 357 debugf("Opening namespace file descriptor %s\n", nspath); 358 ns_fd = open(nspath, O_RDONLY); 359 if ( ns_fd < 0 ) { 360 return(-1); 361 } 362 363 if ( setns(ns_fd, nstype) < 0 ) { 364 int err = errno; 365 close(ns_fd); 366 errno = err; 367 return(-1); 368 } 369 370 close(ns_fd); 371 return(0); 372 } 373 374 static void setup_userns_mappings(struct cConfig *config, pid_t pid, const char *setgroup) { 375 FILE *map_fp; 376 int i; 377 struct idMapping *uidmap; 378 struct idMapping *gidmap; 379 char *path = (char *)malloc(PATH_MAX); 380 381 debugf("Write %s to set group file\n", setgroup); 382 memset(path, 0, PATH_MAX); 383 if ( snprintf(path, PATH_MAX-1, "/proc/%d/setgroups", pid) < 0 ) { 384 fatalf("Failed to write path /proc/%d/setgroups in buffer\n", pid); 385 } 386 387 map_fp = fopen(path, "w+"); // Flawfinder: ignore 388 if ( map_fp != NULL ) { 389 fprintf(map_fp, "%s\n", setgroup); 390 if ( fclose(map_fp) < 0 ) { 391 fatalf("Failed to write %s to setgroup file: %s\n", setgroup, strerror(errno)); 392 } 393 } else { 394 fatalf("Could not write info to setgroups: %s\n", strerror(errno)); 395 } 396 397 debugf("Write to GID map\n"); 398 memset(path, 0, PATH_MAX); 399 if ( snprintf(path, PATH_MAX-1, "/proc/%d/gid_map", pid) < 0 ) { 400 fatalf("Failed to write path /proc/%d/gid_map in buffer\n", pid); 401 } 402 403 map_fp = fopen(path, "w+"); // Flawfinder: ignore 404 if ( map_fp != NULL ) { 405 fprintf(map_fp, "%s", config->container.gidMap); 406 if ( fclose(map_fp) < 0 ) { 407 fatalf("Failed to write to GID map: %s\n", strerror(errno)); 408 } 409 } else { 410 fatalf("Could not write parent info to gid_map: %s\n", strerror(errno)); 411 } 412 413 debugf("Write to UID map\n"); 414 memset(path, 0, PATH_MAX); 415 if ( snprintf(path, PATH_MAX-1, "/proc/%d/uid_map", pid) < 0 ) { 416 fatalf("Failed to write path /proc/%d/uid_map in buffer\n", pid); 417 } 418 419 map_fp = fopen(path, "w+"); // Flawfinder: ignore 420 if ( map_fp != NULL ) { 421 fprintf(map_fp, "%s", config->container.uidMap); 422 if ( fclose(map_fp) < 0 ) { 423 fatalf("Failed to write to UID map: %s\n", strerror(errno)); 424 } 425 } else { 426 fatalf("Could not write parent info to uid_map: %s\n", strerror(errno)); 427 } 428 429 free(path); 430 } 431 432 static void setup_userns_identity(struct cConfig *config) { 433 uid_t uidMap = config->container.targetUID; 434 gid_t gidMap = config->container.targetGID[0]; 435 436 if ( setgroups(0, NULL) < 0 ) { 437 fatalf("Unabled to clear additional group IDs: %s\n", strerror(errno)); 438 } 439 if ( setresgid(gidMap, gidMap, gidMap) < 0 ) { 440 fatalf("Failed to change namespace group identity: %s\n", strerror(errno)); 441 } 442 if ( setresuid(uidMap, uidMap, uidMap) < 0 ) { 443 fatalf("Failed to change namespace user identity: %s\n", strerror(errno)); 444 } 445 } 446 447 static void user_namespace_init(struct cConfig *config, int *fork_flags) { 448 if ( (config->namespace.flags & CLONE_NEWUSER) == 0 && config->namespace.user[0] == 0 ) { 449 priv_escalate(); 450 } else { 451 if ( config->container.isSuid ) { 452 fatalf("Running setuid workflow with user namespace is not allowed\n"); 453 } 454 if ( config->namespace.user[0] != 0 ) { 455 if ( enter_namespace(config->namespace.user, CLONE_NEWUSER) < 0 ) { 456 fatalf("Failed to enter in user namespace: %s\n", strerror(errno)); 457 } 458 if ( !config->container.sharedMount ) { 459 setup_userns_identity(config); 460 } 461 } else if ( config->container.sharedMount ) { 462 verbosef("Create user namespace\n"); 463 464 if ( unshare(CLONE_NEWUSER) < 0 ) { 465 fatalf("Failed to create user namespace\n"); 466 } 467 468 setup_userns_mappings(config, getpid(), "deny"); 469 } else { 470 *fork_flags |= CLONE_NEWUSER; 471 priv_escalate(); 472 } 473 } 474 } 475 476 static char *shared_mount_namespace_init(struct cConfig *config) { 477 if ( config->namespace.mount[0] == 0 && config->container.sharedMount ) { 478 unsigned long propagation = config->container.mountPropagation; 479 480 if ( propagation == 0 ) { 481 propagation = MS_PRIVATE | MS_REC; 482 } 483 if ( unshare(CLONE_FS) < 0 ) { 484 fatalf("Failed to unshare root file system: %s\n", strerror(errno)); 485 } 486 if ( create_namespace(CLONE_NEWNS) < 0 ) { 487 fatalf("Failed to create mount namespace: %s\n", strerror(errno)); 488 } 489 if ( mount(NULL, "/", NULL, propagation, NULL) < 0 ) { 490 fatalf("Failed to set mount propagation: %s\n", strerror(errno)); 491 } 492 /* set shared mount propagation to share mount points between master and container process */ 493 if ( mount(NULL, "/", NULL, MS_SHARED|MS_REC, NULL) < 0 ) { 494 fatalf("Failed to propagate as SHARED: %s\n", strerror(errno)); 495 } 496 } 497 } 498 499 static void pid_namespace_init(struct cConfig *config, int *fork_flags) { 500 if ( config->namespace.pid[0] != 0 ) { 501 if ( enter_namespace(config->namespace.pid, CLONE_NEWPID) < 0 ) { 502 fatalf("Failed to enter in pid namespace: %s\n", strerror(errno)); 503 } 504 } else if ( config->namespace.flags & CLONE_NEWPID ) { 505 verbosef("Create pid namespace\n"); 506 *fork_flags |= CLONE_NEWPID; 507 } 508 } 509 510 static void network_namespace_init(struct cConfig *config) { 511 if ( config->namespace.network[0] != 0 ) { 512 if ( enter_namespace(config->namespace.network, CLONE_NEWNET) < 0 ) { 513 fatalf("Failed to enter in network namespace: %s\n", strerror(errno)); 514 } 515 } else if ( config->namespace.flags & CLONE_NEWNET ) { 516 if ( create_namespace(CLONE_NEWNET) < 0 ) { 517 fatalf("Failed to create network namespace: %s\n", strerror(errno)); 518 } 519 520 if ( config->container.bringLoopbackInterface ) { 521 struct ifreq req; 522 int sockfd = socket(AF_INET, SOCK_DGRAM, 0); 523 524 if ( sockfd < 0 ) { 525 fatalf("Unable to open AF_INET socket: %s\n", strerror(errno)); 526 } 527 528 memset(&req, 0, sizeof(req)); 529 strncpy(req.ifr_name, "lo", IFNAMSIZ); 530 531 req.ifr_flags |= IFF_UP; 532 533 debugf("Bringing up network loopback interface\n"); 534 if ( ioctl(sockfd, SIOCSIFFLAGS, &req) < 0 ) { 535 fatalf("Failed to set flags on interface: %s\n", strerror(errno)); 536 } 537 close(sockfd); 538 } 539 } 540 } 541 542 static void uts_namespace_init(struct cConfig *config) { 543 if ( config->namespace.uts[0] != 0 ) { 544 if ( enter_namespace(config->namespace.uts, CLONE_NEWUTS) < 0 ) { 545 fatalf("Failed to enter in uts namespace: %s\n", strerror(errno)); 546 } 547 } else if ( config->namespace.flags & CLONE_NEWUTS ) { 548 if ( create_namespace(CLONE_NEWUTS) < 0 ) { 549 fatalf("Failed to create uts namespace: %s\n", strerror(errno)); 550 } 551 } 552 } 553 554 static void ipc_namespace_init(struct cConfig *config) { 555 if ( config->namespace.ipc[0] != 0 ) { 556 if ( enter_namespace(config->namespace.ipc, CLONE_NEWIPC) < 0 ) { 557 fatalf("Failed to enter in ipc namespace: %s\n", strerror(errno)); 558 } 559 } else if ( config->namespace.flags & CLONE_NEWIPC ) { 560 if ( create_namespace(CLONE_NEWIPC) < 0 ) { 561 fatalf("Failed to create ipc namespace: %s\n", strerror(errno)); 562 } 563 } 564 } 565 566 static void cgroup_namespace_init(struct cConfig *config) { 567 if ( config->namespace.cgroup[0] != 0 ) { 568 if ( enter_namespace(config->namespace.cgroup, CLONE_NEWCGROUP) < 0 ) { 569 fatalf("Failed to enter in cgroup namespace: %s\n", strerror(errno)); 570 } 571 } else if ( config->namespace.flags & CLONE_NEWCGROUP ) { 572 if ( create_namespace(CLONE_NEWCGROUP) < 0 ) { 573 fatalf("Failed to create cgroup namespace: %s\n", strerror(errno)); 574 } 575 } 576 } 577 578 static void mount_namespace_init(struct cConfig *config) { 579 if ( config->namespace.mount[0] != 0 ) { 580 if ( enter_namespace(config->namespace.mount, CLONE_NEWNS) < 0 ) { 581 fatalf("Failed to enter in mount namespace: %s\n", strerror(errno)); 582 } 583 } else if ( config->namespace.flags & CLONE_NEWNS ) { 584 if ( !config->container.sharedMount ) { 585 unsigned long propagation = config->container.mountPropagation; 586 587 if ( unshare(CLONE_FS) < 0 ) { 588 fatalf("Failed to unshare root file system: %s\n", strerror(errno)); 589 } 590 if ( create_namespace(CLONE_NEWNS) < 0 ) { 591 fatalf("Failed to create mount namespace: %s\n", strerror(errno)); 592 } 593 if ( propagation && mount(NULL, "/", NULL, propagation, NULL) < 0 ) { 594 fatalf("Failed to set mount propagation: %s\n", strerror(errno)); 595 } 596 } else { 597 /* create a namespace for container process to separate master during pivot_root */ 598 if ( create_namespace(CLONE_NEWNS) < 0 ) { 599 fatalf("Failed to create mount namespace: %s\n", strerror(errno)); 600 } 601 602 /* set shared propagation to propagate few mount points to master */ 603 if ( mount(NULL, "/", NULL, MS_SHARED|MS_REC, NULL) < 0 ) { 604 fatalf("Failed to propagate as SHARED: %s\n", strerror(errno)); 605 } 606 } 607 } 608 } 609 610 static unsigned char is_suid(void) { 611 ElfW(auxv_t) *auxv; 612 unsigned char suid = 0; 613 char *buffer = (char *)malloc(4096); 614 int proc_auxv = open("/proc/self/auxv", O_RDONLY); 615 616 verbosef("Check if we are running as setuid\n"); 617 618 if ( proc_auxv < 0 ) { 619 fatalf("Can't open /proc/self/auxv: %s\n", strerror(errno)); 620 } 621 622 /* use auxiliary vectors to determine if running privileged */ 623 memset(buffer, 0, 4096); 624 if ( read(proc_auxv, buffer, 4088) < 0 ) { 625 fatalf("Can't read auxiliary vectors: %s\n", strerror(errno)); 626 } 627 628 auxv = (ElfW(auxv_t) *)buffer; 629 630 for (; auxv->a_type != AT_NULL; auxv++) { 631 if ( auxv->a_type == AT_SECURE ) { 632 suid = (int)auxv->a_un.a_val; 633 break; 634 } 635 } 636 637 free(buffer); 638 close(proc_auxv); 639 640 return suid; 641 } 642 643 static struct fdlist *list_fd(void) { 644 int i = 0; 645 int fd_proc; 646 DIR *dir; 647 struct dirent *dirent; 648 struct fdlist *fl = (struct fdlist *)malloc(sizeof(struct fdlist)); 649 650 if ( fl == NULL ) { 651 fatalf("Memory allocation failed: %s\n", strerror(errno)); 652 } 653 654 fl->fds = NULL; 655 fl->num = 0; 656 657 if ( ( fd_proc = open("/proc/self/fd", O_RDONLY) ) < 0 ) { 658 fatalf("Failed to open /proc/self/fd: %s\n", strerror(errno)); 659 } 660 661 if ( ( dir = fdopendir(fd_proc) ) == NULL ) { 662 fatalf("Failed to list /proc/self/fd directory: %s\n", strerror(errno)); 663 } 664 665 while ( ( dirent = readdir(dir ) ) ) { 666 if ( strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0 ) { 667 continue; 668 } 669 if ( atoi(dirent->d_name) == fd_proc ) { 670 continue; 671 } 672 fl->num++; 673 } 674 675 rewinddir(dir); 676 677 fl->fds = (int *)malloc(sizeof(int)*fl->num); 678 if ( fl->fds == NULL ) { 679 fatalf("Memory allocation failed: %s\n", strerror(errno)); 680 } 681 682 while ( ( dirent = readdir(dir ) ) ) { 683 int cv; 684 if ( strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0 ) { 685 continue; 686 } 687 688 cv = atoi(dirent->d_name); 689 if ( cv == fd_proc ) { 690 continue; 691 } 692 693 fl->fds[i++] = atoi(dirent->d_name); 694 } 695 696 closedir(dir); 697 close(fd_proc); 698 699 return fl; 700 } 701 702 static void cleanup_fd(struct fdlist *fd_before, struct fdlist *fd_after) { 703 int i, j; 704 char *source = (char *)malloc(PATH_MAX); 705 char *target = (char *)malloc(PATH_MAX); 706 707 if ( source == NULL || target == NULL ) { 708 fatalf("Memory allocation failed: %s", strerror(errno)); 709 } 710 711 /* 712 * close unattended file descriptors opened during stage 1 713 * execution, that may not be accurate depending of fs operations done 714 * in stage 1, but should work for most engines. 715 */ 716 for ( i = 0; i < fd_after->num; i++ ) { 717 struct stat st; 718 int found; 719 720 if ( fd_after->fds[i] == master_socket[0] || fd_after->fds[i] == master_socket[1] ) { 721 continue; 722 } 723 724 found = 0; 725 for ( j = 0; j < fd_before->num; j++ ) { 726 if ( fd_before->fds[j] == fd_after->fds[i] ) { 727 found = 1; 728 break; 729 } 730 } 731 if ( found == 1 ) { 732 continue; 733 } 734 735 memset(target, 0, PATH_MAX); 736 snprintf(source, PATH_MAX, "/proc/self/fd/%d", fd_after->fds[i]); 737 738 /* fd with link generating error are closed */ 739 if ( readlink(source, target, PATH_MAX) < 0 ) { 740 close(fd_after->fds[i]); 741 continue; 742 } 743 /* fd pointing to /dev/tty or anonymous inodes are closed */ 744 debugf("Check file descriptor %s pointing to %s\n", source, target); 745 if ( strcmp(target, "/dev/tty") == 0 || strncmp(target, "anon_", 5) == 0 ) { 746 debugf("Closing %s\n", source); 747 close(fd_after->fds[i]); 748 continue; 749 } 750 /* set force close on exec for remaining fd */ 751 if ( fcntl(fd_after->fds[i], F_SETFD, FD_CLOEXEC) < 0 ) { 752 debugf("Can't set FD_CLOEXEC on file descriptor %d: %s", fd_after->fds[i], strerror(errno)); 753 } 754 } 755 756 free(source); 757 free(target); 758 759 if ( fd_before->fds ) { 760 free(fd_before->fds); 761 } 762 if ( fd_after->fds ) { 763 free(fd_after->fds); 764 } 765 766 free(fd_before); 767 free(fd_after); 768 } 769 770 static void set_terminal_control(pid_t pid) { 771 pid_t tcpgrp = tcgetpgrp(STDIN_FILENO); 772 pid_t pgrp = getpgrp(); 773 774 if ( tcpgrp == pgrp ) { 775 debugf("Pass terminal control to child\n"); 776 777 if ( setpgid(pid, pid) < 0 ) { 778 fatalf("Failed to set child process group: %s\n", strerror(errno)); 779 } 780 if ( tcsetpgrp(STDIN_FILENO, pid) < 0 ) { 781 fatalf("Failed to set child as foreground process: %s\n", strerror(errno)); 782 } 783 } 784 } 785 786 static void event_stop(int fd) { 787 unsigned long long counter; 788 789 if ( read(fd, &counter, sizeof(counter)) != sizeof(counter) ) { 790 fatalf("Failed to receive sync signal: %s\n", strerror(errno)); 791 } 792 } 793 794 static void event_start(int fd) { 795 unsigned long long counter = 1; 796 797 if ( write(fd, &counter, sizeof(counter)) != sizeof(counter) ) { 798 fatalf("Failed to synchronize with master: %s\n", strerror(errno)); 799 } 800 } 801 802 static void fix_fsuid(uid_t uid) { 803 setfsuid(uid); 804 805 if ( setfsuid(uid) != uid ) { 806 fatalf("Failed to set filesystem uid to %d\n", uid); 807 } 808 } 809 810 static void fix_streams(void) { 811 struct stat st; 812 int i = 0; 813 int null = open("/dev/null", O_RDONLY); 814 815 if ( null <= 2 ) { 816 i = null; 817 } 818 819 for ( ; i <= 2; i++ ) { 820 if ( fstat(i, &st) < 0 && errno == EBADF ) { 821 if ( dup2(null, i) < 0 ) { 822 fatalf("Error while fixing IO streams: %s", strerror(errno)); 823 } 824 } 825 } 826 827 if ( null > 2 ) { 828 close(null); 829 } 830 } 831 832 static char *dupenv(const char *env) { 833 char *var = getenv(env); 834 835 if ( var != NULL ) { 836 return strdup(var); 837 } else { 838 fatalf("%s environment variable isn't set\n", env); 839 } 840 841 return NULL; 842 } 843 844 static void exit_with_status(const char *name, int status) { 845 if ( WIFEXITED(status) ) { 846 verbosef("%s exited with status %d\n", name, WEXITSTATUS(status)); 847 exit(WEXITSTATUS(status)); 848 } else if ( WIFSIGNALED(status) ) { 849 verbosef("%s interrupted by signal number %d\n", name, WTERMSIG(status)); 850 kill(getpid(), WTERMSIG(status)); 851 } 852 fatalf("%s exited with unknown status\n", name); 853 } 854 855 void do_exit(int sig) { 856 if ( sig == SIGUSR1 ) { 857 exit(0); 858 } 859 exit(1); 860 } 861 862 __attribute__((constructor)) static void init(void) { 863 uid_t uid = getuid(); 864 gid_t gid = getgid(); 865 sigset_t mask; 866 pid_t stage_pid; 867 char *loglevel; 868 char *pipe_fd_env; 869 int status; 870 int forkfd = -1; 871 int pipe_fd = -1; 872 int fork_flags = 0; 873 int join_chroot = 0; 874 int sync_pipe[2]; 875 struct pollfd fds[2]; 876 struct fdlist *fd_before; 877 struct fdlist *fd_after; 878 879 #ifndef SINGULARITY_NO_NEW_PRIVS 880 fatalf("Host kernel is outdated and does not support PR_SET_NO_NEW_PRIVS!\n"); 881 #endif 882 883 loglevel = dupenv("SINGULARITY_MESSAGELEVEL"); 884 885 pipe_fd_env = getenv("PIPE_EXEC_FD"); 886 if ( pipe_fd_env != NULL ) { 887 if ( sscanf(pipe_fd_env, "%d", &pipe_fd) != 1 ) { 888 fatalf("Failed to parse PIPE_EXEC_FD environment variable: %s\n", strerror(errno)); 889 } 890 debugf("PIPE_EXEC_FD value: %d\n", pipe_fd); 891 if ( pipe_fd < 0 || pipe_fd >= sysconf(_SC_OPEN_MAX) ) { 892 fatalf("Bad PIPE_EXEC_FD file descriptor value\n"); 893 } 894 } else { 895 fatalf("PIPE_EXEC_FD environment variable isn't set\n"); 896 } 897 898 verbosef("Container runtime\n"); 899 900 // initialize starter configuration and share it with child processes 901 config = (struct cConfig *)mmap(NULL, sizeof(struct cConfig), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); 902 if ( config == MAP_FAILED ) { 903 fatalf("Memory allocation failed: %s\n", strerror(errno)); 904 } 905 906 config->container.isSuid = is_suid(); 907 908 if ( config->container.isSuid || geteuid() == 0 ) { 909 /* force kernel to load overlay module to ease detection later */ 910 if ( mount("none", "/", "overlay", MS_SILENT, "") < 0 ) { 911 if ( errno != EINVAL ) { 912 debugf("Overlay seems not supported by kernel\n"); 913 } else { 914 debugf("Overlay seems supported by kernel\n"); 915 } 916 } 917 } 918 919 if ( config->container.isSuid ) { 920 debugf("Drop privileges\n"); 921 if ( setegid(gid) < 0 || seteuid(uid) < 0 ) { 922 fatalf("Failed to drop privileges: %s\n", strerror(errno)); 923 } 924 } 925 926 /* reset environment variables */ 927 clearenv(); 928 929 if ( loglevel != NULL ) { 930 setenv("SINGULARITY_MESSAGELEVEL", loglevel, 1); 931 free(loglevel); 932 } 933 934 /* read json configuration from stdin */ 935 debugf("Read json configuration from pipe\n"); 936 937 if ( ( config->json.size = read(pipe_fd, config->json.config, MAX_JSON_SIZE - 1) ) <= 0 ) { 938 fatalf("Read JSON configuration from pipe failed: %s\n", strerror(errno)); 939 } 940 close(pipe_fd); 941 942 fix_streams(); 943 944 fd_before = list_fd(); 945 946 /* block SIGCHLD signal handled later by stage 2/master */ 947 debugf("Set child signal mask\n"); 948 sigemptyset(&mask); 949 sigaddset(&mask, SIGCHLD); 950 if (sigprocmask(SIG_SETMASK, &mask, NULL) == -1) { 951 fatalf("Blocked signals error: %s\n", strerror(errno)); 952 } 953 954 /* 955 * use CLONE_FILES to share file descriptors opened during stage 1, 956 * this is a lazy implementation to avoid passing file descriptors 957 * between wrapper and stage 1 over unix socket. 958 * This is required so that all processes works with same files/directories 959 * to minimize race conditions 960 */ 961 stage_pid = fork_ns(CLONE_FILES|CLONE_FS); 962 if ( stage_pid == 0 ) { 963 /* 964 * stage1 is responsible for singularity configuration file parsing, handle user input, 965 * read capabilities, check what namespaces is required. 966 */ 967 if ( config->container.isSuid ) { 968 priv_escalate(); 969 execute = prepare_stage(STAGE1, config); 970 } else { 971 set_parent_death_signal(SIGKILL); 972 execute = STAGE1; 973 } 974 975 verbosef("Spawn stage 1\n"); 976 return; 977 } else if ( stage_pid < 0 ) { 978 fatalf("Failed to spawn stage 1\n"); 979 } 980 981 debugf("Wait completion of stage1\n"); 982 if ( wait(&status) != stage_pid ) { 983 fatalf("Can't wait child\n"); 984 } 985 986 if ( WIFEXITED(status) && WEXITSTATUS(status) != 0 ) { 987 verbosef("stage 1 exited with status %d\n", WEXITSTATUS(status)); 988 exit(WEXITSTATUS(status)); 989 } else if ( WIFSIGNALED(status) ) { 990 verbosef("stage 1 interrupted by signal number %d\n", WTERMSIG(status)); 991 kill(getpid(), WTERMSIG(status)); 992 } 993 994 debugf("Create socketpair for master communication channel\n"); 995 if ( socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, master_socket) < 0 ) { 996 fatalf("Failed to create communication socket: %s\n", strerror(errno)); 997 } 998 999 if ( config->container.isInstance ) { 1000 verbosef("Run as instance\n"); 1001 int forked = fork(); 1002 if ( forked == 0 ) { 1003 if ( setsid() < 0 ) { 1004 fatalf("Can't set session leader: %s\n", strerror(errno)); 1005 } 1006 umask(0); 1007 } else { 1008 sigset_t usrmask; 1009 static struct sigaction action; 1010 1011 action.sa_sigaction = (void *)&do_exit; 1012 action.sa_flags = SA_SIGINFO|SA_RESTART; 1013 1014 close(master_socket[0]); 1015 close(master_socket[1]); 1016 1017 sigemptyset(&usrmask); 1018 sigaddset(&usrmask, SIGUSR1); 1019 sigaddset(&usrmask, SIGUSR2); 1020 1021 if (sigprocmask(SIG_SETMASK, &usrmask, NULL) == -1) { 1022 fatalf("Blocked signals error: %s\n", strerror(errno)); 1023 } 1024 if (sigaction(SIGUSR2, &action, NULL) < 0) { 1025 fatalf("Failed to install signal handler for SIGUSR2\n"); 1026 } 1027 if (sigaction(SIGUSR1, &action, NULL) < 0) { 1028 fatalf("Failed to install signal handler for SIGUSR1\n"); 1029 } 1030 if (sigprocmask(SIG_UNBLOCK, &usrmask, NULL) == -1) { 1031 fatalf("Unblock signals error: %s\n", strerror(errno)); 1032 } 1033 while ( waitpid(forked, &status, 0) <= 0 ) { 1034 continue; 1035 } 1036 exit_with_status("instance", status); 1037 } 1038 } 1039 1040 /* relinquish CPU to apply current directory change for current thread */ 1041 sched_yield(); 1042 1043 fd_after = list_fd(); 1044 1045 cleanup_fd(fd_before, fd_after); 1046 1047 user_namespace_init(config, &fork_flags); 1048 1049 shared_mount_namespace_init(config); 1050 1051 if ( fork_flags == CLONE_NEWUSER ) { 1052 forkfd = eventfd(0, 0); 1053 if ( forkfd < 0 ) { 1054 fatalf("Failed to create fork sync pipe between master and child: %s\n", strerror(errno)); 1055 } 1056 } 1057 1058 if ( !config->container.joinMount ) { 1059 debugf("Create RPC socketpair for communication between stage 2 and RPC server\n"); 1060 if ( socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, rpc_socket) < 0 ) { 1061 fatalf("Failed to create communication socket: %s\n", strerror(errno)); 1062 } 1063 } 1064 1065 /* Use setfsuid to address issue about root_squash filesystems option */ 1066 if ( config->container.isSuid ) { 1067 fix_fsuid(uid); 1068 } 1069 1070 /* sync master and near child with an eventfd */ 1071 if ( pipe(sync_pipe) < 0 ) { 1072 fatalf("Failed to create sync pipe: %s\n", strerror(errno)); 1073 } 1074 1075 pid_namespace_init(config, &fork_flags); 1076 1077 stage_pid = fork_ns(fork_flags); 1078 1079 if ( stage_pid == 0 ) { 1080 /* at this stage we are PID 1 if PID namespace requested */ 1081 set_parent_death_signal(SIGKILL); 1082 1083 if ( forkfd >= 0 ) { 1084 // wait parent write user namespace mappings 1085 event_stop(forkfd); 1086 close(forkfd); 1087 1088 setup_userns_identity(config); 1089 } 1090 1091 close(master_socket[0]); 1092 1093 network_namespace_init(config); 1094 1095 uts_namespace_init(config); 1096 1097 ipc_namespace_init(config); 1098 1099 cgroup_namespace_init(config); 1100 1101 mount_namespace_init(config); 1102 1103 close(sync_pipe[0]); 1104 sync_pipe[0] = 0; 1105 if ( write(sync_pipe[1], &sync_pipe[0], sizeof(int)) < 0 ) { 1106 fatalf("Failed to send sync event: %s\n", strerror(errno)); 1107 } 1108 close(sync_pipe[1]); 1109 1110 if ( !config->container.joinMount ) { 1111 close(rpc_socket[0]); 1112 1113 /* 1114 * fork is a convenient way to apply capabilities and privileges drop 1115 * from single thread context before entering in stage 2 1116 */ 1117 int process = fork_ns(CLONE_FS); 1118 1119 if ( process == 0 ) { 1120 verbosef("Spawn RPC server\n"); 1121 execute = RPC_SERVER; 1122 } else if ( process > 0 ) { 1123 int status; 1124 1125 execute = prepare_stage(STAGE2, config); 1126 1127 if ( wait(&status) != process ) { 1128 fatalf("Error while waiting RPC server: %s\n", strerror(errno)); 1129 } 1130 if ( rpc_socket[1] != -1 ) { 1131 close(rpc_socket[1]); 1132 } 1133 } else { 1134 fatalf("Fork failed: %s\n", strerror(errno)); 1135 } 1136 } else { 1137 verbosef("Spawn stage 2\n"); 1138 verbosef("Don't execute RPC server, joining instance\n"); 1139 execute = prepare_stage(STAGE2, config); 1140 } 1141 return; 1142 } else if ( stage_pid > 0 ) { 1143 if ( config->namespace.pid[0] != 0 && config->namespace.flags & CLONE_NEWNS ) { 1144 if ( enter_namespace("/proc/self/ns/pid", CLONE_NEWPID) < 0 ) { 1145 fatalf("Failed to enter in pid namespace: %s\n", strerror(errno)); 1146 } 1147 } 1148 1149 if ( forkfd >= 0 ) { 1150 setup_userns_mappings(config, stage_pid, "allow"); 1151 1152 event_start(forkfd); 1153 close(forkfd); 1154 } 1155 1156 set_terminal_control(stage_pid); 1157 1158 config->container.pid = stage_pid; 1159 1160 verbosef("Spawn master process\n"); 1161 1162 close(master_socket[1]); 1163 1164 // wait child finish namespaces initialization 1165 close(sync_pipe[1]); 1166 sync_pipe[1] = -1; 1167 if ( read(sync_pipe[0], &sync_pipe[1], sizeof(int)) < 0 ) { 1168 fatalf("Failed to receive sync event: %s\n", strerror(errno)); 1169 } 1170 close(sync_pipe[0]); 1171 1172 // value not set, child has exited before sending data 1173 if ( sync_pipe[1] == -1 ) { 1174 waitpid(stage_pid, &status, 0); 1175 exit_with_status("stage 2", status); 1176 } 1177 1178 if ( config->container.joinMount ) { 1179 if ( config->container.isSuid && setresuid(uid, uid, uid) < 0 ) { 1180 fatalf("Failed to drop privileges permanently\n"); 1181 } 1182 debugf("Wait stage 2 child process\n"); 1183 waitpid(stage_pid, &status, 0); 1184 1185 pid_t pgrp = getpgrp(); 1186 pid_t tcpgrp = tcgetpgrp(STDIN_FILENO); 1187 1188 if ( tcpgrp > 0 && pgrp != tcpgrp ) { 1189 if ( signal(SIGTTOU, SIG_IGN) == SIG_ERR ) { 1190 fatalf("failed to ignore SIGTTOU signal: %s\n", strerror(errno)); 1191 } 1192 if ( tcsetpgrp(STDIN_FILENO, pgrp) < 0 ) { 1193 fatalf("Failed to set parent as foreground process: %s\n", strerror(errno)); 1194 } 1195 } 1196 exit_with_status("stage 2", status); 1197 } else { 1198 close(rpc_socket[1]); 1199 1200 if ( config->container.isSuid && setresuid(uid, uid, 0) < 0 ) { 1201 fatalf("Failed to drop privileges\n"); 1202 } 1203 execute = MASTER; 1204 return; 1205 } 1206 } 1207 fatalf("Failed to create container namespaces\n"); 1208 }