github.com/hanks177/podman/v4@v4.1.3-0.20220613032544-16d90015bc83/pkg/rootless/rootless_linux.c (about) 1 #define _GNU_SOURCE 2 #include <sched.h> 3 #include <stdio.h> 4 #include <unistd.h> 5 #include <sys/syscall.h> 6 #include <stdlib.h> 7 #include <errno.h> 8 #include <sys/stat.h> 9 #include <limits.h> 10 #include <sys/types.h> 11 #include <signal.h> 12 #include <fcntl.h> 13 #include <sys/wait.h> 14 #include <string.h> 15 #include <stdbool.h> 16 #include <sys/types.h> 17 #include <sys/prctl.h> 18 #include <dirent.h> 19 #include <sys/select.h> 20 #include <stdio.h> 21 22 #ifndef TEMP_FAILURE_RETRY 23 #define TEMP_FAILURE_RETRY(expression) \ 24 (__extension__ \ 25 ({ long int __result; \ 26 do __result = (long int) (expression); \ 27 while (__result == -1L && errno == EINTR); \ 28 __result; })) 29 #endif 30 31 #define cleanup_free __attribute__ ((cleanup (cleanup_freep))) 32 #define cleanup_close __attribute__ ((cleanup (cleanup_closep))) 33 #define cleanup_dir __attribute__ ((cleanup (cleanup_dirp))) 34 35 static inline void 36 cleanup_freep (void *p) 37 { 38 void **pp = (void **) p; 39 free (*pp); 40 } 41 42 static inline void 43 cleanup_closep (void *p) 44 { 45 int *pp = p; 46 if (*pp >= 0) 47 TEMP_FAILURE_RETRY (close (*pp)); 48 } 49 50 static inline void 51 cleanup_dirp (DIR **p) 52 { 53 DIR *dir = *p; 54 if (dir) 55 closedir (dir); 56 } 57 58 int rename_noreplace (int olddirfd, const char *oldpath, int newdirfd, const char *newpath) 59 { 60 int ret; 61 62 # ifdef SYS_renameat2 63 # ifndef RENAME_NOREPLACE 64 # define RENAME_NOREPLACE (1 << 0) 65 # endif 66 67 ret = (int) syscall (SYS_renameat2, olddirfd, oldpath, newdirfd, newpath, RENAME_NOREPLACE); 68 if (ret == 0 || errno != EINVAL) 69 return ret; 70 71 /* Fallback in case of errno==EINVAL. */ 72 # endif 73 74 /* This might be an issue if another process is trying to read the file while it is empty. */ 75 ret = open (newpath, O_EXCL|O_CREAT, 0700); 76 if (ret < 0) 77 return ret; 78 close (ret); 79 80 /* We are sure we created the file, let's overwrite it. */ 81 return rename (oldpath, newpath); 82 } 83 84 static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces"; 85 static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone"; 86 87 static int open_files_max_fd; 88 static fd_set *open_files_set; 89 static uid_t rootless_uid_init; 90 static gid_t rootless_gid_init; 91 static bool do_socket_activation = false; 92 static char *saved_systemd_listen_fds; 93 static char *saved_systemd_listen_pid; 94 static char *saved_systemd_listen_fdnames; 95 96 static int 97 syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid) 98 { 99 return (int) syscall (__NR_setresuid, ruid, euid, suid); 100 } 101 102 static int 103 syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid) 104 { 105 return (int) syscall (__NR_setresgid, rgid, egid, sgid); 106 } 107 108 uid_t 109 rootless_uid () 110 { 111 return rootless_uid_init; 112 } 113 114 uid_t 115 rootless_gid () 116 { 117 return rootless_gid_init; 118 } 119 120 static void 121 do_pause () 122 { 123 int i; 124 struct sigaction act; 125 int const sig[] = 126 { 127 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGPOLL, 128 SIGPROF, SIGVTALRM, SIGXCPU, SIGXFSZ, 0 129 }; 130 131 act.sa_handler = SIG_IGN; 132 133 for (i = 0; sig[i]; i++) 134 sigaction (sig[i], &act, NULL); 135 136 /* Attempt to execv catatonit to keep the pause process alive. */ 137 execl ("/usr/libexec/podman/catatonit", "catatonit", "-P", NULL); 138 execl ("/usr/bin/catatonit", "catatonit", "-P", NULL); 139 /* and if the catatonit executable could not be found, fallback here... */ 140 141 prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL); 142 while (1) 143 pause (); 144 } 145 146 static char ** 147 get_cmd_line_args () 148 { 149 cleanup_free char *buffer = NULL; 150 cleanup_close int fd = -1; 151 size_t allocated; 152 size_t used = 0; 153 int ret; 154 int i, argc = 0; 155 char **argv; 156 157 fd = open ("/proc/self/cmdline", O_RDONLY); 158 if (fd < 0) 159 return NULL; 160 161 allocated = 512; 162 buffer = malloc (allocated); 163 if (buffer == NULL) 164 return NULL; 165 for (;;) 166 { 167 ret = TEMP_FAILURE_RETRY (read (fd, buffer + used, allocated - used)); 168 if (ret < 0) 169 return NULL; 170 171 if (ret == 0) 172 break; 173 174 used += ret; 175 if (allocated == used) 176 { 177 allocated += 512; 178 char *tmp = realloc (buffer, allocated); 179 if (tmp == NULL) 180 return NULL; 181 buffer = tmp; 182 } 183 } 184 185 for (i = 0; i < used; i++) 186 if (buffer[i] == '\0') 187 argc++; 188 if (argc == 0) 189 return NULL; 190 191 argv = malloc (sizeof (char *) * (argc + 1)); 192 if (argv == NULL) 193 return NULL; 194 195 argc = 0; 196 197 argv[argc++] = buffer; 198 for (i = 0; i < used - 1; i++) 199 if (buffer[i] == '\0') 200 argv[argc++] = buffer + i + 1; 201 202 argv[argc] = NULL; 203 204 /* Move ownership. */ 205 buffer = NULL; 206 207 return argv; 208 } 209 210 static bool 211 can_use_shortcut () 212 { 213 cleanup_free char **argv = NULL; 214 cleanup_free char *argv0 = NULL; 215 bool ret = true; 216 int argc; 217 218 #ifdef DISABLE_JOIN_SHORTCUT 219 return false; 220 #endif 221 222 argv = get_cmd_line_args (); 223 if (argv == NULL) 224 return false; 225 226 argv0 = argv[0]; 227 228 if (strstr (argv[0], "podman") == NULL) 229 return false; 230 231 for (argc = 0; argv[argc]; argc++) 232 { 233 if (argc == 0 || argv[argc][0] == '-') 234 continue; 235 236 if (strcmp (argv[argc], "mount") == 0 237 || strcmp (argv[argc], "machine") == 0 238 || strcmp (argv[argc], "search") == 0 239 || (strcmp (argv[argc], "system") == 0 && argv[argc+1] && strcmp (argv[argc+1], "service") != 0)) 240 { 241 ret = false; 242 break; 243 } 244 245 if (argv[argc+1] != NULL && (strcmp (argv[argc], "container") == 0 || 246 strcmp (argv[argc], "image") == 0) && 247 (strcmp (argv[argc+1], "mount") == 0 || strcmp (argv[argc+1], "scp") == 0)) 248 { 249 ret = false; 250 break; 251 } 252 } 253 254 return ret; 255 } 256 257 static int 258 open_namespace (int pid_to_join, const char *ns_file) 259 { 260 char ns_path[PATH_MAX]; 261 int ret; 262 263 ret = snprintf (ns_path, PATH_MAX, "/proc/%d/ns/%s", pid_to_join, ns_file); 264 if (ret == PATH_MAX) 265 { 266 fprintf (stderr, "internal error: namespace path too long\n"); 267 return -1; 268 } 269 270 return open (ns_path, O_CLOEXEC | O_RDONLY); 271 } 272 273 int 274 is_fd_inherited(int fd) 275 { 276 if (open_files_set == NULL || fd > open_files_max_fd || fd < 0) 277 return 0; 278 279 return FD_ISSET(fd % FD_SETSIZE, &(open_files_set[fd / FD_SETSIZE])) ? 1 : 0; 280 } 281 282 static void __attribute__((constructor)) init() 283 { 284 const char *xdg_runtime_dir; 285 const char *pause; 286 const char *listen_pid; 287 const char *listen_fds; 288 const char *listen_fdnames; 289 cleanup_dir DIR *d = NULL; 290 291 pause = getenv ("_PODMAN_PAUSE"); 292 if (pause && pause[0]) 293 { 294 do_pause (); 295 _exit (EXIT_FAILURE); 296 } 297 298 /* Store how many FDs were open before the Go runtime kicked in. */ 299 d = opendir ("/proc/self/fd"); 300 if (d) 301 { 302 struct dirent *ent; 303 size_t size = 0; 304 305 for (ent = readdir (d); ent; ent = readdir (d)) 306 { 307 int fd; 308 309 if (ent->d_name[0] == '.') 310 continue; 311 312 fd = atoi (ent->d_name); 313 if (fd == dirfd (d)) 314 continue; 315 316 if (fd >= size * FD_SETSIZE) 317 { 318 int i; 319 size_t new_size; 320 321 new_size = (fd / FD_SETSIZE) + 1; 322 open_files_set = realloc (open_files_set, new_size * sizeof (fd_set)); 323 if (open_files_set == NULL) 324 _exit (EXIT_FAILURE); 325 326 for (i = size; i < new_size; i++) 327 FD_ZERO (&(open_files_set[i])); 328 329 size = new_size; 330 } 331 332 if (fd > open_files_max_fd) 333 open_files_max_fd = fd; 334 335 FD_SET (fd % FD_SETSIZE, &(open_files_set[fd / FD_SETSIZE])); 336 } 337 } 338 339 listen_pid = getenv("LISTEN_PID"); 340 listen_fds = getenv("LISTEN_FDS"); 341 listen_fdnames = getenv("LISTEN_FDNAMES"); 342 343 if (listen_pid != NULL && listen_fds != NULL && strtol(listen_pid, NULL, 10) == getpid()) 344 { 345 // save systemd socket environment for rootless child 346 do_socket_activation = true; 347 saved_systemd_listen_pid = strdup(listen_pid); 348 saved_systemd_listen_fds = strdup(listen_fds); 349 if (listen_fdnames != NULL) 350 saved_systemd_listen_fdnames = strdup(listen_fdnames); 351 if (saved_systemd_listen_pid == NULL 352 || saved_systemd_listen_fds == NULL) 353 { 354 fprintf (stderr, "save socket listen environments error: %m\n"); 355 _exit (EXIT_FAILURE); 356 } 357 } 358 359 /* Shortcut. If we are able to join the pause pid file, do it now so we don't 360 need to re-exec. */ 361 xdg_runtime_dir = getenv ("XDG_RUNTIME_DIR"); 362 if (geteuid () != 0 && xdg_runtime_dir && xdg_runtime_dir[0] && can_use_shortcut ()) 363 { 364 cleanup_free char *cwd = NULL; 365 cleanup_close int userns_fd = -1; 366 cleanup_close int mntns_fd = -1; 367 cleanup_close int fd = -1; 368 long pid; 369 char buf[12]; 370 uid_t uid; 371 gid_t gid; 372 char path[PATH_MAX]; 373 const char *const suffix = "/libpod/tmp/pause.pid"; 374 char uid_fmt[16]; 375 char gid_fmt[16]; 376 size_t len; 377 int r; 378 379 cwd = getcwd (NULL, 0); 380 if (cwd == NULL) 381 { 382 fprintf (stderr, "error getting current working directory: %m\n"); 383 _exit (EXIT_FAILURE); 384 } 385 386 len = snprintf (path, PATH_MAX, "%s%s", xdg_runtime_dir, suffix); 387 if (len >= PATH_MAX) 388 { 389 errno = ENAMETOOLONG; 390 fprintf (stderr, "invalid value for XDG_RUNTIME_DIR: %m"); 391 exit (EXIT_FAILURE); 392 } 393 394 fd = open (path, O_RDONLY); 395 if (fd < 0) 396 return; 397 398 r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof (buf) - 1)); 399 400 if (r < 0) 401 return; 402 buf[r] = '\0'; 403 404 pid = strtol (buf, NULL, 10); 405 if (pid == LONG_MAX) 406 return; 407 408 uid = geteuid (); 409 gid = getegid (); 410 411 userns_fd = open_namespace (pid, "user"); 412 if (userns_fd < 0) 413 return; 414 415 mntns_fd = open_namespace (pid, "mnt"); 416 if (mntns_fd < 0) 417 return; 418 419 if (setns (userns_fd, 0) < 0) 420 return; 421 422 /* The user namespace was joined, after this point errors are 423 not recoverable anymore. */ 424 425 if (setns (mntns_fd, 0) < 0) 426 { 427 fprintf (stderr, "cannot join mount namespace for %ld: %m", pid); 428 exit (EXIT_FAILURE); 429 } 430 431 sprintf (uid_fmt, "%d", uid); 432 sprintf (gid_fmt, "%d", gid); 433 434 setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1); 435 setenv ("_CONTAINERS_ROOTLESS_UID", uid_fmt, 1); 436 setenv ("_CONTAINERS_ROOTLESS_GID", gid_fmt, 1); 437 438 if (syscall_setresgid (0, 0, 0) < 0) 439 { 440 fprintf (stderr, "cannot setresgid: %m\n"); 441 _exit (EXIT_FAILURE); 442 } 443 444 if (syscall_setresuid (0, 0, 0) < 0) 445 { 446 fprintf (stderr, "cannot setresuid: %m\n"); 447 _exit (EXIT_FAILURE); 448 } 449 450 if (chdir (cwd) < 0) 451 { 452 fprintf (stderr, "cannot chdir to %s: %m\n", cwd); 453 _exit (EXIT_FAILURE); 454 } 455 456 rootless_uid_init = uid; 457 rootless_gid_init = gid; 458 } 459 } 460 461 static int 462 syscall_clone (unsigned long flags, void *child_stack) 463 { 464 #if defined(__s390__) || defined(__CRIS__) 465 return (int) syscall (__NR_clone, child_stack, flags); 466 #else 467 return (int) syscall (__NR_clone, flags, child_stack); 468 #endif 469 } 470 471 int 472 reexec_in_user_namespace_wait (int pid, int options) 473 { 474 pid_t p; 475 int status; 476 477 p = TEMP_FAILURE_RETRY (waitpid (pid, &status, 0)); 478 if (p < 0) 479 return -1; 480 481 if (WIFEXITED (status)) 482 return WEXITSTATUS (status); 483 if (WIFSIGNALED (status)) 484 return 128 + WTERMSIG (status); 485 return -1; 486 } 487 488 static int 489 create_pause_process (const char *pause_pid_file_path, char **argv) 490 { 491 pid_t pid; 492 int p[2]; 493 494 if (pipe (p) < 0) 495 return -1; 496 497 pid = fork (); 498 if (pid < 0) 499 { 500 close (p[0]); 501 close (p[1]); 502 return -1; 503 } 504 505 if (pid) 506 { 507 char b; 508 int r; 509 510 close (p[1]); 511 /* Block until we write the pid file. */ 512 r = TEMP_FAILURE_RETRY (read (p[0], &b, 1)); 513 close (p[0]); 514 515 reexec_in_user_namespace_wait (pid, 0); 516 517 return r == 1 && b == '0' ? 0 : -1; 518 } 519 else 520 { 521 int r, fd; 522 523 close (p[0]); 524 525 setsid (); 526 pid = fork (); 527 if (pid < 0) 528 _exit (EXIT_FAILURE); 529 530 if (pid) 531 { 532 char pid_str[12]; 533 char *tmp_file_path = NULL; 534 535 sprintf (pid_str, "%d", pid); 536 537 if (asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path) < 0) 538 { 539 fprintf (stderr, "unable to print to string\n"); 540 kill (pid, SIGKILL); 541 _exit (EXIT_FAILURE); 542 } 543 544 if (tmp_file_path == NULL) 545 { 546 fprintf (stderr, "temporary file path is NULL\n"); 547 kill (pid, SIGKILL); 548 _exit (EXIT_FAILURE); 549 } 550 551 fd = mkstemp (tmp_file_path); 552 if (fd < 0) 553 { 554 fprintf (stderr, "error creating temporary file: %m\n"); 555 kill (pid, SIGKILL); 556 _exit (EXIT_FAILURE); 557 } 558 559 r = TEMP_FAILURE_RETRY (write (fd, pid_str, strlen (pid_str))); 560 if (r < 0) 561 { 562 fprintf (stderr, "cannot write to file descriptor: %m\n"); 563 kill (pid, SIGKILL); 564 _exit (EXIT_FAILURE); 565 } 566 close (fd); 567 568 /* There can be another process at this point trying to configure the user namespace and the pause 569 process, do not override the pid file if it already exists. */ 570 if (rename_noreplace (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path) < 0) 571 { 572 unlink (tmp_file_path); 573 kill (pid, SIGKILL); 574 _exit (EXIT_FAILURE); 575 } 576 577 r = TEMP_FAILURE_RETRY (write (p[1], "0", 1)); 578 if (r < 0) 579 { 580 fprintf (stderr, "cannot write to pipe: %m\n"); 581 _exit (EXIT_FAILURE); 582 } 583 close (p[1]); 584 585 _exit (EXIT_SUCCESS); 586 } 587 else 588 { 589 int null; 590 591 close (p[1]); 592 593 null = open ("/dev/null", O_RDWR); 594 if (null >= 0) 595 { 596 dup2 (null, 0); 597 dup2 (null, 1); 598 dup2 (null, 2); 599 close (null); 600 } 601 602 for (fd = 3; fd < open_files_max_fd + 16; fd++) 603 close (fd); 604 605 setenv ("_PODMAN_PAUSE", "1", 1); 606 execlp (argv[0], argv[0], NULL); 607 608 /* If the execve fails, then do the pause here. */ 609 do_pause (); 610 _exit (EXIT_FAILURE); 611 } 612 } 613 } 614 615 static void 616 join_namespace_or_die (const char *name, int ns_fd) 617 { 618 if (setns (ns_fd, 0) < 0) 619 { 620 fprintf (stderr, "cannot set %s namespace\n", name); 621 _exit (EXIT_FAILURE); 622 } 623 } 624 625 int 626 reexec_userns_join (int pid_to_join, char *pause_pid_file_path) 627 { 628 cleanup_close int userns_fd = -1; 629 cleanup_close int mntns_fd = -1; 630 cleanup_free char *cwd = NULL; 631 char uid[16]; 632 char gid[16]; 633 cleanup_free char *argv0 = NULL; 634 cleanup_free char **argv = NULL; 635 int pid; 636 sigset_t sigset, oldsigset; 637 638 cwd = getcwd (NULL, 0); 639 if (cwd == NULL) 640 { 641 fprintf (stderr, "error getting current working directory: %m\n"); 642 _exit (EXIT_FAILURE); 643 } 644 645 sprintf (uid, "%d", geteuid ()); 646 sprintf (gid, "%d", getegid ()); 647 648 argv = get_cmd_line_args (); 649 if (argv == NULL) 650 { 651 fprintf (stderr, "cannot read argv: %m\n"); 652 _exit (EXIT_FAILURE); 653 } 654 655 argv0 = argv[0]; 656 657 userns_fd = open_namespace (pid_to_join, "user"); 658 if (userns_fd < 0) 659 return userns_fd; 660 mntns_fd = open_namespace (pid_to_join, "mnt"); 661 if (mntns_fd < 0) 662 return mntns_fd; 663 664 pid = fork (); 665 if (pid < 0) 666 fprintf (stderr, "cannot fork: %m\n"); 667 668 if (pid) 669 { 670 int f; 671 672 for (f = 3; f <= open_files_max_fd; f++) 673 if (is_fd_inherited (f)) 674 close (f); 675 if (do_socket_activation) 676 { 677 unsetenv ("LISTEN_PID"); 678 unsetenv ("LISTEN_FDS"); 679 unsetenv ("LISTEN_FDNAMES"); 680 } 681 682 return pid; 683 } 684 685 if (sigfillset (&sigset) < 0) 686 { 687 fprintf (stderr, "cannot fill sigset: %m\n"); 688 _exit (EXIT_FAILURE); 689 } 690 if (sigdelset (&sigset, SIGCHLD) < 0) 691 { 692 fprintf (stderr, "cannot sigdelset(SIGCHLD): %m\n"); 693 _exit (EXIT_FAILURE); 694 } 695 if (sigdelset (&sigset, SIGTERM) < 0) 696 { 697 fprintf (stderr, "cannot sigdelset(SIGTERM): %m\n"); 698 _exit (EXIT_FAILURE); 699 } 700 if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0) 701 { 702 fprintf (stderr, "cannot block signals: %m\n"); 703 _exit (EXIT_FAILURE); 704 } 705 706 if (do_socket_activation) 707 { 708 char s[32]; 709 sprintf (s, "%d", getpid()); 710 setenv ("LISTEN_PID", s, true); 711 setenv ("LISTEN_FDS", saved_systemd_listen_fds, true); 712 // Setting fdnames is optional for systemd_socket_activation 713 if (saved_systemd_listen_fdnames != NULL) 714 setenv ("LISTEN_FDNAMES", saved_systemd_listen_fdnames, true); 715 } 716 717 setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1); 718 setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1); 719 setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1); 720 721 if (prctl (PR_SET_PDEATHSIG, SIGTERM, 0, 0, 0) < 0) 722 { 723 fprintf (stderr, "cannot prctl(PR_SET_PDEATHSIG): %m\n"); 724 _exit (EXIT_FAILURE); 725 } 726 727 join_namespace_or_die ("user", userns_fd); 728 join_namespace_or_die ("mnt", mntns_fd); 729 730 if (syscall_setresgid (0, 0, 0) < 0) 731 { 732 fprintf (stderr, "cannot setresgid: %m\n"); 733 _exit (EXIT_FAILURE); 734 } 735 736 if (syscall_setresuid (0, 0, 0) < 0) 737 { 738 fprintf (stderr, "cannot setresuid: %m\n"); 739 _exit (EXIT_FAILURE); 740 } 741 742 if (chdir (cwd) < 0) 743 { 744 fprintf (stderr, "cannot chdir to %s: %m\n", cwd); 745 _exit (EXIT_FAILURE); 746 } 747 748 if (pause_pid_file_path && pause_pid_file_path[0] != '\0') 749 { 750 /* We ignore errors here as we didn't create the namespace anyway. */ 751 create_pause_process (pause_pid_file_path, argv); 752 } 753 if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0) 754 { 755 fprintf (stderr, "cannot block signals: %m\n"); 756 _exit (EXIT_FAILURE); 757 } 758 759 execvp (argv[0], argv); 760 761 _exit (EXIT_FAILURE); 762 } 763 764 static void 765 check_proc_sys_userns_file (const char *path) 766 { 767 FILE *fp; 768 fp = fopen (path, "r"); 769 if (fp) 770 { 771 char buf[32]; 772 size_t n_read = fread (buf, 1, sizeof(buf) - 1, fp); 773 if (n_read > 0) 774 { 775 buf[n_read] = '\0'; 776 if (strtol (buf, NULL, 10) == 0) 777 fprintf (stderr, "user namespaces are not enabled in %s\n", path); 778 } 779 fclose (fp); 780 } 781 } 782 783 static int 784 copy_file_to_fd (const char *file_to_read, int outfd) 785 { 786 char buf[512]; 787 cleanup_close int fd = -1; 788 789 fd = open (file_to_read, O_RDONLY); 790 if (fd < 0) 791 return fd; 792 793 for (;;) 794 { 795 ssize_t r, w, t = 0; 796 797 r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof buf)); 798 if (r < 0) 799 return r; 800 801 if (r == 0) 802 break; 803 804 while (t < r) 805 { 806 w = TEMP_FAILURE_RETRY (write (outfd, &buf[t], r - t)); 807 if (w < 0) 808 return w; 809 t += w; 810 } 811 } 812 return 0; 813 } 814 815 int 816 reexec_in_user_namespace (int ready, char *pause_pid_file_path, char *file_to_read, int outputfd) 817 { 818 cleanup_free char **argv = NULL; 819 cleanup_free char *argv0 = NULL; 820 cleanup_free char *cwd = NULL; 821 sigset_t sigset, oldsigset; 822 int ret; 823 pid_t pid; 824 char b; 825 char uid[16]; 826 char gid[16]; 827 828 cwd = getcwd (NULL, 0); 829 if (cwd == NULL) 830 { 831 fprintf (stderr, "error getting current working directory: %m\n"); 832 _exit (EXIT_FAILURE); 833 } 834 835 sprintf (uid, "%d", geteuid ()); 836 sprintf (gid, "%d", getegid ()); 837 838 pid = syscall_clone (CLONE_NEWUSER|CLONE_NEWNS|SIGCHLD, NULL); 839 if (pid < 0) 840 { 841 fprintf (stderr, "cannot clone: %m\n"); 842 check_proc_sys_userns_file (_max_user_namespaces); 843 check_proc_sys_userns_file (_unprivileged_user_namespaces); 844 } 845 if (pid) 846 { 847 if (do_socket_activation) 848 { 849 long num_fds; 850 851 num_fds = strtol (saved_systemd_listen_fds, NULL, 10); 852 if (num_fds != LONG_MIN && num_fds != LONG_MAX) 853 { 854 int f; 855 856 for (f = 3; f < num_fds + 3; f++) 857 if (is_fd_inherited (f)) 858 close (f); 859 } 860 unsetenv ("LISTEN_PID"); 861 unsetenv ("LISTEN_FDS"); 862 unsetenv ("LISTEN_FDNAMES"); 863 } 864 return pid; 865 } 866 867 if (sigfillset (&sigset) < 0) 868 { 869 fprintf (stderr, "cannot fill sigset: %m\n"); 870 _exit (EXIT_FAILURE); 871 } 872 if (sigdelset (&sigset, SIGCHLD) < 0) 873 { 874 fprintf (stderr, "cannot sigdelset(SIGCHLD): %m\n"); 875 _exit (EXIT_FAILURE); 876 } 877 if (sigdelset (&sigset, SIGTERM) < 0) 878 { 879 fprintf (stderr, "cannot sigdelset(SIGTERM): %m\n"); 880 _exit (EXIT_FAILURE); 881 } 882 if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0) 883 { 884 fprintf (stderr, "cannot block signals: %m\n"); 885 _exit (EXIT_FAILURE); 886 } 887 888 argv = get_cmd_line_args (); 889 if (argv == NULL) 890 { 891 fprintf (stderr, "cannot read argv: %m\n"); 892 _exit (EXIT_FAILURE); 893 } 894 895 argv0 = argv[0]; 896 897 if (do_socket_activation) 898 { 899 char s[32]; 900 sprintf (s, "%d", getpid()); 901 setenv ("LISTEN_PID", s, true); 902 setenv ("LISTEN_FDS", saved_systemd_listen_fds, true); 903 // Setting fdnames is optional for systemd_socket_activation 904 if (saved_systemd_listen_fdnames != NULL) 905 setenv ("LISTEN_FDNAMES", saved_systemd_listen_fdnames, true); 906 } 907 908 setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1); 909 setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1); 910 setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1); 911 912 ret = TEMP_FAILURE_RETRY (read (ready, &b, 1)); 913 if (ret < 0) 914 { 915 fprintf (stderr, "cannot read from sync pipe: %m\n"); 916 _exit (EXIT_FAILURE); 917 } 918 if (ret != 1 || b != '0') 919 _exit (EXIT_FAILURE); 920 921 if (syscall_setresgid (0, 0, 0) < 0) 922 { 923 fprintf (stderr, "cannot setresgid: %m\n"); 924 TEMP_FAILURE_RETRY (write (ready, "1", 1)); 925 _exit (EXIT_FAILURE); 926 } 927 928 if (syscall_setresuid (0, 0, 0) < 0) 929 { 930 fprintf (stderr, "cannot setresuid: %m\n"); 931 TEMP_FAILURE_RETRY (write (ready, "1", 1)); 932 _exit (EXIT_FAILURE); 933 } 934 935 if (chdir (cwd) < 0) 936 { 937 fprintf (stderr, "cannot chdir to %s: %m\n", cwd); 938 TEMP_FAILURE_RETRY (write (ready, "1", 1)); 939 _exit (EXIT_FAILURE); 940 } 941 942 if (pause_pid_file_path && pause_pid_file_path[0] != '\0') 943 { 944 if (create_pause_process (pause_pid_file_path, argv) < 0) 945 { 946 TEMP_FAILURE_RETRY (write (ready, "2", 1)); 947 _exit (EXIT_FAILURE); 948 } 949 } 950 951 ret = TEMP_FAILURE_RETRY (write (ready, "0", 1)); 952 if (ret < 0) 953 { 954 fprintf (stderr, "cannot write to ready pipe: %m\n"); 955 _exit (EXIT_FAILURE); 956 } 957 close (ready); 958 959 if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0) 960 { 961 fprintf (stderr, "cannot block signals: %m\n"); 962 _exit (EXIT_FAILURE); 963 } 964 965 if (file_to_read && file_to_read[0]) 966 { 967 ret = copy_file_to_fd (file_to_read, outputfd); 968 close (outputfd); 969 _exit (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); 970 } 971 972 execvp (argv[0], argv); 973 974 _exit (EXIT_FAILURE); 975 }