github.com/hanks177/podman/v4@v4.1.3-0.20220613032544-16d90015bc83/pkg/rootless/rootless_linux.c (about)

     1  #define _GNU_SOURCE
     2  #include <sched.h>
     3  #include <stdio.h>
     4  #include <unistd.h>
     5  #include <sys/syscall.h>
     6  #include <stdlib.h>
     7  #include <errno.h>
     8  #include <sys/stat.h>
     9  #include <limits.h>
    10  #include <sys/types.h>
    11  #include <signal.h>
    12  #include <fcntl.h>
    13  #include <sys/wait.h>
    14  #include <string.h>
    15  #include <stdbool.h>
    16  #include <sys/types.h>
    17  #include <sys/prctl.h>
    18  #include <dirent.h>
    19  #include <sys/select.h>
    20  #include <stdio.h>
    21  
    22  #ifndef TEMP_FAILURE_RETRY
    23  #define TEMP_FAILURE_RETRY(expression) \
    24    (__extension__                                                              \
    25      ({ long int __result;                                                     \
    26         do __result = (long int) (expression);                                 \
    27         while (__result == -1L && errno == EINTR);                             \
    28         __result; }))
    29  #endif
    30  
    31  #define cleanup_free __attribute__ ((cleanup (cleanup_freep)))
    32  #define cleanup_close __attribute__ ((cleanup (cleanup_closep)))
    33  #define cleanup_dir __attribute__ ((cleanup (cleanup_dirp)))
    34  
    35  static inline void
    36  cleanup_freep (void *p)
    37  {
    38    void **pp = (void **) p;
    39    free (*pp);
    40  }
    41  
    42  static inline void
    43  cleanup_closep (void *p)
    44  {
    45    int *pp = p;
    46    if (*pp >= 0)
    47      TEMP_FAILURE_RETRY (close (*pp));
    48  }
    49  
    50  static inline void
    51  cleanup_dirp (DIR **p)
    52  {
    53    DIR *dir = *p;
    54    if (dir)
    55      closedir (dir);
    56  }
    57  
    58  int rename_noreplace (int olddirfd, const char *oldpath, int newdirfd, const char *newpath)
    59  {
    60    int ret;
    61  
    62  # ifdef SYS_renameat2
    63  #  ifndef RENAME_NOREPLACE
    64  #   define RENAME_NOREPLACE	(1 << 0)
    65  #  endif
    66  
    67    ret = (int) syscall (SYS_renameat2, olddirfd, oldpath, newdirfd, newpath, RENAME_NOREPLACE);
    68    if (ret == 0 || errno != EINVAL)
    69      return ret;
    70  
    71    /* Fallback in case of errno==EINVAL.  */
    72  # endif
    73  
    74    /* This might be an issue if another process is trying to read the file while it is empty.  */
    75    ret = open (newpath, O_EXCL|O_CREAT, 0700);
    76    if (ret < 0)
    77      return ret;
    78    close (ret);
    79  
    80    /* We are sure we created the file, let's overwrite it.  */
    81    return rename (oldpath, newpath);
    82  }
    83  
    84  static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces";
    85  static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone";
    86  
    87  static int open_files_max_fd;
    88  static fd_set *open_files_set;
    89  static uid_t rootless_uid_init;
    90  static gid_t rootless_gid_init;
    91  static bool do_socket_activation = false;
    92  static char *saved_systemd_listen_fds;
    93  static char *saved_systemd_listen_pid;
    94  static char *saved_systemd_listen_fdnames;
    95  
    96  static int
    97  syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid)
    98  {
    99    return (int) syscall (__NR_setresuid, ruid, euid, suid);
   100  }
   101  
   102  static int
   103  syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid)
   104  {
   105    return (int) syscall (__NR_setresgid, rgid, egid, sgid);
   106  }
   107  
   108  uid_t
   109  rootless_uid ()
   110  {
   111    return rootless_uid_init;
   112  }
   113  
   114  uid_t
   115  rootless_gid ()
   116  {
   117    return rootless_gid_init;
   118  }
   119  
   120  static void
   121  do_pause ()
   122  {
   123    int i;
   124    struct sigaction act;
   125    int const sig[] =
   126      {
   127       SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGPOLL,
   128       SIGPROF, SIGVTALRM, SIGXCPU, SIGXFSZ, 0
   129      };
   130  
   131    act.sa_handler = SIG_IGN;
   132  
   133    for (i = 0; sig[i]; i++)
   134      sigaction (sig[i], &act, NULL);
   135  
   136    /* Attempt to execv catatonit to keep the pause process alive.  */
   137    execl ("/usr/libexec/podman/catatonit", "catatonit", "-P", NULL);
   138    execl ("/usr/bin/catatonit", "catatonit", "-P", NULL);
   139    /* and if the catatonit executable could not be found, fallback here... */
   140  
   141    prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL);
   142    while (1)
   143      pause ();
   144  }
   145  
   146  static char **
   147  get_cmd_line_args ()
   148  {
   149    cleanup_free char *buffer = NULL;
   150    cleanup_close int fd = -1;
   151    size_t allocated;
   152    size_t used = 0;
   153    int ret;
   154    int i, argc = 0;
   155    char **argv;
   156  
   157    fd = open ("/proc/self/cmdline", O_RDONLY);
   158    if (fd < 0)
   159      return NULL;
   160  
   161    allocated = 512;
   162    buffer = malloc (allocated);
   163    if (buffer == NULL)
   164      return NULL;
   165    for (;;)
   166      {
   167        ret = TEMP_FAILURE_RETRY (read (fd, buffer + used, allocated - used));
   168        if (ret < 0)
   169          return NULL;
   170  
   171        if (ret == 0)
   172          break;
   173  
   174        used += ret;
   175        if (allocated == used)
   176          {
   177            allocated += 512;
   178            char *tmp = realloc (buffer, allocated);
   179            if (tmp == NULL)
   180              return NULL;
   181  	  buffer = tmp;
   182          }
   183      }
   184  
   185    for (i = 0; i < used; i++)
   186      if (buffer[i] == '\0')
   187        argc++;
   188    if (argc == 0)
   189      return NULL;
   190  
   191    argv = malloc (sizeof (char *) * (argc + 1));
   192    if (argv == NULL)
   193      return NULL;
   194  
   195    argc = 0;
   196  
   197    argv[argc++] = buffer;
   198    for (i = 0; i < used - 1; i++)
   199      if (buffer[i] == '\0')
   200        argv[argc++] = buffer + i + 1;
   201  
   202    argv[argc] = NULL;
   203  
   204    /* Move ownership.  */
   205    buffer = NULL;
   206  
   207    return argv;
   208  }
   209  
   210  static bool
   211  can_use_shortcut ()
   212  {
   213    cleanup_free char **argv = NULL;
   214    cleanup_free char *argv0 = NULL;
   215    bool ret = true;
   216    int argc;
   217  
   218  #ifdef DISABLE_JOIN_SHORTCUT
   219    return false;
   220  #endif
   221  
   222    argv = get_cmd_line_args ();
   223    if (argv == NULL)
   224      return false;
   225  
   226    argv0 = argv[0];
   227  
   228    if (strstr (argv[0], "podman") == NULL)
   229      return false;
   230  
   231    for (argc = 0; argv[argc]; argc++)
   232      {
   233        if (argc == 0 || argv[argc][0] == '-')
   234          continue;
   235  
   236        if (strcmp (argv[argc], "mount") == 0
   237            || strcmp (argv[argc], "machine") == 0
   238            || strcmp (argv[argc], "search") == 0
   239            || (strcmp (argv[argc], "system") == 0 && argv[argc+1] && strcmp (argv[argc+1], "service") != 0))
   240          {
   241            ret = false;
   242            break;
   243          }
   244  
   245        if (argv[argc+1] != NULL && (strcmp (argv[argc], "container") == 0 ||
   246  	   strcmp (argv[argc], "image") == 0) &&
   247       (strcmp (argv[argc+1], "mount") == 0  || strcmp (argv[argc+1], "scp") == 0))
   248          {
   249            ret = false;
   250            break;
   251          }
   252      }
   253  
   254    return ret;
   255  }
   256  
   257  static int
   258  open_namespace (int pid_to_join, const char *ns_file)
   259  {
   260    char ns_path[PATH_MAX];
   261    int ret;
   262  
   263    ret = snprintf (ns_path, PATH_MAX, "/proc/%d/ns/%s", pid_to_join, ns_file);
   264    if (ret == PATH_MAX)
   265      {
   266        fprintf (stderr, "internal error: namespace path too long\n");
   267        return -1;
   268      }
   269  
   270    return open (ns_path, O_CLOEXEC | O_RDONLY);
   271  }
   272  
   273  int
   274  is_fd_inherited(int fd)
   275  {
   276    if (open_files_set == NULL || fd > open_files_max_fd || fd < 0)
   277      return 0;
   278  
   279    return FD_ISSET(fd % FD_SETSIZE, &(open_files_set[fd / FD_SETSIZE])) ? 1 : 0;
   280  }
   281  
   282  static void __attribute__((constructor)) init()
   283  {
   284    const char *xdg_runtime_dir;
   285    const char *pause;
   286    const char *listen_pid;
   287    const char *listen_fds;
   288    const char *listen_fdnames;
   289    cleanup_dir DIR *d = NULL;
   290  
   291    pause = getenv ("_PODMAN_PAUSE");
   292    if (pause && pause[0])
   293      {
   294        do_pause ();
   295        _exit (EXIT_FAILURE);
   296      }
   297  
   298    /* Store how many FDs were open before the Go runtime kicked in.  */
   299    d = opendir ("/proc/self/fd");
   300    if (d)
   301      {
   302        struct dirent *ent;
   303        size_t size = 0;
   304  
   305        for (ent = readdir (d); ent; ent = readdir (d))
   306          {
   307            int fd;
   308  
   309            if (ent->d_name[0] == '.')
   310              continue;
   311  
   312            fd = atoi (ent->d_name);
   313            if (fd == dirfd (d))
   314              continue;
   315  
   316            if (fd >= size * FD_SETSIZE)
   317              {
   318                int i;
   319                size_t new_size;
   320  
   321                new_size = (fd / FD_SETSIZE) + 1;
   322                open_files_set = realloc (open_files_set, new_size * sizeof (fd_set));
   323                if (open_files_set == NULL)
   324                  _exit (EXIT_FAILURE);
   325  
   326                for (i = size; i < new_size; i++)
   327                  FD_ZERO (&(open_files_set[i]));
   328  
   329                size = new_size;
   330              }
   331  
   332            if (fd > open_files_max_fd)
   333              open_files_max_fd = fd;
   334  
   335            FD_SET (fd % FD_SETSIZE, &(open_files_set[fd / FD_SETSIZE]));
   336          }
   337      }
   338  
   339      listen_pid = getenv("LISTEN_PID");
   340      listen_fds = getenv("LISTEN_FDS");
   341      listen_fdnames = getenv("LISTEN_FDNAMES");
   342  
   343      if (listen_pid != NULL && listen_fds != NULL && strtol(listen_pid, NULL, 10) == getpid())
   344        {
   345          // save systemd socket environment for rootless child
   346          do_socket_activation = true;
   347          saved_systemd_listen_pid = strdup(listen_pid);
   348          saved_systemd_listen_fds = strdup(listen_fds);
   349          if (listen_fdnames != NULL)
   350            saved_systemd_listen_fdnames = strdup(listen_fdnames);
   351          if (saved_systemd_listen_pid == NULL
   352                  || saved_systemd_listen_fds == NULL)
   353            {
   354              fprintf (stderr, "save socket listen environments error: %m\n");
   355              _exit (EXIT_FAILURE);
   356            }
   357        }
   358  
   359    /* Shortcut.  If we are able to join the pause pid file, do it now so we don't
   360       need to re-exec.  */
   361    xdg_runtime_dir = getenv ("XDG_RUNTIME_DIR");
   362    if (geteuid () != 0 && xdg_runtime_dir && xdg_runtime_dir[0] && can_use_shortcut ())
   363      {
   364        cleanup_free char *cwd = NULL;
   365        cleanup_close int userns_fd = -1;
   366        cleanup_close int mntns_fd = -1;
   367        cleanup_close int fd = -1;
   368        long pid;
   369        char buf[12];
   370        uid_t uid;
   371        gid_t gid;
   372        char path[PATH_MAX];
   373        const char *const suffix = "/libpod/tmp/pause.pid";
   374        char uid_fmt[16];
   375        char gid_fmt[16];
   376        size_t len;
   377        int r;
   378  
   379        cwd = getcwd (NULL, 0);
   380        if (cwd == NULL)
   381          {
   382            fprintf (stderr, "error getting current working directory: %m\n");
   383            _exit (EXIT_FAILURE);
   384          }
   385  
   386        len = snprintf (path, PATH_MAX, "%s%s", xdg_runtime_dir, suffix);
   387        if (len >= PATH_MAX)
   388          {
   389            errno = ENAMETOOLONG;
   390            fprintf (stderr, "invalid value for XDG_RUNTIME_DIR: %m");
   391            exit (EXIT_FAILURE);
   392          }
   393  
   394        fd = open (path, O_RDONLY);
   395        if (fd < 0)
   396          return;
   397  
   398        r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof (buf) - 1));
   399  
   400        if (r < 0)
   401          return;
   402        buf[r] = '\0';
   403  
   404        pid = strtol (buf, NULL, 10);
   405        if (pid == LONG_MAX)
   406          return;
   407  
   408        uid = geteuid ();
   409        gid = getegid ();
   410  
   411        userns_fd = open_namespace (pid, "user");
   412        if (userns_fd < 0)
   413          return;
   414  
   415        mntns_fd = open_namespace (pid, "mnt");
   416        if (mntns_fd < 0)
   417          return;
   418  
   419        if (setns (userns_fd, 0) < 0)
   420          return;
   421  
   422        /* The user namespace was joined, after this point errors are
   423           not recoverable anymore.  */
   424  
   425        if (setns (mntns_fd, 0) < 0)
   426          {
   427            fprintf (stderr, "cannot join mount namespace for %ld: %m", pid);
   428            exit (EXIT_FAILURE);
   429          }
   430  
   431        sprintf (uid_fmt, "%d", uid);
   432        sprintf (gid_fmt, "%d", gid);
   433  
   434        setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1);
   435        setenv ("_CONTAINERS_ROOTLESS_UID", uid_fmt, 1);
   436        setenv ("_CONTAINERS_ROOTLESS_GID", gid_fmt, 1);
   437  
   438        if (syscall_setresgid (0, 0, 0) < 0)
   439          {
   440            fprintf (stderr, "cannot setresgid: %m\n");
   441            _exit (EXIT_FAILURE);
   442          }
   443  
   444        if (syscall_setresuid (0, 0, 0) < 0)
   445          {
   446            fprintf (stderr, "cannot setresuid: %m\n");
   447            _exit (EXIT_FAILURE);
   448          }
   449  
   450        if (chdir (cwd) < 0)
   451          {
   452            fprintf (stderr, "cannot chdir to %s: %m\n", cwd);
   453            _exit (EXIT_FAILURE);
   454          }
   455  
   456        rootless_uid_init = uid;
   457        rootless_gid_init = gid;
   458      }
   459  }
   460  
   461  static int
   462  syscall_clone (unsigned long flags, void *child_stack)
   463  {
   464  #if defined(__s390__) || defined(__CRIS__)
   465    return (int) syscall (__NR_clone, child_stack, flags);
   466  #else
   467    return (int) syscall (__NR_clone, flags, child_stack);
   468  #endif
   469  }
   470  
   471  int
   472  reexec_in_user_namespace_wait (int pid, int options)
   473  {
   474    pid_t p;
   475    int status;
   476  
   477    p = TEMP_FAILURE_RETRY (waitpid (pid, &status, 0));
   478    if (p < 0)
   479      return -1;
   480  
   481    if (WIFEXITED (status))
   482      return WEXITSTATUS (status);
   483    if (WIFSIGNALED (status))
   484      return 128 + WTERMSIG (status);
   485    return -1;
   486  }
   487  
   488  static int
   489  create_pause_process (const char *pause_pid_file_path, char **argv)
   490  {
   491    pid_t pid;
   492    int p[2];
   493  
   494    if (pipe (p) < 0)
   495      return -1;
   496  
   497    pid = fork ();
   498    if (pid < 0)
   499      {
   500        close (p[0]);
   501        close (p[1]);
   502        return -1;
   503      }
   504  
   505    if (pid)
   506      {
   507        char b;
   508        int r;
   509  
   510        close (p[1]);
   511        /* Block until we write the pid file.  */
   512        r = TEMP_FAILURE_RETRY (read (p[0], &b, 1));
   513        close (p[0]);
   514  
   515        reexec_in_user_namespace_wait (pid, 0);
   516  
   517        return r == 1 && b == '0' ? 0 : -1;
   518      }
   519    else
   520      {
   521        int r, fd;
   522  
   523        close (p[0]);
   524  
   525        setsid ();
   526        pid = fork ();
   527        if (pid < 0)
   528          _exit (EXIT_FAILURE);
   529  
   530        if (pid)
   531          {
   532            char pid_str[12];
   533            char *tmp_file_path = NULL;
   534  
   535            sprintf (pid_str, "%d", pid);
   536  
   537            if (asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path) < 0)
   538              {
   539                fprintf (stderr, "unable to print to string\n");
   540                kill (pid, SIGKILL);
   541                _exit (EXIT_FAILURE);
   542              }
   543  
   544            if (tmp_file_path == NULL)
   545              {
   546                fprintf (stderr, "temporary file path is NULL\n");
   547                kill (pid, SIGKILL);
   548                _exit (EXIT_FAILURE);
   549              }
   550  
   551            fd = mkstemp (tmp_file_path);
   552            if (fd < 0)
   553              {
   554                fprintf (stderr, "error creating temporary file: %m\n");
   555                kill (pid, SIGKILL);
   556                _exit (EXIT_FAILURE);
   557              }
   558  
   559            r = TEMP_FAILURE_RETRY (write (fd, pid_str, strlen (pid_str)));
   560            if (r < 0)
   561              {
   562                fprintf (stderr, "cannot write to file descriptor: %m\n");
   563                kill (pid, SIGKILL);
   564                _exit (EXIT_FAILURE);
   565              }
   566            close (fd);
   567  
   568            /* There can be another process at this point trying to configure the user namespace and the pause
   569             process, do not override the pid file if it already exists. */
   570            if (rename_noreplace (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path) < 0)
   571              {
   572                unlink (tmp_file_path);
   573                kill (pid, SIGKILL);
   574                _exit (EXIT_FAILURE);
   575              }
   576  
   577            r = TEMP_FAILURE_RETRY (write (p[1], "0", 1));
   578            if (r < 0)
   579              {
   580                fprintf (stderr, "cannot write to pipe: %m\n");
   581                _exit (EXIT_FAILURE);
   582              }
   583            close (p[1]);
   584  
   585            _exit (EXIT_SUCCESS);
   586          }
   587        else
   588          {
   589            int null;
   590  
   591            close (p[1]);
   592  
   593            null = open ("/dev/null", O_RDWR);
   594            if (null >= 0)
   595              {
   596                dup2 (null, 0);
   597                dup2 (null, 1);
   598                dup2 (null, 2);
   599                close (null);
   600              }
   601  
   602            for (fd = 3; fd < open_files_max_fd + 16; fd++)
   603              close (fd);
   604  
   605            setenv ("_PODMAN_PAUSE", "1", 1);
   606            execlp (argv[0], argv[0], NULL);
   607  
   608            /* If the execve fails, then do the pause here.  */
   609            do_pause ();
   610            _exit (EXIT_FAILURE);
   611          }
   612      }
   613  }
   614  
   615  static void
   616  join_namespace_or_die (const char *name, int ns_fd)
   617  {
   618    if (setns (ns_fd, 0) < 0)
   619      {
   620        fprintf (stderr, "cannot set %s namespace\n", name);
   621        _exit (EXIT_FAILURE);
   622      }
   623  }
   624  
   625  int
   626  reexec_userns_join (int pid_to_join, char *pause_pid_file_path)
   627  {
   628    cleanup_close int userns_fd = -1;
   629    cleanup_close int mntns_fd = -1;
   630    cleanup_free char *cwd = NULL;
   631    char uid[16];
   632    char gid[16];
   633    cleanup_free char *argv0 = NULL;
   634    cleanup_free char **argv = NULL;
   635    int pid;
   636    sigset_t sigset, oldsigset;
   637  
   638    cwd = getcwd (NULL, 0);
   639    if (cwd == NULL)
   640      {
   641        fprintf (stderr, "error getting current working directory: %m\n");
   642        _exit (EXIT_FAILURE);
   643      }
   644  
   645    sprintf (uid, "%d", geteuid ());
   646    sprintf (gid, "%d", getegid ());
   647  
   648    argv = get_cmd_line_args ();
   649    if (argv == NULL)
   650      {
   651        fprintf (stderr, "cannot read argv: %m\n");
   652        _exit (EXIT_FAILURE);
   653      }
   654  
   655    argv0 = argv[0];
   656  
   657    userns_fd = open_namespace (pid_to_join, "user");
   658    if (userns_fd < 0)
   659      return userns_fd;
   660    mntns_fd = open_namespace (pid_to_join, "mnt");
   661    if (mntns_fd < 0)
   662      return mntns_fd;
   663  
   664    pid = fork ();
   665    if (pid < 0)
   666      fprintf (stderr, "cannot fork: %m\n");
   667  
   668    if (pid)
   669      {
   670        int f;
   671  
   672        for (f = 3; f <= open_files_max_fd; f++)
   673          if (is_fd_inherited (f))
   674            close (f);
   675        if (do_socket_activation)
   676          {
   677            unsetenv ("LISTEN_PID");
   678            unsetenv ("LISTEN_FDS");
   679            unsetenv ("LISTEN_FDNAMES");
   680          }
   681  
   682        return pid;
   683      }
   684  
   685    if (sigfillset (&sigset) < 0)
   686      {
   687        fprintf (stderr, "cannot fill sigset: %m\n");
   688        _exit (EXIT_FAILURE);
   689      }
   690    if (sigdelset (&sigset, SIGCHLD) < 0)
   691      {
   692        fprintf (stderr, "cannot sigdelset(SIGCHLD): %m\n");
   693        _exit (EXIT_FAILURE);
   694      }
   695    if (sigdelset (&sigset, SIGTERM) < 0)
   696      {
   697        fprintf (stderr, "cannot sigdelset(SIGTERM): %m\n");
   698        _exit (EXIT_FAILURE);
   699      }
   700    if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0)
   701      {
   702        fprintf (stderr, "cannot block signals: %m\n");
   703        _exit (EXIT_FAILURE);
   704      }
   705  
   706    if (do_socket_activation)
   707      {
   708        char s[32];
   709        sprintf (s, "%d", getpid());
   710        setenv ("LISTEN_PID", s, true);
   711        setenv ("LISTEN_FDS", saved_systemd_listen_fds, true);
   712        // Setting fdnames is optional for systemd_socket_activation
   713        if (saved_systemd_listen_fdnames != NULL)
   714          setenv ("LISTEN_FDNAMES", saved_systemd_listen_fdnames, true);
   715      }
   716  
   717    setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1);
   718    setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1);
   719    setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1);
   720  
   721    if (prctl (PR_SET_PDEATHSIG, SIGTERM, 0, 0, 0) < 0)
   722      {
   723        fprintf (stderr, "cannot prctl(PR_SET_PDEATHSIG): %m\n");
   724        _exit (EXIT_FAILURE);
   725      }
   726  
   727    join_namespace_or_die ("user", userns_fd);
   728    join_namespace_or_die ("mnt", mntns_fd);
   729  
   730    if (syscall_setresgid (0, 0, 0) < 0)
   731      {
   732        fprintf (stderr, "cannot setresgid: %m\n");
   733        _exit (EXIT_FAILURE);
   734      }
   735  
   736    if (syscall_setresuid (0, 0, 0) < 0)
   737      {
   738        fprintf (stderr, "cannot setresuid: %m\n");
   739        _exit (EXIT_FAILURE);
   740      }
   741  
   742    if (chdir (cwd) < 0)
   743      {
   744        fprintf (stderr, "cannot chdir to %s: %m\n", cwd);
   745        _exit (EXIT_FAILURE);
   746      }
   747  
   748    if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
   749      {
   750        /* We ignore errors here as we didn't create the namespace anyway.  */
   751        create_pause_process (pause_pid_file_path, argv);
   752      }
   753    if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0)
   754      {
   755        fprintf (stderr, "cannot block signals: %m\n");
   756        _exit (EXIT_FAILURE);
   757      }
   758  
   759    execvp (argv[0], argv);
   760  
   761    _exit (EXIT_FAILURE);
   762  }
   763  
   764  static void
   765  check_proc_sys_userns_file (const char *path)
   766  {
   767    FILE *fp;
   768    fp = fopen (path, "r");
   769    if (fp)
   770      {
   771        char buf[32];
   772        size_t n_read = fread (buf, 1, sizeof(buf) - 1, fp);
   773        if (n_read > 0)
   774          {
   775            buf[n_read] = '\0';
   776            if (strtol (buf, NULL, 10) == 0)
   777              fprintf (stderr, "user namespaces are not enabled in %s\n", path);
   778          }
   779        fclose (fp);
   780      }
   781  }
   782  
   783  static int
   784  copy_file_to_fd (const char *file_to_read, int outfd)
   785  {
   786    char buf[512];
   787    cleanup_close int fd = -1;
   788  
   789    fd = open (file_to_read, O_RDONLY);
   790    if (fd < 0)
   791      return fd;
   792  
   793    for (;;)
   794      {
   795        ssize_t r, w, t = 0;
   796  
   797        r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof buf));
   798        if (r < 0)
   799          return r;
   800  
   801        if (r == 0)
   802          break;
   803  
   804        while (t < r)
   805          {
   806            w = TEMP_FAILURE_RETRY (write (outfd, &buf[t], r - t));
   807            if (w < 0)
   808              return w;
   809            t += w;
   810          }
   811      }
   812    return 0;
   813  }
   814  
   815  int
   816  reexec_in_user_namespace (int ready, char *pause_pid_file_path, char *file_to_read, int outputfd)
   817  {
   818    cleanup_free char **argv = NULL;
   819    cleanup_free char *argv0 = NULL;
   820    cleanup_free char *cwd = NULL;
   821    sigset_t sigset, oldsigset;
   822    int ret;
   823    pid_t pid;
   824    char b;
   825    char uid[16];
   826    char gid[16];
   827  
   828    cwd = getcwd (NULL, 0);
   829    if (cwd == NULL)
   830      {
   831        fprintf (stderr, "error getting current working directory: %m\n");
   832        _exit (EXIT_FAILURE);
   833      }
   834  
   835    sprintf (uid, "%d", geteuid ());
   836    sprintf (gid, "%d", getegid ());
   837  
   838    pid = syscall_clone (CLONE_NEWUSER|CLONE_NEWNS|SIGCHLD, NULL);
   839    if (pid < 0)
   840      {
   841        fprintf (stderr, "cannot clone: %m\n");
   842        check_proc_sys_userns_file (_max_user_namespaces);
   843        check_proc_sys_userns_file (_unprivileged_user_namespaces);
   844      }
   845    if (pid)
   846      {
   847        if (do_socket_activation)
   848          {
   849            long num_fds;
   850  
   851            num_fds = strtol (saved_systemd_listen_fds, NULL, 10);
   852            if (num_fds != LONG_MIN && num_fds != LONG_MAX)
   853              {
   854                int f;
   855  
   856                for (f = 3; f < num_fds + 3; f++)
   857                  if (is_fd_inherited (f))
   858                    close (f);
   859              }
   860            unsetenv ("LISTEN_PID");
   861            unsetenv ("LISTEN_FDS");
   862            unsetenv ("LISTEN_FDNAMES");
   863          }
   864        return pid;
   865      }
   866  
   867    if (sigfillset (&sigset) < 0)
   868      {
   869        fprintf (stderr, "cannot fill sigset: %m\n");
   870        _exit (EXIT_FAILURE);
   871      }
   872    if (sigdelset (&sigset, SIGCHLD) < 0)
   873      {
   874        fprintf (stderr, "cannot sigdelset(SIGCHLD): %m\n");
   875        _exit (EXIT_FAILURE);
   876      }
   877    if (sigdelset (&sigset, SIGTERM) < 0)
   878      {
   879        fprintf (stderr, "cannot sigdelset(SIGTERM): %m\n");
   880        _exit (EXIT_FAILURE);
   881      }
   882    if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0)
   883      {
   884        fprintf (stderr, "cannot block signals: %m\n");
   885        _exit (EXIT_FAILURE);
   886      }
   887  
   888    argv = get_cmd_line_args ();
   889    if (argv == NULL)
   890      {
   891        fprintf (stderr, "cannot read argv: %m\n");
   892        _exit (EXIT_FAILURE);
   893      }
   894  
   895    argv0 = argv[0];
   896  
   897    if (do_socket_activation)
   898      {
   899        char s[32];
   900        sprintf (s, "%d", getpid());
   901        setenv ("LISTEN_PID", s, true);
   902        setenv ("LISTEN_FDS", saved_systemd_listen_fds, true);
   903        // Setting fdnames is optional for systemd_socket_activation
   904        if (saved_systemd_listen_fdnames != NULL)
   905          setenv ("LISTEN_FDNAMES", saved_systemd_listen_fdnames, true);
   906      }
   907  
   908    setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1);
   909    setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1);
   910    setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1);
   911  
   912    ret = TEMP_FAILURE_RETRY (read (ready, &b, 1));
   913    if (ret < 0)
   914      {
   915        fprintf (stderr, "cannot read from sync pipe: %m\n");
   916        _exit (EXIT_FAILURE);
   917      }
   918    if (ret != 1 || b != '0')
   919      _exit (EXIT_FAILURE);
   920  
   921    if (syscall_setresgid (0, 0, 0) < 0)
   922      {
   923        fprintf (stderr, "cannot setresgid: %m\n");
   924        TEMP_FAILURE_RETRY (write (ready, "1", 1));
   925        _exit (EXIT_FAILURE);
   926      }
   927  
   928    if (syscall_setresuid (0, 0, 0) < 0)
   929      {
   930        fprintf (stderr, "cannot setresuid: %m\n");
   931        TEMP_FAILURE_RETRY (write (ready, "1", 1));
   932        _exit (EXIT_FAILURE);
   933      }
   934  
   935    if (chdir (cwd) < 0)
   936      {
   937        fprintf (stderr, "cannot chdir to %s: %m\n", cwd);
   938        TEMP_FAILURE_RETRY (write (ready, "1", 1));
   939        _exit (EXIT_FAILURE);
   940      }
   941  
   942    if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
   943      {
   944        if (create_pause_process (pause_pid_file_path, argv) < 0)
   945          {
   946            TEMP_FAILURE_RETRY (write (ready, "2", 1));
   947            _exit (EXIT_FAILURE);
   948          }
   949      }
   950  
   951    ret = TEMP_FAILURE_RETRY (write (ready, "0", 1));
   952    if (ret < 0)
   953    {
   954      fprintf (stderr, "cannot write to ready pipe: %m\n");
   955      _exit (EXIT_FAILURE);
   956    }
   957    close (ready);
   958  
   959    if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0)
   960      {
   961        fprintf (stderr, "cannot block signals: %m\n");
   962        _exit (EXIT_FAILURE);
   963      }
   964  
   965    if (file_to_read && file_to_read[0])
   966      {
   967        ret = copy_file_to_fd (file_to_read, outputfd);
   968        close (outputfd);
   969        _exit (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
   970      }
   971  
   972    execvp (argv[0], argv);
   973  
   974    _exit (EXIT_FAILURE);
   975  }