github.com/containers/libpod@v1.9.4-0.20220419124438-4284fd425507/pkg/rootless/rootless_linux.c (about)

     1  #define _GNU_SOURCE
     2  #include <sched.h>
     3  #include <stdio.h>
     4  #include <unistd.h>
     5  #include <sys/syscall.h>
     6  #include <stdlib.h>
     7  #include <errno.h>
     8  #include <sys/stat.h>
     9  #include <limits.h>
    10  #include <sys/types.h>
    11  #include <signal.h>
    12  #include <fcntl.h>
    13  #include <sys/wait.h>
    14  #include <string.h>
    15  #include <stdbool.h>
    16  #include <sys/types.h>
    17  #include <sys/prctl.h>
    18  #include <dirent.h>
    19  #include <sys/select.h>
    20  #include <stdio.h>
    21  
    22  int rename_noreplace (int olddirfd, const char *oldpath, int newdirfd, const char *newpath)
    23  {
    24    int ret;
    25  
    26  # ifdef SYS_renameat2
    27  #  ifndef RENAME_NOREPLACE
    28  #   define RENAME_NOREPLACE	(1 << 0)
    29  #  endif
    30  
    31    ret = (int) syscall (SYS_renameat2, olddirfd, oldpath, newdirfd, newpath, RENAME_NOREPLACE);
    32    if (ret == 0 || errno != EINVAL)
    33      return ret;
    34  
    35    /* Fallback in case of errno==EINVAL.  */
    36  # endif
    37  
    38    /* This might be an issue if another process is trying to read the file while it is empty.  */
    39    ret = open (newpath, O_EXCL|O_CREAT, 0700);
    40    if (ret < 0)
    41      return ret;
    42    close (ret);
    43  
    44    /* We are sure we created the file, let's overwrite it.  */
    45    return rename (oldpath, newpath);
    46  }
    47  
    48  #ifndef TEMP_FAILURE_RETRY
    49  #define TEMP_FAILURE_RETRY(expression) \
    50    (__extension__                                                              \
    51      ({ long int __result;                                                     \
    52         do __result = (long int) (expression);                                 \
    53         while (__result == -1L && errno == EINTR);                             \
    54         __result; }))
    55  #endif
    56  
    57  static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces";
    58  static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone";
    59  
    60  static int open_files_max_fd;
    61  static fd_set *open_files_set;
    62  static uid_t rootless_uid_init;
    63  static gid_t rootless_gid_init;
    64  
    65  static int
    66  syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid)
    67  {
    68    return (int) syscall (__NR_setresuid, ruid, euid, suid);
    69  }
    70  
    71  static int
    72  syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid)
    73  {
    74    return (int) syscall (__NR_setresgid, rgid, egid, sgid);
    75  }
    76  
    77  uid_t
    78  rootless_uid ()
    79  {
    80    return rootless_uid_init;
    81  }
    82  
    83  uid_t
    84  rootless_gid ()
    85  {
    86    return rootless_gid_init;
    87  }
    88  
    89  static void
    90  do_pause ()
    91  {
    92    int i;
    93    struct sigaction act;
    94    int const sig[] =
    95      {
    96       SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGPOLL,
    97       SIGPROF, SIGVTALRM, SIGXCPU, SIGXFSZ, 0
    98      };
    99  
   100    act.sa_handler = SIG_IGN;
   101  
   102    for (i = 0; sig[i]; i++)
   103      sigaction (sig[i], &act, NULL);
   104  
   105    prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL);
   106    while (1)
   107      pause ();
   108  }
   109  
   110  static char **
   111  get_cmd_line_args ()
   112  {
   113    int fd;
   114    char *buffer;
   115    size_t allocated;
   116    size_t used = 0;
   117    int ret;
   118    int i, argc = 0;
   119    char **argv;
   120  
   121    fd = open ("/proc/self/cmdline", O_RDONLY);
   122    if (fd < 0)
   123      return NULL;
   124  
   125    allocated = 512;
   126    buffer = malloc (allocated);
   127    if (buffer == NULL)
   128      return NULL;
   129    for (;;)
   130      {
   131        ret = TEMP_FAILURE_RETRY (read (fd, buffer + used, allocated - used));
   132        if (ret < 0)
   133          {
   134            free (buffer);
   135            return NULL;
   136          }
   137  
   138        if (ret == 0)
   139          break;
   140  
   141        used += ret;
   142        if (allocated == used)
   143          {
   144            allocated += 512;
   145            char *tmp = realloc (buffer, allocated);
   146            if (tmp == NULL)
   147              {
   148                free (buffer);
   149                return NULL;
   150              }
   151  	  buffer = tmp;
   152          }
   153      }
   154    close (fd);
   155  
   156    for (i = 0; i < used; i++)
   157      if (buffer[i] == '\0')
   158        argc++;
   159    if (argc == 0)
   160      {
   161        free (buffer);
   162        return NULL;
   163      }
   164  
   165    argv = malloc (sizeof (char *) * (argc + 1));
   166    if (argv == NULL)
   167      {
   168        free (buffer);
   169        return NULL;
   170      }
   171    argc = 0;
   172  
   173    argv[argc++] = buffer;
   174    for (i = 0; i < used - 1; i++)
   175      if (buffer[i] == '\0')
   176        argv[argc++] = buffer + i + 1;
   177  
   178    argv[argc] = NULL;
   179  
   180    return argv;
   181  }
   182  
   183  static bool
   184  can_use_shortcut ()
   185  {
   186    int argc;
   187    char **argv;
   188    bool ret = true;
   189  
   190  #ifdef DISABLE_JOIN_SHORTCUT
   191    return false;
   192  #endif
   193  
   194    argv = get_cmd_line_args ();
   195    if (argv == NULL)
   196      return false;
   197  
   198    if (strstr (argv[0], "podman") == NULL)
   199      return false;
   200  
   201    for (argc = 0; argv[argc]; argc++)
   202      {
   203        if (argc == 0 || argv[argc][0] == '-')
   204          continue;
   205  
   206        if (strcmp (argv[argc], "mount") == 0
   207            || strcmp (argv[argc], "search") == 0
   208            || strcmp (argv[argc], "system") == 0)
   209          {
   210            ret = false;
   211            break;
   212          }
   213      }
   214  
   215    free (argv[0]);
   216    free (argv);
   217    return ret;
   218  }
   219  
   220  static void __attribute__((constructor)) init()
   221  {
   222    const char *xdg_runtime_dir;
   223    const char *pause;
   224    DIR *d;
   225  
   226    pause = getenv ("_PODMAN_PAUSE");
   227    if (pause && pause[0])
   228      {
   229        do_pause ();
   230        _exit (EXIT_FAILURE);
   231      }
   232  
   233    /* Store how many FDs were open before the Go runtime kicked in.  */
   234    d = opendir ("/proc/self/fd");
   235    if (d)
   236      {
   237        struct dirent *ent;
   238        size_t size = 0;
   239  
   240        for (ent = readdir (d); ent; ent = readdir (d))
   241          {
   242            int fd;
   243  
   244            if (ent->d_name[0] == '.')
   245              continue;
   246  
   247            fd = atoi (ent->d_name);
   248            if (fd == dirfd (d))
   249              continue;
   250  
   251            if (fd >= size * FD_SETSIZE)
   252              {
   253                int i;
   254                size_t new_size;
   255  
   256                new_size = (fd / FD_SETSIZE) + 1;
   257                open_files_set = realloc (open_files_set, new_size * sizeof (fd_set));
   258                if (open_files_set == NULL)
   259                  _exit (EXIT_FAILURE);
   260  
   261                for (i = size; i < new_size; i++)
   262                  FD_ZERO (&(open_files_set[i]));
   263  
   264                size = new_size;
   265              }
   266  
   267            if (fd > open_files_max_fd)
   268              open_files_max_fd = fd;
   269  
   270            FD_SET (fd % FD_SETSIZE, &(open_files_set[fd / FD_SETSIZE]));
   271          }
   272        closedir (d);
   273      }
   274  
   275    /* Shortcut.  If we are able to join the pause pid file, do it now so we don't
   276       need to re-exec.  */
   277    xdg_runtime_dir = getenv ("XDG_RUNTIME_DIR");
   278    if (geteuid () != 0 && xdg_runtime_dir && xdg_runtime_dir[0] && can_use_shortcut ())
   279      {
   280        int r;
   281        int fd;
   282        long pid;
   283        char buf[12];
   284        uid_t uid;
   285        gid_t gid;
   286        char path[PATH_MAX];
   287        const char *const suffix = "/libpod/pause.pid";
   288        char *cwd = getcwd (NULL, 0);
   289        char uid_fmt[16];
   290        char gid_fmt[16];
   291        size_t len;
   292  
   293        if (cwd == NULL)
   294          {
   295            fprintf (stderr, "error getting current working directory: %s\n", strerror (errno));
   296            _exit (EXIT_FAILURE);
   297          }
   298  
   299        len = snprintf (path, PATH_MAX, "%s%s", xdg_runtime_dir, suffix);
   300        if (len >= PATH_MAX)
   301          {
   302            fprintf (stderr, "invalid value for XDG_RUNTIME_DIR: %s", strerror (ENAMETOOLONG));
   303            exit (EXIT_FAILURE);
   304          }
   305  
   306        fd = open (path, O_RDONLY);
   307        if (fd < 0)
   308          {
   309            free (cwd);
   310            return;
   311          }
   312  
   313        r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof (buf) - 1));
   314        close (fd);
   315        if (r < 0)
   316          {
   317            free (cwd);
   318            return;
   319          }
   320        buf[r] = '\0';
   321  
   322        pid = strtol (buf, NULL, 10);
   323        if (pid == LONG_MAX)
   324          {
   325            free (cwd);
   326            return;
   327          }
   328  
   329        uid = geteuid ();
   330        gid = getegid ();
   331  
   332        sprintf (path, "/proc/%ld/ns/user", pid);
   333        fd = open (path, O_RDONLY);
   334        if (fd < 0 || setns (fd, 0) < 0)
   335          {
   336            free (cwd);
   337            return;
   338          }
   339        close (fd);
   340  
   341        /* Errors here cannot be ignored as we already joined a ns.  */
   342        sprintf (path, "/proc/%ld/ns/mnt", pid);
   343        fd = open (path, O_RDONLY);
   344        if (fd < 0)
   345          {
   346            fprintf (stderr, "cannot open %s: %s", path, strerror (errno));
   347            exit (EXIT_FAILURE);
   348          }
   349  
   350        sprintf (uid_fmt, "%d", uid);
   351        sprintf (gid_fmt, "%d", gid);
   352  
   353        setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1);
   354        setenv ("_CONTAINERS_ROOTLESS_UID", uid_fmt, 1);
   355        setenv ("_CONTAINERS_ROOTLESS_GID", gid_fmt, 1);
   356  
   357        r = setns (fd, 0);
   358        if (r < 0)
   359          {
   360            fprintf (stderr, "cannot join mount namespace for %ld: %s", pid, strerror (errno));
   361            exit (EXIT_FAILURE);
   362          }
   363        close (fd);
   364  
   365        if (syscall_setresgid (0, 0, 0) < 0)
   366          {
   367            fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
   368            _exit (EXIT_FAILURE);
   369          }
   370  
   371        if (syscall_setresuid (0, 0, 0) < 0)
   372          {
   373            fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
   374            _exit (EXIT_FAILURE);
   375          }
   376  
   377        if (chdir (cwd) < 0)
   378          {
   379            fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
   380            _exit (EXIT_FAILURE);
   381          }
   382  
   383        free (cwd);
   384        rootless_uid_init = uid;
   385        rootless_gid_init = gid;
   386      }
   387  }
   388  
   389  static int
   390  syscall_clone (unsigned long flags, void *child_stack)
   391  {
   392  #if defined(__s390__) || defined(__CRIS__)
   393    return (int) syscall (__NR_clone, child_stack, flags);
   394  #else
   395    return (int) syscall (__NR_clone, flags, child_stack);
   396  #endif
   397  }
   398  
   399  int
   400  reexec_in_user_namespace_wait (int pid, int options)
   401  {
   402    pid_t p;
   403    int status;
   404  
   405    p = TEMP_FAILURE_RETRY (waitpid (pid, &status, 0));
   406    if (p < 0)
   407      return -1;
   408  
   409    if (WIFEXITED (status))
   410      return WEXITSTATUS (status);
   411    if (WIFSIGNALED (status))
   412      return 128 + WTERMSIG (status);
   413    return -1;
   414  }
   415  
   416  static int
   417  create_pause_process (const char *pause_pid_file_path, char **argv)
   418  {
   419    int r, p[2];
   420  
   421    if (pipe (p) < 0)
   422      _exit (EXIT_FAILURE);
   423  
   424    r = fork ();
   425    if (r < 0)
   426      _exit (EXIT_FAILURE);
   427  
   428    if (r)
   429      {
   430        char b;
   431  
   432        close (p[1]);
   433        /* Block until we write the pid file.  */
   434        r = TEMP_FAILURE_RETRY (read (p[0], &b, 1));
   435        close (p[0]);
   436  
   437        reexec_in_user_namespace_wait (r, 0);
   438  
   439        return r == 1 && b == '0' ? 0 : -1;
   440      }
   441    else
   442      {
   443        int fd;
   444        pid_t pid;
   445  
   446        close (p[0]);
   447  
   448        setsid ();
   449        pid = fork ();
   450        if (r < 0)
   451          _exit (EXIT_FAILURE);
   452  
   453        if (pid)
   454          {
   455            char pid_str[12];
   456            char *tmp_file_path = NULL;
   457  
   458            sprintf (pid_str, "%d", pid);
   459  
   460            if (asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path) < 0)
   461              {
   462                fprintf (stderr, "unable to print to string\n");
   463                kill (pid, SIGKILL);
   464                _exit (EXIT_FAILURE);
   465              }
   466  
   467            if (tmp_file_path == NULL)
   468              {
   469                fprintf (stderr, "temporary file path is NULL\n");
   470                kill (pid, SIGKILL);
   471                _exit (EXIT_FAILURE);
   472              }
   473  
   474            fd = mkstemp (tmp_file_path);
   475            if (fd < 0)
   476              {
   477                fprintf (stderr, "error creating temporary file: %s\n", strerror (errno));
   478                kill (pid, SIGKILL);
   479                _exit (EXIT_FAILURE);
   480              }
   481  
   482            r = TEMP_FAILURE_RETRY (write (fd, pid_str, strlen (pid_str)));
   483            if (r < 0)
   484              {
   485                fprintf (stderr, "cannot write to file descriptor: %s\n", strerror (errno));
   486                kill (pid, SIGKILL);
   487                _exit (EXIT_FAILURE);
   488              }
   489            close (fd);
   490  
   491            /* There can be another process at this point trying to configure the user namespace and the pause
   492             process, do not override the pid file if it already exists. */
   493            if (rename_noreplace (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path) < 0)
   494              {
   495                unlink (tmp_file_path);
   496                kill (pid, SIGKILL);
   497                _exit (EXIT_FAILURE);
   498              }
   499  
   500            r = TEMP_FAILURE_RETRY (write (p[1], "0", 1));
   501            if (r < 0)
   502              {
   503                fprintf (stderr, "cannot write to pipe: %s\n", strerror (errno));
   504                _exit (EXIT_FAILURE);
   505              }
   506            close (p[1]);
   507  
   508            _exit (EXIT_SUCCESS);
   509          }
   510        else
   511          {
   512            int null;
   513  
   514            close (p[1]);
   515  
   516            null = open ("/dev/null", O_RDWR);
   517            if (null >= 0)
   518              {
   519                dup2 (null, 0);
   520                dup2 (null, 1);
   521                dup2 (null, 2);
   522                close (null);
   523              }
   524  
   525            for (fd = 3; fd < open_files_max_fd + 16; fd++)
   526              close (fd);
   527  
   528            setenv ("_PODMAN_PAUSE", "1", 1);
   529            execlp (argv[0], argv[0], NULL);
   530  
   531            /* If the execve fails, then do the pause here.  */
   532            do_pause ();
   533            _exit (EXIT_FAILURE);
   534          }
   535      }
   536  }
   537  
   538  static int
   539  open_namespace (int pid_to_join, const char *ns_file)
   540  {
   541    char ns_path[PATH_MAX];
   542    int ret;
   543  
   544    ret = snprintf (ns_path, PATH_MAX, "/proc/%d/ns/%s", pid_to_join, ns_file);
   545    if (ret == PATH_MAX)
   546      {
   547        fprintf (stderr, "internal error: namespace path too long\n");
   548        return -1;
   549      }
   550  
   551    return open (ns_path, O_CLOEXEC | O_RDONLY);
   552  }
   553  
   554  static void
   555  join_namespace_or_die (const char *name, int ns_fd)
   556  {
   557    if (setns (ns_fd, 0) < 0)
   558      {
   559        fprintf (stderr, "cannot set %s namespace\n", name);
   560        _exit (EXIT_FAILURE);
   561      }
   562  }
   563  
   564  int
   565  reexec_userns_join (int pid_to_join, char *pause_pid_file_path)
   566  {
   567    char uid[16];
   568    char gid[16];
   569    char **argv;
   570    int pid;
   571    int mnt_ns = -1;
   572    int user_ns = -1;
   573    char *cwd = getcwd (NULL, 0);
   574    sigset_t sigset, oldsigset;
   575  
   576    if (cwd == NULL)
   577      {
   578        fprintf (stderr, "error getting current working directory: %s\n", strerror (errno));
   579        _exit (EXIT_FAILURE);
   580      }
   581  
   582    sprintf (uid, "%d", geteuid ());
   583    sprintf (gid, "%d", getegid ());
   584  
   585    argv = get_cmd_line_args ();
   586    if (argv == NULL)
   587      {
   588        fprintf (stderr, "cannot read argv: %s\n", strerror (errno));
   589        _exit (EXIT_FAILURE);
   590      }
   591  
   592    user_ns = open_namespace (pid_to_join, "user");
   593    if (user_ns < 0)
   594      return user_ns;
   595    mnt_ns = open_namespace (pid_to_join, "mnt");
   596    if (mnt_ns < 0)
   597      {
   598        close (user_ns);
   599        return mnt_ns;
   600      }
   601  
   602    pid = fork ();
   603    if (pid < 0)
   604      fprintf (stderr, "cannot fork: %s\n", strerror (errno));
   605  
   606    if (pid)
   607      {
   608        int f;
   609  
   610        /* We passed down these fds, close them.  */
   611        close (user_ns);
   612        close (mnt_ns);
   613  
   614        for (f = 3; f < open_files_max_fd; f++)
   615          if (open_files_set == NULL || FD_ISSET (f % FD_SETSIZE, &(open_files_set[f / FD_SETSIZE])))
   616            close (f);
   617        return pid;
   618      }
   619  
   620    if (sigfillset (&sigset) < 0)
   621      {
   622        fprintf (stderr, "cannot fill sigset: %s\n", strerror (errno));
   623        _exit (EXIT_FAILURE);
   624      }
   625    if (sigdelset (&sigset, SIGCHLD) < 0)
   626      {
   627        fprintf (stderr, "cannot sigdelset(SIGCHLD): %s\n", strerror (errno));
   628        _exit (EXIT_FAILURE);
   629      }
   630    if (sigdelset (&sigset, SIGTERM) < 0)
   631      {
   632        fprintf (stderr, "cannot sigdelset(SIGTERM): %s\n", strerror (errno));
   633        _exit (EXIT_FAILURE);
   634      }
   635    if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0)
   636      {
   637        fprintf (stderr, "cannot block signals: %s\n", strerror (errno));
   638        _exit (EXIT_FAILURE);
   639      }
   640  
   641    setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1);
   642    setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1);
   643    setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1);
   644  
   645    if (prctl (PR_SET_PDEATHSIG, SIGTERM, 0, 0, 0) < 0)
   646      {
   647        fprintf (stderr, "cannot prctl(PR_SET_PDEATHSIG): %s\n", strerror (errno));
   648        _exit (EXIT_FAILURE);
   649      }
   650  
   651    join_namespace_or_die ("user", user_ns);
   652    join_namespace_or_die ("mnt", mnt_ns);
   653    close (user_ns);
   654    close (mnt_ns);
   655  
   656    if (syscall_setresgid (0, 0, 0) < 0)
   657      {
   658        fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
   659        _exit (EXIT_FAILURE);
   660      }
   661  
   662    if (syscall_setresuid (0, 0, 0) < 0)
   663      {
   664        fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
   665        _exit (EXIT_FAILURE);
   666      }
   667  
   668    if (chdir (cwd) < 0)
   669      {
   670        fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
   671        _exit (EXIT_FAILURE);
   672      }
   673    free (cwd);
   674  
   675    if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
   676      {
   677        /* We ignore errors here as we didn't create the namespace anyway.  */
   678        create_pause_process (pause_pid_file_path, argv);
   679      }
   680    if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0)
   681      {
   682        fprintf (stderr, "cannot block signals: %s\n", strerror (errno));
   683        _exit (EXIT_FAILURE);
   684      }
   685  
   686    execvp (argv[0], argv);
   687  
   688    _exit (EXIT_FAILURE);
   689  }
   690  
   691  static void
   692  check_proc_sys_userns_file (const char *path)
   693  {
   694    FILE *fp;
   695    fp = fopen (path, "r");
   696    if (fp)
   697      {
   698        char buf[32];
   699        size_t n_read = fread (buf, 1, sizeof(buf) - 1, fp);
   700        if (n_read > 0)
   701          {
   702            buf[n_read] = '\0';
   703            if (strtol (buf, NULL, 10) == 0)
   704              fprintf (stderr, "user namespaces are not enabled in %s\n", path);
   705          }
   706        fclose (fp);
   707      }
   708  }
   709  
   710  static int
   711  copy_file_to_fd (const char *file_to_read, int outfd)
   712  {
   713    char buf[512];
   714    int fd;
   715  
   716    fd = open (file_to_read, O_RDONLY);
   717    if (fd < 0)
   718      return fd;
   719  
   720    for (;;)
   721      {
   722        ssize_t r, w, t = 0;
   723  
   724        r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof buf));
   725        if (r < 0)
   726          {
   727            close (fd);
   728            return r;
   729          }
   730  
   731        if (r == 0)
   732          break;
   733  
   734        while (t < r)
   735          {
   736            w = TEMP_FAILURE_RETRY (write (outfd, &buf[t], r - t));
   737            if (w < 0)
   738              {
   739                close (fd);
   740                return w;
   741              }
   742            t += w;
   743          }
   744      }
   745    close (fd);
   746    return 0;
   747  }
   748  
   749  int
   750  reexec_in_user_namespace (int ready, char *pause_pid_file_path, char *file_to_read, int outputfd)
   751  {
   752    int ret;
   753    pid_t pid;
   754    char b;
   755    char **argv;
   756    char uid[16];
   757    char gid[16];
   758    char *listen_fds = NULL;
   759    char *listen_pid = NULL;
   760    bool do_socket_activation = false;
   761    char *cwd = getcwd (NULL, 0);
   762    sigset_t sigset, oldsigset;
   763  
   764    if (cwd == NULL)
   765      {
   766        fprintf (stderr, "error getting current working directory: %s\n", strerror (errno));
   767        _exit (EXIT_FAILURE);
   768      }
   769  
   770    listen_pid = getenv("LISTEN_PID");
   771    listen_fds = getenv("LISTEN_FDS");
   772  
   773    if (listen_pid != NULL && listen_fds != NULL)
   774      {
   775        if (strtol(listen_pid, NULL, 10) == getpid())
   776          do_socket_activation = true;
   777      }
   778  
   779    sprintf (uid, "%d", geteuid ());
   780    sprintf (gid, "%d", getegid ());
   781  
   782    pid = syscall_clone (CLONE_NEWUSER|CLONE_NEWNS|SIGCHLD, NULL);
   783    if (pid < 0)
   784      {
   785        fprintf (stderr, "cannot clone: %s\n", strerror (errno));
   786        check_proc_sys_userns_file (_max_user_namespaces);
   787        check_proc_sys_userns_file (_unprivileged_user_namespaces);
   788      }
   789    if (pid)
   790      {
   791        if (do_socket_activation)
   792          {
   793            long num_fds;
   794            num_fds = strtol (listen_fds, NULL, 10);
   795            if (num_fds != LONG_MIN && num_fds != LONG_MAX)
   796              {
   797                int f;
   798  
   799                for (f = 3; f < num_fds + 3; f++)
   800                  if (open_files_set == NULL || FD_ISSET (f % FD_SETSIZE, &(open_files_set[f / FD_SETSIZE])))
   801                    close (f);
   802              }
   803            unsetenv ("LISTEN_PID");
   804            unsetenv ("LISTEN_FDS");
   805            unsetenv ("LISTEN_FDNAMES");
   806          }
   807        return pid;
   808      }
   809  
   810    if (sigfillset (&sigset) < 0)
   811      {
   812        fprintf (stderr, "cannot fill sigset: %s\n", strerror (errno));
   813        _exit (EXIT_FAILURE);
   814      }
   815    if (sigdelset (&sigset, SIGCHLD) < 0)
   816      {
   817        fprintf (stderr, "cannot sigdelset(SIGCHLD): %s\n", strerror (errno));
   818        _exit (EXIT_FAILURE);
   819      }
   820    if (sigdelset (&sigset, SIGTERM) < 0)
   821      {
   822        fprintf (stderr, "cannot sigdelset(SIGTERM): %s\n", strerror (errno));
   823        _exit (EXIT_FAILURE);
   824      }
   825    if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0)
   826      {
   827        fprintf (stderr, "cannot block signals: %s\n", strerror (errno));
   828        _exit (EXIT_FAILURE);
   829      }
   830  
   831    argv = get_cmd_line_args ();
   832    if (argv == NULL)
   833      {
   834        fprintf (stderr, "cannot read argv: %s\n", strerror (errno));
   835        _exit (EXIT_FAILURE);
   836      }
   837  
   838    if (do_socket_activation)
   839      {
   840        char s[32];
   841        sprintf (s, "%d", getpid());
   842        setenv ("LISTEN_PID", s, true);
   843      }
   844  
   845    setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1);
   846    setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1);
   847    setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1);
   848  
   849    ret = TEMP_FAILURE_RETRY (read (ready, &b, 1));
   850    if (ret < 0)
   851      {
   852        fprintf (stderr, "cannot read from sync pipe: %s\n", strerror (errno));
   853        _exit (EXIT_FAILURE);
   854      }
   855    if (b != '0')
   856      _exit (EXIT_FAILURE);
   857  
   858    if (syscall_setresgid (0, 0, 0) < 0)
   859      {
   860        fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
   861        TEMP_FAILURE_RETRY (write (ready, "1", 1));
   862        _exit (EXIT_FAILURE);
   863      }
   864  
   865    if (syscall_setresuid (0, 0, 0) < 0)
   866      {
   867        fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
   868        TEMP_FAILURE_RETRY (write (ready, "1", 1));
   869        _exit (EXIT_FAILURE);
   870      }
   871  
   872    if (chdir (cwd) < 0)
   873      {
   874        fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
   875        TEMP_FAILURE_RETRY (write (ready, "1", 1));
   876        _exit (EXIT_FAILURE);
   877      }
   878    free (cwd);
   879  
   880    if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
   881      {
   882        if (create_pause_process (pause_pid_file_path, argv) < 0)
   883          {
   884            TEMP_FAILURE_RETRY (write (ready, "2", 1));
   885            _exit (EXIT_FAILURE);
   886          }
   887      }
   888  
   889    ret = TEMP_FAILURE_RETRY (write (ready, "0", 1));
   890    if (ret < 0)
   891    {
   892  	  fprintf (stderr, "cannot write to ready pipe: %s\n", strerror (errno));
   893  	  _exit (EXIT_FAILURE);
   894    }
   895    close (ready);
   896  
   897    if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0)
   898      {
   899        fprintf (stderr, "cannot block signals: %s\n", strerror (errno));
   900        _exit (EXIT_FAILURE);
   901      }
   902  
   903    if (file_to_read && file_to_read[0])
   904      {
   905        ret = copy_file_to_fd (file_to_read, outputfd);
   906        close (outputfd);
   907        _exit (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
   908      }
   909  
   910    execvp (argv[0], argv);
   911  
   912    _exit (EXIT_FAILURE);
   913  }