github.com/containers/podman/v2@v2.2.2-0.20210501105131-c1e07d070c4c/pkg/rootless/rootless_linux.c (about)

     1  #define _GNU_SOURCE
     2  #include <sched.h>
     3  #include <stdio.h>
     4  #include <unistd.h>
     5  #include <sys/syscall.h>
     6  #include <stdlib.h>
     7  #include <errno.h>
     8  #include <sys/stat.h>
     9  #include <limits.h>
    10  #include <sys/types.h>
    11  #include <signal.h>
    12  #include <fcntl.h>
    13  #include <sys/wait.h>
    14  #include <string.h>
    15  #include <stdbool.h>
    16  #include <sys/types.h>
    17  #include <sys/prctl.h>
    18  #include <dirent.h>
    19  #include <sys/select.h>
    20  #include <stdio.h>
    21  
    22  int rename_noreplace (int olddirfd, const char *oldpath, int newdirfd, const char *newpath)
    23  {
    24    int ret;
    25  
    26  # ifdef SYS_renameat2
    27  #  ifndef RENAME_NOREPLACE
    28  #   define RENAME_NOREPLACE	(1 << 0)
    29  #  endif
    30  
    31    ret = (int) syscall (SYS_renameat2, olddirfd, oldpath, newdirfd, newpath, RENAME_NOREPLACE);
    32    if (ret == 0 || errno != EINVAL)
    33      return ret;
    34  
    35    /* Fallback in case of errno==EINVAL.  */
    36  # endif
    37  
    38    /* This might be an issue if another process is trying to read the file while it is empty.  */
    39    ret = open (newpath, O_EXCL|O_CREAT, 0700);
    40    if (ret < 0)
    41      return ret;
    42    close (ret);
    43  
    44    /* We are sure we created the file, let's overwrite it.  */
    45    return rename (oldpath, newpath);
    46  }
    47  
    48  #ifndef TEMP_FAILURE_RETRY
    49  #define TEMP_FAILURE_RETRY(expression) \
    50    (__extension__                                                              \
    51      ({ long int __result;                                                     \
    52         do __result = (long int) (expression);                                 \
    53         while (__result == -1L && errno == EINTR);                             \
    54         __result; }))
    55  #endif
    56  
    57  static const char *_max_user_namespaces = "/proc/sys/user/max_user_namespaces";
    58  static const char *_unprivileged_user_namespaces = "/proc/sys/kernel/unprivileged_userns_clone";
    59  
    60  static int open_files_max_fd;
    61  static fd_set *open_files_set;
    62  static uid_t rootless_uid_init;
    63  static gid_t rootless_gid_init;
    64  
    65  static int
    66  syscall_setresuid (uid_t ruid, uid_t euid, uid_t suid)
    67  {
    68    return (int) syscall (__NR_setresuid, ruid, euid, suid);
    69  }
    70  
    71  static int
    72  syscall_setresgid (gid_t rgid, gid_t egid, gid_t sgid)
    73  {
    74    return (int) syscall (__NR_setresgid, rgid, egid, sgid);
    75  }
    76  
    77  uid_t
    78  rootless_uid ()
    79  {
    80    return rootless_uid_init;
    81  }
    82  
    83  uid_t
    84  rootless_gid ()
    85  {
    86    return rootless_gid_init;
    87  }
    88  
    89  static void
    90  do_pause ()
    91  {
    92    int i;
    93    struct sigaction act;
    94    int const sig[] =
    95      {
    96       SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGPOLL,
    97       SIGPROF, SIGVTALRM, SIGXCPU, SIGXFSZ, 0
    98      };
    99  
   100    act.sa_handler = SIG_IGN;
   101  
   102    for (i = 0; sig[i]; i++)
   103      sigaction (sig[i], &act, NULL);
   104  
   105    prctl (PR_SET_NAME, "podman pause", NULL, NULL, NULL);
   106    while (1)
   107      pause ();
   108  }
   109  
   110  static char **
   111  get_cmd_line_args ()
   112  {
   113    int fd;
   114    char *buffer;
   115    size_t allocated;
   116    size_t used = 0;
   117    int ret;
   118    int i, argc = 0;
   119    char **argv;
   120  
   121    fd = open ("/proc/self/cmdline", O_RDONLY);
   122    if (fd < 0)
   123      return NULL;
   124  
   125    allocated = 512;
   126    buffer = malloc (allocated);
   127    if (buffer == NULL)
   128      return NULL;
   129    for (;;)
   130      {
   131        ret = TEMP_FAILURE_RETRY (read (fd, buffer + used, allocated - used));
   132        if (ret < 0)
   133          {
   134            free (buffer);
   135            return NULL;
   136          }
   137  
   138        if (ret == 0)
   139          break;
   140  
   141        used += ret;
   142        if (allocated == used)
   143          {
   144            allocated += 512;
   145            char *tmp = realloc (buffer, allocated);
   146            if (tmp == NULL)
   147              {
   148                free (buffer);
   149                return NULL;
   150              }
   151  	  buffer = tmp;
   152          }
   153      }
   154    close (fd);
   155  
   156    for (i = 0; i < used; i++)
   157      if (buffer[i] == '\0')
   158        argc++;
   159    if (argc == 0)
   160      {
   161        free (buffer);
   162        return NULL;
   163      }
   164  
   165    argv = malloc (sizeof (char *) * (argc + 1));
   166    if (argv == NULL)
   167      {
   168        free (buffer);
   169        return NULL;
   170      }
   171    argc = 0;
   172  
   173    argv[argc++] = buffer;
   174    for (i = 0; i < used - 1; i++)
   175      if (buffer[i] == '\0')
   176        argv[argc++] = buffer + i + 1;
   177  
   178    argv[argc] = NULL;
   179  
   180    return argv;
   181  }
   182  
   183  static bool
   184  can_use_shortcut ()
   185  {
   186    int argc;
   187    char **argv;
   188    bool ret = true;
   189  
   190  #ifdef DISABLE_JOIN_SHORTCUT
   191    return false;
   192  #endif
   193  
   194    argv = get_cmd_line_args ();
   195    if (argv == NULL)
   196      return false;
   197  
   198    if (strstr (argv[0], "podman") == NULL)
   199      return false;
   200  
   201    for (argc = 0; argv[argc]; argc++)
   202      {
   203        if (argc == 0 || argv[argc][0] == '-')
   204          continue;
   205  
   206        if (strcmp (argv[argc], "mount") == 0
   207            || strcmp (argv[argc], "search") == 0
   208            || (strcmp (argv[argc], "system") == 0 && argv[argc+1] && strcmp (argv[argc+1], "service") != 0))
   209          {
   210            ret = false;
   211            break;
   212          }
   213  
   214        if (argv[argc+1] != NULL && (strcmp (argv[argc], "container") == 0 ||
   215  	   strcmp (argv[argc], "image") == 0) &&
   216  	   strcmp (argv[argc+1], "mount") == 0)
   217          {
   218            ret = false;
   219            break;
   220          }
   221      }
   222  
   223    free (argv[0]);
   224    free (argv);
   225    return ret;
   226  }
   227  
   228  int
   229  is_fd_inherited(int fd)
   230  {
   231    if (open_files_set == NULL || fd > open_files_max_fd || fd < 0)
   232    {
   233      return 0;
   234    }
   235    return FD_ISSET(fd % FD_SETSIZE, &(open_files_set[fd / FD_SETSIZE])) ? 1 : 0;
   236  }
   237  
   238  static void __attribute__((constructor)) init()
   239  {
   240    const char *xdg_runtime_dir;
   241    const char *pause;
   242    DIR *d;
   243  
   244    pause = getenv ("_PODMAN_PAUSE");
   245    if (pause && pause[0])
   246      {
   247        do_pause ();
   248        _exit (EXIT_FAILURE);
   249      }
   250  
   251    /* Store how many FDs were open before the Go runtime kicked in.  */
   252    d = opendir ("/proc/self/fd");
   253    if (d)
   254      {
   255        struct dirent *ent;
   256        size_t size = 0;
   257  
   258        for (ent = readdir (d); ent; ent = readdir (d))
   259          {
   260            int fd;
   261  
   262            if (ent->d_name[0] == '.')
   263              continue;
   264  
   265            fd = atoi (ent->d_name);
   266            if (fd == dirfd (d))
   267              continue;
   268  
   269            if (fd >= size * FD_SETSIZE)
   270              {
   271                int i;
   272                size_t new_size;
   273  
   274                new_size = (fd / FD_SETSIZE) + 1;
   275                open_files_set = realloc (open_files_set, new_size * sizeof (fd_set));
   276                if (open_files_set == NULL)
   277                  _exit (EXIT_FAILURE);
   278  
   279                for (i = size; i < new_size; i++)
   280                  FD_ZERO (&(open_files_set[i]));
   281  
   282                size = new_size;
   283              }
   284  
   285            if (fd > open_files_max_fd)
   286              open_files_max_fd = fd;
   287  
   288            FD_SET (fd % FD_SETSIZE, &(open_files_set[fd / FD_SETSIZE]));
   289          }
   290        closedir (d);
   291      }
   292  
   293    /* Shortcut.  If we are able to join the pause pid file, do it now so we don't
   294       need to re-exec.  */
   295    xdg_runtime_dir = getenv ("XDG_RUNTIME_DIR");
   296    if (geteuid () != 0 && xdg_runtime_dir && xdg_runtime_dir[0] && can_use_shortcut ())
   297      {
   298        int r;
   299        int fd;
   300        long pid;
   301        char buf[12];
   302        uid_t uid;
   303        gid_t gid;
   304        char path[PATH_MAX];
   305        const char *const suffix = "/libpod/pause.pid";
   306        char *cwd = getcwd (NULL, 0);
   307        char uid_fmt[16];
   308        char gid_fmt[16];
   309        size_t len;
   310  
   311        if (cwd == NULL)
   312          {
   313            fprintf (stderr, "error getting current working directory: %s\n", strerror (errno));
   314            _exit (EXIT_FAILURE);
   315          }
   316  
   317        len = snprintf (path, PATH_MAX, "%s%s", xdg_runtime_dir, suffix);
   318        if (len >= PATH_MAX)
   319          {
   320            fprintf (stderr, "invalid value for XDG_RUNTIME_DIR: %s", strerror (ENAMETOOLONG));
   321            exit (EXIT_FAILURE);
   322          }
   323  
   324        fd = open (path, O_RDONLY);
   325        if (fd < 0)
   326          {
   327            free (cwd);
   328            return;
   329          }
   330  
   331        r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof (buf) - 1));
   332        close (fd);
   333        if (r < 0)
   334          {
   335            free (cwd);
   336            return;
   337          }
   338        buf[r] = '\0';
   339  
   340        pid = strtol (buf, NULL, 10);
   341        if (pid == LONG_MAX)
   342          {
   343            free (cwd);
   344            return;
   345          }
   346  
   347        uid = geteuid ();
   348        gid = getegid ();
   349  
   350        sprintf (path, "/proc/%ld/ns/user", pid);
   351        fd = open (path, O_RDONLY);
   352        if (fd < 0 || setns (fd, 0) < 0)
   353          {
   354            free (cwd);
   355            return;
   356          }
   357        close (fd);
   358  
   359        /* Errors here cannot be ignored as we already joined a ns.  */
   360        sprintf (path, "/proc/%ld/ns/mnt", pid);
   361        fd = open (path, O_RDONLY);
   362        if (fd < 0)
   363          {
   364            fprintf (stderr, "cannot open %s: %s", path, strerror (errno));
   365            exit (EXIT_FAILURE);
   366          }
   367  
   368        sprintf (uid_fmt, "%d", uid);
   369        sprintf (gid_fmt, "%d", gid);
   370  
   371        setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1);
   372        setenv ("_CONTAINERS_ROOTLESS_UID", uid_fmt, 1);
   373        setenv ("_CONTAINERS_ROOTLESS_GID", gid_fmt, 1);
   374  
   375        r = setns (fd, 0);
   376        if (r < 0)
   377          {
   378            fprintf (stderr, "cannot join mount namespace for %ld: %s", pid, strerror (errno));
   379            exit (EXIT_FAILURE);
   380          }
   381        close (fd);
   382  
   383        if (syscall_setresgid (0, 0, 0) < 0)
   384          {
   385            fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
   386            _exit (EXIT_FAILURE);
   387          }
   388  
   389        if (syscall_setresuid (0, 0, 0) < 0)
   390          {
   391            fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
   392            _exit (EXIT_FAILURE);
   393          }
   394  
   395        if (chdir (cwd) < 0)
   396          {
   397            fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
   398            _exit (EXIT_FAILURE);
   399          }
   400  
   401        free (cwd);
   402        rootless_uid_init = uid;
   403        rootless_gid_init = gid;
   404      }
   405  }
   406  
   407  static int
   408  syscall_clone (unsigned long flags, void *child_stack)
   409  {
   410  #if defined(__s390__) || defined(__CRIS__)
   411    return (int) syscall (__NR_clone, child_stack, flags);
   412  #else
   413    return (int) syscall (__NR_clone, flags, child_stack);
   414  #endif
   415  }
   416  
   417  int
   418  reexec_in_user_namespace_wait (int pid, int options)
   419  {
   420    pid_t p;
   421    int status;
   422  
   423    p = TEMP_FAILURE_RETRY (waitpid (pid, &status, 0));
   424    if (p < 0)
   425      return -1;
   426  
   427    if (WIFEXITED (status))
   428      return WEXITSTATUS (status);
   429    if (WIFSIGNALED (status))
   430      return 128 + WTERMSIG (status);
   431    return -1;
   432  }
   433  
   434  static int
   435  create_pause_process (const char *pause_pid_file_path, char **argv)
   436  {
   437    int r, p[2];
   438  
   439    if (pipe (p) < 0)
   440      _exit (EXIT_FAILURE);
   441  
   442    r = fork ();
   443    if (r < 0)
   444      _exit (EXIT_FAILURE);
   445  
   446    if (r)
   447      {
   448        char b;
   449  
   450        close (p[1]);
   451        /* Block until we write the pid file.  */
   452        r = TEMP_FAILURE_RETRY (read (p[0], &b, 1));
   453        close (p[0]);
   454  
   455        reexec_in_user_namespace_wait (r, 0);
   456  
   457        return r == 1 && b == '0' ? 0 : -1;
   458      }
   459    else
   460      {
   461        int fd;
   462        pid_t pid;
   463  
   464        close (p[0]);
   465  
   466        setsid ();
   467        pid = fork ();
   468        if (r < 0)
   469          _exit (EXIT_FAILURE);
   470  
   471        if (pid)
   472          {
   473            char pid_str[12];
   474            char *tmp_file_path = NULL;
   475  
   476            sprintf (pid_str, "%d", pid);
   477  
   478            if (asprintf (&tmp_file_path, "%s.XXXXXX", pause_pid_file_path) < 0)
   479              {
   480                fprintf (stderr, "unable to print to string\n");
   481                kill (pid, SIGKILL);
   482                _exit (EXIT_FAILURE);
   483              }
   484  
   485            if (tmp_file_path == NULL)
   486              {
   487                fprintf (stderr, "temporary file path is NULL\n");
   488                kill (pid, SIGKILL);
   489                _exit (EXIT_FAILURE);
   490              }
   491  
   492            fd = mkstemp (tmp_file_path);
   493            if (fd < 0)
   494              {
   495                fprintf (stderr, "error creating temporary file: %s\n", strerror (errno));
   496                kill (pid, SIGKILL);
   497                _exit (EXIT_FAILURE);
   498              }
   499  
   500            r = TEMP_FAILURE_RETRY (write (fd, pid_str, strlen (pid_str)));
   501            if (r < 0)
   502              {
   503                fprintf (stderr, "cannot write to file descriptor: %s\n", strerror (errno));
   504                kill (pid, SIGKILL);
   505                _exit (EXIT_FAILURE);
   506              }
   507            close (fd);
   508  
   509            /* There can be another process at this point trying to configure the user namespace and the pause
   510             process, do not override the pid file if it already exists. */
   511            if (rename_noreplace (AT_FDCWD, tmp_file_path, AT_FDCWD, pause_pid_file_path) < 0)
   512              {
   513                unlink (tmp_file_path);
   514                kill (pid, SIGKILL);
   515                _exit (EXIT_FAILURE);
   516              }
   517  
   518            r = TEMP_FAILURE_RETRY (write (p[1], "0", 1));
   519            if (r < 0)
   520              {
   521                fprintf (stderr, "cannot write to pipe: %s\n", strerror (errno));
   522                _exit (EXIT_FAILURE);
   523              }
   524            close (p[1]);
   525  
   526            _exit (EXIT_SUCCESS);
   527          }
   528        else
   529          {
   530            int null;
   531  
   532            close (p[1]);
   533  
   534            null = open ("/dev/null", O_RDWR);
   535            if (null >= 0)
   536              {
   537                dup2 (null, 0);
   538                dup2 (null, 1);
   539                dup2 (null, 2);
   540                close (null);
   541              }
   542  
   543            for (fd = 3; fd < open_files_max_fd + 16; fd++)
   544              close (fd);
   545  
   546            setenv ("_PODMAN_PAUSE", "1", 1);
   547            execlp (argv[0], argv[0], NULL);
   548  
   549            /* If the execve fails, then do the pause here.  */
   550            do_pause ();
   551            _exit (EXIT_FAILURE);
   552          }
   553      }
   554  }
   555  
   556  static int
   557  open_namespace (int pid_to_join, const char *ns_file)
   558  {
   559    char ns_path[PATH_MAX];
   560    int ret;
   561  
   562    ret = snprintf (ns_path, PATH_MAX, "/proc/%d/ns/%s", pid_to_join, ns_file);
   563    if (ret == PATH_MAX)
   564      {
   565        fprintf (stderr, "internal error: namespace path too long\n");
   566        return -1;
   567      }
   568  
   569    return open (ns_path, O_CLOEXEC | O_RDONLY);
   570  }
   571  
   572  static void
   573  join_namespace_or_die (const char *name, int ns_fd)
   574  {
   575    if (setns (ns_fd, 0) < 0)
   576      {
   577        fprintf (stderr, "cannot set %s namespace\n", name);
   578        _exit (EXIT_FAILURE);
   579      }
   580  }
   581  
   582  int
   583  reexec_userns_join (int pid_to_join, char *pause_pid_file_path)
   584  {
   585    char uid[16];
   586    char gid[16];
   587    char **argv;
   588    int pid;
   589    int mnt_ns = -1;
   590    int user_ns = -1;
   591    char *cwd = getcwd (NULL, 0);
   592    sigset_t sigset, oldsigset;
   593  
   594    if (cwd == NULL)
   595      {
   596        fprintf (stderr, "error getting current working directory: %s\n", strerror (errno));
   597        _exit (EXIT_FAILURE);
   598      }
   599  
   600    sprintf (uid, "%d", geteuid ());
   601    sprintf (gid, "%d", getegid ());
   602  
   603    argv = get_cmd_line_args ();
   604    if (argv == NULL)
   605      {
   606        fprintf (stderr, "cannot read argv: %s\n", strerror (errno));
   607        _exit (EXIT_FAILURE);
   608      }
   609  
   610    user_ns = open_namespace (pid_to_join, "user");
   611    if (user_ns < 0)
   612      return user_ns;
   613    mnt_ns = open_namespace (pid_to_join, "mnt");
   614    if (mnt_ns < 0)
   615      {
   616        close (user_ns);
   617        return mnt_ns;
   618      }
   619  
   620    pid = fork ();
   621    if (pid < 0)
   622      fprintf (stderr, "cannot fork: %s\n", strerror (errno));
   623  
   624    if (pid)
   625      {
   626        int f;
   627  
   628        /* We passed down these fds, close them.  */
   629        close (user_ns);
   630        close (mnt_ns);
   631  
   632        for (f = 3; f < open_files_max_fd; f++)
   633          if (open_files_set == NULL || FD_ISSET (f % FD_SETSIZE, &(open_files_set[f / FD_SETSIZE])))
   634            close (f);
   635        return pid;
   636      }
   637  
   638    if (sigfillset (&sigset) < 0)
   639      {
   640        fprintf (stderr, "cannot fill sigset: %s\n", strerror (errno));
   641        _exit (EXIT_FAILURE);
   642      }
   643    if (sigdelset (&sigset, SIGCHLD) < 0)
   644      {
   645        fprintf (stderr, "cannot sigdelset(SIGCHLD): %s\n", strerror (errno));
   646        _exit (EXIT_FAILURE);
   647      }
   648    if (sigdelset (&sigset, SIGTERM) < 0)
   649      {
   650        fprintf (stderr, "cannot sigdelset(SIGTERM): %s\n", strerror (errno));
   651        _exit (EXIT_FAILURE);
   652      }
   653    if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0)
   654      {
   655        fprintf (stderr, "cannot block signals: %s\n", strerror (errno));
   656        _exit (EXIT_FAILURE);
   657      }
   658  
   659    setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1);
   660    setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1);
   661    setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1);
   662  
   663    if (prctl (PR_SET_PDEATHSIG, SIGTERM, 0, 0, 0) < 0)
   664      {
   665        fprintf (stderr, "cannot prctl(PR_SET_PDEATHSIG): %s\n", strerror (errno));
   666        _exit (EXIT_FAILURE);
   667      }
   668  
   669    join_namespace_or_die ("user", user_ns);
   670    join_namespace_or_die ("mnt", mnt_ns);
   671    close (user_ns);
   672    close (mnt_ns);
   673  
   674    if (syscall_setresgid (0, 0, 0) < 0)
   675      {
   676        fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
   677        _exit (EXIT_FAILURE);
   678      }
   679  
   680    if (syscall_setresuid (0, 0, 0) < 0)
   681      {
   682        fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
   683        _exit (EXIT_FAILURE);
   684      }
   685  
   686    if (chdir (cwd) < 0)
   687      {
   688        fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
   689        _exit (EXIT_FAILURE);
   690      }
   691    free (cwd);
   692  
   693    if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
   694      {
   695        /* We ignore errors here as we didn't create the namespace anyway.  */
   696        create_pause_process (pause_pid_file_path, argv);
   697      }
   698    if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0)
   699      {
   700        fprintf (stderr, "cannot block signals: %s\n", strerror (errno));
   701        _exit (EXIT_FAILURE);
   702      }
   703  
   704    execvp (argv[0], argv);
   705  
   706    _exit (EXIT_FAILURE);
   707  }
   708  
   709  static void
   710  check_proc_sys_userns_file (const char *path)
   711  {
   712    FILE *fp;
   713    fp = fopen (path, "r");
   714    if (fp)
   715      {
   716        char buf[32];
   717        size_t n_read = fread (buf, 1, sizeof(buf) - 1, fp);
   718        if (n_read > 0)
   719          {
   720            buf[n_read] = '\0';
   721            if (strtol (buf, NULL, 10) == 0)
   722              fprintf (stderr, "user namespaces are not enabled in %s\n", path);
   723          }
   724        fclose (fp);
   725      }
   726  }
   727  
   728  static int
   729  copy_file_to_fd (const char *file_to_read, int outfd)
   730  {
   731    char buf[512];
   732    int fd;
   733  
   734    fd = open (file_to_read, O_RDONLY);
   735    if (fd < 0)
   736      return fd;
   737  
   738    for (;;)
   739      {
   740        ssize_t r, w, t = 0;
   741  
   742        r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof buf));
   743        if (r < 0)
   744          {
   745            close (fd);
   746            return r;
   747          }
   748  
   749        if (r == 0)
   750          break;
   751  
   752        while (t < r)
   753          {
   754            w = TEMP_FAILURE_RETRY (write (outfd, &buf[t], r - t));
   755            if (w < 0)
   756              {
   757                close (fd);
   758                return w;
   759              }
   760            t += w;
   761          }
   762      }
   763    close (fd);
   764    return 0;
   765  }
   766  
   767  int
   768  reexec_in_user_namespace (int ready, char *pause_pid_file_path, char *file_to_read, int outputfd)
   769  {
   770    int ret;
   771    pid_t pid;
   772    char b;
   773    char **argv;
   774    char uid[16];
   775    char gid[16];
   776    char *listen_fds = NULL;
   777    char *listen_pid = NULL;
   778    bool do_socket_activation = false;
   779    char *cwd = getcwd (NULL, 0);
   780    sigset_t sigset, oldsigset;
   781  
   782    if (cwd == NULL)
   783      {
   784        fprintf (stderr, "error getting current working directory: %s\n", strerror (errno));
   785        _exit (EXIT_FAILURE);
   786      }
   787  
   788    listen_pid = getenv("LISTEN_PID");
   789    listen_fds = getenv("LISTEN_FDS");
   790  
   791    if (listen_pid != NULL && listen_fds != NULL)
   792      {
   793        if (strtol(listen_pid, NULL, 10) == getpid())
   794          do_socket_activation = true;
   795      }
   796  
   797    sprintf (uid, "%d", geteuid ());
   798    sprintf (gid, "%d", getegid ());
   799  
   800    pid = syscall_clone (CLONE_NEWUSER|CLONE_NEWNS|SIGCHLD, NULL);
   801    if (pid < 0)
   802      {
   803        fprintf (stderr, "cannot clone: %s\n", strerror (errno));
   804        check_proc_sys_userns_file (_max_user_namespaces);
   805        check_proc_sys_userns_file (_unprivileged_user_namespaces);
   806      }
   807    if (pid)
   808      {
   809        if (do_socket_activation)
   810          {
   811            long num_fds;
   812            num_fds = strtol (listen_fds, NULL, 10);
   813            if (num_fds != LONG_MIN && num_fds != LONG_MAX)
   814              {
   815                int f;
   816  
   817                for (f = 3; f < num_fds + 3; f++)
   818                  if (open_files_set == NULL || FD_ISSET (f % FD_SETSIZE, &(open_files_set[f / FD_SETSIZE])))
   819                    close (f);
   820              }
   821            unsetenv ("LISTEN_PID");
   822            unsetenv ("LISTEN_FDS");
   823            unsetenv ("LISTEN_FDNAMES");
   824          }
   825        return pid;
   826      }
   827  
   828    if (sigfillset (&sigset) < 0)
   829      {
   830        fprintf (stderr, "cannot fill sigset: %s\n", strerror (errno));
   831        _exit (EXIT_FAILURE);
   832      }
   833    if (sigdelset (&sigset, SIGCHLD) < 0)
   834      {
   835        fprintf (stderr, "cannot sigdelset(SIGCHLD): %s\n", strerror (errno));
   836        _exit (EXIT_FAILURE);
   837      }
   838    if (sigdelset (&sigset, SIGTERM) < 0)
   839      {
   840        fprintf (stderr, "cannot sigdelset(SIGTERM): %s\n", strerror (errno));
   841        _exit (EXIT_FAILURE);
   842      }
   843    if (sigprocmask (SIG_BLOCK, &sigset, &oldsigset) < 0)
   844      {
   845        fprintf (stderr, "cannot block signals: %s\n", strerror (errno));
   846        _exit (EXIT_FAILURE);
   847      }
   848  
   849    argv = get_cmd_line_args ();
   850    if (argv == NULL)
   851      {
   852        fprintf (stderr, "cannot read argv: %s\n", strerror (errno));
   853        _exit (EXIT_FAILURE);
   854      }
   855  
   856    if (do_socket_activation)
   857      {
   858        char s[32];
   859        sprintf (s, "%d", getpid());
   860        setenv ("LISTEN_PID", s, true);
   861      }
   862  
   863    setenv ("_CONTAINERS_USERNS_CONFIGURED", "init", 1);
   864    setenv ("_CONTAINERS_ROOTLESS_UID", uid, 1);
   865    setenv ("_CONTAINERS_ROOTLESS_GID", gid, 1);
   866  
   867    ret = TEMP_FAILURE_RETRY (read (ready, &b, 1));
   868    if (ret < 0)
   869      {
   870        fprintf (stderr, "cannot read from sync pipe: %s\n", strerror (errno));
   871        _exit (EXIT_FAILURE);
   872      }
   873    if (ret != 1 || b != '0')
   874      _exit (EXIT_FAILURE);
   875  
   876    if (syscall_setresgid (0, 0, 0) < 0)
   877      {
   878        fprintf (stderr, "cannot setresgid: %s\n", strerror (errno));
   879        TEMP_FAILURE_RETRY (write (ready, "1", 1));
   880        _exit (EXIT_FAILURE);
   881      }
   882  
   883    if (syscall_setresuid (0, 0, 0) < 0)
   884      {
   885        fprintf (stderr, "cannot setresuid: %s\n", strerror (errno));
   886        TEMP_FAILURE_RETRY (write (ready, "1", 1));
   887        _exit (EXIT_FAILURE);
   888      }
   889  
   890    if (chdir (cwd) < 0)
   891      {
   892        fprintf (stderr, "cannot chdir: %s\n", strerror (errno));
   893        TEMP_FAILURE_RETRY (write (ready, "1", 1));
   894        _exit (EXIT_FAILURE);
   895      }
   896    free (cwd);
   897  
   898    if (pause_pid_file_path && pause_pid_file_path[0] != '\0')
   899      {
   900        if (create_pause_process (pause_pid_file_path, argv) < 0)
   901          {
   902            TEMP_FAILURE_RETRY (write (ready, "2", 1));
   903            _exit (EXIT_FAILURE);
   904          }
   905      }
   906  
   907    ret = TEMP_FAILURE_RETRY (write (ready, "0", 1));
   908    if (ret < 0)
   909    {
   910  	  fprintf (stderr, "cannot write to ready pipe: %s\n", strerror (errno));
   911  	  _exit (EXIT_FAILURE);
   912    }
   913    close (ready);
   914  
   915    if (sigprocmask (SIG_SETMASK, &oldsigset, NULL) < 0)
   916      {
   917        fprintf (stderr, "cannot block signals: %s\n", strerror (errno));
   918        _exit (EXIT_FAILURE);
   919      }
   920  
   921    if (file_to_read && file_to_read[0])
   922      {
   923        ret = copy_file_to_fd (file_to_read, outputfd);
   924        close (outputfd);
   925        _exit (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
   926      }
   927  
   928    execvp (argv[0], argv);
   929  
   930    _exit (EXIT_FAILURE);
   931  }