github.com/schwarzm/garden-linux@v0.0.0-20150507151835-33bca2147c47/old/linux_backend/src/wsh/wshd.c (about)

     1  #define _GNU_SOURCE
     2  
     3  #include <assert.h>
     4  #include <errno.h>
     5  #include <fcntl.h>
     6  #include <sched.h>
     7  #include <signal.h>
     8  #include <stdio.h>
     9  #include <stdlib.h>
    10  #include <string.h>
    11  #include <sys/ioctl.h>
    12  #include <sys/ipc.h>
    13  #include <sys/mount.h>
    14  #include <sys/param.h>
    15  #include <sys/resource.h>
    16  #include <sys/shm.h>
    17  #include <sys/signalfd.h>
    18  #include <sys/socket.h>
    19  #include <sys/stat.h>
    20  #include <sys/types.h>
    21  #include <sys/wait.h>
    22  #include <termios.h>
    23  #include <unistd.h>
    24  
    25  #include "barrier.h"
    26  #include "msg.h"
    27  #include "pty.h"
    28  #include "pwd.h"
    29  #include "un.h"
    30  #include "util.h"
    31  
    32  typedef struct wshd_s wshd_t;
    33  
    34  struct wshd_s {
    35    /* Path to directory where server socket is placed */
    36    char run_path[256];
    37  
    38    /* Path to directory containing hooks */
    39    char lib_path[256];
    40  
    41    /* Path to directory that will become root in the new mount namespace */
    42    char root_path[256];
    43  
    44    /* Process title */
    45    char title[32];
    46  
    47    /* Extra flags to pass to clone operation */
    48    int clone_flags;
    49  
    50    /* File descriptor of listening socket */
    51    int fd;
    52  
    53    barrier_t barrier_parent;
    54    barrier_t barrier_child;
    55  
    56    /* Map pids to exit status fds */
    57    struct {
    58      pid_t pid;
    59      int fd;
    60    } *pid_to_fd;
    61    size_t pid_to_fd_len;
    62  };
    63  
    64  int wshd__usage(wshd_t *w, int argc, char **argv) {
    65    fprintf(stderr, "Usage: %s OPTION...\n", argv[0]);
    66    fprintf(stderr, "\n");
    67  
    68    fprintf(stderr, "  --run PATH   "
    69      "Directory where server socket is placed"
    70      "\n");
    71  
    72    fprintf(stderr, "  --lib PATH   "
    73      "Directory containing hooks"
    74      "\n");
    75  
    76    fprintf(stderr, "  --root PATH  "
    77      "Directory that will become root in the new mount namespace"
    78      "\n");
    79  
    80    fprintf(stderr, "  --title NAME "
    81      "Process title"
    82      "\n");
    83  
    84    fprintf(stderr, "  --userns 1 "
    85      "If specified, use user namespacing"
    86      "\n");
    87  
    88    return 0;
    89  }
    90  
    91  int wshd__getopt(wshd_t *w, int argc, char **argv) {
    92    int i = 1;
    93    int j = argc - i;
    94    int rv;
    95  
    96    w->clone_flags = 0;
    97    while (i < argc) {
    98      if (j >= 2) {
    99        if (strcmp("--run", argv[i]) == 0) {
   100          rv = snprintf(w->run_path, sizeof(w->run_path), "%s", argv[i+1]);
   101          if (rv >= sizeof(w->run_path)) {
   102            goto toolong;
   103          }
   104        } else if (strcmp("--lib", argv[i]) == 0) {
   105          rv = snprintf(w->lib_path, sizeof(w->lib_path), "%s", argv[i+1]);
   106          if (rv >= sizeof(w->lib_path)) {
   107            goto toolong;
   108          }
   109        } else if (strcmp("--root", argv[i]) == 0) {
   110          rv = snprintf(w->root_path, sizeof(w->root_path), "%s", argv[i+1]);
   111          if (rv >= sizeof(w->root_path)) {
   112            goto toolong;
   113          }
   114        } else if (strcmp("--title", argv[i]) == 0) {
   115          rv = snprintf(w->title, sizeof(w->title), "%s", argv[i+1]);
   116          if (rv >= sizeof(w->title)) {
   117            goto toolong;
   118          }
   119        } else if (strcmp("--userns", argv[i]) == 0) {
   120          if (strcmp("disabled", argv[i+1]) != 0) {
   121            w->clone_flags = CLONE_NEWUSER;
   122          }
   123        } else {
   124          goto invalid;
   125        }
   126  
   127        i += 2;
   128        j -= 2;
   129      } else if (j == 1) {
   130        if (strcmp("-h", argv[i]) == 0 ||
   131            strcmp("--help", argv[i]) == 0)
   132        {
   133          wshd__usage(w, argc, argv);
   134          return -1;
   135        } else {
   136          goto invalid;
   137        }
   138      } else {
   139        assert(NULL);
   140      }
   141    }
   142  
   143    return 0;
   144  
   145  toolong:
   146    fprintf(stderr, "%s: argument too long -- %s\n", argv[0], argv[i]);
   147    fprintf(stderr, "Try `%s --help' for more information.\n", argv[0]);
   148    return -1;
   149  
   150  invalid:
   151    fprintf(stderr, "%s: invalid option -- %s\n", argv[0], argv[i]);
   152    fprintf(stderr, "Try `%s --help' for more information.\n", argv[0]);
   153    return -1;
   154  }
   155  
   156  void assert_directory(const char *path) {
   157    int rv;
   158    struct stat st;
   159  
   160    rv = stat(path, &st);
   161    if (rv == -1) {
   162      fprintf(stderr, "stat(\"%s\"): %s\n", path, strerror(errno));
   163      exit(1);
   164    }
   165  
   166    if (!S_ISDIR(st.st_mode)) {
   167      fprintf(stderr, "stat(\"%s\"): %s\n", path, "No such directory");
   168      exit(1);
   169    }
   170  }
   171  
   172  void child_pid_to_fd_add(wshd_t *w, pid_t pid, int fd) {
   173    int len = w->pid_to_fd_len;
   174  
   175    /* Store a copy */
   176    fd = dup(fd);
   177    if (fd == -1) {
   178      perror("dup");
   179      abort();
   180    }
   181  
   182    w->pid_to_fd = realloc(w->pid_to_fd, (len + 1) * sizeof(w->pid_to_fd[0]));
   183    assert(w->pid_to_fd != NULL);
   184  
   185    w->pid_to_fd[len].pid = pid;
   186    w->pid_to_fd[len].fd = fd;
   187    w->pid_to_fd_len++;
   188  }
   189  
   190  int child_pid_to_fd_remove(wshd_t *w, pid_t pid) {
   191    int i;
   192    int len = w->pid_to_fd_len;
   193    int fd = -1;
   194  
   195    for (i = 0; i < len; i++) {
   196      if (w->pid_to_fd[i].pid == pid) {
   197        fd = w->pid_to_fd[i].fd;
   198  
   199        /* Move tail if there is one */
   200        if ((i + 1) < len) {
   201          memmove(&w->pid_to_fd[i], &w->pid_to_fd[i+1], (len - i - 1) * sizeof(w->pid_to_fd[0]));
   202        }
   203  
   204        w->pid_to_fd = realloc(w->pid_to_fd, (w->pid_to_fd_len - 1) * sizeof(w->pid_to_fd[0]));
   205        w->pid_to_fd_len--;
   206  
   207        if (w->pid_to_fd_len) {
   208          assert(w->pid_to_fd != NULL);
   209        } else {
   210          assert(w->pid_to_fd == NULL);
   211        }
   212  
   213        break;
   214      }
   215    }
   216  
   217    return fd;
   218  }
   219  
   220  char **env__add(char **envp, const char *key, const char *value) {
   221    size_t envplen = 0;
   222    char *buf;
   223    size_t buflen;
   224    int rv;
   225  
   226    if (envp == NULL) {
   227      /* Trailing NULL */
   228      envplen = 1;
   229    } else {
   230      while(envp[envplen++] != NULL);
   231    }
   232  
   233    envp = realloc(envp, sizeof(envp[0]) * (envplen + 1));
   234    assert(envp != NULL);
   235  
   236    buflen = strlen(key) + 1 + strlen(value) + 1;
   237    buf = malloc(buflen);
   238    assert(buf != NULL);
   239  
   240    rv = snprintf(buf, buflen, "%s=%s", key, value);
   241    assert(rv == buflen - 1);
   242  
   243    envp[envplen - 1] = buf;
   244    envp[envplen] = NULL;
   245  
   246    return envp;
   247  }
   248  
   249  const char* env__get(char **envp, const char* key) {
   250    if (envp != NULL) {
   251      int i = 0;
   252      while (envp[i] != NULL) {
   253        char* eq = strchr(envp[i], '=');
   254        if (eq != NULL) {
   255          size_t keyLen = eq - envp[i];
   256          if (strlen(key) == keyLen) {
   257            if (memcmp(key, envp[i], keyLen) == 0) {
   258              return eq + 1;
   259            }
   260          }
   261        }
   262        i++;
   263      }
   264    }
   265  
   266    return NULL;
   267  }
   268  
   269  char **child_setup_environment(struct passwd *pw, char **extra_env_vars) {
   270    int rv;
   271    char **envp = extra_env_vars;
   272  
   273    rv = chdir(pw->pw_dir);
   274    if (rv == -1) {
   275      perror("chdir");
   276      return NULL;
   277    }
   278  
   279    envp = env__add(envp, "HOME", pw->pw_dir);
   280    envp = env__add(envp, "USER", pw->pw_name);
   281  
   282    // Use $PATH if provided, otherwise default depending on uid.
   283    const char * envp_path = env__get(envp, "PATH");
   284    if (envp_path != NULL) {
   285        setenv("PATH", envp_path, 1);
   286    } else if (pw->pw_uid == 0) {
   287      const char *sanitizedRootPath = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin";
   288      envp = env__add(envp, "PATH", sanitizedRootPath);
   289      setenv("PATH", sanitizedRootPath, 1);
   290    } else {
   291      const char *sanitizedUserPath = "/usr/local/bin:/usr/bin:/bin";
   292      envp = env__add(envp, "PATH", sanitizedUserPath);
   293      setenv("PATH", sanitizedUserPath, 1);
   294    }
   295  
   296    return envp;
   297  }
   298  
   299  int child_fork(msg_request_t *req, int in, int out, int err) {
   300    int rv;
   301  
   302    rv = fork();
   303    if (rv == -1) {
   304      perror("fork");
   305      exit(1);
   306    }
   307  
   308    if (rv == 0) {
   309      const char *user;
   310      struct passwd *pw;
   311      char *default_argv[] = { "/bin/sh", NULL };
   312      char *default_envp[] = { NULL };
   313      char **argv = default_argv;
   314      char **envp = default_envp;
   315      char **extra_env_vars = NULL;
   316  
   317      rv = dup2(in, STDIN_FILENO);
   318      assert(rv != -1);
   319  
   320      rv = dup2(out, STDOUT_FILENO);
   321      assert(rv != -1);
   322  
   323      rv = dup2(err, STDERR_FILENO);
   324      assert(rv != -1);
   325  
   326      rv = setsid();
   327      assert(rv != -1);
   328  
   329      user = req->user.name;
   330      if (!strlen(user)) {
   331        user = "root";
   332      }
   333  
   334      pw = getpwnam(user);
   335      if (pw == NULL) {
   336        perror("getpwnam");
   337        goto error;
   338      }
   339  
   340      if (strlen(pw->pw_shell)) {
   341        default_argv[0] = strdup(pw->pw_shell);
   342      }
   343  
   344      /* Set controlling terminal if needed */
   345      if (isatty(in)) {
   346        rv = ioctl(STDIN_FILENO, TIOCSCTTY, 1);
   347        assert(rv != -1);
   348      }
   349  
   350      /* Use argv from request if needed */
   351      if (req->arg.count) {
   352        argv = (char **)msg_array_export(&req->arg);
   353        assert(argv != NULL);
   354      }
   355  
   356      rv = msg_rlimit_export(&req->rlim);
   357      if (rv == -1) {
   358        perror("msg_rlimit_export");
   359        goto error;
   360      }
   361  
   362      rv = msg_user_export(&req->user, pw);
   363      if (rv == -1) {
   364        perror("msg_user_export");
   365        goto error;
   366      }
   367  
   368      if (req->env.count) {
   369        extra_env_vars = (char **)msg_array_export(&req->env);
   370        assert(extra_env_vars != NULL);
   371      }
   372  
   373      envp = child_setup_environment(pw, extra_env_vars);
   374      assert(envp != NULL);
   375  
   376      if (strlen(req->dir.path)) {
   377        rv = chdir(req->dir.path);
   378        if (rv == -1) {
   379          perror("chdir");
   380          goto error;
   381        }
   382      }
   383  
   384      // don't mask signals of child process
   385      sigset_t mask;
   386      sigemptyset(&mask);
   387      sigprocmask(SIG_SETMASK, &mask, NULL);
   388  
   389      execvpe(argv[0], argv, envp);
   390      perror("execvpe");
   391  
   392  error:
   393      exit(255);
   394    }
   395  
   396    return rv;
   397  }
   398  
   399  int child_handle_interactive(int fd, wshd_t *w, msg_request_t *req) {
   400    int i, j;
   401    int num_descriptors = 3;
   402    int p[num_descriptors][2];
   403    int p_[num_descriptors];
   404    int rv;
   405    msg_response_t res;
   406  
   407    msg_response_init(&res);
   408  
   409    /* Initialize so that the error handler can do its job */
   410    for (i = 0; i < num_descriptors; i++) {
   411      p[i][0] = -1;
   412      p[i][1] = -1;
   413      p_[i] = -1;
   414    }
   415  
   416    for (i = 1; i < num_descriptors; i++) {
   417      rv = pipe(p[i]);
   418      if (rv == -1) {
   419        perror("pipe");
   420        abort();
   421      }
   422  
   423      fcntl_mix_cloexec(p[i][0]);
   424      fcntl_mix_cloexec(p[i][1]);
   425    }
   426  
   427    rv = openpty(&p[0][0], &p[0][1], NULL);
   428    if (rv < 0) {
   429      perror("openpty");
   430      abort();
   431    }
   432  
   433    fcntl_mix_cloexec(p[0][0]);
   434    fcntl_mix_cloexec(p[0][1]);
   435  
   436    /* Descriptors to send to client */
   437    p_[0] = p[0][0];
   438    p_[1] = p[1][0];
   439    p_[2] = p[2][0];
   440  
   441    rv = un_send_fds(fd, (char *)&res, sizeof(res), p_, num_descriptors);
   442    if (rv == -1) {
   443      goto err;
   444    }
   445  
   446    rv = child_fork(req, p[0][1], p[0][1], p[0][1]);
   447    assert(rv > 0);
   448  
   449    write(p[2][1], &rv, sizeof(rv));
   450  
   451    child_pid_to_fd_add(w, rv, p[1][1]);
   452  
   453  err:
   454    for (i = 0; i < 3; i++) {
   455      for (j = 0; j < 2; j++) {
   456        if (p[i][j] > -1) {
   457          close(p[i][j]);
   458          p[i][j] = -1;
   459        }
   460      }
   461    }
   462  
   463    if (fd > -1) {
   464      close(fd);
   465      fd = -1;
   466    }
   467  
   468    return 0;
   469  }
   470  
   471  int child_handle_noninteractive(int fd, wshd_t *w, msg_request_t *req) {
   472    int i, j;
   473    int num_descriptors = 5;
   474    int p[num_descriptors][2];
   475    int p_[num_descriptors];
   476    int rv;
   477    msg_response_t res;
   478  
   479    msg_response_init(&res);
   480  
   481    /* Initialize so that the error handler can do its job */
   482    for (i = 0; i < num_descriptors; i++) {
   483      p[i][0] = -1;
   484      p[i][1] = -1;
   485      p_[i] = -1;
   486    }
   487  
   488    for (i = 0; i < num_descriptors; i++) {
   489      rv = pipe(p[i]);
   490      if (rv == -1) {
   491        perror("pipe");
   492        abort();
   493      }
   494  
   495      fcntl_mix_cloexec(p[i][0]);
   496      fcntl_mix_cloexec(p[i][1]);
   497    }
   498  
   499    /* Descriptors to send to client */
   500    p_[0] = p[0][1];
   501    p_[1] = p[1][0];
   502    p_[2] = p[2][0];
   503    p_[3] = p[3][0];
   504    p_[4] = p[4][0];
   505  
   506    rv = un_send_fds(fd, (char *)&res, sizeof(res), p_, num_descriptors);
   507    if (rv == -1) {
   508      goto err;
   509    }
   510  
   511    rv = child_fork(req, p[0][0], p[1][1], p[2][1]);
   512    assert(rv > 0);
   513  
   514    write(p[4][1], &rv, sizeof(rv));
   515  
   516    child_pid_to_fd_add(w, rv, p[3][1]);
   517  
   518  err:
   519    for (i = 0; i < 5; i++) {
   520      for (j = 0; j < 2; j++) {
   521        if (p[i][j] > -1) {
   522          close(p[i][j]);
   523          p[i][j] = -1;
   524        }
   525      }
   526    }
   527  
   528    if (fd > -1) {
   529      close(fd);
   530      fd = -1;
   531    }
   532  
   533    return 0;
   534  }
   535  
   536  int child_accept(wshd_t *w) {
   537    int rv, fd;
   538    msg_request_t req;
   539  
   540    rv = accept(w->fd, NULL, NULL);
   541    if (rv == -1) {
   542      perror("accept");
   543      abort();
   544    }
   545  
   546    fd = rv;
   547  
   548    fcntl_mix_cloexec(fd);
   549  
   550    rv = un_recv_fds(fd, (char *)&req, sizeof(req), NULL, 0);
   551    if (rv < 0) {
   552      perror("recvmsg");
   553      exit(255);
   554    }
   555  
   556    if (rv == 0) {
   557      close(fd);
   558      return 0;
   559    }
   560  
   561    assert(rv == sizeof(req));
   562  
   563    if (req.tty) {
   564      return child_handle_interactive(fd, w, &req);
   565    } else {
   566      return child_handle_noninteractive(fd, w, &req);
   567    }
   568  }
   569  
   570  void child_handle_sigchld(wshd_t *w) {
   571    pid_t pid;
   572    int status, exitstatus;
   573    int fd;
   574  
   575    while (1) {
   576      do {
   577        pid = waitpid(-1, &status, WNOHANG);
   578      } while (pid == -1 && errno == EINTR);
   579  
   580      /* Break when there are no more children */
   581      if (pid <= 0) {
   582        break;
   583      }
   584  
   585      /* Processes can be reparented, so a pid may not map to an fd */
   586      fd = child_pid_to_fd_remove(w, pid);
   587      if (fd == -1) {
   588        continue;
   589      }
   590  
   591      if (WIFEXITED(status)) {
   592        exitstatus = WEXITSTATUS(status);
   593  
   594        /* Send exit status to client */
   595        write(fd, &exitstatus, sizeof(exitstatus));
   596      } else {
   597        assert(WIFSIGNALED(status));
   598  
   599        /* No exit status */
   600      }
   601  
   602      close(fd);
   603    }
   604  }
   605  
   606  int child_signalfd(void) {
   607    sigset_t mask;
   608    int rv;
   609    int fd;
   610  
   611    sigemptyset(&mask);
   612    sigaddset(&mask, SIGCHLD);
   613  
   614    rv = sigprocmask(SIG_BLOCK, &mask, NULL);
   615    if (rv == -1) {
   616      perror("sigprocmask");
   617      abort();
   618    }
   619  
   620    fd = signalfd(-1, &mask, SFD_NONBLOCK | SFD_CLOEXEC);
   621    if (fd == -1) {
   622      perror("signalfd");
   623      abort();
   624    }
   625  
   626    return fd;
   627  }
   628  
   629  int child_loop(wshd_t *w) {
   630    int sfd;
   631    int rv;
   632  
   633    close(STDIN_FILENO);
   634    close(STDOUT_FILENO);
   635    close(STDERR_FILENO);
   636  
   637    sfd = child_signalfd();
   638  
   639    for (;;) {
   640      fd_set fds;
   641  
   642      FD_ZERO(&fds);
   643      FD_SET(w->fd, &fds);
   644      FD_SET(sfd, &fds);
   645  
   646      do {
   647        rv = select(FD_SETSIZE, &fds, NULL, NULL, NULL);
   648      } while (rv == -1 && errno == EINTR);
   649  
   650      if (rv == -1) {
   651        perror("select");
   652        abort();
   653      }
   654  
   655      if (FD_ISSET(w->fd, &fds)) {
   656        child_accept(w);
   657      }
   658  
   659      if (FD_ISSET(sfd, &fds)) {
   660        struct signalfd_siginfo fdsi;
   661  
   662        rv = read(sfd, &fdsi, sizeof(fdsi));
   663        assert(rv == sizeof(fdsi));
   664  
   665        /* Ignore siginfo and loop waitpid to catch all children */
   666        child_handle_sigchld(w);
   667      }
   668    }
   669  
   670    return 1;
   671  }
   672  
   673  /* No header defines this */
   674  extern int pivot_root(const char *new_root, const char *put_old);
   675  
   676  void child_save_to_shm(wshd_t *w) {
   677    int rv;
   678    void *w_;
   679  
   680    rv = shmget(0xdeadbeef, sizeof(*w), IPC_CREAT | IPC_EXCL | 0600);
   681    if (rv == -1) {
   682      perror("shmget");
   683      abort();
   684    }
   685  
   686    w_ = shmat(rv, NULL, 0);
   687    if (w_ == (void *)-1) {
   688      perror("shmat");
   689      abort();
   690    }
   691  
   692    memcpy(w_, w, sizeof(*w));
   693  }
   694  
   695  wshd_t *child_load_from_shm(void) {
   696    int rv;
   697    int shmid;
   698    wshd_t *w;
   699    void *w_;
   700  
   701    shmid = shmget(0xdeadbeef, sizeof(*w), 0600);
   702    if (shmid == -1) {
   703      perror("shmget");
   704      abort();
   705    }
   706  
   707    w_ = shmat(shmid, NULL, 0);
   708    if (w_ == (void *)-1) {
   709      perror("shmat");
   710      abort();
   711    }
   712  
   713    w = malloc(sizeof(*w));
   714    if (w == NULL) {
   715      perror("malloc");
   716      abort();
   717    }
   718  
   719    memcpy(w, w_, sizeof(*w));
   720  
   721    rv = shmdt(w_);
   722    if (rv == -1) {
   723      perror("shmdt");
   724      abort();
   725    }
   726  
   727    rv = shmctl(shmid, IPC_RMID, NULL);
   728    if (rv == -1) {
   729      perror("shmctl");
   730      abort();
   731    }
   732  
   733    return w;
   734  }
   735  
   736  int child_run(void *data) {
   737    wshd_t *w = (wshd_t *)data;
   738    int rv;
   739    char pivoted_lib_path[PATH_MAX];
   740    size_t pivoted_lib_path_len;
   741  
   742    /* Wait for parent */
   743    rv = barrier_wait(&w->barrier_parent);
   744    assert(rv == 0);
   745  
   746    /* Prepare lib path for pivot */
   747    strcpy(pivoted_lib_path, "/tmp/garden-host");
   748    pivoted_lib_path_len = strlen(pivoted_lib_path);
   749    realpath(w->lib_path, pivoted_lib_path + pivoted_lib_path_len);
   750  
   751    rv = mount(w->root_path, w->root_path, NULL, MS_BIND|MS_REC, NULL);
   752    if(rv == -1) {
   753      perror("mount");
   754      abort();
   755    }
   756  
   757    rv = chdir(w->root_path);
   758    if (rv == -1) {
   759      perror("chdir");
   760      abort();
   761    }
   762  
   763    /* Ensure /tmp is world-writable as part of container contract */
   764    rv = chmod("tmp", 01777);
   765    if (rv == -1) {
   766      perror("chmod");
   767      abort();
   768    }
   769  
   770    rv = mkdir("tmp/garden-host", 0700);
   771    if (rv == -1 && errno != EEXIST) {
   772      perror("mkdir");
   773      abort();
   774    }
   775  
   776    rv = pivot_root(".", "tmp/garden-host");
   777    if (rv == -1) {
   778      perror("pivot_root");
   779      abort();
   780    }
   781  
   782    rv = chdir("/");
   783    if (rv == -1) {
   784      perror("chdir");
   785      abort();
   786    }
   787  
   788    rv = symlink("/dev/pts/ptmx", "/dev/ptmx");
   789    if (rv == -1 || errno == EEXIST) {
   790      rv = unlink("/dev/ptmx");
   791      if (rv == -1) {
   792        perror("unlink");
   793        abort();
   794      }
   795  
   796      rv = symlink("/dev/pts/ptmx", "/dev/ptmx");
   797    }
   798  
   799    rv = setuid(0);
   800    if (rv == -1) {
   801      perror("setuid");
   802      abort();
   803    }
   804  
   805    rv = setgid(0);
   806    if (rv == -1) {
   807      perror("setgid");
   808      abort();
   809    }
   810  
   811    rv = hook(pivoted_lib_path, "child-after-pivot");
   812    if(rv != 0) {
   813      perror("hook-child-after-pivot");
   814      abort();
   815    }
   816  
   817    child_save_to_shm(w);
   818  
   819    execl("/sbin/wshd", "/sbin/wshd", "--continue", NULL);
   820    perror("exec");
   821    abort();
   822  }
   823  
   824  int child_continue(int argc, char **argv) {
   825    wshd_t *w;
   826    int rv;
   827  
   828    w = child_load_from_shm();
   829  
   830    /* Process MUST not leak file descriptors to children */
   831    barrier_mix_cloexec(&w->barrier_child);
   832    fcntl_mix_cloexec(w->fd);
   833  
   834    if (strlen(w->title) > 0) {
   835      setproctitle(argv, w->title);
   836    }
   837  
   838    /* Clean up temporary pivot_root dir */
   839    rv = umount2("/tmp/garden-host", MNT_DETACH);
   840    if (rv == -1) {
   841      perror("unmount2");
   842      exit(1);
   843    }
   844  
   845    /* Detach this process from its original group */
   846    rv = setsid();
   847    assert(rv > 0 && rv == getpid());
   848  
   849    /* Signal parent */
   850    rv = barrier_signal(&w->barrier_child);
   851    assert(rv == 0);
   852  
   853    return child_loop(w);
   854  }
   855  
   856  pid_t child_start(wshd_t *w) {
   857    long pagesize;
   858    void *stack;
   859    int flags = 0;
   860    pid_t pid;
   861  
   862    pagesize = sysconf(_SC_PAGESIZE);
   863    stack = alloca(pagesize);
   864    assert(stack != NULL);
   865  
   866    /* Point to top of stack (it grows down) */
   867    stack = stack + pagesize;
   868  
   869    /* Setup namespaces */
   870    flags |= CLONE_NEWIPC;
   871    flags |= CLONE_NEWNET;
   872    flags |= CLONE_NEWNS;
   873    flags |= CLONE_NEWPID;
   874    flags |= CLONE_NEWUTS;
   875    flags |= w->clone_flags;
   876  
   877    pid = clone(child_run, stack, flags, w);
   878    if (pid == -1) {
   879      perror("clone");
   880      abort();
   881    }
   882  
   883    return pid;
   884  }
   885  
   886  void parent_setenv_pid(wshd_t *w, int pid) {
   887    char buf[16];
   888    int rv;
   889  
   890    rv = snprintf(buf, sizeof(buf), "%d", pid);
   891    assert(rv < sizeof(buf));
   892  
   893    rv = setenv("PID", buf, 1);
   894    assert(rv == 0);
   895  }
   896  
   897  /* Returns the maximum allowed number of open files. */
   898  long int max_nr_open() {
   899    char file_data[32];
   900    size_t bytes_read;
   901    FILE *f;
   902    long int nr;
   903  
   904    if ((f = fopen("/proc/sys/fs/nr_open", "r")) == NULL) {
   905      perror("Failed to open /proc/sys/fs/nr_open");
   906      abort();
   907    }
   908  
   909    bytes_read = fread(file_data, 1, sizeof(file_data), f);
   910    if (ferror(f) || bytes_read == 0) {
   911      perror("Failed to read /proc/sys/fs/nr_open");
   912      abort();
   913    }
   914  
   915    if (fclose(f)) {
   916      perror("Failed to close /proc/sys/fs/nr_open");
   917      abort();
   918    }
   919  
   920    errno = 0;
   921    nr = strtol(file_data, NULL, 10);
   922    if (errno) {
   923      perror("Contents of /proc/sys/fs/nr_open could not be converted to a long int");
   924      abort();
   925    }
   926    return nr;
   927  }
   928  
   929  /* Sets a hard resource limit to specified value. */
   930  void set_hard_rlimit(char * resource_name, int resource, rlim_t hard_limit) {
   931    char err_text[1024];
   932    struct rlimit lim = {0, 0};
   933    if (getrlimit(resource, &lim)) {
   934      strcpy(err_text, "getrlimit failed to return ");
   935      strcat(err_text, resource_name);
   936      perror(err_text);
   937      abort();
   938    }
   939  
   940    lim.rlim_max = hard_limit;
   941    if (setrlimit(resource, &lim)) {
   942      strcpy(err_text, "setrlimit failed to set ");
   943      strcat(err_text, resource_name);
   944      perror(err_text);
   945      abort();
   946    }
   947  }
   948  
   949  /* Sets hard resource limits to their maximum permitted values. */
   950  void set_hard_rlimits() {
   951    set_hard_rlimit("RLIMIT_AS", RLIMIT_AS, RLIM_INFINITY);
   952    set_hard_rlimit("RLIMIT_CORE", RLIMIT_CORE, RLIM_INFINITY);
   953    set_hard_rlimit("RLIMIT_CPU", RLIMIT_CPU, RLIM_INFINITY);
   954    set_hard_rlimit("RLIMIT_DATA", RLIMIT_DATA, RLIM_INFINITY);
   955    set_hard_rlimit("RLIMIT_FSIZE", RLIMIT_FSIZE, RLIM_INFINITY);
   956    set_hard_rlimit("RLIMIT_LOCKS", RLIMIT_LOCKS, RLIM_INFINITY);
   957    set_hard_rlimit("RLIMIT_MEMLOCK", RLIMIT_MEMLOCK, RLIM_INFINITY);
   958    set_hard_rlimit("RLIMIT_MSGQUEUE", RLIMIT_MSGQUEUE, RLIM_INFINITY);
   959    set_hard_rlimit("RLIMIT_NICE", RLIMIT_NICE, RLIM_INFINITY);
   960    set_hard_rlimit("RLIMIT_NOFILE", RLIMIT_NOFILE, max_nr_open());
   961    set_hard_rlimit("RLIMIT_NPROC", RLIMIT_NPROC, RLIM_INFINITY);
   962    set_hard_rlimit("RLIMIT_RSS", RLIMIT_RSS, RLIM_INFINITY);
   963    set_hard_rlimit("RLIMIT_RTPRIO", RLIMIT_RTPRIO, RLIM_INFINITY);
   964    set_hard_rlimit("RLIMIT_SIGPENDING", RLIMIT_SIGPENDING, RLIM_INFINITY);
   965    set_hard_rlimit("RLIMIT_STACK", RLIMIT_STACK, RLIM_INFINITY);
   966  }
   967  
   968  int parent_run(wshd_t *w) {
   969    char path[MAXPATHLEN];
   970    int rv;
   971    pid_t pid;
   972  
   973    memset(path, 0, sizeof(path));
   974  
   975    strcpy(path + strlen(path), w->run_path);
   976    strcpy(path + strlen(path), "/");
   977    strcpy(path + strlen(path), "wshd.sock");
   978  
   979    w->fd = un_listen(path);
   980  
   981    rv = barrier_open(&w->barrier_parent);
   982    assert(rv == 0);
   983  
   984    rv = barrier_open(&w->barrier_child);
   985    assert(rv == 0);
   986  
   987    /* Unshare mount namespace, so the before clone hook is free to mount
   988     * whatever it needs without polluting the global mount namespace. */
   989    rv = unshare(CLONE_NEWNS);
   990    assert(rv == 0);
   991  
   992    rv = hook(w->lib_path, "parent-before-clone");
   993    assert(rv == 0);
   994  
   995    /* Set hard resource limits to their maximum values so that soft and
   996       hard resource limits can be set to arbitrary values even in an
   997       unprivileged container. */
   998    set_hard_rlimits();
   999  
  1000    pid = child_start(w);
  1001    assert(pid > 0);
  1002  
  1003    parent_setenv_pid(w, pid);
  1004  
  1005    rv = hook(w->lib_path, "parent-after-clone");
  1006    assert(rv == 0);
  1007  
  1008    rv = barrier_signal(&w->barrier_parent);
  1009    if (rv == -1) {
  1010      fprintf(stderr, "Error waking up child process\n");
  1011      exit(1);
  1012    }
  1013  
  1014    rv = barrier_wait(&w->barrier_child);
  1015    if (rv == -1) {
  1016      fprintf(stderr, "Error waiting for acknowledgement from child process\n");
  1017      exit(1);
  1018    }
  1019  
  1020    return 0;
  1021  }
  1022  
  1023  int main(int argc, char **argv) {
  1024    wshd_t *w;
  1025    int rv;
  1026  
  1027    /* Continue child execution in the context of the container */
  1028    if (argc > 1 && strcmp(argv[1], "--continue") == 0) {
  1029      return child_continue(argc, argv);
  1030    }
  1031  
  1032    w = calloc(1, sizeof(*w));
  1033    assert(w != NULL);
  1034  
  1035    rv = wshd__getopt(w, argc, argv);
  1036    if (rv == -1) {
  1037      exit(1);
  1038    }
  1039  
  1040    if (strlen(w->run_path) == 0) {
  1041      strcpy(w->run_path, "run");
  1042    }
  1043  
  1044    if (strlen(w->lib_path) == 0) {
  1045      strcpy(w->lib_path, "lib");
  1046    }
  1047  
  1048    if (strlen(w->root_path) == 0) {
  1049      strcpy(w->root_path, "root");
  1050    }
  1051  
  1052    assert_directory(w->run_path);
  1053    assert_directory(w->lib_path);
  1054    assert_directory(w->root_path);
  1055  
  1056    parent_run(w);
  1057  
  1058    return 0;
  1059  }