github.com/apptainer/singularity@v3.1.1+incompatible/cmd/starter/c/starter.c (about)

     1  /*
     2    Copyright (c) 2018-2019, Sylabs, Inc. All rights reserved.
     3  
     4    This software is licensed under a 3-clause BSD license.  Please
     5    consult LICENSE.md file distributed with the sources of this project regarding
     6    your rights to use or distribute this software.
     7  */
     8  
     9  
    10  #define _GNU_SOURCE
    11  #include <stdio.h>
    12  #include <stdlib.h>
    13  #include <stdarg.h>
    14  #include <unistd.h>
    15  #include <errno.h>
    16  #include <ctype.h>
    17  #include <string.h>
    18  #include <fcntl.h>
    19  #include <poll.h>
    20  #include <grp.h>
    21  #include <link.h>
    22  #include <dirent.h>
    23  #include <libgen.h>
    24  #include <limits.h>
    25  #include <sys/mman.h>
    26  #include <sys/fsuid.h>
    27  #include <sys/mount.h>
    28  #include <sys/wait.h>
    29  #include <sys/prctl.h>
    30  #include <sys/socket.h>
    31  #include <sys/stat.h>
    32  #include <signal.h>
    33  #include <sched.h>
    34  #include <setjmp.h>
    35  #include <sys/syscall.h>
    36  #include <net/if.h>
    37  #include <sys/eventfd.h>
    38  
    39  #ifdef SINGULARITY_SECUREBITS
    40  #  include <linux/securebits.h>
    41  #else
    42  #  include "include/securebits.h"
    43  #endif /* SINGULARITY_SECUREBITS */
    44  
    45  #ifndef PR_SET_NO_NEW_PRIVS
    46  #define PR_SET_NO_NEW_PRIVS 38
    47  #endif
    48  
    49  #ifndef PR_GET_NO_NEW_PRIVS
    50  #define PR_GET_NO_NEW_PRIVS 39
    51  #endif
    52  
    53  #ifndef CLONE_NEWUSER
    54  #define CLONE_NEWUSER       0x10000000
    55  #endif
    56  
    57  #ifndef CLONE_NEWCGROUP
    58  #define CLONE_NEWCGROUP     0x02000000
    59  #endif
    60  
    61  #include "include/capability.h"
    62  #include "include/message.h"
    63  #include "include/starter.h"
    64  
    65  #define CLONE_STACK_SIZE    1024*1024
    66  #define BUFSIZE             512
    67  
    68  /* C and JSON configuration */
    69  struct cConfig *config;
    70  
    71  /* Socket process communication */
    72  int rpc_socket[2] = {-1, -1};
    73  int master_socket[2] = {-1, -1};
    74  
    75  #define STAGE1      1
    76  #define STAGE2      2
    77  #define MASTER      3
    78  #define RPC_SERVER  4
    79  
    80  unsigned char execute;
    81  
    82  typedef struct fork_state_s {
    83      sigjmp_buf env;
    84  } fork_state_t;
    85  
    86  /* copy paste from singularity code */
    87  static int clone_fn(void *data_ptr) {
    88      fork_state_t *state = (fork_state_t *)data_ptr;
    89      siglongjmp(state->env, 1);
    90  }
    91  
    92  static int fork_ns(unsigned int flags) {
    93      fork_state_t state;
    94  
    95      if ( sigsetjmp(state.env, 1) ) {
    96          return 0;
    97      }
    98  
    99      int stack_size = CLONE_STACK_SIZE;
   100      char *child_stack_ptr = malloc(stack_size);
   101      if ( child_stack_ptr == 0 ) {
   102          errno = ENOMEM;
   103          return -1;
   104      }
   105      child_stack_ptr += stack_size;
   106  
   107      int retval = clone(clone_fn, child_stack_ptr, (SIGCHLD|flags), &state);
   108      return retval;
   109  }
   110  
   111  static void priv_escalate(void) {
   112      verbosef("Get root privileges\n");
   113      if ( seteuid(0) < 0 ) {
   114          fatalf("Failed to set effective UID to 0\n");
   115      }
   116  }
   117  
   118  static void set_parent_death_signal(int signo) {
   119      debugf("Set parent death signal to %d\n", signo);
   120      if ( prctl(PR_SET_PDEATHSIG, signo) < 0 ) {
   121          fatalf("Failed to set parent death signal\n");
   122      }
   123  }
   124  
   125  static int prepare_stage(int stage, struct cConfig *config) {
   126      uid_t uid = getuid();
   127      struct __user_cap_header_struct header;
   128      struct __user_cap_data_struct data[2];
   129  
   130      set_parent_death_signal(SIGKILL);
   131  
   132      debugf("Entering in stage %d\n", stage);
   133  
   134      header.version = LINUX_CAPABILITY_VERSION;
   135      header.pid = 0;
   136  
   137      if ( capget(&header, data) < 0 ) {
   138          fatalf("Failed to get processus capabilities\n");
   139      }
   140  
   141      data[1].inheritable = (__u32)(config->capabilities.inheritable >> 32);
   142      data[0].inheritable = (__u32)(config->capabilities.inheritable & 0xFFFFFFFF);
   143      data[1].permitted = (__u32)(config->capabilities.permitted >> 32);
   144      data[0].permitted = (__u32)(config->capabilities.permitted & 0xFFFFFFFF);
   145      data[1].effective = (__u32)(config->capabilities.effective >> 32);
   146      data[0].effective = (__u32)(config->capabilities.effective & 0xFFFFFFFF);
   147  
   148      int last_cap;
   149      for ( last_cap = CAPSET_MAX; ; last_cap-- ) {
   150          if ( prctl(PR_CAPBSET_READ, last_cap) > 0 || last_cap == 0 ) {
   151              break;
   152          }
   153      }
   154  
   155      int caps_index;
   156      for ( caps_index = 0; caps_index <= last_cap; caps_index++ ) {
   157          if ( !(config->capabilities.bounding & (1ULL << caps_index)) ) {
   158              if ( prctl(PR_CAPBSET_DROP, caps_index) < 0 ) {
   159                  fatalf("Failed to drop bounding capabilities set: %s\n", strerror(errno));
   160              }
   161          }
   162      }
   163  
   164      if ( !(config->namespace.flags & CLONE_NEWUSER) ) {
   165          /* apply target UID/GID for root user */
   166          if ( uid == 0 ) {
   167              if ( config->container.numGID != 0 || config->container.targetUID != 0 ) {
   168                  if ( prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP|SECBIT_NO_SETUID_FIXUP_LOCKED) < 0 ) {
   169                      fatalf("Failed to set securebits: %s\n", strerror(errno));
   170                  }
   171              }
   172  
   173              if ( config->container.numGID != 0 ) {
   174                  debugf("Clear additional group IDs\n");
   175  
   176                  if ( setgroups(0, NULL) < 0 ) {
   177                      fatalf("Unable to clear additional group IDs: %s\n", strerror(errno));
   178                  }
   179              }
   180  
   181              if ( config->container.numGID >= 2 ) {
   182                  debugf("Set additional group IDs\n");
   183  
   184                  if ( setgroups(config->container.numGID-1, &config->container.targetGID[1]) < 0 ) {
   185                      fatalf("Failed to set additional groups: %s\n", strerror(errno));
   186                  }
   187              }
   188              if ( config->container.numGID >= 1 ) {
   189                  gid_t targetGID = config->container.targetGID[0];
   190  
   191                  debugf("Set main group ID\n");
   192  
   193                  if ( setresgid(targetGID, targetGID, targetGID) < 0 ) {
   194                      fatalf("Failed to set GID %d: %s\n", targetGID, strerror(errno));
   195                  }
   196              }
   197              if ( config->container.targetUID != 0 ) {
   198                  uid_t targetUID = config->container.targetUID;
   199  
   200                  debugf("Set user ID to %d\n", targetUID);
   201  
   202                  if ( setresuid(targetUID, targetUID, targetUID) < 0 ) {
   203                      fatalf("Failed to drop privileges: %s\n", strerror(errno));
   204                  }
   205              }
   206          } else if ( config->container.isSuid ) {
   207              if ( prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP|SECBIT_NO_SETUID_FIXUP_LOCKED) < 0 ) {
   208                  fatalf("Failed to set securebits: %s\n", strerror(errno));
   209              }
   210  
   211              if ( setresuid(uid, uid, uid) < 0 ) {
   212                  fatalf("Failed to drop privileges: %s\n", strerror(errno));
   213              }
   214          }
   215  
   216          set_parent_death_signal(SIGKILL);
   217      }
   218  
   219      if ( config->container.noNewPrivs ) {
   220          if ( prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0 ) {
   221              fatalf("Failed to set no new privs flag: %s\n", strerror(errno));
   222          }
   223          if ( prctl(PR_GET_NO_NEW_PRIVS, 0, 0 ,0, 0) != 1 ) {
   224              fatalf("Aborting, failed to set no new privs flag: %s\n", strerror(errno));
   225          }
   226      }
   227  
   228      if ( capset(&header, data) < 0 ) {
   229          fatalf("Failed to set process capabilities\n");
   230      }
   231  
   232  #ifdef USER_CAPABILITIES
   233      // set ambient capabilities if supported
   234      for ( caps_index = 0; caps_index <= last_cap; caps_index++ ) {
   235          if ( (config->capabilities.ambient & (1ULL << caps_index)) ) {
   236              if ( prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, caps_index, 0, 0) < 0 ) {
   237                  fatalf("Failed to set ambient capability: %s\n", strerror(errno));
   238              }
   239          }
   240      }
   241  #endif
   242  
   243      return stage;
   244  }
   245  
   246  static int create_namespace(int nstype) {
   247      switch(nstype) {
   248      case CLONE_NEWNET:
   249  #ifdef NS_CLONE_NEWNET
   250          verbosef("Create network namespace\n");
   251  #else
   252          warningf("Skipping network namespace creation, not supported\n");
   253          return(0);
   254  #endif /* NS_CLONE_NEWNET */
   255          break;
   256      case CLONE_NEWIPC:
   257  #ifdef NS_CLONE_NEWIPC
   258          verbosef("Create ipc namespace\n");
   259  #else
   260          warningf("Skipping ipc namespace creation, not supported\n");
   261          return(0);
   262  #endif /* NS_CLONE_NEWIPC */
   263          break;
   264      case CLONE_NEWNS:
   265  #ifdef NS_CLONE_NEWNS
   266          verbosef("Create mount namespace\n");
   267  #else
   268          warningf("Skipping mount namespace creation, not supported\n");
   269          return(0);
   270  #endif /* NS_CLONE_NEWNS */
   271          break;
   272      case CLONE_NEWUTS:
   273  #ifdef NS_CLONE_NEWUTS
   274          verbosef("Create uts namespace\n");
   275  #else
   276          warningf("Skipping uts namespace creation, not supported\n");
   277          return(0);
   278  #endif /* NS_CLONE_NEWUTS */
   279          break;
   280      case CLONE_NEWUSER:
   281  #ifdef NS_CLONE_NEWUSER
   282          verbosef("Create user namespace\n");
   283  #else
   284          warningf("Skipping user namespace creation, not supported\n");
   285  #endif /* NS_CLONE_NEWUSER */
   286          break;
   287  #ifdef NS_CLONE_NEWCGROUP
   288      case CLONE_NEWCGROUP:
   289          verbosef("Create cgroup namespace\n");
   290          break;
   291  #endif /* NS_CLONE_NEWCGROUP */
   292      default:
   293          warningf("Skipping unknown namespace creation\n");
   294          errno = EINVAL;
   295          return(-1);
   296      }
   297      return unshare(nstype);
   298  }
   299  
   300  static int enter_namespace(char *nspath, int nstype) {
   301      int ns_fd;
   302  
   303      switch(nstype) {
   304      case CLONE_NEWPID:
   305          verbosef("Entering in pid namespace\n");
   306  #ifndef NS_CLONE_NEWPID
   307          errno = EINVAL;
   308          return(-1);
   309  #endif /* NS_CLONE_NEWPID */
   310          break;
   311      case CLONE_NEWNET:
   312          verbosef("Entering in network namespace\n");
   313  #ifndef NS_CLONE_NEWNET
   314          errno = EINVAL;
   315          return(-1);
   316  #endif /* NS_CLONE_NEWNET */
   317          break;
   318      case CLONE_NEWIPC:
   319          verbosef("Entering in ipc namespace\n");
   320  #ifndef NS_CLONE_NEWIPC
   321          errno = EINVAL;
   322          return(-1);
   323  #endif /* NS_CLONE_NEWIPC */
   324          break;
   325      case CLONE_NEWNS:
   326          verbosef("Entering in mount namespace\n");
   327  #ifndef NS_CLONE_NEWNS
   328          errno = EINVAL;
   329          return(-1);
   330  #endif /* NS_CLONE_NEWNS */
   331          break;
   332      case CLONE_NEWUTS:
   333          verbosef("Entering in uts namespace\n");
   334  #ifndef NS_CLONE_NEWUTS
   335          errno = EINVAL;
   336          return(-1);
   337  #endif /* NS_CLONE_NEWUTS */
   338          break;
   339      case CLONE_NEWUSER:
   340          verbosef("Entering in user namespace\n");
   341  #ifndef NS_CLONE_NEWUSER
   342          errno = EINVAL;
   343          return(-1);
   344  #endif /* NS_CLONE_NEWUSER */
   345          break;
   346  #ifdef NS_CLONE_NEWCGROUP
   347      case CLONE_NEWCGROUP:
   348          verbosef("Entering in cgroup namespace\n");
   349          break;
   350  #endif /* NS_CLONE_NEWCGROUP */
   351      default:
   352          verbosef("Entering in unknown namespace\n");
   353          errno = EINVAL;
   354          return(-1);
   355      }
   356  
   357      debugf("Opening namespace file descriptor %s\n", nspath);
   358      ns_fd = open(nspath, O_RDONLY);
   359      if ( ns_fd < 0 ) {
   360          return(-1);
   361      }
   362  
   363      if ( setns(ns_fd, nstype) < 0 ) {
   364          int err = errno;
   365          close(ns_fd);
   366          errno = err;
   367          return(-1);
   368      }
   369  
   370      close(ns_fd);
   371      return(0);
   372  }
   373  
   374  static void setup_userns_mappings(struct cConfig *config, pid_t pid, const char *setgroup) {
   375      FILE *map_fp;
   376      int i;
   377      struct idMapping *uidmap;
   378      struct idMapping *gidmap;
   379      char *path = (char *)malloc(PATH_MAX);
   380  
   381      debugf("Write %s to set group file\n", setgroup);
   382      memset(path, 0, PATH_MAX);
   383      if ( snprintf(path, PATH_MAX-1, "/proc/%d/setgroups", pid) < 0 ) {
   384          fatalf("Failed to write path /proc/%d/setgroups in buffer\n", pid);
   385      }
   386  
   387      map_fp = fopen(path, "w+"); // Flawfinder: ignore
   388      if ( map_fp != NULL ) {
   389          fprintf(map_fp, "%s\n", setgroup);
   390          if ( fclose(map_fp) < 0 ) {
   391              fatalf("Failed to write %s to setgroup file: %s\n", setgroup, strerror(errno));
   392          }
   393      } else {
   394          fatalf("Could not write info to setgroups: %s\n", strerror(errno));
   395      }
   396  
   397      debugf("Write to GID map\n");
   398      memset(path, 0, PATH_MAX);
   399      if ( snprintf(path, PATH_MAX-1, "/proc/%d/gid_map", pid) < 0 ) {
   400          fatalf("Failed to write path /proc/%d/gid_map in buffer\n", pid);
   401      }
   402  
   403      map_fp = fopen(path, "w+"); // Flawfinder: ignore
   404      if ( map_fp != NULL ) {
   405          fprintf(map_fp, "%s", config->container.gidMap);
   406          if ( fclose(map_fp) < 0 ) {
   407              fatalf("Failed to write to GID map: %s\n", strerror(errno));
   408          }
   409      } else {
   410          fatalf("Could not write parent info to gid_map: %s\n", strerror(errno));
   411      }
   412  
   413      debugf("Write to UID map\n");
   414      memset(path, 0, PATH_MAX);
   415      if ( snprintf(path, PATH_MAX-1, "/proc/%d/uid_map", pid) < 0 ) {
   416          fatalf("Failed to write path /proc/%d/uid_map in buffer\n", pid);
   417      }
   418  
   419      map_fp = fopen(path, "w+"); // Flawfinder: ignore
   420      if ( map_fp != NULL ) {
   421          fprintf(map_fp, "%s", config->container.uidMap);
   422          if ( fclose(map_fp) < 0 ) {
   423              fatalf("Failed to write to UID map: %s\n", strerror(errno));
   424          }
   425      } else {
   426          fatalf("Could not write parent info to uid_map: %s\n", strerror(errno));
   427      }
   428  
   429      free(path);
   430  }
   431  
   432  static void setup_userns_identity(struct cConfig *config) {
   433      uid_t uidMap = config->container.targetUID;
   434      gid_t gidMap = config->container.targetGID[0];
   435  
   436      if ( setgroups(0, NULL) < 0 ) {
   437          fatalf("Unabled to clear additional group IDs: %s\n", strerror(errno));
   438      }
   439      if ( setresgid(gidMap, gidMap, gidMap) < 0 ) {
   440          fatalf("Failed to change namespace group identity: %s\n", strerror(errno));
   441      }
   442      if ( setresuid(uidMap, uidMap, uidMap) < 0 ) {
   443          fatalf("Failed to change namespace user identity: %s\n", strerror(errno));
   444      }
   445  }
   446  
   447  static void user_namespace_init(struct cConfig *config, int *fork_flags) {
   448      if ( (config->namespace.flags & CLONE_NEWUSER) == 0 && config->namespace.user[0] == 0 ) {
   449          priv_escalate();
   450      } else {
   451          if ( config->container.isSuid ) {
   452              fatalf("Running setuid workflow with user namespace is not allowed\n");
   453          }
   454          if ( config->namespace.user[0] != 0 ) {
   455              if ( enter_namespace(config->namespace.user, CLONE_NEWUSER) < 0 ) {
   456                  fatalf("Failed to enter in user namespace: %s\n", strerror(errno));
   457              }
   458              if ( !config->container.sharedMount ) {
   459                  setup_userns_identity(config);
   460              }
   461          } else if ( config->container.sharedMount ) {
   462              verbosef("Create user namespace\n");
   463  
   464              if ( unshare(CLONE_NEWUSER) < 0 ) {
   465                  fatalf("Failed to create user namespace\n");
   466              }
   467  
   468              setup_userns_mappings(config, getpid(), "deny");
   469          } else {
   470              *fork_flags |= CLONE_NEWUSER;
   471              priv_escalate();
   472          }
   473      }
   474  }
   475  
   476  static char *shared_mount_namespace_init(struct cConfig *config) {
   477      if ( config->namespace.mount[0] == 0 && config->container.sharedMount ) {
   478          unsigned long propagation = config->container.mountPropagation;
   479  
   480          if ( propagation == 0 ) {
   481              propagation = MS_PRIVATE | MS_REC;
   482          }
   483          if ( unshare(CLONE_FS) < 0 ) {
   484              fatalf("Failed to unshare root file system: %s\n", strerror(errno));
   485          }
   486          if ( create_namespace(CLONE_NEWNS) < 0 ) {
   487              fatalf("Failed to create mount namespace: %s\n", strerror(errno));
   488          }
   489          if ( mount(NULL, "/", NULL, propagation, NULL) < 0 ) {
   490              fatalf("Failed to set mount propagation: %s\n", strerror(errno));
   491          }
   492          /* set shared mount propagation to share mount points between master and container process */
   493          if ( mount(NULL, "/", NULL, MS_SHARED|MS_REC, NULL) < 0 ) {
   494              fatalf("Failed to propagate as SHARED: %s\n", strerror(errno));
   495          }
   496      }
   497  }
   498  
   499  static void pid_namespace_init(struct cConfig *config, int *fork_flags) {
   500      if ( config->namespace.pid[0] != 0 ) {
   501          if ( enter_namespace(config->namespace.pid, CLONE_NEWPID) < 0 ) {
   502              fatalf("Failed to enter in pid namespace: %s\n", strerror(errno));
   503          }
   504      } else if ( config->namespace.flags & CLONE_NEWPID ) {
   505          verbosef("Create pid namespace\n");
   506          *fork_flags |= CLONE_NEWPID;
   507      }
   508  }
   509  
   510  static void network_namespace_init(struct cConfig *config) {
   511      if ( config->namespace.network[0] != 0 ) {
   512          if ( enter_namespace(config->namespace.network, CLONE_NEWNET) < 0 ) {
   513              fatalf("Failed to enter in network namespace: %s\n", strerror(errno));
   514          }
   515      } else if ( config->namespace.flags & CLONE_NEWNET ) {
   516          if ( create_namespace(CLONE_NEWNET) < 0 ) {
   517              fatalf("Failed to create network namespace: %s\n", strerror(errno));
   518          }
   519  
   520          if ( config->container.bringLoopbackInterface ) {
   521              struct ifreq req;
   522              int sockfd = socket(AF_INET, SOCK_DGRAM, 0);
   523  
   524              if ( sockfd < 0 ) {
   525                  fatalf("Unable to open AF_INET socket: %s\n", strerror(errno));
   526              }
   527  
   528              memset(&req, 0, sizeof(req));
   529              strncpy(req.ifr_name, "lo", IFNAMSIZ);
   530  
   531              req.ifr_flags |= IFF_UP;
   532  
   533              debugf("Bringing up network loopback interface\n");
   534              if ( ioctl(sockfd, SIOCSIFFLAGS, &req) < 0 ) {
   535                  fatalf("Failed to set flags on interface: %s\n", strerror(errno));
   536              }
   537              close(sockfd);
   538          }
   539      }
   540  }
   541  
   542  static void uts_namespace_init(struct cConfig *config) {
   543      if ( config->namespace.uts[0] != 0 ) {
   544          if ( enter_namespace(config->namespace.uts, CLONE_NEWUTS) < 0 ) {
   545              fatalf("Failed to enter in uts namespace: %s\n", strerror(errno));
   546          }
   547      } else if ( config->namespace.flags & CLONE_NEWUTS ) {
   548          if ( create_namespace(CLONE_NEWUTS) < 0 ) {
   549              fatalf("Failed to create uts namespace: %s\n", strerror(errno));
   550          }
   551      }
   552  }
   553  
   554  static void ipc_namespace_init(struct cConfig *config) {
   555      if ( config->namespace.ipc[0] != 0 ) {
   556          if ( enter_namespace(config->namespace.ipc, CLONE_NEWIPC) < 0 ) {
   557              fatalf("Failed to enter in ipc namespace: %s\n", strerror(errno));
   558          }
   559      } else if ( config->namespace.flags & CLONE_NEWIPC ) {
   560          if ( create_namespace(CLONE_NEWIPC) < 0 ) {
   561              fatalf("Failed to create ipc namespace: %s\n", strerror(errno));
   562          }
   563      }
   564  }
   565  
   566  static void cgroup_namespace_init(struct cConfig *config) {
   567      if ( config->namespace.cgroup[0] != 0 ) {
   568          if ( enter_namespace(config->namespace.cgroup, CLONE_NEWCGROUP) < 0 ) {
   569              fatalf("Failed to enter in cgroup namespace: %s\n", strerror(errno));
   570          }
   571      } else if ( config->namespace.flags & CLONE_NEWCGROUP ) {
   572          if ( create_namespace(CLONE_NEWCGROUP) < 0 ) {
   573              fatalf("Failed to create cgroup namespace: %s\n", strerror(errno));
   574          }
   575      }
   576  }
   577  
   578  static void mount_namespace_init(struct cConfig *config) {
   579      if ( config->namespace.mount[0] != 0 ) {
   580          if ( enter_namespace(config->namespace.mount, CLONE_NEWNS) < 0 ) {
   581              fatalf("Failed to enter in mount namespace: %s\n", strerror(errno));
   582          }
   583      } else if ( config->namespace.flags & CLONE_NEWNS ) {
   584          if ( !config->container.sharedMount ) {
   585              unsigned long propagation = config->container.mountPropagation;
   586  
   587              if ( unshare(CLONE_FS) < 0 ) {
   588                  fatalf("Failed to unshare root file system: %s\n", strerror(errno));
   589              }
   590              if ( create_namespace(CLONE_NEWNS) < 0 ) {
   591                  fatalf("Failed to create mount namespace: %s\n", strerror(errno));
   592              }
   593              if ( propagation && mount(NULL, "/", NULL, propagation, NULL) < 0 ) {
   594                  fatalf("Failed to set mount propagation: %s\n", strerror(errno));
   595              }
   596          } else {
   597              /* create a namespace for container process to separate master during pivot_root */
   598              if ( create_namespace(CLONE_NEWNS) < 0 ) {
   599                  fatalf("Failed to create mount namespace: %s\n", strerror(errno));
   600              }
   601  
   602              /* set shared propagation to propagate few mount points to master */
   603              if ( mount(NULL, "/", NULL, MS_SHARED|MS_REC, NULL) < 0 ) {
   604                  fatalf("Failed to propagate as SHARED: %s\n", strerror(errno));
   605              }
   606          }
   607      }
   608  }
   609  
   610  static unsigned char is_suid(void) {
   611      ElfW(auxv_t) *auxv;
   612      unsigned char suid = 0;
   613      char *buffer = (char *)malloc(4096);
   614      int proc_auxv = open("/proc/self/auxv", O_RDONLY);
   615  
   616      verbosef("Check if we are running as setuid\n");
   617  
   618      if ( proc_auxv < 0 ) {
   619          fatalf("Can't open /proc/self/auxv: %s\n", strerror(errno));
   620      }
   621  
   622      /* use auxiliary vectors to determine if running privileged */
   623      memset(buffer, 0, 4096);
   624      if ( read(proc_auxv, buffer, 4088) < 0 ) {
   625          fatalf("Can't read auxiliary vectors: %s\n", strerror(errno));
   626      }
   627  
   628      auxv = (ElfW(auxv_t) *)buffer;
   629  
   630      for (; auxv->a_type != AT_NULL; auxv++) {
   631          if ( auxv->a_type == AT_SECURE ) {
   632              suid = (int)auxv->a_un.a_val;
   633              break;
   634          }
   635      }
   636  
   637      free(buffer);
   638      close(proc_auxv);
   639  
   640      return suid;
   641  }
   642  
   643  static struct fdlist *list_fd(void) {
   644      int i = 0;
   645      int fd_proc;
   646      DIR *dir;
   647      struct dirent *dirent;
   648      struct fdlist *fl = (struct fdlist *)malloc(sizeof(struct fdlist));
   649  
   650      if ( fl == NULL ) {
   651          fatalf("Memory allocation failed: %s\n", strerror(errno));
   652      }
   653  
   654      fl->fds = NULL;
   655      fl->num = 0;
   656  
   657      if ( ( fd_proc = open("/proc/self/fd", O_RDONLY) ) < 0 ) {
   658          fatalf("Failed to open /proc/self/fd: %s\n", strerror(errno));
   659      }
   660  
   661      if ( ( dir = fdopendir(fd_proc) ) == NULL ) {
   662          fatalf("Failed to list /proc/self/fd directory: %s\n", strerror(errno));
   663      }
   664  
   665      while ( ( dirent = readdir(dir ) ) ) {
   666          if ( strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0 ) {
   667              continue;
   668          }
   669          if ( atoi(dirent->d_name) == fd_proc ) {
   670              continue;
   671          }
   672          fl->num++;
   673      }
   674  
   675      rewinddir(dir);
   676  
   677      fl->fds = (int *)malloc(sizeof(int)*fl->num);
   678      if ( fl->fds == NULL ) {
   679          fatalf("Memory allocation failed: %s\n", strerror(errno));
   680      }
   681  
   682      while ( ( dirent = readdir(dir ) ) ) {
   683          int cv;
   684          if ( strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0 ) {
   685              continue;
   686          }
   687  
   688          cv = atoi(dirent->d_name);
   689          if ( cv == fd_proc ) {
   690              continue;
   691          }
   692  
   693          fl->fds[i++] = atoi(dirent->d_name);
   694      }
   695  
   696      closedir(dir);
   697      close(fd_proc);
   698  
   699      return fl;
   700  }
   701  
   702  static void cleanup_fd(struct fdlist *fd_before, struct fdlist *fd_after) {
   703      int i, j;
   704      char *source = (char *)malloc(PATH_MAX);
   705      char *target = (char *)malloc(PATH_MAX);
   706  
   707      if ( source == NULL || target == NULL ) {
   708          fatalf("Memory allocation failed: %s", strerror(errno));
   709      }
   710  
   711      /*
   712       *  close unattended file descriptors opened during stage 1
   713       *  execution, that may not be accurate depending of fs operations done
   714       *  in stage 1, but should work for most engines.
   715       */
   716      for ( i = 0; i < fd_after->num; i++ ) {
   717          struct stat st;
   718          int found;
   719  
   720          if ( fd_after->fds[i] == master_socket[0] || fd_after->fds[i] == master_socket[1] ) {
   721              continue;
   722          }
   723  
   724          found = 0;
   725          for ( j = 0; j < fd_before->num; j++ ) {
   726              if ( fd_before->fds[j] == fd_after->fds[i] ) {
   727                  found = 1;
   728                  break;
   729              }
   730          }
   731          if ( found == 1 ) {
   732              continue;
   733          }
   734  
   735          memset(target, 0, PATH_MAX);
   736          snprintf(source, PATH_MAX, "/proc/self/fd/%d", fd_after->fds[i]);
   737  
   738          /* fd with link generating error are closed */
   739          if ( readlink(source, target, PATH_MAX) < 0 ) {
   740              close(fd_after->fds[i]);
   741              continue;
   742          }
   743          /* fd pointing to /dev/tty or anonymous inodes are closed */
   744          debugf("Check file descriptor %s pointing to %s\n", source, target);
   745          if ( strcmp(target, "/dev/tty") == 0 || strncmp(target, "anon_", 5) == 0 ) {
   746              debugf("Closing %s\n", source);
   747              close(fd_after->fds[i]);
   748              continue;
   749          }
   750          /* set force close on exec for remaining fd */
   751          if ( fcntl(fd_after->fds[i], F_SETFD, FD_CLOEXEC) < 0 ) {
   752              debugf("Can't set FD_CLOEXEC on file descriptor %d: %s", fd_after->fds[i], strerror(errno));
   753          }
   754      }
   755  
   756      free(source);
   757      free(target);
   758  
   759      if ( fd_before->fds ) {
   760          free(fd_before->fds);
   761      }
   762      if ( fd_after->fds ) {
   763          free(fd_after->fds);
   764      }
   765  
   766      free(fd_before);
   767      free(fd_after);
   768  }
   769  
   770  static void set_terminal_control(pid_t pid) {
   771      pid_t tcpgrp = tcgetpgrp(STDIN_FILENO);
   772      pid_t pgrp = getpgrp();
   773  
   774      if ( tcpgrp == pgrp ) {
   775          debugf("Pass terminal control to child\n");
   776  
   777          if ( setpgid(pid, pid) < 0 ) {
   778              fatalf("Failed to set child process group: %s\n", strerror(errno));
   779          }
   780          if ( tcsetpgrp(STDIN_FILENO, pid) < 0 ) {
   781              fatalf("Failed to set child as foreground process: %s\n", strerror(errno));
   782          }
   783      }
   784  }
   785  
   786  static void event_stop(int fd) {
   787      unsigned long long counter;
   788  
   789      if ( read(fd, &counter, sizeof(counter)) != sizeof(counter) ) {
   790          fatalf("Failed to receive sync signal: %s\n", strerror(errno));
   791      }
   792  }
   793  
   794  static void event_start(int fd) {
   795      unsigned long long counter = 1;
   796  
   797      if ( write(fd, &counter, sizeof(counter)) != sizeof(counter) ) {
   798          fatalf("Failed to synchronize with master: %s\n", strerror(errno));
   799      }
   800  }
   801  
   802  static void fix_fsuid(uid_t uid) {
   803      setfsuid(uid);
   804  
   805      if ( setfsuid(uid) != uid ) {
   806          fatalf("Failed to set filesystem uid to %d\n", uid);
   807      }
   808  }
   809  
   810  static void fix_streams(void) {
   811      struct stat st;
   812      int i = 0;
   813      int null = open("/dev/null", O_RDONLY);
   814  
   815      if ( null <= 2 ) {
   816          i = null;
   817      }
   818  
   819      for ( ; i <= 2; i++ ) {
   820          if ( fstat(i, &st) < 0 && errno == EBADF ) {
   821              if ( dup2(null, i) < 0 ) {
   822                  fatalf("Error while fixing IO streams: %s", strerror(errno));
   823              }
   824          }
   825      }
   826  
   827      if ( null > 2 ) {
   828          close(null);
   829      }
   830  }
   831  
   832  static char *dupenv(const char *env) {
   833      char *var = getenv(env);
   834  
   835      if ( var != NULL ) {
   836          return strdup(var);
   837      } else {
   838          fatalf("%s environment variable isn't set\n", env);
   839      }
   840  
   841      return NULL;
   842  }
   843  
   844  static void exit_with_status(const char *name, int status) {
   845      if ( WIFEXITED(status) ) {
   846          verbosef("%s exited with status %d\n", name, WEXITSTATUS(status));
   847          exit(WEXITSTATUS(status));
   848      } else if ( WIFSIGNALED(status) ) {
   849          verbosef("%s interrupted by signal number %d\n", name, WTERMSIG(status));
   850          kill(getpid(), WTERMSIG(status));
   851      }
   852      fatalf("%s exited with unknown status\n", name);
   853  }
   854  
   855  void do_exit(int sig) {
   856      if ( sig == SIGUSR1 ) {
   857          exit(0);
   858      }
   859      exit(1);
   860  }
   861  
   862  __attribute__((constructor)) static void init(void) {
   863      uid_t uid = getuid();
   864      gid_t gid = getgid();
   865      sigset_t mask;
   866      pid_t stage_pid;
   867      char *loglevel;
   868      char *pipe_fd_env;
   869      int status;
   870      int forkfd = -1;
   871      int pipe_fd = -1;
   872      int fork_flags = 0;
   873      int join_chroot = 0;
   874      int sync_pipe[2];
   875      struct pollfd fds[2];
   876      struct fdlist *fd_before;
   877      struct fdlist *fd_after;
   878  
   879  #ifndef SINGULARITY_NO_NEW_PRIVS
   880      fatalf("Host kernel is outdated and does not support PR_SET_NO_NEW_PRIVS!\n");
   881  #endif
   882  
   883      loglevel = dupenv("SINGULARITY_MESSAGELEVEL");
   884  
   885      pipe_fd_env = getenv("PIPE_EXEC_FD");
   886      if ( pipe_fd_env != NULL ) {
   887          if ( sscanf(pipe_fd_env, "%d", &pipe_fd) != 1 ) {
   888              fatalf("Failed to parse PIPE_EXEC_FD environment variable: %s\n", strerror(errno));
   889          }
   890          debugf("PIPE_EXEC_FD value: %d\n", pipe_fd);
   891          if ( pipe_fd < 0 || pipe_fd >= sysconf(_SC_OPEN_MAX) ) {
   892              fatalf("Bad PIPE_EXEC_FD file descriptor value\n");
   893          }
   894      } else {
   895          fatalf("PIPE_EXEC_FD environment variable isn't set\n");
   896      }
   897  
   898      verbosef("Container runtime\n");
   899  
   900      // initialize starter configuration and share it with child processes
   901      config = (struct cConfig *)mmap(NULL, sizeof(struct cConfig), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
   902      if ( config == MAP_FAILED ) {
   903          fatalf("Memory allocation failed: %s\n", strerror(errno));
   904      }
   905  
   906      config->container.isSuid = is_suid();
   907  
   908      if ( config->container.isSuid || geteuid() == 0 ) {
   909          /* force kernel to load overlay module to ease detection later */
   910          if ( mount("none", "/", "overlay", MS_SILENT, "") < 0 ) {
   911              if ( errno != EINVAL ) {
   912                  debugf("Overlay seems not supported by kernel\n");
   913              } else {
   914                  debugf("Overlay seems supported by kernel\n");
   915              }
   916          }
   917      }
   918  
   919      if ( config->container.isSuid ) {
   920          debugf("Drop privileges\n");
   921          if ( setegid(gid) < 0 || seteuid(uid) < 0 ) {
   922              fatalf("Failed to drop privileges: %s\n", strerror(errno));
   923          }
   924      }
   925  
   926      /* reset environment variables */
   927      clearenv();
   928  
   929      if ( loglevel != NULL ) {
   930          setenv("SINGULARITY_MESSAGELEVEL", loglevel, 1);
   931          free(loglevel);
   932      }
   933  
   934      /* read json configuration from stdin */
   935      debugf("Read json configuration from pipe\n");
   936  
   937      if ( ( config->json.size = read(pipe_fd, config->json.config, MAX_JSON_SIZE - 1) ) <= 0 ) {
   938          fatalf("Read JSON configuration from pipe failed: %s\n", strerror(errno));
   939      }
   940      close(pipe_fd);
   941  
   942      fix_streams();
   943  
   944      fd_before = list_fd();
   945  
   946      /* block SIGCHLD signal handled later by stage 2/master */
   947      debugf("Set child signal mask\n");
   948      sigemptyset(&mask);
   949      sigaddset(&mask, SIGCHLD);
   950      if (sigprocmask(SIG_SETMASK, &mask, NULL) == -1) {
   951          fatalf("Blocked signals error: %s\n", strerror(errno));
   952      }
   953  
   954      /*
   955       *  use CLONE_FILES to share file descriptors opened during stage 1,
   956       *  this is a lazy implementation to avoid passing file descriptors
   957       *  between wrapper and stage 1 over unix socket.
   958       *  This is required so that all processes works with same files/directories
   959       *  to minimize race conditions
   960       */
   961      stage_pid = fork_ns(CLONE_FILES|CLONE_FS);
   962      if ( stage_pid == 0 ) {
   963          /*
   964           *  stage1 is responsible for singularity configuration file parsing, handle user input,
   965           *  read capabilities, check what namespaces is required.
   966           */
   967          if ( config->container.isSuid ) {
   968              priv_escalate();
   969              execute = prepare_stage(STAGE1, config);
   970          } else {
   971              set_parent_death_signal(SIGKILL);
   972              execute = STAGE1;
   973          }
   974  
   975          verbosef("Spawn stage 1\n");
   976          return;
   977      } else if ( stage_pid < 0 ) {
   978          fatalf("Failed to spawn stage 1\n");
   979      }
   980  
   981      debugf("Wait completion of stage1\n");
   982      if ( wait(&status) != stage_pid ) {
   983          fatalf("Can't wait child\n");
   984      }
   985  
   986      if ( WIFEXITED(status) && WEXITSTATUS(status) != 0 ) {
   987          verbosef("stage 1 exited with status %d\n", WEXITSTATUS(status));
   988          exit(WEXITSTATUS(status));
   989      } else if ( WIFSIGNALED(status) ) {
   990          verbosef("stage 1 interrupted by signal number %d\n", WTERMSIG(status));
   991          kill(getpid(), WTERMSIG(status));
   992      }
   993  
   994      debugf("Create socketpair for master communication channel\n");
   995      if ( socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, master_socket) < 0 ) {
   996          fatalf("Failed to create communication socket: %s\n", strerror(errno));
   997      }
   998  
   999      if ( config->container.isInstance ) {
  1000          verbosef("Run as instance\n");
  1001          int forked = fork();
  1002          if ( forked == 0 ) {
  1003              if ( setsid() < 0 ) {
  1004                  fatalf("Can't set session leader: %s\n", strerror(errno));
  1005              }
  1006              umask(0);
  1007          } else {
  1008              sigset_t usrmask;
  1009              static struct sigaction action;
  1010  
  1011              action.sa_sigaction = (void *)&do_exit;
  1012              action.sa_flags = SA_SIGINFO|SA_RESTART;
  1013  
  1014              close(master_socket[0]);
  1015              close(master_socket[1]);
  1016  
  1017              sigemptyset(&usrmask);
  1018              sigaddset(&usrmask, SIGUSR1);
  1019              sigaddset(&usrmask, SIGUSR2);
  1020  
  1021              if (sigprocmask(SIG_SETMASK, &usrmask, NULL) == -1) {
  1022                  fatalf("Blocked signals error: %s\n", strerror(errno));
  1023              }
  1024              if (sigaction(SIGUSR2, &action, NULL) < 0) {
  1025                  fatalf("Failed to install signal handler for SIGUSR2\n");
  1026              }
  1027              if (sigaction(SIGUSR1, &action, NULL) < 0) {
  1028                  fatalf("Failed to install signal handler for SIGUSR1\n");
  1029              }
  1030              if (sigprocmask(SIG_UNBLOCK, &usrmask, NULL) == -1) {
  1031                  fatalf("Unblock signals error: %s\n", strerror(errno));
  1032              }
  1033              while ( waitpid(forked, &status, 0) <= 0 ) {
  1034                  continue;
  1035              }
  1036              exit_with_status("instance", status);
  1037          }
  1038      }
  1039  
  1040      /* relinquish CPU to apply current directory change for current thread */
  1041      sched_yield();
  1042  
  1043      fd_after = list_fd();
  1044  
  1045      cleanup_fd(fd_before, fd_after);
  1046  
  1047      user_namespace_init(config, &fork_flags);
  1048  
  1049      shared_mount_namespace_init(config);
  1050  
  1051      if ( fork_flags == CLONE_NEWUSER ) {
  1052          forkfd = eventfd(0, 0);
  1053          if ( forkfd < 0 ) {
  1054              fatalf("Failed to create fork sync pipe between master and child: %s\n", strerror(errno));
  1055          }
  1056      }
  1057  
  1058      if ( !config->container.joinMount ) {
  1059          debugf("Create RPC socketpair for communication between stage 2 and RPC server\n");
  1060          if ( socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0, rpc_socket) < 0 ) {
  1061              fatalf("Failed to create communication socket: %s\n", strerror(errno));
  1062          }
  1063      }
  1064  
  1065      /* Use setfsuid to address issue about root_squash filesystems option */
  1066      if ( config->container.isSuid ) {
  1067          fix_fsuid(uid);
  1068      }
  1069  
  1070      /* sync master and near child with an eventfd */
  1071      if ( pipe(sync_pipe) < 0 ) {
  1072          fatalf("Failed to create sync pipe: %s\n", strerror(errno));
  1073      }
  1074  
  1075      pid_namespace_init(config, &fork_flags);
  1076  
  1077      stage_pid = fork_ns(fork_flags);
  1078  
  1079      if ( stage_pid == 0 ) {
  1080          /* at this stage we are PID 1 if PID namespace requested */
  1081          set_parent_death_signal(SIGKILL);
  1082  
  1083          if ( forkfd >= 0 ) {
  1084              // wait parent write user namespace mappings
  1085              event_stop(forkfd);
  1086              close(forkfd);
  1087  
  1088              setup_userns_identity(config);
  1089          }
  1090  
  1091          close(master_socket[0]);
  1092  
  1093          network_namespace_init(config);
  1094  
  1095          uts_namespace_init(config);
  1096  
  1097          ipc_namespace_init(config);
  1098  
  1099          cgroup_namespace_init(config);
  1100  
  1101          mount_namespace_init(config);
  1102  
  1103          close(sync_pipe[0]);
  1104          sync_pipe[0] = 0;
  1105          if ( write(sync_pipe[1], &sync_pipe[0], sizeof(int)) < 0 ) {
  1106              fatalf("Failed to send sync event: %s\n", strerror(errno));
  1107          }
  1108          close(sync_pipe[1]);
  1109  
  1110          if ( !config->container.joinMount ) {
  1111              close(rpc_socket[0]);
  1112  
  1113              /*
  1114               * fork is a convenient way to apply capabilities and privileges drop
  1115               * from single thread context before entering in stage 2
  1116               */
  1117              int process = fork_ns(CLONE_FS);
  1118  
  1119              if ( process == 0 ) {
  1120                  verbosef("Spawn RPC server\n");
  1121                  execute = RPC_SERVER;
  1122              } else if ( process > 0 ) {
  1123                  int status;
  1124  
  1125                  execute = prepare_stage(STAGE2, config);
  1126  
  1127                  if ( wait(&status) != process ) {
  1128                      fatalf("Error while waiting RPC server: %s\n", strerror(errno));
  1129                  }
  1130                  if ( rpc_socket[1] != -1 ) {
  1131                      close(rpc_socket[1]);
  1132                  }
  1133              } else {
  1134                  fatalf("Fork failed: %s\n", strerror(errno));
  1135              }
  1136          } else {
  1137              verbosef("Spawn stage 2\n");
  1138              verbosef("Don't execute RPC server, joining instance\n");
  1139              execute = prepare_stage(STAGE2, config);
  1140          }
  1141          return;
  1142      } else if ( stage_pid > 0 ) {
  1143          if ( config->namespace.pid[0] != 0 && config->namespace.flags & CLONE_NEWNS ) {
  1144              if ( enter_namespace("/proc/self/ns/pid", CLONE_NEWPID) < 0 ) {
  1145                  fatalf("Failed to enter in pid namespace: %s\n", strerror(errno));
  1146              }
  1147          }
  1148  
  1149          if ( forkfd >= 0 ) {
  1150              setup_userns_mappings(config, stage_pid, "allow");
  1151  
  1152              event_start(forkfd);
  1153              close(forkfd);
  1154          }
  1155  
  1156          set_terminal_control(stage_pid);
  1157  
  1158          config->container.pid = stage_pid;
  1159  
  1160          verbosef("Spawn master process\n");
  1161  
  1162          close(master_socket[1]);
  1163  
  1164          // wait child finish namespaces initialization
  1165          close(sync_pipe[1]);
  1166          sync_pipe[1] = -1;
  1167          if ( read(sync_pipe[0], &sync_pipe[1], sizeof(int)) < 0 ) {
  1168              fatalf("Failed to receive sync event: %s\n", strerror(errno));
  1169          }
  1170          close(sync_pipe[0]);
  1171  
  1172          // value not set, child has exited before sending data
  1173          if ( sync_pipe[1] == -1 ) {
  1174              waitpid(stage_pid, &status, 0);
  1175              exit_with_status("stage 2", status);
  1176          }
  1177  
  1178          if ( config->container.joinMount ) {
  1179              if ( config->container.isSuid && setresuid(uid, uid, uid) < 0 ) {
  1180                  fatalf("Failed to drop privileges permanently\n");
  1181              }
  1182              debugf("Wait stage 2 child process\n");
  1183              waitpid(stage_pid, &status, 0);
  1184  
  1185              pid_t pgrp = getpgrp();
  1186              pid_t tcpgrp = tcgetpgrp(STDIN_FILENO);
  1187  
  1188              if ( tcpgrp > 0 && pgrp != tcpgrp ) {
  1189                  if ( signal(SIGTTOU, SIG_IGN) == SIG_ERR ) {
  1190                      fatalf("failed to ignore SIGTTOU signal: %s\n", strerror(errno));
  1191                  }
  1192                  if ( tcsetpgrp(STDIN_FILENO, pgrp) < 0 ) {
  1193                      fatalf("Failed to set parent as foreground process: %s\n", strerror(errno));
  1194                  }
  1195              }
  1196              exit_with_status("stage 2", status);
  1197          } else {
  1198              close(rpc_socket[1]);
  1199  
  1200              if ( config->container.isSuid && setresuid(uid, uid, 0) < 0 ) {
  1201                  fatalf("Failed to drop privileges\n");
  1202              }
  1203              execute = MASTER;
  1204              return;
  1205          }
  1206      }
  1207      fatalf("Failed to create container namespaces\n");
  1208  }