github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/stage1/prepare-app/prepare-app.c (about) 1 // Copyright 2015 The rkt Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #define _GNU_SOURCE 16 #include <errno.h> 17 #include <string.h> 18 #include <stdio.h> 19 #include <stdlib.h> 20 #include <sys/mount.h> 21 #include <sys/stat.h> 22 #include <sys/sysmacros.h> 23 #include <sys/types.h> 24 #include <unistd.h> 25 #include <fcntl.h> 26 #include <sys/vfs.h> 27 #include <dirent.h> 28 #include <inttypes.h> 29 #include <stdbool.h> 30 31 #define err_out(_fmt, _args...) \ 32 fprintf(stderr, "Error: " _fmt "\n", ##_args); 33 static int exit_err; 34 #define exit_if(_cond, _fmt, _args...) \ 35 exit_err++; \ 36 if(_cond) { \ 37 err_out(_fmt, ##_args); \ 38 exit(exit_err); \ 39 } 40 #define pexit_if(_cond, _fmt, _args...) \ 41 exit_if(_cond, _fmt ": %s", ##_args, strerror(errno)) 42 43 #define goto_if(_cond, _lbl, _fmt, _args...) \ 44 if(_cond) { \ 45 err_out(_fmt, ##_args); \ 46 goto _lbl; \ 47 } 48 #define pgoto_if(_cond, _lbl, _fmt, _args...) \ 49 goto_if(_cond, _lbl, _fmt ": %s", ##_args, strerror(errno)); 50 51 #define nelems(_array) \ 52 (sizeof(_array) / sizeof(_array[0])) 53 #define lenof(_str) \ 54 (sizeof(_str) - 1) 55 56 #define MACHINE_ID_LEN lenof("0123456789abcdef0123456789abcdef") 57 #define MACHINE_NAME_LEN lenof("rkt-01234567-89ab-cdef-0123-456789abcdef") 58 59 #define UNMAPPED ((uid_t) -1) 60 61 #ifndef CGROUP2_SUPER_MAGIC 62 #define CGROUP2_SUPER_MAGIC 0x63677270 63 #endif 64 65 /* permission masks */ 66 #define WORLD_READABLE 0444 67 #define WORLD_WRITABLE 0222 68 69 typedef struct _dir_op_t { 70 const char *name; 71 mode_t mode; 72 } dir_op_t; 73 74 typedef struct _mount_point_t { 75 const char *source; 76 const char *target; 77 const char *type; 78 const char *options; 79 unsigned long flags; 80 const bool skip_if_dst_exists; // Only respected for files_mount_table 81 } mount_point; 82 83 #define dir(_name, _mode) \ 84 { .name = _name, .mode = _mode } 85 86 static void mount_at(const char *root, const mount_point *mnt) 87 { 88 char to[4096]; 89 exit_if(snprintf(to, sizeof(to), "%s/%s", root, mnt->target) >= sizeof(to), 90 "Path too long: \"%s\"", to); 91 pexit_if(mount(mnt->source, to, mnt->type, 92 mnt->flags, mnt->options) == -1, 93 "Mounting \"%s\" on \"%s\" failed", mnt->source, to); 94 } 95 96 static int mount_sys_required(const char *root) 97 { 98 FILE *f; 99 char *line = NULL; 100 size_t len = 0; 101 ssize_t read; 102 103 pexit_if((f = fopen("/proc/self/mountinfo", "re")) == NULL, 104 "Unable to open /proc/self/mountinfo"); 105 106 while ((read = getline(&line, &len, f)) != -1) { 107 char *sys_dir; 108 char *sys_subdir; 109 char *mountpoint; 110 111 exit_if(asprintf(&sys_dir, "%s/sys", root) == -1, 112 "Calling asprintf failed"); 113 exit_if(asprintf(&sys_subdir, "%s/sys/", root) == -1, 114 "Calling asprintf failed"); 115 sscanf(line, "%*s %*s %*s %*s %ms", &mountpoint); 116 117 // The mount point is exactly $ROOTFS/sys 118 if (strcmp(sys_dir, mountpoint) == 0) { 119 free(mountpoint); 120 return 0; 121 } 122 // The mount point is a subdirectory of $ROOTFS/sys 123 if (strncmp(sys_subdir, mountpoint, strlen(sys_subdir)) == 0) { 124 free(mountpoint); 125 return 0; 126 } 127 128 free(mountpoint); 129 } 130 131 pexit_if(fclose(f) != 0, "Unable to close /proc/self/mountinfo"); 132 133 return 1; 134 } 135 static void mount_sys(const char *root) 136 { 137 struct statfs fs; 138 const mount_point sys_bind_rec = { "/sys", "sys", "bind", NULL, MS_BIND|MS_REC, false }; 139 const mount_point sys_bind = { "/sys", "sys", "bind", NULL, MS_BIND, false }; 140 141 pexit_if(statfs("/sys/fs/cgroup", &fs) != 0, 142 "Cannot statfs /sys/fs/cgroup"); 143 if (fs.f_type == (typeof(fs.f_type)) CGROUP2_SUPER_MAGIC) { 144 /* With the unified cgroup hierarchy, recursive bind mounts 145 * are fine. */ 146 mount_at(root, &sys_bind_rec); 147 return; 148 } 149 150 // For security reasons recent Linux kernels do not allow to bind-mount non-recursively 151 // if it would give read-write access to other subdirectories mounted as read-only. 152 // Hence we have to check if we are in a user namespaced environment and bind mount recursively instead. 153 if (access("/proc/1/uid_map", F_OK) == 0) { 154 FILE *f; 155 int k; 156 uid_t uid_base, uid_shift, uid_range; 157 158 pexit_if((f = fopen("/proc/1/uid_map", "re")) == NULL, 159 "Unable to open /proc/1/uid_map"); 160 161 if (sizeof(uid_t) == 4) { 162 k = fscanf(f, "%"PRIu32" %"PRIu32" %"PRIu32, 163 &uid_base, &uid_shift, &uid_range); 164 } else { 165 k = fscanf(f, "%"PRIu16" %"PRIu16" %"PRIu16, 166 &uid_base, &uid_shift, &uid_range); 167 } 168 pexit_if(fclose(f) != 0, "Unable to close /proc/1/uid_map"); 169 pexit_if(k != 3, "Invalid uid_map format"); 170 171 // do a recursive bind mount if we are in a user namespace having a parent namespace set, 172 // i.e. either one of uid base, shift, or the range is set, see user_namespaces(7). 173 if (uid_base != 0 || uid_shift != 0 || uid_range != UNMAPPED) { 174 mount_at(root, &sys_bind_rec); 175 return; 176 } 177 } 178 179 /* With cgroup-v1, rkt and systemd-nspawn add more cgroup 180 * bind-mounts to control which files are read-only. To avoid 181 * a quadratic progression, prepare-app does not bind mount 182 * /sys recursively. See: 183 * https://github.com/rkt/rkt/issues/2351 */ 184 mount_at(root, &sys_bind); 185 } 186 187 static void copy_volume_symlinks() 188 { 189 DIR *volumes_dir; 190 struct dirent *de; 191 const char *rkt_volume_links_path = "/rkt/volumes"; 192 const char *dev_rkt_path = "/dev/.rkt"; 193 194 pexit_if(mkdir(dev_rkt_path, 0700) == -1 && errno != EEXIST, 195 "Failed to create directory \"%s\"", dev_rkt_path); 196 197 pexit_if((volumes_dir = opendir(rkt_volume_links_path)) == NULL && errno != ENOENT, 198 "Failed to open directory \"%s\"", rkt_volume_links_path); 199 while (volumes_dir) { 200 errno = 0; 201 if ((de = readdir(volumes_dir)) != NULL) { 202 char *link_path; 203 char *new_link; 204 char target[4096] = {0,}; 205 206 if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, "..")) 207 continue; 208 209 exit_if(asprintf(&link_path, "%s/%s", rkt_volume_links_path, de->d_name) == -1, 210 "Calling asprintf failed"); 211 exit_if(asprintf(&new_link, "%s/%s", dev_rkt_path, de->d_name) == -1, 212 "Calling asprintf failed"); 213 214 pexit_if(readlink(link_path, target, sizeof(target)) == -1, 215 "Error reading \"%s\" link", link_path); 216 pexit_if(symlink(target, new_link) == -1 && errno != EEXIST, 217 "Failed to create volume symlink \"%s\"", new_link); 218 } else { 219 pexit_if(errno != 0, 220 "Error reading \"%s\" directory", rkt_volume_links_path); 221 pexit_if(closedir(volumes_dir), 222 "Error closing \"%s\" directory", rkt_volume_links_path); 223 return; 224 } 225 } 226 } 227 228 /* Determine if the specified ptmx device (or symlink to a device) 229 * is usable by all users. 230 * 231 * dirfd: Open file descriptor of a root directory. 232 * path: Relative path to ptmx device below the path specified by dirfd. 233 * 234 * Returns true on success, else false. 235 */ 236 bool 237 ptmx_device_usable (int dirfd, const char *path) 238 { 239 struct stat st; 240 int perms; 241 bool world_readable, world_writable; 242 bool is_char; 243 bool dev_type; 244 dev_t expected_dev; 245 246 if (dirfd < 0 || ! path) { 247 return false; 248 } 249 250 expected_dev = makedev(5, 2); 251 252 if (fstatat (dirfd, path, &st, 0) < 0) { 253 return false; 254 } 255 256 is_char = S_ISCHR (st.st_mode); 257 dev_type = (expected_dev == st.st_rdev); 258 259 perms = (st.st_mode & ACCESSPERMS); 260 261 world_readable = (perms & WORLD_READABLE) == WORLD_READABLE; 262 world_writable = (perms & WORLD_WRITABLE) == WORLD_WRITABLE; 263 264 return (is_char && dev_type && world_readable && world_writable); 265 } 266 267 int main(int argc, char *argv[]) 268 { 269 static const char *unlink_paths[] = { 270 "dev/shm", 271 NULL 272 }; 273 static const dir_op_t dirs[] = { 274 dir("dev", 0755), 275 dir("dev/net", 0755), 276 dir("dev/shm", 0755), 277 dir("etc", 0755), 278 dir("proc", 0755), 279 dir("sys", 0755), 280 dir("tmp", 01777), 281 dir("dev/pts", 0755), 282 dir("run", 0755), 283 dir("run/systemd", 0755), 284 dir("run/systemd/journal", 0755), 285 }; 286 static const char *devnodes[] = { 287 "/dev/null", 288 "/dev/zero", 289 "/dev/full", 290 "/dev/random", 291 "/dev/urandom", 292 "/dev/tty", 293 "/dev/net/tun", 294 "/dev/console", 295 NULL 296 }; 297 static const mount_point dirs_mount_table[] = { 298 { "/proc", "/proc", "bind", NULL, MS_BIND|MS_REC, false }, 299 { "/dev/shm", "/dev/shm", "bind", NULL, MS_BIND, false }, 300 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, false }, 301 { "/run/systemd/journal", "/run/systemd/journal", "bind", NULL, MS_BIND, false }, 302 /* /sys is handled separately */ 303 }; 304 static const mount_point files_mount_table[] = { 305 { "/etc/rkt-resolv.conf", "/etc/resolv.conf", "bind", NULL, MS_BIND, false }, 306 { "/etc/rkt-hosts", "/etc/hosts", "bind", NULL, MS_BIND, false }, 307 { "/etc/hosts-fallback", "/etc/hosts", "bind", NULL, MS_BIND, true }, // only create as fallback 308 { "/proc/sys/kernel/hostname", "/etc/hostname", "bind", NULL, MS_BIND, false }, 309 // TODO @alepuccetti this could be removed when https://github.com/systemd/systemd/issues/3544 is solved 310 { "/run/systemd/notify", "/run/systemd/notify", "bind", NULL, MS_BIND, false }, 311 }; 312 const char *root; 313 int rootfd; 314 char to[4096]; 315 int i; 316 bool ptmx_usable, pts_ptmx_usable; 317 318 exit_if(argc < 2, 319 "Usage: %s /path/to/root", argv[0]); 320 321 root = argv[1]; 322 323 /* Make stage2's root a mount point. Chrooting an application in a 324 * directory which is not a mount point is not nice because the 325 * application would not be able to remount "/" it as private mount. 326 * This allows Docker to run inside rkt. 327 * The recursive flag is to preserve volumes mounted previously by 328 * systemd-nspawn via "rkt run -volume". 329 * */ 330 pexit_if(mount(root, root, "bind", MS_BIND | MS_REC, NULL) == -1, 331 "Make / a mount point failed"); 332 333 rootfd = open(root, O_DIRECTORY | O_CLOEXEC); 334 pexit_if(rootfd < 0, 335 "Failed to open directory \"%s\"", root); 336 337 /* Some images have annoying symlinks that are resolved as dangling 338 * links before the chroot in stage1. E.g. "/dev/shm" -> "/run/shm" 339 * Just remove the symlinks. 340 */ 341 for (i = 0; unlink_paths[i]; i++) { 342 pexit_if(unlinkat(rootfd, unlink_paths[i], 0) != 0 343 && errno != ENOENT && errno != EISDIR, 344 "Failed to unlink \"%s\"", unlink_paths[i]) 345 } 346 347 /* Create the directories */ 348 umask(0); 349 for (i = 0; i < nelems(dirs); i++) { 350 const dir_op_t *d = &dirs[i]; 351 pexit_if(mkdirat(rootfd, d->name, d->mode) == -1 && 352 errno != EEXIST, 353 "Failed to create directory \"%s/%s\"", root, d->name); 354 } 355 356 close(rootfd); 357 358 /* systemd-nspawn already creates few /dev entries in the container 359 * namespace: copy_devnodes() 360 * http://cgit.freedesktop.org/systemd/systemd/tree/src/nspawn/nspawn.c?h=v219#n1345 361 * 362 * But they are not visible by the apps because they are "protected" by 363 * the chroot. 364 * 365 * Bind mount them individually over the chroot border. 366 * 367 * Do NOT bind mount the whole directory /dev because it would shadow 368 * potential individual bind mount by stage0 ("rkt run --volume..."). 369 * 370 * Do NOT use mknod, it would not work for /dev/console because it is 371 * a bind mount to a pts and pts device nodes only work when they live 372 * on a devpts filesystem. 373 */ 374 for (i = 0; devnodes[i]; i++) { 375 const char *from = devnodes[i]; 376 int fd; 377 378 /* If the file does not exist, skip it. It might be because 379 * the kernel does not provide it (e.g. kernel compiled without 380 * CONFIG_TUN) or because systemd-nspawn does not provide it 381 * (/dev/net/tun is not available with systemd-nspawn < v217 382 */ 383 if (access(from, F_OK) != 0) 384 continue; 385 386 exit_if(snprintf(to, sizeof(to), "%s%s", root, from) >= sizeof(to), 387 "Path too long: \"%s\"", to); 388 389 /* The mode does not matter: it will be bind-mounted over. 390 */ 391 fd = open(to, O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY, 0644); 392 if (fd != -1) 393 close(fd); 394 395 pexit_if(mount(from, to, "bind", MS_BIND, NULL) == -1, 396 "Mounting \"%s\" on \"%s\" failed", from, to); 397 } 398 399 /* Bind mount directories */ 400 for (i = 0; i < nelems(dirs_mount_table); i++) { 401 mount_at(root, &dirs_mount_table[i]); 402 } 403 404 /* Bind mount /sys: handled differently, depending on cgroups */ 405 if (mount_sys_required(root)) 406 mount_sys(root); 407 408 /* Bind mount files, if the source exists. 409 * By default, overwrite dst unless skip_if_dst_exists is true. */ 410 for (i = 0; i < nelems(files_mount_table); i++) { 411 const mount_point *mnt = &files_mount_table[i]; 412 int fd; 413 414 exit_if(snprintf(to, sizeof(to), "%s/%s", root, mnt->target) >= sizeof(to), 415 "Path too long: \"%s\"", to); 416 if (access(mnt->source, F_OK) != 0) 417 continue; 418 if( mnt->skip_if_dst_exists && access(to, F_OK) == 0) 419 continue; 420 if (access(to, F_OK) != 0) { 421 pexit_if((fd = creat(to, 0644)) == -1, 422 "Cannot create file: \"%s\"", to); 423 pexit_if(close(fd) == -1, 424 "Cannot close file: \"%s\"", to); 425 } 426 pexit_if(mount(mnt->source, to, mnt->type, 427 mnt->flags, mnt->options) == -1, 428 "Mounting \"%s\" on \"%s\" failed", mnt->source, to); 429 } 430 431 /* Now that all mounts have been handled, reopen the root 432 * directory to special-case the handling of ptmx devices. 433 */ 434 rootfd = open(root, O_DIRECTORY | O_CLOEXEC); 435 pexit_if(rootfd < 0, 436 "Failed to open directory \"%s\"", root); 437 438 ptmx_usable = ptmx_device_usable (rootfd, "dev/ptmx"); 439 pts_ptmx_usable = ptmx_device_usable (rootfd, "dev/pts/ptmx"); 440 441 if (pts_ptmx_usable) { 442 if (! ptmx_usable) { 443 pexit_if(unlinkat(rootfd, "dev/ptmx", 0) != 0 444 && errno != ENOENT, 445 "Failed to unlink \"%s\"", "dev/ptmx"); 446 pexit_if(symlinkat("/dev/pts/ptmx", rootfd, "dev/ptmx") == -1, 447 "Failed to create /dev/ptmx symlink"); 448 } 449 } else { 450 if (! ptmx_usable) { 451 int perms = (WORLD_READABLE + WORLD_WRITABLE); 452 453 pexit_if(unlinkat(rootfd, "dev/ptmx", 0) != 0 454 && errno != ENOENT, 455 "Failed to unlink \"%s\"", "dev/ptmx"); 456 pexit_if(mknodat (rootfd, "dev/ptmx", (S_IFCHR|perms), makedev (5, 2)) < 0, 457 "Failed to create device: \"%s\"", "dev/ptmx"); 458 } 459 } 460 461 close(rootfd); 462 463 /* Copy symlinks to device node volumes to "/dev/.rkt" so they can be 464 * used in the DeviceAllow= option of the app's unit file (systemd 465 * needs the path to start with "/dev". */ 466 copy_volume_symlinks(); 467 468 /* /dev/log -> /run/systemd/journal/dev-log */ 469 exit_if(snprintf(to, sizeof(to), "%s/dev/log", root) >= sizeof(to), 470 "Path too long: \"%s\"", to); 471 pexit_if(symlink("/run/systemd/journal/dev-log", to) == -1 && errno != EEXIST, 472 "Failed to create /dev/log symlink"); 473 474 return EXIT_SUCCESS; 475 }