github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/rootfs_linux.go (about) 1 package libcontainer 2 3 import ( 4 "encoding/json" 5 "errors" 6 "fmt" 7 "os" 8 "path" 9 "path/filepath" 10 "strconv" 11 "strings" 12 "syscall" 13 "time" 14 15 securejoin "github.com/cyphar/filepath-securejoin" 16 "github.com/moby/sys/mountinfo" 17 "github.com/mrunalp/fileutils" 18 "github.com/opencontainers/runtime-spec/specs-go" 19 "github.com/opencontainers/selinux/go-selinux/label" 20 "github.com/sirupsen/logrus" 21 "golang.org/x/sys/unix" 22 23 "github.com/opencontainers/runc/libcontainer/cgroups" 24 "github.com/opencontainers/runc/libcontainer/cgroups/fs2" 25 "github.com/opencontainers/runc/libcontainer/configs" 26 "github.com/opencontainers/runc/libcontainer/devices" 27 "github.com/opencontainers/runc/libcontainer/userns" 28 "github.com/opencontainers/runc/libcontainer/utils" 29 ) 30 31 const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV 32 33 // mountConfig contains mount data not specific to a mount point. 34 type mountConfig struct { 35 root string 36 label string 37 cgroup2Path string 38 rootlessCgroups bool 39 cgroupns bool 40 } 41 42 // mountEntry contains mount data specific to a mount point. 43 type mountEntry struct { 44 *configs.Mount 45 srcFile *mountSource 46 } 47 48 // srcName is only meant for error messages, it returns a "friendly" name. 49 func (m mountEntry) srcName() string { 50 if m.srcFile != nil { 51 return m.srcFile.file.Name() 52 } 53 return m.Source 54 } 55 56 func (m mountEntry) srcStat() (os.FileInfo, *syscall.Stat_t, error) { 57 var ( 58 st os.FileInfo 59 err error 60 ) 61 if m.srcFile != nil { 62 st, err = m.srcFile.file.Stat() 63 } else { 64 st, err = os.Stat(m.Source) 65 } 66 if err != nil { 67 return nil, nil, err 68 } 69 return st, st.Sys().(*syscall.Stat_t), nil 70 } 71 72 func (m mountEntry) srcStatfs() (*unix.Statfs_t, error) { 73 var st unix.Statfs_t 74 if m.srcFile != nil { 75 if err := unix.Fstatfs(int(m.srcFile.file.Fd()), &st); err != nil { 76 return nil, os.NewSyscallError("fstatfs", err) 77 } 78 } else { 79 if err := unix.Statfs(m.Source, &st); err != nil { 80 return nil, &os.PathError{Op: "statfs", Path: m.Source, Err: err} 81 } 82 } 83 return &st, nil 84 } 85 86 // needsSetupDev returns true if /dev needs to be set up. 87 func needsSetupDev(config *configs.Config) bool { 88 for _, m := range config.Mounts { 89 if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" { 90 return false 91 } 92 } 93 return true 94 } 95 96 // prepareRootfs sets up the devices, mount points, and filesystems for use 97 // inside a new mount namespace. It doesn't set anything as ro. You must call 98 // finalizeRootfs after this function to finish setting up the rootfs. 99 func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) { 100 config := iConfig.Config 101 if err := prepareRoot(config); err != nil { 102 return fmt.Errorf("error preparing rootfs: %w", err) 103 } 104 105 mountConfig := &mountConfig{ 106 root: config.Rootfs, 107 label: config.MountLabel, 108 cgroup2Path: iConfig.Cgroup2Path, 109 rootlessCgroups: iConfig.RootlessCgroups, 110 cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), 111 } 112 for _, m := range config.Mounts { 113 entry := mountEntry{Mount: m} 114 // Figure out whether we need to request runc to give us an 115 // open_tree(2)-style mountfd. For idmapped mounts, this is always 116 // necessary. For bind-mounts, this is only necessary if we cannot 117 // resolve the parent mount (this is only hit if you are running in a 118 // userns -- but for rootless the host-side thread can't help). 119 wantSourceFile := m.IsIDMapped() 120 if m.IsBind() && !config.RootlessEUID { 121 if _, err := os.Stat(m.Source); err != nil { 122 wantSourceFile = true 123 } 124 } 125 if wantSourceFile { 126 // Request a source file from the host. 127 if err := writeSyncArg(pipe, procMountPlease, m); err != nil { 128 return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err) 129 } 130 sync, err := readSyncFull(pipe, procMountFd) 131 if err != nil { 132 return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err) 133 } 134 if sync.File == nil { 135 return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source) 136 } 137 defer sync.File.Close() 138 // Sanity-check to make sure we didn't get the wrong fd back. Note 139 // that while m.Source might contain symlinks, the (*os.File).Name 140 // is based on the path provided to os.OpenFile, not what it 141 // resolves to. So this should never happen. 142 if sync.File.Name() != m.Source { 143 return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name()) 144 } 145 // Unmarshal the procMountFd argument (the file is sync.File). 146 var src *mountSource 147 if sync.Arg == nil { 148 return fmt.Errorf("sync %q is missing an argument", sync.Type) 149 } 150 if err := json.Unmarshal(*sync.Arg, &src); err != nil { 151 return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err) 152 } 153 if src == nil { 154 return fmt.Errorf("mountfd request for %q: no mount source info received", m.Source) 155 } 156 src.file = sync.File 157 entry.srcFile = src 158 } 159 if err := mountToRootfs(mountConfig, entry); err != nil { 160 return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) 161 } 162 } 163 164 setupDev := needsSetupDev(config) 165 if setupDev { 166 if err := createDevices(config); err != nil { 167 return fmt.Errorf("error creating device nodes: %w", err) 168 } 169 if err := setupPtmx(config); err != nil { 170 return fmt.Errorf("error setting up ptmx: %w", err) 171 } 172 if err := setupDevSymlinks(config.Rootfs); err != nil { 173 return fmt.Errorf("error setting up /dev symlinks: %w", err) 174 } 175 } 176 177 // Signal the parent to run the pre-start hooks. 178 // The hooks are run after the mounts are setup, but before we switch to the new 179 // root, so that the old root is still available in the hooks for any mount 180 // manipulations. 181 // Note that iConfig.Cwd is not guaranteed to exist here. 182 if err := syncParentHooks(pipe); err != nil { 183 return err 184 } 185 186 // The reason these operations are done here rather than in finalizeRootfs 187 // is because the console-handling code gets quite sticky if we have to set 188 // up the console before doing the pivot_root(2). This is because the 189 // Console API has to also work with the ExecIn case, which means that the 190 // API must be able to deal with being inside as well as outside the 191 // container. It's just cleaner to do this here (at the expense of the 192 // operation not being perfectly split). 193 194 if err := unix.Chdir(config.Rootfs); err != nil { 195 return &os.PathError{Op: "chdir", Path: config.Rootfs, Err: err} 196 } 197 198 s := iConfig.SpecState 199 s.Pid = unix.Getpid() 200 s.Status = specs.StateCreating 201 if err := iConfig.Config.Hooks.Run(configs.CreateContainer, s); err != nil { 202 return err 203 } 204 205 if config.NoPivotRoot { 206 err = msMoveRoot(config.Rootfs) 207 } else if config.Namespaces.Contains(configs.NEWNS) { 208 err = pivotRoot(config.Rootfs) 209 } else { 210 err = chroot() 211 } 212 if err != nil { 213 return fmt.Errorf("error jailing process inside rootfs: %w", err) 214 } 215 216 if setupDev { 217 if err := reOpenDevNull(); err != nil { 218 return fmt.Errorf("error reopening /dev/null inside container: %w", err) 219 } 220 } 221 222 if cwd := iConfig.Cwd; cwd != "" { 223 // Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...". 224 // However, we are safe to call MkDirAll directly because we are in the jail here. 225 if err := os.MkdirAll(cwd, 0o755); err != nil { 226 return err 227 } 228 } 229 230 return nil 231 } 232 233 // finalizeRootfs sets anything to ro if necessary. You must call 234 // prepareRootfs first. 235 func finalizeRootfs(config *configs.Config) (err error) { 236 // All tmpfs mounts and /dev were previously mounted as rw 237 // by mountPropagate. Remount them read-only as requested. 238 for _, m := range config.Mounts { 239 if m.Flags&unix.MS_RDONLY != unix.MS_RDONLY { 240 continue 241 } 242 if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" { 243 if err := remountReadonly(m); err != nil { 244 return err 245 } 246 } 247 } 248 249 // set rootfs ( / ) as readonly 250 if config.Readonlyfs { 251 if err := setReadonly(); err != nil { 252 return fmt.Errorf("error setting rootfs as readonly: %w", err) 253 } 254 } 255 256 if config.Umask != nil { 257 unix.Umask(int(*config.Umask)) 258 } else { 259 unix.Umask(0o022) 260 } 261 return nil 262 } 263 264 // /tmp has to be mounted as private to allow MS_MOVE to work in all situations 265 func prepareTmp(topTmpDir string) (string, error) { 266 tmpdir, err := os.MkdirTemp(topTmpDir, "runctop") 267 if err != nil { 268 return "", err 269 } 270 if err := mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil { 271 return "", err 272 } 273 if err := mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil { 274 return "", err 275 } 276 return tmpdir, nil 277 } 278 279 func cleanupTmp(tmpdir string) { 280 _ = unix.Unmount(tmpdir, 0) 281 _ = os.RemoveAll(tmpdir) 282 } 283 284 func mountCgroupV1(m *configs.Mount, c *mountConfig) error { 285 binds, err := getCgroupMounts(m) 286 if err != nil { 287 return err 288 } 289 var merged []string 290 for _, b := range binds { 291 ss := filepath.Base(b.Destination) 292 if strings.Contains(ss, ",") { 293 merged = append(merged, ss) 294 } 295 } 296 tmpfs := &configs.Mount{ 297 Source: "tmpfs", 298 Device: "tmpfs", 299 Destination: m.Destination, 300 Flags: defaultMountFlags, 301 Data: "mode=755", 302 PropagationFlags: m.PropagationFlags, 303 } 304 305 if err := mountToRootfs(c, mountEntry{Mount: tmpfs}); err != nil { 306 return err 307 } 308 309 for _, b := range binds { 310 if c.cgroupns { 311 subsystemPath := filepath.Join(c.root, b.Destination) 312 if err := os.MkdirAll(subsystemPath, 0o755); err != nil { 313 return err 314 } 315 if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error { 316 flags := defaultMountFlags 317 if m.Flags&unix.MS_RDONLY != 0 { 318 flags = flags | unix.MS_RDONLY 319 } 320 var ( 321 source = "cgroup" 322 data = filepath.Base(subsystemPath) 323 ) 324 if data == "systemd" { 325 data = cgroups.CgroupNamePrefix + data 326 source = "systemd" 327 } 328 return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data) 329 }); err != nil { 330 return err 331 } 332 } else { 333 if err := mountToRootfs(c, mountEntry{Mount: b}); err != nil { 334 return err 335 } 336 } 337 } 338 for _, mc := range merged { 339 for _, ss := range strings.Split(mc, ",") { 340 // symlink(2) is very dumb, it will just shove the path into 341 // the link and doesn't do any checks or relative path 342 // conversion. Also, don't error out if the cgroup already exists. 343 if err := os.Symlink(mc, filepath.Join(c.root, m.Destination, ss)); err != nil && !os.IsExist(err) { 344 return err 345 } 346 } 347 } 348 return nil 349 } 350 351 func mountCgroupV2(m *configs.Mount, c *mountConfig) error { 352 dest, err := securejoin.SecureJoin(c.root, m.Destination) 353 if err != nil { 354 return err 355 } 356 if err := os.MkdirAll(dest, 0o755); err != nil { 357 return err 358 } 359 err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error { 360 return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data) 361 }) 362 if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) { 363 return err 364 } 365 366 // When we are in UserNS but CgroupNS is not unshared, we cannot mount 367 // cgroup2 (#2158), so fall back to bind mount. 368 bindM := &configs.Mount{ 369 Device: "bind", 370 Source: fs2.UnifiedMountpoint, 371 Destination: m.Destination, 372 Flags: unix.MS_BIND | m.Flags, 373 PropagationFlags: m.PropagationFlags, 374 } 375 if c.cgroupns && c.cgroup2Path != "" { 376 // Emulate cgroupns by bind-mounting the container cgroup path 377 // rather than the whole /sys/fs/cgroup. 378 bindM.Source = c.cgroup2Path 379 } 380 // mountToRootfs() handles remounting for MS_RDONLY. 381 err = mountToRootfs(c, mountEntry{Mount: bindM}) 382 if c.rootlessCgroups && errors.Is(err, unix.ENOENT) { 383 // ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed 384 // outside the userns+mountns. 385 // 386 // Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted 387 // with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`). 388 err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error { 389 return maskPath(procfd, c.label) 390 }) 391 } 392 return err 393 } 394 395 func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) { 396 // Set up a scratch dir for the tmpfs on the host. 397 tmpdir, err := prepareTmp("/tmp") 398 if err != nil { 399 return fmt.Errorf("tmpcopyup: failed to setup tmpdir: %w", err) 400 } 401 defer cleanupTmp(tmpdir) 402 tmpDir, err := os.MkdirTemp(tmpdir, "runctmpdir") 403 if err != nil { 404 return fmt.Errorf("tmpcopyup: failed to create tmpdir: %w", err) 405 } 406 defer os.RemoveAll(tmpDir) 407 408 // Configure the *host* tmpdir as if it's the container mount. We change 409 // m.Destination since we are going to mount *on the host*. 410 oldDest := m.Destination 411 m.Destination = tmpDir 412 err = mountPropagate(m, "/", mountLabel) 413 m.Destination = oldDest 414 if err != nil { 415 return err 416 } 417 defer func() { 418 if Err != nil { 419 if err := unmount(tmpDir, unix.MNT_DETACH); err != nil { 420 logrus.Warnf("tmpcopyup: %v", err) 421 } 422 } 423 }() 424 425 return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) { 426 // Copy the container data to the host tmpdir. We append "/" to force 427 // CopyDirectory to resolve the symlink rather than trying to copy the 428 // symlink itself. 429 if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil { 430 return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err) 431 } 432 // Now move the mount into the container. 433 if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil { 434 return fmt.Errorf("tmpcopyup: failed to move mount: %w", err) 435 } 436 return nil 437 }) 438 } 439 440 const ( 441 // The atime "enum" flags (which are mutually exclusive). 442 mntAtimeEnumFlags = unix.MS_NOATIME | unix.MS_RELATIME | unix.MS_STRICTATIME 443 // All atime-related flags. 444 mntAtimeFlags = mntAtimeEnumFlags | unix.MS_NODIRATIME 445 // Flags which can be locked when inheriting mounts in a different userns. 446 // In the kernel, these are the mounts that are locked using MNT_LOCK_*. 447 mntLockFlags = unix.MS_RDONLY | unix.MS_NODEV | unix.MS_NOEXEC | 448 unix.MS_NOSUID | mntAtimeFlags 449 ) 450 451 func statfsToMountFlags(st unix.Statfs_t) int { 452 // From <linux/statfs.h>. 453 const ST_NOSYMFOLLOW = 0x2000 //nolint:revive 454 455 var flags int 456 for _, f := range []struct { 457 st, ms int 458 }{ 459 // See calculate_f_flags() in fs/statfs.c. 460 {unix.ST_RDONLY, unix.MS_RDONLY}, 461 {unix.ST_NOSUID, unix.MS_NOSUID}, 462 {unix.ST_NODEV, unix.MS_NODEV}, 463 {unix.ST_NOEXEC, unix.MS_NOEXEC}, 464 {unix.ST_MANDLOCK, unix.MS_MANDLOCK}, 465 {unix.ST_SYNCHRONOUS, unix.MS_SYNCHRONOUS}, 466 {unix.ST_NOATIME, unix.MS_NOATIME}, 467 {unix.ST_NODIRATIME, unix.MS_NODIRATIME}, 468 {unix.ST_RELATIME, unix.MS_RELATIME}, 469 {ST_NOSYMFOLLOW, unix.MS_NOSYMFOLLOW}, 470 // There is no ST_STRICTATIME -- see below. 471 } { 472 if int(st.Flags)&f.st == f.st { 473 flags |= f.ms 474 } 475 } 476 // MS_STRICTATIME is a "fake" MS_* flag. It isn't stored in mnt->mnt_flags, 477 // and so it doesn't show up in statfs(2). If none of the other flags in 478 // atime enum are present, the mount is MS_STRICTATIME. 479 if flags&mntAtimeEnumFlags == 0 { 480 flags |= unix.MS_STRICTATIME 481 } 482 return flags 483 } 484 485 func mountToRootfs(c *mountConfig, m mountEntry) error { 486 rootfs := c.root 487 488 // procfs and sysfs are special because we need to ensure they are actually 489 // mounted on a specific path in a container without any funny business. 490 switch m.Device { 491 case "proc", "sysfs": 492 // If the destination already exists and is not a directory, we bail 493 // out. This is to avoid mounting through a symlink or similar -- which 494 // has been a "fun" attack scenario in the past. 495 // TODO: This won't be necessary once we switch to libpathrs and we can 496 // stop all of these symlink-exchange attacks. 497 dest := filepath.Clean(m.Destination) 498 if !strings.HasPrefix(dest, rootfs) { 499 // Do not use securejoin as it resolves symlinks. 500 dest = filepath.Join(rootfs, dest) 501 } 502 if err := checkProcMount(rootfs, dest, m); err != nil { 503 return err 504 } 505 if fi, err := os.Lstat(dest); err != nil { 506 if !os.IsNotExist(err) { 507 return err 508 } 509 } else if !fi.IsDir() { 510 return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device) 511 } 512 if err := os.MkdirAll(dest, 0o755); err != nil { 513 return err 514 } 515 // Selinux kernels do not support labeling of /proc or /sys. 516 return mountPropagate(m, rootfs, "") 517 } 518 519 mountLabel := c.label 520 dest, err := securejoin.SecureJoin(rootfs, m.Destination) 521 if err != nil { 522 return err 523 } 524 if err := checkProcMount(rootfs, dest, m); err != nil { 525 return err 526 } 527 528 switch m.Device { 529 case "mqueue": 530 if err := os.MkdirAll(dest, 0o755); err != nil { 531 return err 532 } 533 if err := mountPropagate(m, rootfs, ""); err != nil { 534 return err 535 } 536 return label.SetFileLabel(dest, mountLabel) 537 case "tmpfs": 538 if stat, err := os.Stat(dest); err != nil { 539 if err := os.MkdirAll(dest, 0o755); err != nil { 540 return err 541 } 542 } else { 543 dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode())) 544 if m.Data != "" { 545 dt = dt + "," + m.Data 546 } 547 m.Data = dt 548 } 549 550 if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP { 551 err = doTmpfsCopyUp(m, rootfs, mountLabel) 552 } else { 553 err = mountPropagate(m, rootfs, mountLabel) 554 } 555 556 return err 557 case "bind": 558 fi, _, err := m.srcStat() 559 if err != nil { 560 // error out if the source of a bind mount does not exist as we will be 561 // unable to bind anything to it. 562 return err 563 } 564 if err := createIfNotExists(dest, fi.IsDir()); err != nil { 565 return err 566 } 567 // open_tree()-related shenanigans are all handled in mountViaFds. 568 if err := mountPropagate(m, rootfs, mountLabel); err != nil { 569 return err 570 } 571 572 // The initial MS_BIND won't change the mount options, we need to do a 573 // separate MS_BIND|MS_REMOUNT to apply the mount options. We skip 574 // doing this if the user has not specified any mount flags at all 575 // (including cleared flags) -- in which case we just keep the original 576 // mount flags. 577 // 578 // Note that the fact we check whether any clearing flags are set is in 579 // contrast to mount(8)'s current behaviour, but is what users probably 580 // expect. See <https://github.com/util-linux/util-linux/issues/2433>. 581 if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 { 582 if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { 583 flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT 584 // The runtime-spec says we SHOULD map to the relevant mount(8) 585 // behaviour. However, it's not clear whether we want the 586 // "mount --bind -o ..." or "mount --bind -o remount,..." 587 // behaviour here -- both of which are somewhat broken[1]. 588 // 589 // So, if the user has passed "remount" as a mount option, we 590 // implement the "mount --bind -o remount" behaviour, otherwise 591 // we implement the spiritual intent of the "mount --bind -o" 592 // behaviour, which should match what users expect. Maybe 593 // mount(8) will eventually implement this behaviour too.. 594 // 595 // [1]: https://github.com/util-linux/util-linux/issues/2433 596 597 // Initially, we emulate "mount --bind -o ..." where we set 598 // only the requested flags (clearing any existing flags). The 599 // only difference from mount(8) is that we do this 600 // unconditionally, regardless of whether any set-me mount 601 // options have been requested. 602 // 603 // TODO: We are not doing any special handling of the atime 604 // flags here, which means that the mount will inherit the old 605 // atime flags if the user didn't explicitly request a 606 // different set of flags. This also has the mount(8) bug where 607 // "nodiratime,norelatime" will result in a 608 // "nodiratime,relatime" mount. 609 mountErr := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "") 610 if mountErr == nil { 611 return nil 612 } 613 614 // If the mount failed, the mount may contain locked mount 615 // flags. In that case, we emulate "mount --bind -o 616 // remount,...", where we take the existing mount flags of the 617 // mount and apply the request flags (including clearing flags) 618 // on top. The main divergence we have from mount(8) here is 619 // that we handle atimes correctly to make sure we error out if 620 // we cannot fulfil the requested mount flags. 621 622 st, err := m.srcStatfs() 623 if err != nil { 624 return err 625 } 626 srcFlags := statfsToMountFlags(*st) 627 // If the user explicitly request one of the locked flags *not* 628 // be set, we need to return an error to avoid producing mounts 629 // that don't match the user's request. 630 if srcFlags&m.ClearedFlags&mntLockFlags != 0 { 631 return mountErr 632 } 633 634 // If an MS_*ATIME flag was requested, it must match the 635 // existing one. This handles two separate kernel bugs, and 636 // matches the logic of can_change_locked_flags() but without 637 // these bugs: 638 // 639 // * (2.6.30+) Since commit 613cbe3d4870 ("Don't set relatime 640 // when noatime is specified"), MS_RELATIME is ignored when 641 // MS_NOATIME is set. This means that us inheriting MS_NOATIME 642 // from a mount while requesting MS_RELATIME would *silently* 643 // produce an MS_NOATIME mount. 644 // 645 // * (2.6.30+) Since its introduction in commit d0adde574b84 646 // ("Add a strictatime mount option"), MS_STRICTATIME has 647 // caused any passed MS_RELATIME and MS_NOATIME flags to be 648 // ignored which results in us *silently* producing 649 // MS_STRICTATIME mounts even if the user requested MS_RELATIME 650 // or MS_NOATIME. 651 if m.Flags&mntAtimeFlags != 0 && m.Flags&mntAtimeFlags != srcFlags&mntAtimeFlags { 652 return mountErr 653 } 654 655 // Retry the mount with the existing lockable mount flags 656 // applied. 657 flags |= srcFlags & mntLockFlags 658 mountErr = mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "") 659 logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr) 660 return mountErr 661 }); err != nil { 662 return err 663 } 664 } 665 666 if m.Relabel != "" { 667 if err := label.Validate(m.Relabel); err != nil { 668 return err 669 } 670 shared := label.IsShared(m.Relabel) 671 if err := label.Relabel(m.Source, mountLabel, shared); err != nil { 672 return err 673 } 674 } 675 return setRecAttr(m.Mount, rootfs) 676 case "cgroup": 677 if cgroups.IsCgroup2UnifiedMode() { 678 return mountCgroupV2(m.Mount, c) 679 } 680 return mountCgroupV1(m.Mount, c) 681 default: 682 if err := os.MkdirAll(dest, 0o755); err != nil { 683 return err 684 } 685 return mountPropagate(m, rootfs, mountLabel) 686 } 687 } 688 689 func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { 690 mounts, err := cgroups.GetCgroupMounts(false) 691 if err != nil { 692 return nil, err 693 } 694 695 // We don't need to use /proc/thread-self here because runc always runs 696 // with every thread in the same cgroup. This lets us avoid having to do 697 // runtime.LockOSThread. 698 cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup") 699 if err != nil { 700 return nil, err 701 } 702 703 var binds []*configs.Mount 704 705 for _, mm := range mounts { 706 dir, err := mm.GetOwnCgroup(cgroupPaths) 707 if err != nil { 708 return nil, err 709 } 710 relDir, err := filepath.Rel(mm.Root, dir) 711 if err != nil { 712 return nil, err 713 } 714 binds = append(binds, &configs.Mount{ 715 Device: "bind", 716 Source: filepath.Join(mm.Mountpoint, relDir), 717 Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)), 718 Flags: unix.MS_BIND | unix.MS_REC | m.Flags, 719 PropagationFlags: m.PropagationFlags, 720 }) 721 } 722 723 return binds, nil 724 } 725 726 // Taken from <include/linux/proc_ns.h>. If a file is on a filesystem of type 727 // PROC_SUPER_MAGIC, we're guaranteed that only the root of the superblock will 728 // have this inode number. 729 const procRootIno = 1 730 731 // checkProcMount checks to ensure that the mount destination is not over the top of /proc. 732 // dest is required to be an abs path and have any symlinks resolved before calling this function. 733 // 734 // If m is nil, don't stat the filesystem. This is used for restore of a checkpoint. 735 func checkProcMount(rootfs, dest string, m mountEntry) error { 736 const procPath = "/proc" 737 path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest) 738 if err != nil { 739 return err 740 } 741 // pass if the mount path is located outside of /proc 742 if strings.HasPrefix(path, "..") { 743 return nil 744 } 745 if path == "." { 746 // Only allow bind-mounts on top of /proc, and only if the source is a 747 // procfs mount. 748 if m.IsBind() { 749 fsSt, err := m.srcStatfs() 750 if err != nil { 751 return err 752 } 753 if fsSt.Type == unix.PROC_SUPER_MAGIC { 754 if _, uSt, err := m.srcStat(); err != nil { 755 return err 756 } else if uSt.Ino != procRootIno { 757 // We cannot error out in this case, because we've 758 // supported these kinds of mounts for a long time. 759 // However, we would expect users to bind-mount the root of 760 // a real procfs on top of /proc in the container. We might 761 // want to block this in the future. 762 logrus.Warnf("bind-mount %v (source %v) is of type procfs but is not the root of a procfs (inode %d). Future versions of runc might block this configuration -- please report an issue to <https://github.com/opencontainers/runc> if you see this warning.", dest, m.srcName(), uSt.Ino) 763 } 764 return nil 765 } 766 } else if m.Device == "proc" { 767 // Fresh procfs-type mounts are always safe to mount on top of /proc. 768 return nil 769 } 770 return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest) 771 } 772 773 // Here dest is definitely under /proc. Do not allow those, 774 // except for a few specific entries emulated by lxcfs. 775 validProcMounts := []string{ 776 "/proc/cpuinfo", 777 "/proc/diskstats", 778 "/proc/meminfo", 779 "/proc/stat", 780 "/proc/swaps", 781 "/proc/uptime", 782 "/proc/loadavg", 783 "/proc/slabinfo", 784 "/proc/net/dev", 785 "/proc/sys/kernel/ns_last_pid", 786 "/proc/sys/crypto/fips_enabled", 787 } 788 for _, valid := range validProcMounts { 789 path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) 790 if err != nil { 791 return err 792 } 793 if path == "." { 794 return nil 795 } 796 } 797 798 return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest) 799 } 800 801 func setupDevSymlinks(rootfs string) error { 802 // In theory, these should be links to /proc/thread-self, but systems 803 // expect these to be /proc/self and this matches how most distributions 804 // work. 805 links := [][2]string{ 806 {"/proc/self/fd", "/dev/fd"}, 807 {"/proc/self/fd/0", "/dev/stdin"}, 808 {"/proc/self/fd/1", "/dev/stdout"}, 809 {"/proc/self/fd/2", "/dev/stderr"}, 810 } 811 // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink 812 // in /dev if it exists in /proc. 813 if _, err := os.Stat("/proc/kcore"); err == nil { 814 links = append(links, [2]string{"/proc/kcore", "/dev/core"}) 815 } 816 for _, link := range links { 817 var ( 818 src = link[0] 819 dst = filepath.Join(rootfs, link[1]) 820 ) 821 if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) { 822 return err 823 } 824 } 825 return nil 826 } 827 828 // If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs 829 // this method will make them point to `/dev/null` in this container's rootfs. This 830 // needs to be called after we chroot/pivot into the container's rootfs so that any 831 // symlinks are resolved locally. 832 func reOpenDevNull() error { 833 var stat, devNullStat unix.Stat_t 834 file, err := os.OpenFile("/dev/null", os.O_RDWR, 0) 835 if err != nil { 836 return err 837 } 838 defer file.Close() //nolint: errcheck 839 if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil { 840 return &os.PathError{Op: "fstat", Path: file.Name(), Err: err} 841 } 842 for fd := 0; fd < 3; fd++ { 843 if err := unix.Fstat(fd, &stat); err != nil { 844 return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(fd), Err: err} 845 } 846 if stat.Rdev == devNullStat.Rdev { 847 // Close and re-open the fd. 848 if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil { 849 return &os.PathError{ 850 Op: "dup3", 851 Path: "fd " + strconv.Itoa(int(file.Fd())), 852 Err: err, 853 } 854 } 855 } 856 } 857 return nil 858 } 859 860 // Create the device nodes in the container. 861 func createDevices(config *configs.Config) error { 862 useBindMount := userns.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) 863 for _, node := range config.Devices { 864 865 // The /dev/ptmx device is setup by setupPtmx() 866 if utils.CleanPath(node.Path) == "/dev/ptmx" { 867 continue 868 } 869 870 // containers running in a user namespace are not allowed to mknod 871 // devices so we can just bind mount it from the host. 872 if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil { 873 return err 874 } 875 } 876 return nil 877 } 878 879 func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error { 880 f, err := os.Create(dest) 881 if err != nil && !os.IsExist(err) { 882 return err 883 } 884 if f != nil { 885 _ = f.Close() 886 } 887 return utils.WithProcfd(rootfs, dest, func(dstFd string) error { 888 return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "") 889 }) 890 } 891 892 // Creates the device node in the rootfs of the container. 893 func createDeviceNode(rootfs string, node *devices.Device, bind bool) error { 894 if node.Path == "" { 895 // The node only exists for cgroup reasons, ignore it here. 896 return nil 897 } 898 dest, err := securejoin.SecureJoin(rootfs, node.Path) 899 if err != nil { 900 return err 901 } 902 if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil { 903 return err 904 } 905 if bind { 906 return bindMountDeviceNode(rootfs, dest, node) 907 } 908 if err := mknodDevice(dest, node); err != nil { 909 if errors.Is(err, os.ErrExist) { 910 return nil 911 } else if errors.Is(err, os.ErrPermission) { 912 return bindMountDeviceNode(rootfs, dest, node) 913 } 914 return err 915 } 916 return nil 917 } 918 919 func mknodDevice(dest string, node *devices.Device) error { 920 fileMode := node.FileMode 921 switch node.Type { 922 case devices.BlockDevice: 923 fileMode |= unix.S_IFBLK 924 case devices.CharDevice: 925 fileMode |= unix.S_IFCHR 926 case devices.FifoDevice: 927 fileMode |= unix.S_IFIFO 928 default: 929 return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) 930 } 931 dev, err := node.Mkdev() 932 if err != nil { 933 return err 934 } 935 if err := unix.Mknod(dest, uint32(fileMode), int(dev)); err != nil { 936 return &os.PathError{Op: "mknod", Path: dest, Err: err} 937 } 938 // Ensure permission bits (can be different because of umask). 939 if err := os.Chmod(dest, fileMode); err != nil { 940 return err 941 } 942 return os.Chown(dest, int(node.Uid), int(node.Gid)) 943 } 944 945 // Get the parent mount point of directory passed in as argument. Also return 946 // optional fields. 947 func getParentMount(rootfs string) (string, string, error) { 948 mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(rootfs)) 949 if err != nil { 950 return "", "", err 951 } 952 if len(mi) < 1 { 953 return "", "", fmt.Errorf("could not find parent mount of %s", rootfs) 954 } 955 956 // find the longest mount point 957 var idx, maxlen int 958 for i := range mi { 959 if len(mi[i].Mountpoint) > maxlen { 960 maxlen = len(mi[i].Mountpoint) 961 idx = i 962 } 963 } 964 return mi[idx].Mountpoint, mi[idx].Optional, nil 965 } 966 967 // Make parent mount private if it was shared 968 func rootfsParentMountPrivate(rootfs string) error { 969 sharedMount := false 970 971 parentMount, optionalOpts, err := getParentMount(rootfs) 972 if err != nil { 973 return err 974 } 975 976 optsSplit := strings.Split(optionalOpts, " ") 977 for _, opt := range optsSplit { 978 if strings.HasPrefix(opt, "shared:") { 979 sharedMount = true 980 break 981 } 982 } 983 984 // Make parent mount PRIVATE if it was shared. It is needed for two 985 // reasons. First of all pivot_root() will fail if parent mount is 986 // shared. Secondly when we bind mount rootfs it will propagate to 987 // parent namespace and we don't want that to happen. 988 if sharedMount { 989 return mount("", parentMount, "", unix.MS_PRIVATE, "") 990 } 991 992 return nil 993 } 994 995 func prepareRoot(config *configs.Config) error { 996 flag := unix.MS_SLAVE | unix.MS_REC 997 if config.RootPropagation != 0 { 998 flag = config.RootPropagation 999 } 1000 if err := mount("", "/", "", uintptr(flag), ""); err != nil { 1001 return err 1002 } 1003 1004 // Make parent mount private to make sure following bind mount does 1005 // not propagate in other namespaces. Also it will help with kernel 1006 // check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent)) 1007 if err := rootfsParentMountPrivate(config.Rootfs); err != nil { 1008 return err 1009 } 1010 1011 return mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "") 1012 } 1013 1014 func setReadonly() error { 1015 flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY) 1016 1017 err := mount("", "/", "", flags, "") 1018 if err == nil { 1019 return nil 1020 } 1021 var s unix.Statfs_t 1022 if err := unix.Statfs("/", &s); err != nil { 1023 return &os.PathError{Op: "statfs", Path: "/", Err: err} 1024 } 1025 flags |= uintptr(s.Flags) 1026 return mount("", "/", "", flags, "") 1027 } 1028 1029 func setupPtmx(config *configs.Config) error { 1030 ptmx := filepath.Join(config.Rootfs, "dev/ptmx") 1031 if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { 1032 return err 1033 } 1034 if err := os.Symlink("pts/ptmx", ptmx); err != nil { 1035 return err 1036 } 1037 return nil 1038 } 1039 1040 // pivotRoot will call pivot_root such that rootfs becomes the new root 1041 // filesystem, and everything else is cleaned up. 1042 func pivotRoot(rootfs string) error { 1043 // While the documentation may claim otherwise, pivot_root(".", ".") is 1044 // actually valid. What this results in is / being the new root but 1045 // /proc/self/cwd being the old root. Since we can play around with the cwd 1046 // with pivot_root this allows us to pivot without creating directories in 1047 // the rootfs. Shout-outs to the LXC developers for giving us this idea. 1048 1049 oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0) 1050 if err != nil { 1051 return &os.PathError{Op: "open", Path: "/", Err: err} 1052 } 1053 defer unix.Close(oldroot) //nolint: errcheck 1054 1055 newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0) 1056 if err != nil { 1057 return &os.PathError{Op: "open", Path: rootfs, Err: err} 1058 } 1059 defer unix.Close(newroot) //nolint: errcheck 1060 1061 // Change to the new root so that the pivot_root actually acts on it. 1062 if err := unix.Fchdir(newroot); err != nil { 1063 return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err} 1064 } 1065 1066 if err := unix.PivotRoot(".", "."); err != nil { 1067 return &os.PathError{Op: "pivot_root", Path: ".", Err: err} 1068 } 1069 1070 // Currently our "." is oldroot (according to the current kernel code). 1071 // However, purely for safety, we will fchdir(oldroot) since there isn't 1072 // really any guarantee from the kernel what /proc/self/cwd will be after a 1073 // pivot_root(2). 1074 1075 if err := unix.Fchdir(oldroot); err != nil { 1076 return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err} 1077 } 1078 1079 // Make oldroot rslave to make sure our unmounts don't propagate to the 1080 // host (and thus bork the machine). We don't use rprivate because this is 1081 // known to cause issues due to races where we still have a reference to a 1082 // mount while a process in the host namespace are trying to operate on 1083 // something they think has no mounts (devicemapper in particular). 1084 if err := mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { 1085 return err 1086 } 1087 // Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. 1088 if err := unmount(".", unix.MNT_DETACH); err != nil { 1089 return err 1090 } 1091 1092 // Switch back to our shiny new root. 1093 if err := unix.Chdir("/"); err != nil { 1094 return &os.PathError{Op: "chdir", Path: "/", Err: err} 1095 } 1096 return nil 1097 } 1098 1099 func msMoveRoot(rootfs string) error { 1100 // Before we move the root and chroot we have to mask all "full" sysfs and 1101 // procfs mounts which exist on the host. This is because while the kernel 1102 // has protections against mounting procfs if it has masks, when using 1103 // chroot(2) the *host* procfs mount is still reachable in the mount 1104 // namespace and the kernel permits procfs mounts inside --no-pivot 1105 // containers. 1106 // 1107 // Users shouldn't be using --no-pivot except in exceptional circumstances, 1108 // but to avoid such a trivial security flaw we apply a best-effort 1109 // protection here. The kernel only allows a mount of a pseudo-filesystem 1110 // like procfs or sysfs if there is a *full* mount (the root of the 1111 // filesystem is mounted) without any other locked mount points covering a 1112 // subtree of the mount. 1113 // 1114 // So we try to unmount (or mount tmpfs on top of) any mountpoint which is 1115 // a full mount of either sysfs or procfs (since those are the most 1116 // concerning filesystems to us). 1117 mountinfos, err := mountinfo.GetMounts(func(info *mountinfo.Info) (skip, stop bool) { 1118 // Collect every sysfs and procfs filesystem, except for those which 1119 // are non-full mounts or are inside the rootfs of the container. 1120 if info.Root != "/" || 1121 (info.FSType != "proc" && info.FSType != "sysfs") || 1122 strings.HasPrefix(info.Mountpoint, rootfs) { 1123 skip = true 1124 } 1125 return 1126 }) 1127 if err != nil { 1128 return err 1129 } 1130 for _, info := range mountinfos { 1131 p := info.Mountpoint 1132 // Be sure umount events are not propagated to the host. 1133 if err := mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { 1134 if errors.Is(err, unix.ENOENT) { 1135 // If the mountpoint doesn't exist that means that we've 1136 // already blasted away some parent directory of the mountpoint 1137 // and so we don't care about this error. 1138 continue 1139 } 1140 return err 1141 } 1142 if err := unmount(p, unix.MNT_DETACH); err != nil { 1143 if !errors.Is(err, unix.EINVAL) && !errors.Is(err, unix.EPERM) { 1144 return err 1145 } else { 1146 // If we have not privileges for umounting (e.g. rootless), then 1147 // cover the path. 1148 if err := mount("tmpfs", p, "tmpfs", 0, ""); err != nil { 1149 return err 1150 } 1151 } 1152 } 1153 } 1154 1155 // Move the rootfs on top of "/" in our mount namespace. 1156 if err := mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil { 1157 return err 1158 } 1159 return chroot() 1160 } 1161 1162 func chroot() error { 1163 if err := unix.Chroot("."); err != nil { 1164 return &os.PathError{Op: "chroot", Path: ".", Err: err} 1165 } 1166 if err := unix.Chdir("/"); err != nil { 1167 return &os.PathError{Op: "chdir", Path: "/", Err: err} 1168 } 1169 return nil 1170 } 1171 1172 // createIfNotExists creates a file or a directory only if it does not already exist. 1173 func createIfNotExists(path string, isDir bool) error { 1174 if _, err := os.Stat(path); err != nil { 1175 if os.IsNotExist(err) { 1176 if isDir { 1177 return os.MkdirAll(path, 0o755) 1178 } 1179 if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { 1180 return err 1181 } 1182 f, err := os.OpenFile(path, os.O_CREATE, 0o755) 1183 if err != nil { 1184 return err 1185 } 1186 _ = f.Close() 1187 } 1188 } 1189 return nil 1190 } 1191 1192 // readonlyPath will make a path read only. 1193 func readonlyPath(path string) error { 1194 if err := mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil { 1195 if errors.Is(err, os.ErrNotExist) { 1196 return nil 1197 } 1198 return err 1199 } 1200 1201 var s unix.Statfs_t 1202 if err := unix.Statfs(path, &s); err != nil { 1203 return &os.PathError{Op: "statfs", Path: path, Err: err} 1204 } 1205 flags := uintptr(s.Flags) & (unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC) 1206 1207 if err := mount(path, path, "", flags|unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil { 1208 return err 1209 } 1210 1211 return nil 1212 } 1213 1214 // remountReadonly will remount an existing mount point and ensure that it is read-only. 1215 func remountReadonly(m *configs.Mount) error { 1216 var ( 1217 dest = m.Destination 1218 flags = m.Flags 1219 ) 1220 for i := 0; i < 5; i++ { 1221 // There is a special case in the kernel for 1222 // MS_REMOUNT | MS_BIND, which allows us to change only the 1223 // flags even as an unprivileged user (i.e. user namespace) 1224 // assuming we don't drop any security related flags (nodev, 1225 // nosuid, etc.). So, let's use that case so that we can do 1226 // this re-mount without failing in a userns. 1227 flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY 1228 if err := mount("", dest, "", uintptr(flags), ""); err != nil { 1229 if errors.Is(err, unix.EBUSY) { 1230 time.Sleep(100 * time.Millisecond) 1231 continue 1232 } 1233 return err 1234 } 1235 return nil 1236 } 1237 return fmt.Errorf("unable to mount %s as readonly max retries reached", dest) 1238 } 1239 1240 // maskPath masks the top of the specified path inside a container to avoid 1241 // security issues from processes reading information from non-namespace aware 1242 // mounts ( proc/kcore ). 1243 // For files, maskPath bind mounts /dev/null over the top of the specified path. 1244 // For directories, maskPath mounts read-only tmpfs over the top of the specified path. 1245 func maskPath(path string, mountLabel string) error { 1246 if err := mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !errors.Is(err, os.ErrNotExist) { 1247 if errors.Is(err, unix.ENOTDIR) { 1248 return mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel)) 1249 } 1250 return err 1251 } 1252 return nil 1253 } 1254 1255 // writeSystemProperty writes the value to a path under /proc/sys as determined from the key. 1256 // For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward. 1257 func writeSystemProperty(key, value string) error { 1258 keyPath := strings.Replace(key, ".", "/", -1) 1259 return os.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644) 1260 } 1261 1262 // Do the mount operation followed by additional mounts required to take care 1263 // of propagation flags. This will always be scoped inside the container rootfs. 1264 func mountPropagate(m mountEntry, rootfs string, mountLabel string) error { 1265 var ( 1266 data = label.FormatMountLabel(m.Data, mountLabel) 1267 flags = m.Flags 1268 ) 1269 // Delay mounting the filesystem read-only if we need to do further 1270 // operations on it. We need to set up files in "/dev", and other tmpfs 1271 // mounts may need to be chmod-ed after mounting. These mounts will be 1272 // remounted ro later in finalizeRootfs(), if necessary. 1273 if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" { 1274 flags &= ^unix.MS_RDONLY 1275 } 1276 1277 // Because the destination is inside a container path which might be 1278 // mutating underneath us, we verify that we are actually going to mount 1279 // inside the container with WithProcfd() -- mounting through a procfd 1280 // mounts on the target. 1281 if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { 1282 return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data) 1283 }); err != nil { 1284 return err 1285 } 1286 // We have to apply mount propagation flags in a separate WithProcfd() call 1287 // because the previous call invalidates the passed procfd -- the mount 1288 // target needs to be re-opened. 1289 if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { 1290 for _, pflag := range m.PropagationFlags { 1291 if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil { 1292 return err 1293 } 1294 } 1295 return nil 1296 }); err != nil { 1297 return fmt.Errorf("change mount propagation through procfd: %w", err) 1298 } 1299 return nil 1300 } 1301 1302 func setRecAttr(m *configs.Mount, rootfs string) error { 1303 if m.RecAttr == nil { 1304 return nil 1305 } 1306 return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { 1307 return unix.MountSetattr(-1, procfd, unix.AT_RECURSIVE, m.RecAttr) 1308 }) 1309 }