github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/boot/fs.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package boot 16 17 import ( 18 "fmt" 19 "path/filepath" 20 "sort" 21 "strconv" 22 "strings" 23 24 specs "github.com/opencontainers/runtime-spec/specs-go" 25 "golang.org/x/sys/unix" 26 "github.com/SagerNet/gvisor/pkg/abi/linux" 27 "github.com/SagerNet/gvisor/pkg/context" 28 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 29 "github.com/SagerNet/gvisor/pkg/fd" 30 "github.com/SagerNet/gvisor/pkg/log" 31 "github.com/SagerNet/gvisor/pkg/sentry/fs" 32 "github.com/SagerNet/gvisor/pkg/sentry/fs/gofer" 33 "github.com/SagerNet/gvisor/pkg/sentry/fs/ramfs" 34 "github.com/SagerNet/gvisor/pkg/sentry/fs/user" 35 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/cgroupfs" 36 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/devpts" 37 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/devtmpfs" 38 gofervfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/gofer" 39 procvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/proc" 40 sysvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/sys" 41 tmpfsvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/tmpfs" 42 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 43 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 44 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 45 "github.com/SagerNet/gvisor/runsc/config" 46 "github.com/SagerNet/gvisor/runsc/specutils" 47 48 // Include filesystem types that OCI spec might mount. 49 _ "github.com/SagerNet/gvisor/pkg/sentry/fs/dev" 50 _ "github.com/SagerNet/gvisor/pkg/sentry/fs/host" 51 _ "github.com/SagerNet/gvisor/pkg/sentry/fs/proc" 52 _ "github.com/SagerNet/gvisor/pkg/sentry/fs/sys" 53 _ "github.com/SagerNet/gvisor/pkg/sentry/fs/tmpfs" 54 _ "github.com/SagerNet/gvisor/pkg/sentry/fs/tty" 55 ) 56 57 const ( 58 // Device name for root mount. 59 rootDevice = "9pfs-/" 60 61 // MountPrefix is the annotation prefix for mount hints. 62 MountPrefix = "dev.gvisor.spec.mount." 63 64 // Supported filesystems that map to different internal filesystem. 65 bind = "bind" 66 nonefs = "none" 67 ) 68 69 // tmpfs has some extra supported options that we must pass through. 70 var tmpfsAllowedData = []string{"mode", "uid", "gid"} 71 72 func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) { 73 // Upper layer uses the same flags as lower, but it must be read-write. 74 upperFlags := lowerFlags 75 upperFlags.ReadOnly = false 76 77 tmpFS := mustFindFilesystem("tmpfs") 78 if !fs.IsDir(lower.StableAttr) { 79 // Create overlay on top of mount file, e.g. /etc/hostname. 80 msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags) 81 return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags) 82 } 83 84 // Create overlay on top of mount dir. 85 upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil) 86 if err != nil { 87 return nil, fmt.Errorf("creating tmpfs overlay: %v", err) 88 } 89 90 // Replicate permissions and owner from lower to upper mount point. 91 attr, err := lower.UnstableAttr(ctx) 92 if err != nil { 93 return nil, fmt.Errorf("reading attributes from lower mount point: %v", err) 94 } 95 if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) { 96 return nil, fmt.Errorf("error setting permission to upper mount point") 97 } 98 if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil { 99 return nil, fmt.Errorf("setting owner to upper mount point: %v", err) 100 } 101 102 return fs.NewOverlayRoot(ctx, upper, lower, upperFlags) 103 } 104 105 // compileMounts returns the supported mounts from the mount spec, adding any 106 // mandatory mounts that are required by the OCI specification. 107 func compileMounts(spec *specs.Spec, conf *config.Config, vfs2Enabled bool) []specs.Mount { 108 // Keep track of whether proc and sys were mounted. 109 var procMounted, sysMounted, devMounted, devptsMounted bool 110 var mounts []specs.Mount 111 112 // Mount all submounts from the spec. 113 for _, m := range spec.Mounts { 114 if !specutils.IsSupportedDevMount(m, vfs2Enabled) { 115 log.Warningf("ignoring dev mount at %q", m.Destination) 116 continue 117 } 118 // Unconditionally drop any cgroupfs mounts. If requested, we'll add our 119 // own below. 120 if m.Type == cgroupfs.Name { 121 continue 122 } 123 switch filepath.Clean(m.Destination) { 124 case "/proc": 125 procMounted = true 126 case "/sys": 127 sysMounted = true 128 case "/dev": 129 m.Type = devtmpfs.Name 130 devMounted = true 131 case "/dev/pts": 132 m.Type = devpts.Name 133 devptsMounted = true 134 } 135 mounts = append(mounts, m) 136 } 137 138 // Mount proc and sys even if the user did not ask for it, as the spec 139 // says we SHOULD. 140 var mandatoryMounts []specs.Mount 141 142 if conf.Cgroupfs { 143 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 144 Type: tmpfsvfs2.Name, 145 Destination: "/sys/fs/cgroup", 146 }) 147 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 148 Type: cgroupfs.Name, 149 Destination: "/sys/fs/cgroup/memory", 150 Options: []string{"memory"}, 151 }) 152 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 153 Type: cgroupfs.Name, 154 Destination: "/sys/fs/cgroup/cpu", 155 Options: []string{"cpu"}, 156 }) 157 } 158 159 if !procMounted { 160 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 161 Type: procvfs2.Name, 162 Destination: "/proc", 163 }) 164 } 165 if !sysMounted { 166 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 167 Type: sysvfs2.Name, 168 Destination: "/sys", 169 }) 170 } 171 if !devMounted { 172 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 173 Type: devtmpfs.Name, 174 Destination: "/dev", 175 }) 176 } 177 if !devptsMounted { 178 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 179 Type: devpts.Name, 180 Destination: "/dev/pts", 181 }) 182 } 183 184 // The mandatory mounts should be ordered right after the root, in case 185 // there are submounts of these mandatory mounts already in the spec. 186 mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...) 187 188 return mounts 189 } 190 191 // p9MountData creates a slice of p9 mount data. 192 func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string { 193 opts := []string{ 194 "trans=fd", 195 "rfdno=" + strconv.Itoa(fd), 196 "wfdno=" + strconv.Itoa(fd), 197 } 198 if !vfs2 { 199 // privateunixsocket is always enabled in VFS2. VFS1 requires explicit 200 // enablement. 201 opts = append(opts, "privateunixsocket=true") 202 } 203 if fa == config.FileAccessShared { 204 opts = append(opts, "cache=remote_revalidating") 205 } 206 return opts 207 } 208 209 // parseAndFilterOptions parses a MountOptions slice and filters by the allowed 210 // keys. 211 func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) { 212 var out []string 213 for _, o := range opts { 214 ok, err := parseMountOption(o, allowedKeys...) 215 if err != nil { 216 return nil, err 217 } 218 if ok { 219 out = append(out, o) 220 } 221 } 222 return out, nil 223 } 224 225 func parseMountOption(opt string, allowedKeys ...string) (bool, error) { 226 kv := strings.SplitN(opt, "=", 3) 227 if len(kv) > 2 { 228 return false, fmt.Errorf("invalid option %q", opt) 229 } 230 return specutils.ContainsStr(allowedKeys, kv[0]), nil 231 } 232 233 // mountDevice returns a device string based on the fs type and target 234 // of the mount. 235 func mountDevice(m *specs.Mount) string { 236 if m.Type == bind { 237 // Make a device string that includes the target, which is consistent across 238 // S/R and uniquely identifies the connection. 239 return "9pfs-" + m.Destination 240 } 241 // All other fs types use device "none". 242 return "none" 243 } 244 245 func mountFlags(opts []string) fs.MountSourceFlags { 246 mf := fs.MountSourceFlags{} 247 // Note: changes to supported options must be reflected in 248 // isSupportedMountFlag() as well. 249 for _, o := range opts { 250 switch o { 251 case "rw": 252 mf.ReadOnly = false 253 case "ro": 254 mf.ReadOnly = true 255 case "noatime": 256 mf.NoAtime = true 257 case "noexec": 258 mf.NoExec = true 259 case "bind", "rbind": 260 // These are the same as a mount with type="bind". 261 default: 262 log.Warningf("ignoring unknown mount option %q", o) 263 } 264 } 265 return mf 266 } 267 268 func isSupportedMountFlag(fstype, opt string) bool { 269 switch opt { 270 case "rw", "ro", "noatime", "noexec": 271 return true 272 } 273 if fstype == tmpfsvfs2.Name { 274 ok, err := parseMountOption(opt, tmpfsAllowedData...) 275 return ok && err == nil 276 } 277 if fstype == cgroupfs.Name { 278 ok, err := parseMountOption(opt, cgroupfs.SupportedMountOptions...) 279 return ok && err == nil 280 } 281 return false 282 } 283 284 func mustFindFilesystem(name string) fs.Filesystem { 285 fs, ok := fs.FindFilesystem(name) 286 if !ok { 287 panic(fmt.Sprintf("could not find filesystem %q", name)) 288 } 289 return fs 290 } 291 292 // addSubmountOverlay overlays the inode over a ramfs tree containing the given 293 // paths. 294 func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) { 295 // Construct a ramfs tree of mount points. The contents never 296 // change, so this can be fully caching. There's no real 297 // filesystem backing this tree, so we set the filesystem to 298 // nil. 299 msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{}) 300 mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts) 301 if err != nil { 302 return nil, fmt.Errorf("creating mount tree: %v", err) 303 } 304 overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf) 305 if err != nil { 306 return nil, fmt.Errorf("adding mount overlay: %v", err) 307 } 308 return overlayInode, err 309 } 310 311 // subtargets takes a set of Mounts and returns only the targets that are 312 // children of the given root. The returned paths are relative to the root. 313 func subtargets(root string, mnts []specs.Mount) []string { 314 var targets []string 315 for _, mnt := range mnts { 316 if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath { 317 targets = append(targets, relPath) 318 } 319 } 320 return targets 321 } 322 323 func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { 324 if conf.VFS2 { 325 return setupContainerVFS2(ctx, conf, mntr, procArgs) 326 } 327 mns, err := mntr.setupFS(conf, procArgs) 328 if err != nil { 329 return err 330 } 331 332 // Set namespace here so that it can be found in ctx. 333 procArgs.MountNamespace = mns 334 335 // Resolve the executable path from working dir and environment. 336 resolved, err := user.ResolveExecutablePath(ctx, procArgs) 337 if err != nil { 338 return err 339 } 340 procArgs.Filename = resolved 341 return nil 342 } 343 344 func adjustDirentCache(k *kernel.Kernel) error { 345 var hl unix.Rlimit 346 if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &hl); err != nil { 347 return fmt.Errorf("getting RLIMIT_NOFILE: %v", err) 348 } 349 if hl.Cur != unix.RLIM_INFINITY { 350 newSize := hl.Cur / 2 351 if newSize < gofer.DefaultDirentCacheSize { 352 log.Infof("Setting gofer dirent cache size to %d", newSize) 353 gofer.DefaultDirentCacheSize = newSize 354 k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize) 355 } 356 } 357 return nil 358 } 359 360 type fdDispenser struct { 361 fds []*fd.FD 362 } 363 364 func (f *fdDispenser) remove() int { 365 if f.empty() { 366 panic("fdDispenser out of fds") 367 } 368 rv := f.fds[0].Release() 369 f.fds = f.fds[1:] 370 return rv 371 } 372 373 func (f *fdDispenser) empty() bool { 374 return len(f.fds) == 0 375 } 376 377 type shareType int 378 379 const ( 380 invalid shareType = iota 381 382 // container shareType indicates that the mount is used by a single container. 383 container 384 385 // pod shareType indicates that the mount is used by more than one container 386 // inside the pod. 387 pod 388 389 // shared shareType indicates that the mount can also be shared with a process 390 // outside the pod, e.g. NFS. 391 shared 392 ) 393 394 func parseShare(val string) (shareType, error) { 395 switch val { 396 case "container": 397 return container, nil 398 case "pod": 399 return pod, nil 400 case "shared": 401 return shared, nil 402 default: 403 return 0, fmt.Errorf("invalid share value %q", val) 404 } 405 } 406 407 func (s shareType) String() string { 408 switch s { 409 case invalid: 410 return "invalid" 411 case container: 412 return "container" 413 case pod: 414 return "pod" 415 case shared: 416 return "shared" 417 default: 418 return fmt.Sprintf("invalid share value %d", s) 419 } 420 } 421 422 // mountHint represents extra information about mounts that are provided via 423 // annotations. They can override mount type, and provide sharing information 424 // so that mounts can be correctly shared inside the pod. 425 type mountHint struct { 426 name string 427 share shareType 428 mount specs.Mount 429 430 // root is the inode where the volume is mounted. For mounts with 'pod' share 431 // the volume is mounted once and then bind mounted inside the containers. 432 root *fs.Inode 433 434 // vfsMount is the master mount for the volume. For mounts with 'pod' share 435 // the master volume is bind mounted inside the containers. 436 vfsMount *vfs.Mount 437 } 438 439 func (m *mountHint) setField(key, val string) error { 440 switch key { 441 case "source": 442 if len(val) == 0 { 443 return fmt.Errorf("source cannot be empty") 444 } 445 m.mount.Source = val 446 case "type": 447 return m.setType(val) 448 case "share": 449 share, err := parseShare(val) 450 if err != nil { 451 return err 452 } 453 m.share = share 454 case "options": 455 return m.setOptions(val) 456 default: 457 return fmt.Errorf("invalid mount annotation: %s=%s", key, val) 458 } 459 return nil 460 } 461 462 func (m *mountHint) setType(val string) error { 463 switch val { 464 case "tmpfs", "bind": 465 m.mount.Type = val 466 default: 467 return fmt.Errorf("invalid type %q", val) 468 } 469 return nil 470 } 471 472 func (m *mountHint) setOptions(val string) error { 473 opts := strings.Split(val, ",") 474 if err := specutils.ValidateMountOptions(opts); err != nil { 475 return err 476 } 477 // Sort options so it can be compared with container mount options later on. 478 sort.Strings(opts) 479 m.mount.Options = opts 480 return nil 481 } 482 483 func (m *mountHint) isSupported() bool { 484 return m.mount.Type == tmpfsvfs2.Name && m.share == pod 485 } 486 487 // checkCompatible verifies that shared mount is compatible with master. 488 // For now enforce that all options are the same. Once bind mount is properly 489 // supported, then we should ensure the master is less restrictive than the 490 // container, e.g. master can be 'rw' while container mounts as 'ro'. 491 func (m *mountHint) checkCompatible(mount *specs.Mount) error { 492 // Remove options that don't affect to mount's behavior. 493 masterOpts := filterUnsupportedOptions(&m.mount) 494 replicaOpts := filterUnsupportedOptions(mount) 495 496 if len(masterOpts) != len(replicaOpts) { 497 return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) 498 } 499 500 sort.Strings(masterOpts) 501 sort.Strings(replicaOpts) 502 for i, opt := range masterOpts { 503 if opt != replicaOpts[i] { 504 return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts) 505 } 506 } 507 return nil 508 } 509 510 func (m *mountHint) fileAccessType() config.FileAccessType { 511 if m.share == container { 512 return config.FileAccessExclusive 513 } 514 return config.FileAccessShared 515 } 516 517 func filterUnsupportedOptions(mount *specs.Mount) []string { 518 rv := make([]string, 0, len(mount.Options)) 519 for _, o := range mount.Options { 520 if isSupportedMountFlag(mount.Type, o) { 521 rv = append(rv, o) 522 } 523 } 524 return rv 525 } 526 527 // podMountHints contains a collection of mountHints for the pod. 528 type podMountHints struct { 529 mounts map[string]*mountHint 530 } 531 532 func newPodMountHints(spec *specs.Spec) (*podMountHints, error) { 533 mnts := make(map[string]*mountHint) 534 for k, v := range spec.Annotations { 535 // Look for 'dev.gvisor.spec.mount' annotations and parse them. 536 if strings.HasPrefix(k, MountPrefix) { 537 // Remove the prefix and split the rest. 538 parts := strings.Split(k[len(MountPrefix):], ".") 539 if len(parts) != 2 { 540 return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v) 541 } 542 name := parts[0] 543 if len(name) == 0 { 544 return nil, fmt.Errorf("invalid mount name: %s", name) 545 } 546 mnt := mnts[name] 547 if mnt == nil { 548 mnt = &mountHint{name: name} 549 mnts[name] = mnt 550 } 551 if err := mnt.setField(parts[1], v); err != nil { 552 return nil, err 553 } 554 } 555 } 556 557 // Validate all hints after done parsing. 558 for name, m := range mnts { 559 log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share) 560 if m.share == invalid { 561 return nil, fmt.Errorf("share field for %q has not been set", m.name) 562 } 563 if len(m.mount.Source) == 0 { 564 return nil, fmt.Errorf("source field for %q has not been set", m.name) 565 } 566 if len(m.mount.Type) == 0 { 567 return nil, fmt.Errorf("type field for %q has not been set", m.name) 568 } 569 570 // Check for duplicate mount sources. 571 for name2, m2 := range mnts { 572 if name != name2 && m.mount.Source == m2.mount.Source { 573 return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source) 574 } 575 } 576 } 577 578 return &podMountHints{mounts: mnts}, nil 579 } 580 581 func (p *podMountHints) findMount(mount *specs.Mount) *mountHint { 582 for _, m := range p.mounts { 583 if m.mount.Source == mount.Source { 584 return m 585 } 586 } 587 return nil 588 } 589 590 type containerMounter struct { 591 root *specs.Root 592 593 // mounts is the set of submounts for the container. It's a copy from the spec 594 // that may be freely modified without affecting the original spec. 595 mounts []specs.Mount 596 597 // fds is the list of FDs to be dispensed for mounts that require it. 598 fds fdDispenser 599 600 k *kernel.Kernel 601 602 hints *podMountHints 603 } 604 605 func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *podMountHints, vfs2Enabled bool) *containerMounter { 606 return &containerMounter{ 607 root: info.spec.Root, 608 mounts: compileMounts(info.spec, info.conf, vfs2Enabled), 609 fds: fdDispenser{fds: info.goferFDs}, 610 k: k, 611 hints: hints, 612 } 613 } 614 615 // processHints processes annotations that container hints about how volumes 616 // should be mounted (e.g. a volume shared between containers). It must be 617 // called for the root container only. 618 func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error { 619 if conf.VFS2 { 620 return c.processHintsVFS2(conf, creds) 621 } 622 ctx := c.k.SupervisorContext() 623 for _, hint := range c.hints.mounts { 624 // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a 625 // common gofer to mount all shared volumes. 626 if hint.mount.Type != tmpfsvfs2.Name { 627 continue 628 } 629 log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) 630 inode, err := c.mountSharedMaster(ctx, conf, hint) 631 if err != nil { 632 return fmt.Errorf("mounting shared master %q: %v", hint.name, err) 633 } 634 hint.root = inode 635 } 636 return nil 637 } 638 639 // setupFS is used to set up the file system for all containers. This is the 640 // main entry point method, with most of the other being internal only. It 641 // returns the mount namespace that is created for the container. 642 func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) { 643 log.Infof("Configuring container's file system") 644 645 // Create context with root credentials to mount the filesystem (the current 646 // user may not be privileged enough). 647 rootProcArgs := *procArgs 648 rootProcArgs.WorkingDirectory = "/" 649 rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) 650 rootProcArgs.Umask = 0022 651 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals 652 rootCtx := rootProcArgs.NewContext(c.k) 653 654 mns, err := c.createMountNamespace(rootCtx, conf) 655 if err != nil { 656 return nil, err 657 } 658 659 // Set namespace here so that it can be found in rootCtx. 660 rootProcArgs.MountNamespace = mns 661 662 if err := c.mountSubmounts(rootCtx, conf, mns); err != nil { 663 return nil, err 664 } 665 return mns, nil 666 } 667 668 func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) { 669 rootInode, err := c.createRootMount(ctx, conf) 670 if err != nil { 671 return nil, fmt.Errorf("creating filesystem for container: %v", err) 672 } 673 mns, err := fs.NewMountNamespace(ctx, rootInode) 674 if err != nil { 675 return nil, fmt.Errorf("creating new mount namespace for container: %v", err) 676 } 677 return mns, nil 678 } 679 680 func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error { 681 root := mns.Root() 682 defer root.DecRef(ctx) 683 684 for i := range c.mounts { 685 m := &c.mounts[i] 686 log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) 687 if hint := c.hints.findMount(m); hint != nil && hint.isSupported() { 688 if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil { 689 return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err) 690 } 691 } else { 692 if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil { 693 return fmt.Errorf("mount submount %q: %v", m.Destination, err) 694 } 695 } 696 } 697 698 if err := c.mountTmp(ctx, conf, mns, root); err != nil { 699 return fmt.Errorf("mount submount %q: %v", "tmp", err) 700 } 701 702 if err := c.checkDispenser(); err != nil { 703 return err 704 } 705 return nil 706 } 707 708 func (c *containerMounter) checkDispenser() error { 709 if !c.fds.empty() { 710 return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds) 711 } 712 return nil 713 } 714 715 // mountSharedMaster mounts the master of a volume that is shared among 716 // containers in a pod. It returns the root mount's inode. 717 func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) { 718 // Map mount type to filesystem name, and parse out the options that we are 719 // capable of dealing with. 720 fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, &hint.mount) 721 if err != nil { 722 return nil, err 723 } 724 if len(fsName) == 0 { 725 return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) 726 } 727 728 // Mount with revalidate because it's shared among containers. 729 opts = append(opts, "cache=revalidate") 730 731 // All filesystem names should have been mapped to something we know. 732 filesystem := mustFindFilesystem(fsName) 733 734 mf := mountFlags(hint.mount.Options) 735 if useOverlay { 736 // All writes go to upper, be paranoid and make lower readonly. 737 mf.ReadOnly = true 738 } 739 740 inode, err := filesystem.Mount(ctx, mountDevice(&hint.mount), mf, strings.Join(opts, ","), nil) 741 if err != nil { 742 return nil, fmt.Errorf("creating mount %q: %v", hint.name, err) 743 } 744 745 if useOverlay { 746 log.Debugf("Adding overlay on top of shared mount %q", hint.name) 747 inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf) 748 if err != nil { 749 return nil, err 750 } 751 } 752 753 return inode, nil 754 } 755 756 // createRootMount creates the root filesystem. 757 func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) { 758 // First construct the filesystem from the spec.Root. 759 mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay} 760 761 fd := c.fds.remove() 762 log.Infof("Mounting root over 9P, ioFD: %d", fd) 763 p9FS := mustFindFilesystem("9p") 764 opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) 765 766 // We can't check for overlayfs here because sandbox is chroot'ed and gofer 767 // can only send mount options for specs.Mounts (specs.Root is missing 768 // Options field). So assume root is always on top of overlayfs. 769 opts = append(opts, "overlayfs_stale_read") 770 771 rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil) 772 if err != nil { 773 return nil, fmt.Errorf("creating root mount point: %v", err) 774 } 775 776 // We need to overlay the root on top of a ramfs with stub directories 777 // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always 778 // mounted even if they are not in the spec. 779 submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp") 780 rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf) 781 if err != nil { 782 return nil, fmt.Errorf("adding submount overlay: %v", err) 783 } 784 785 if conf.Overlay && !c.root.Readonly { 786 log.Debugf("Adding overlay on top of root mount") 787 // Overlay a tmpfs filesystem on top of the root. 788 rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf) 789 if err != nil { 790 return nil, err 791 } 792 } 793 794 log.Infof("Mounted %q to %q type root", c.root.Path, "/") 795 return rootInode, nil 796 } 797 798 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values 799 // used for mounts. 800 func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m *specs.Mount) (string, []string, bool, error) { 801 specutils.MaybeConvertToBindMount(m) 802 803 var ( 804 fsName string 805 opts []string 806 useOverlay bool 807 ) 808 switch m.Type { 809 case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name: 810 fsName = m.Type 811 case nonefs: 812 fsName = sysvfs2.Name 813 case tmpfsvfs2.Name: 814 fsName = m.Type 815 816 var err error 817 opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) 818 if err != nil { 819 return "", nil, false, err 820 } 821 822 case bind: 823 fd := c.fds.remove() 824 fsName = gofervfs2.Name 825 opts = p9MountData(fd, c.getMountAccessType(conf, m), conf.VFS2) 826 // If configured, add overlay to all writable mounts. 827 useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly 828 case cgroupfs.Name: 829 fsName = m.Type 830 var err error 831 opts, err = parseAndFilterOptions(m.Options, cgroupfs.SupportedMountOptions...) 832 if err != nil { 833 return "", nil, false, err 834 } 835 default: 836 log.Warningf("ignoring unknown filesystem type %q", m.Type) 837 } 838 return fsName, opts, useOverlay, nil 839 } 840 841 func (c *containerMounter) getMountAccessType(conf *config.Config, mount *specs.Mount) config.FileAccessType { 842 if hint := c.hints.findMount(mount); hint != nil { 843 return hint.fileAccessType() 844 } 845 return conf.FileAccessMounts 846 } 847 848 // mountSubmount mounts volumes inside the container's root. Because mounts may 849 // be readonly, a lower ramfs overlay is added to create the mount point dir. 850 // Another overlay is added with tmpfs on top if Config.Overlay is true. 851 // 'm.Destination' must be an absolute path with '..' and symlinks resolved. 852 func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m *specs.Mount) error { 853 // Map mount type to filesystem name, and parse out the options that we are 854 // capable of dealing with. 855 fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) 856 if err != nil { 857 return err 858 } 859 if fsName == "" { 860 // Filesystem is not supported (e.g. cgroup), just skip it. 861 return nil 862 } 863 864 // All filesystem names should have been mapped to something we know. 865 filesystem := mustFindFilesystem(fsName) 866 867 mf := mountFlags(m.Options) 868 if useOverlay { 869 // All writes go to upper, be paranoid and make lower readonly. 870 mf.ReadOnly = true 871 } 872 873 inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil) 874 if err != nil { 875 err := fmt.Errorf("creating mount with source %q: %v", m.Source, err) 876 // Check to see if this is a common error due to a Linux bug. 877 // This error is generated here in order to cause it to be 878 // printed to the user using Docker via 'runsc create' etc. rather 879 // than simply printed to the logs for the 'runsc boot' command. 880 // 881 // We check the error message string rather than type because the 882 // actual error types (unix.EIO, unix.EPIPE) are lost by file system 883 // implementation (e.g. p9). 884 // TODO(github.com/SagerNet/issue/1765): Remove message when bug is resolved. 885 if strings.Contains(err.Error(), unix.EIO.Error()) || strings.Contains(err.Error(), unix.EPIPE.Error()) { 886 return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug")) 887 } 888 return err 889 } 890 891 // If there are submounts, we need to overlay the mount on top of a ramfs 892 // with stub directories for submount paths. 893 submounts := subtargets(m.Destination, c.mounts) 894 if len(submounts) > 0 { 895 log.Infof("Adding submount overlay over %q", m.Destination) 896 inode, err = addSubmountOverlay(ctx, inode, submounts, mf) 897 if err != nil { 898 return fmt.Errorf("adding submount overlay: %v", err) 899 } 900 } 901 902 if useOverlay { 903 log.Debugf("Adding overlay on top of mount %q", m.Destination) 904 inode, err = addOverlay(ctx, conf, inode, m.Type, mf) 905 if err != nil { 906 return err 907 } 908 } 909 910 maxTraversals := uint(0) 911 dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals) 912 if err != nil { 913 return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) 914 } 915 defer dirent.DecRef(ctx) 916 if err := mns.Mount(ctx, dirent, inode); err != nil { 917 return fmt.Errorf("mount %q error: %v", m.Destination, err) 918 } 919 920 log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts) 921 return nil 922 } 923 924 // mountSharedSubmount binds mount to a previously mounted volume that is shared 925 // among containers in the same pod. 926 func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount *specs.Mount, source *mountHint) error { 927 if err := source.checkCompatible(mount); err != nil { 928 return err 929 } 930 931 maxTraversals := uint(0) 932 target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals) 933 if err != nil { 934 return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) 935 } 936 defer target.DecRef(ctx) 937 938 // Take a ref on the inode that is about to be (re)-mounted. 939 source.root.IncRef() 940 if err := mns.Mount(ctx, target, source.root); err != nil { 941 source.root.DecRef(ctx) 942 return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) 943 } 944 945 log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) 946 return nil 947 } 948 949 // addRestoreMount adds a mount to the MountSources map used for restoring a 950 // checkpointed container. 951 func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m *specs.Mount) error { 952 fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m) 953 if err != nil { 954 return err 955 } 956 if fsName == "" { 957 // Filesystem is not supported (e.g. cgroup), just skip it. 958 return nil 959 } 960 961 newMount := fs.MountArgs{ 962 Dev: mountDevice(m), 963 Flags: mountFlags(m.Options), 964 DataString: strings.Join(opts, ","), 965 } 966 if useOverlay { 967 newMount.Flags.ReadOnly = true 968 } 969 renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount) 970 log.Infof("Added mount at %q: %+v", fsName, newMount) 971 return nil 972 } 973 974 // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding 975 // the mounts to the environment. 976 func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) { 977 renv := &fs.RestoreEnvironment{ 978 MountSources: make(map[string][]fs.MountArgs), 979 } 980 981 // Add root mount. 982 fd := c.fds.remove() 983 opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */) 984 985 mf := fs.MountSourceFlags{} 986 if c.root.Readonly || conf.Overlay { 987 mf.ReadOnly = true 988 } 989 990 rootMount := fs.MountArgs{ 991 Dev: rootDevice, 992 Flags: mf, 993 DataString: strings.Join(opts, ","), 994 } 995 renv.MountSources[gofervfs2.Name] = append(renv.MountSources[gofervfs2.Name], rootMount) 996 997 // Add submounts. 998 var tmpMounted bool 999 for i := range c.mounts { 1000 m := &c.mounts[i] 1001 if err := c.addRestoreMount(conf, renv, m); err != nil { 1002 return nil, err 1003 } 1004 if filepath.Clean(m.Destination) == "/tmp" { 1005 tmpMounted = true 1006 } 1007 } 1008 1009 // TODO(b/67958150): handle '/tmp' properly (see mountTmp()). 1010 if !tmpMounted { 1011 tmpMount := specs.Mount{ 1012 Type: tmpfsvfs2.Name, 1013 Destination: "/tmp", 1014 } 1015 if err := c.addRestoreMount(conf, renv, &tmpMount); err != nil { 1016 return nil, err 1017 } 1018 } 1019 1020 return renv, nil 1021 } 1022 1023 // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. 1024 // Technically we don't have to mount tmpfs at /tmp, as we could just rely on 1025 // the host /tmp, but this is a nice optimization, and fixes some apps that call 1026 // mknod in /tmp. It's unsafe to mount tmpfs if: 1027 // 1. /tmp is mounted explicitly: we should not override user's wish 1028 // 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp 1029 // 1030 // Note that when there are submounts inside of '/tmp', directories for the 1031 // mount points must be present, making '/tmp' not empty anymore. 1032 func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error { 1033 for _, m := range c.mounts { 1034 if filepath.Clean(m.Destination) == "/tmp" { 1035 log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m) 1036 return nil 1037 } 1038 } 1039 1040 maxTraversals := uint(0) 1041 tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals) 1042 switch { 1043 case err == nil: 1044 // Found '/tmp' in filesystem, check if it's empty. 1045 defer tmp.DecRef(ctx) 1046 f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) 1047 if err != nil { 1048 return err 1049 } 1050 defer f.DecRef(ctx) 1051 serializer := &fs.CollectEntriesSerializer{} 1052 if err := f.Readdir(ctx, serializer); err != nil { 1053 return err 1054 } 1055 // If more than "." and ".." is found, skip internal tmpfs to prevent hiding 1056 // existing files. 1057 if len(serializer.Order) > 2 { 1058 log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp") 1059 return nil 1060 } 1061 log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp") 1062 fallthrough 1063 1064 case linuxerr.Equals(linuxerr.ENOENT, err): 1065 // No '/tmp' found (or fallthrough from above). Safe to mount internal 1066 // tmpfs. 1067 tmpMount := specs.Mount{ 1068 Type: tmpfsvfs2.Name, 1069 Destination: "/tmp", 1070 // Sticky bit is added to prevent accidental deletion of files from 1071 // another user. This is normally done for /tmp. 1072 Options: []string{"mode=01777"}, 1073 } 1074 return c.mountSubmount(ctx, conf, mns, root, &tmpMount) 1075 1076 default: 1077 return err 1078 } 1079 }