github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/boot/vfs.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package boot 16 17 import ( 18 "fmt" 19 "os" 20 "path" 21 "path/filepath" 22 "regexp" 23 "sort" 24 "strconv" 25 "strings" 26 27 specs "github.com/opencontainers/runtime-spec/specs-go" 28 "github.com/metacubex/gvisor/pkg/abi/linux" 29 "github.com/metacubex/gvisor/pkg/abi/nvgpu" 30 "github.com/metacubex/gvisor/pkg/abi/tpu" 31 "github.com/metacubex/gvisor/pkg/cleanup" 32 "github.com/metacubex/gvisor/pkg/context" 33 "github.com/metacubex/gvisor/pkg/devutil" 34 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 35 "github.com/metacubex/gvisor/pkg/fd" 36 "github.com/metacubex/gvisor/pkg/fspath" 37 "github.com/metacubex/gvisor/pkg/log" 38 "github.com/metacubex/gvisor/pkg/sentry/devices/accel" 39 "github.com/metacubex/gvisor/pkg/sentry/devices/memdev" 40 "github.com/metacubex/gvisor/pkg/sentry/devices/nvproxy" 41 "github.com/metacubex/gvisor/pkg/sentry/devices/tpuproxy" 42 "github.com/metacubex/gvisor/pkg/sentry/devices/ttydev" 43 "github.com/metacubex/gvisor/pkg/sentry/devices/tundev" 44 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/cgroupfs" 45 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/dev" 46 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/devpts" 47 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/devtmpfs" 48 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/erofs" 49 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/fuse" 50 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/gofer" 51 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/mqfs" 52 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/overlay" 53 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/proc" 54 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/sys" 55 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/tmpfs" 56 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/user" 57 "github.com/metacubex/gvisor/pkg/sentry/inet" 58 "github.com/metacubex/gvisor/pkg/sentry/kernel" 59 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 60 "github.com/metacubex/gvisor/pkg/sentry/pgalloc" 61 "github.com/metacubex/gvisor/pkg/sentry/vfs" 62 "github.com/metacubex/gvisor/runsc/config" 63 "github.com/metacubex/gvisor/runsc/specutils" 64 ) 65 66 // Supported filesystems that map to different internal filesystems. 67 const ( 68 Bind = "bind" 69 Nonefs = "none" 70 ) 71 72 // SelfFilestorePrefix is the prefix of the self filestore file name. 73 const SelfFilestorePrefix = ".gvisor.filestore." 74 75 const ( 76 pciPathGlobTPUv4 = "/sys/devices/pci0000:00/*/accel/accel*" 77 pciPathGlobTPUv5 = "/sys/devices/pci0000:00/*/vfio-dev/vfio*" 78 ) 79 80 // SelfFilestorePath returns the path at which the self filestore file is 81 // stored for a given mount. 82 func SelfFilestorePath(mountSrc, sandboxID string) string { 83 // We will place the filestore file in a gVisor specific hidden file inside 84 // the mount being overlaid itself. The same volume can be overlaid by 85 // multiple sandboxes. So make the filestore file unique to a sandbox by 86 // suffixing the sandbox ID. 87 return path.Join(mountSrc, selfFilestoreName(sandboxID)) 88 } 89 90 func selfFilestoreName(sandboxID string) string { 91 return SelfFilestorePrefix + sandboxID 92 } 93 94 // tmpfs has some extra supported options that we must pass through. 95 var tmpfsAllowedData = []string{"mode", "size", "uid", "gid"} 96 97 func registerFilesystems(k *kernel.Kernel, info *containerInfo) error { 98 ctx := k.SupervisorContext() 99 vfsObj := k.VFS() 100 101 vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 102 AllowUserMount: true, 103 AllowUserList: true, 104 }) 105 vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 106 AllowUserList: true, 107 // TODO(b/29356795): Users may mount this once the terminals are in a 108 // usable state. 109 AllowUserMount: true, 110 }) 111 vfsObj.MustRegisterFilesystemType(dev.Name, &dev.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{}) 112 vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 113 AllowUserMount: true, 114 AllowUserList: true, 115 }) 116 vfsObj.MustRegisterFilesystemType(erofs.Name, &erofs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 117 AllowUserList: true, 118 }) 119 vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 120 AllowUserMount: true, 121 AllowUserList: true, 122 }) 123 vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 124 AllowUserList: true, 125 }) 126 vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 127 AllowUserMount: true, 128 AllowUserList: true, 129 }) 130 vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 131 AllowUserMount: true, 132 AllowUserList: true, 133 }) 134 vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 135 AllowUserMount: true, 136 AllowUserList: true, 137 }) 138 vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 139 AllowUserMount: true, 140 AllowUserList: true, 141 }) 142 vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 143 AllowUserMount: true, 144 AllowUserList: true, 145 }) 146 147 // Register devices. 148 if err := memdev.Register(vfsObj); err != nil { 149 return fmt.Errorf("registering memdev: %w", err) 150 } 151 if err := ttydev.Register(vfsObj); err != nil { 152 return fmt.Errorf("registering ttydev: %w", err) 153 } 154 tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx)) 155 if tunSupported { 156 if err := tundev.Register(vfsObj); err != nil { 157 return fmt.Errorf("registering tundev: %v", err) 158 } 159 } 160 if err := fuse.Register(vfsObj); err != nil { 161 return fmt.Errorf("registering fusedev: %w", err) 162 } 163 164 if err := nvproxyRegisterDevices(info, vfsObj); err != nil { 165 return err 166 } 167 168 if err := tpuProxyRegisterDevices(info, vfsObj); err != nil { 169 return err 170 } 171 172 return nil 173 } 174 175 func setupContainerVFS(ctx context.Context, info *containerInfo, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { 176 // Create context with root credentials to mount the filesystem (the current 177 // user may not be privileged enough). 178 rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace) 179 rootProcArgs := *procArgs 180 rootProcArgs.WorkingDirectory = "/" 181 rootProcArgs.Credentials = rootCreds 182 rootProcArgs.Umask = 0022 183 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals 184 rootCtx := rootProcArgs.NewContext(mntr.k) 185 186 mns, err := mntr.mountAll(rootCtx, rootCreds, info.spec, info.conf, &rootProcArgs) 187 if err != nil { 188 return fmt.Errorf("failed to setupFS: %w", err) 189 } 190 procArgs.MountNamespace = mns 191 192 // If cgroups are mounted, then only check for the cgroup mounts per 193 // container. Otherwise the root cgroups will be enabled. 194 if mntr.cgroupsMounted { 195 cgroupRegistry := mntr.k.CgroupRegistry() 196 for _, ctrl := range kernel.CgroupCtrls { 197 cg, err := cgroupRegistry.FindCgroup(ctx, ctrl, "/"+mntr.containerID) 198 if err != nil { 199 return fmt.Errorf("cgroup mount for controller %v not found", ctrl) 200 } 201 if procArgs.InitialCgroups == nil { 202 procArgs.InitialCgroups = make(map[kernel.Cgroup]struct{}, len(kernel.CgroupCtrls)) 203 } 204 procArgs.InitialCgroups[cg] = struct{}{} 205 } 206 } 207 208 mnsRoot := mns.Root(rootCtx) 209 defer mnsRoot.DecRef(rootCtx) 210 211 if err := createDeviceFiles(rootCtx, rootCreds, info, mntr.k.VFS(), mnsRoot); err != nil { 212 return fmt.Errorf("failed to create device files: %w", err) 213 } 214 215 // We are executing a file directly. Do not resolve the executable path. 216 if procArgs.File != nil { 217 return nil 218 } 219 // Resolve the executable path from working dir and environment. 220 resolved, err := user.ResolveExecutablePath(ctx, procArgs) 221 if err != nil { 222 return err 223 } 224 procArgs.Filename = resolved 225 return nil 226 } 227 228 // compileMounts returns the supported mounts from the mount spec, adding any 229 // mandatory mounts that are required by the OCI specification. 230 // 231 // This function must NOT add/remove any gofer mounts or change their order. 232 func compileMounts(spec *specs.Spec, conf *config.Config, containerID string) []specs.Mount { 233 // Keep track of whether proc and sys were mounted. 234 var procMounted, sysMounted, devMounted, devptsMounted, cgroupsMounted bool 235 var mounts []specs.Mount 236 237 // Mount all submounts from the spec. 238 for _, m := range spec.Mounts { 239 // Mount all the cgroup controllers when "/sys/fs/cgroup" mount 240 // is present. If any other cgroup controller mounts are there, 241 // it will be a no-op, drop them. 242 if m.Type == cgroupfs.Name && cgroupsMounted { 243 continue 244 } 245 246 switch filepath.Clean(m.Destination) { 247 case "/proc": 248 procMounted = true 249 case "/sys": 250 sysMounted = true 251 case "/dev": 252 m.Type = dev.Name 253 devMounted = true 254 case "/dev/pts": 255 m.Type = devpts.Name 256 devptsMounted = true 257 case "/sys/fs/cgroup": 258 cgroupsMounted = true 259 } 260 261 mounts = append(mounts, m) 262 } 263 264 // Mount proc and sys even if the user did not ask for it, as the spec 265 // says we SHOULD. 266 var mandatoryMounts []specs.Mount 267 268 if !procMounted { 269 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 270 Type: proc.Name, 271 Destination: "/proc", 272 }) 273 } 274 if !sysMounted { 275 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 276 Type: sys.Name, 277 Destination: "/sys", 278 }) 279 } 280 if !devMounted { 281 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 282 Type: dev.Name, 283 Destination: "/dev", 284 }) 285 } 286 if !devptsMounted { 287 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 288 Type: devpts.Name, 289 Destination: "/dev/pts", 290 }) 291 } 292 293 // The mandatory mounts should be ordered right after the root, in case 294 // there are submounts of these mandatory mounts already in the spec. 295 mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...) 296 297 return mounts 298 } 299 300 // goferMountData creates a slice of gofer mount data. 301 func goferMountData(fd int, fa config.FileAccessType, conf *config.Config) []string { 302 opts := []string{ 303 "trans=fd", 304 "rfdno=" + strconv.Itoa(fd), 305 "wfdno=" + strconv.Itoa(fd), 306 } 307 if fa == config.FileAccessShared { 308 opts = append(opts, "cache=remote_revalidating") 309 } 310 if conf.DirectFS { 311 opts = append(opts, "directfs") 312 } 313 if !conf.HostFifo.AllowOpen() { 314 opts = append(opts, "disable_fifo_open") 315 } 316 return opts 317 } 318 319 // consumeMountOptions consumes mount options from opts based on allowedKeys 320 // and returns the remaining and consumed options. 321 func consumeMountOptions(opts []string, allowedKeys ...string) ([]string, []string, error) { 322 var rem, out []string 323 for _, o := range opts { 324 ok, err := parseMountOption(o, allowedKeys...) 325 if err != nil { 326 return nil, nil, err 327 } 328 if ok { 329 out = append(out, o) 330 } else { 331 rem = append(rem, o) 332 } 333 } 334 return rem, out, nil 335 } 336 337 func parseMountOption(opt string, allowedKeys ...string) (bool, error) { 338 kv := strings.SplitN(opt, "=", 3) 339 if len(kv) > 2 { 340 return false, fmt.Errorf("invalid option %q", opt) 341 } 342 return specutils.ContainsStr(allowedKeys, kv[0]), nil 343 } 344 345 type fdDispenser struct { 346 fds []*fd.FD 347 } 348 349 func (f *fdDispenser) remove() int { 350 return f.removeAsFD().Release() 351 } 352 353 func (f *fdDispenser) removeAsFD() *fd.FD { 354 if f.empty() { 355 panic("fdDispenser out of fds") 356 } 357 rv := f.fds[0] 358 f.fds = f.fds[1:] 359 return rv 360 } 361 362 func (f *fdDispenser) empty() bool { 363 return len(f.fds) == 0 364 } 365 366 type containerMounter struct { 367 root *specs.Root 368 369 // mounts is the set of submounts for the container. It's a copy from the spec 370 // that may be freely modified without affecting the original spec. 371 mounts []specs.Mount 372 373 // goferFDs is the list of FDs to be dispensed for gofer mounts. 374 goferFDs fdDispenser 375 376 // goferFilestoreFDs are FDs to the regular files that will back the tmpfs or 377 // overlayfs mount for certain gofer mounts. 378 goferFilestoreFDs fdDispenser 379 380 // devGoferFD is the FD to attach the sandbox to the dev gofer. 381 devGoferFD *fd.FD 382 383 // goferMountConfs contains information about how the gofer mounts have been 384 // configured. The first entry is for rootfs and the following entries are 385 // for bind mounts in Spec.Mounts (in the same order). 386 goferMountConfs []GoferMountConf 387 388 k *kernel.Kernel 389 390 // hints is the set of pod mount hints for the sandbox. 391 hints *PodMountHints 392 393 // sharedMounts is a map of shared mounts that can be reused across 394 // containers. 395 sharedMounts map[string]*vfs.Mount 396 397 // productName is the value to show in 398 // /sys/devices/virtual/dmi/id/product_name. 399 productName string 400 401 // containerID is the ID for the container. 402 containerID string 403 404 // sandboxID is the ID for the whole sandbox. 405 sandboxID string 406 containerName string 407 408 // cgroupsMounted indicates if cgroups are mounted in the container. 409 // This is used to set the InitialCgroups before starting the container 410 // process. 411 cgroupsMounted bool 412 } 413 414 func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *PodMountHints, sharedMounts map[string]*vfs.Mount, productName string, sandboxID string) *containerMounter { 415 return &containerMounter{ 416 root: info.spec.Root, 417 mounts: compileMounts(info.spec, info.conf, info.procArgs.ContainerID), 418 goferFDs: fdDispenser{fds: info.goferFDs}, 419 goferFilestoreFDs: fdDispenser{fds: info.goferFilestoreFDs}, 420 devGoferFD: info.devGoferFD, 421 goferMountConfs: info.goferMountConfs, 422 k: k, 423 hints: hints, 424 sharedMounts: sharedMounts, 425 productName: productName, 426 containerID: info.procArgs.ContainerID, 427 sandboxID: sandboxID, 428 containerName: info.containerName, 429 } 430 } 431 432 func (c *containerMounter) checkDispenser() error { 433 if !c.goferFDs.empty() { 434 return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.goferFDs) 435 } 436 if !c.goferFilestoreFDs.empty() { 437 return fmt.Errorf("not all gofer Filestore FDs were consumed, remaining: %v", c.goferFilestoreFDs) 438 } 439 if c.devGoferFD != nil && c.devGoferFD.FD() >= 0 { 440 return fmt.Errorf("dev gofer FD was not consumed: %d", c.devGoferFD.FD()) 441 } 442 return nil 443 } 444 445 func getMountAccessType(conf *config.Config, hint *MountHint) config.FileAccessType { 446 if hint != nil { 447 return hint.fileAccessType() 448 } 449 return conf.FileAccessMounts 450 } 451 452 func (c *containerMounter) mountAll(rootCtx context.Context, rootCreds *auth.Credentials, spec *specs.Spec, conf *config.Config, rootProcArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { 453 log.Infof("Configuring container's file system") 454 455 mns, err := c.createMountNamespace(rootCtx, conf, rootCreds) 456 if err != nil { 457 return nil, fmt.Errorf("creating mount namespace: %w", err) 458 } 459 rootProcArgs.MountNamespace = mns 460 461 root := mns.Root(rootCtx) 462 defer root.DecRef(rootCtx) 463 if root.Mount().ReadOnly() { 464 // Switch to ReadWrite while we setup submounts. 465 if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil { 466 return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err) 467 } 468 // Restore back to ReadOnly at the end. 469 defer func() { 470 if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { 471 panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err)) 472 } 473 }() 474 } 475 476 // Mount submounts. 477 if err := c.mountSubmounts(rootCtx, spec, conf, mns, rootCreds); err != nil { 478 return nil, fmt.Errorf("mounting submounts: %w", err) 479 } 480 481 return mns, nil 482 } 483 484 // createMountNamespace creates the container's root mount and namespace. 485 func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { 486 ioFD := c.goferFDs.remove() 487 rootfsConf := c.goferMountConfs[0] 488 489 var ( 490 fsName string 491 opts *vfs.MountOptions 492 ) 493 switch { 494 case rootfsConf.ShouldUseLisafs(): 495 fsName = gofer.Name 496 497 data := goferMountData(ioFD, conf.FileAccess, conf) 498 499 // We can't check for overlayfs here because sandbox is chroot'ed and gofer 500 // can only send mount options for specs.Mounts (specs.Root is missing 501 // Options field). So assume root is always on top of overlayfs. 502 data = append(data, "overlayfs_stale_read") 503 504 // Configure the gofer dentry cache size. 505 gofer.SetDentryCacheSize(conf.DCache) 506 507 opts = &vfs.MountOptions{ 508 ReadOnly: c.root.Readonly, 509 GetFilesystemOptions: vfs.GetFilesystemOptions{ 510 InternalMount: true, 511 Data: strings.Join(data, ","), 512 InternalData: gofer.InternalFilesystemOptions{ 513 UniqueID: vfs.RestoreID{ 514 ContainerName: c.containerName, 515 Path: "/", 516 }, 517 }, 518 }, 519 } 520 521 case rootfsConf.ShouldUseErofs(): 522 fsName = erofs.Name 523 opts = &vfs.MountOptions{ 524 ReadOnly: c.root.Readonly, 525 GetFilesystemOptions: vfs.GetFilesystemOptions{ 526 InternalMount: true, 527 Data: fmt.Sprintf("ifd=%d", ioFD), 528 InternalData: erofs.InternalFilesystemOptions{ 529 UniqueID: vfs.RestoreID{ 530 ContainerName: c.containerName, 531 Path: "/", 532 }, 533 }, 534 }, 535 } 536 537 default: 538 return nil, fmt.Errorf("unsupported rootfs config: %+v", rootfsConf) 539 } 540 541 log.Infof("Mounting root with %s, ioFD: %d", fsName, ioFD) 542 543 if rootfsConf.ShouldUseOverlayfs() { 544 log.Infof("Adding overlay on top of root") 545 var ( 546 err error 547 cleanup func() 548 filestoreFD *fd.FD 549 ) 550 if rootfsConf.IsFilestorePresent() { 551 filestoreFD = c.goferFilestoreFDs.removeAsFD() 552 } 553 opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, filestoreFD, rootfsConf, "/") 554 if err != nil { 555 return nil, fmt.Errorf("mounting root with overlay: %w", err) 556 } 557 defer cleanup() 558 fsName = overlay.Name 559 } 560 561 // The namespace root mount can't be changed, so let's mount a dummy 562 // read-only tmpfs here. It simplifies creation of containers without 563 // leaking the root file system. 564 mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "rootfs", "tmpfs", 565 &vfs.MountOptions{ReadOnly: true, Locked: true}, c.k) 566 if err != nil { 567 return nil, fmt.Errorf("setting up mount namespace: %w", err) 568 } 569 defer mns.DecRef(ctx) 570 571 mnt, err := c.k.VFS().MountDisconnected(ctx, creds, "root", fsName, opts) 572 if err != nil { 573 return nil, fmt.Errorf("creating root file system: %w", err) 574 } 575 defer mnt.DecRef(ctx) 576 root := mns.Root(ctx) 577 defer root.DecRef(ctx) 578 target := &vfs.PathOperation{ 579 Root: root, 580 Start: root, 581 } 582 if err := c.k.VFS().ConnectMountAt(ctx, creds, mnt, target); err != nil { 583 return nil, fmt.Errorf("mounting root file system: %w", err) 584 } 585 586 mns.IncRef() 587 return mns, nil 588 } 589 590 // configureOverlay mounts the lower layer using "lowerOpts", mounts the upper 591 // layer using tmpfs, and return overlay mount options. "cleanup" must be called 592 // after the options have been used to mount the overlay, to release refs on 593 // lower and upper mounts. 594 func (c *containerMounter) configureOverlay(ctx context.Context, conf *config.Config, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string, filestoreFD *fd.FD, mountConf GoferMountConf, dst string) (*vfs.MountOptions, func(), error) { 595 // First copy options from lower layer to upper layer and overlay. Clear 596 // filesystem specific options. 597 upperOpts := *lowerOpts 598 upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true} 599 600 overlayOpts := *lowerOpts 601 overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true} 602 603 // All writes go to the upper layer, be paranoid and make lower readonly. 604 lowerOpts.ReadOnly = true 605 lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) 606 if err != nil { 607 return nil, nil, err 608 } 609 cu := cleanup.Make(func() { lower.DecRef(ctx) }) 610 defer cu.Clean() 611 612 // Determine the lower layer's root's type. 613 lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root()) 614 stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{ 615 Root: lowerRootVD, 616 Start: lowerRootVD, 617 }, &vfs.StatOptions{ 618 Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE, 619 }) 620 if err != nil { 621 return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err) 622 } 623 if stat.Mask&linux.STATX_TYPE == 0 { 624 return nil, nil, fmt.Errorf("failed to get file type of lower layer's root") 625 } 626 rootType := stat.Mode & linux.S_IFMT 627 if rootType != linux.S_IFDIR && rootType != linux.S_IFREG { 628 return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType) 629 } 630 631 // Upper is a tmpfs mount to keep all modifications inside the sandbox. 632 tmpfsOpts := tmpfs.FilesystemOpts{ 633 RootFileType: uint16(rootType), 634 // If a mount is being overlaid, it should not be limited by the default 635 // tmpfs size limit. 636 DisableDefaultSizeLimit: true, 637 } 638 if filestoreFD != nil { 639 // Create memory file for disk-backed overlays. 640 mf, err := createPrivateMemoryFile(filestoreFD.ReleaseToFile("overlay-filestore"), vfs.RestoreID{ContainerName: c.containerName, Path: dst}) 641 if err != nil { 642 return nil, nil, fmt.Errorf("failed to create memory file for overlay: %v", err) 643 } 644 tmpfsOpts.MemoryFile = mf 645 } 646 upperOpts.GetFilesystemOptions.InternalData = tmpfsOpts 647 upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) 648 if err != nil { 649 return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) 650 } 651 cu.Add(func() { upper.DecRef(ctx) }) 652 653 // If the overlay mount consists of a regular file, copy up its contents 654 // from the lower layer, since in the overlay the otherwise-empty upper 655 // layer file will take precedence. 656 upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) 657 if rootType == linux.S_IFREG { 658 lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ 659 Root: lowerRootVD, 660 Start: lowerRootVD, 661 }, &vfs.OpenOptions{ 662 Flags: linux.O_RDONLY, 663 }) 664 if err != nil { 665 return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err) 666 } 667 defer lowerFD.DecRef(ctx) 668 upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ 669 Root: upperRootVD, 670 Start: upperRootVD, 671 }, &vfs.OpenOptions{ 672 Flags: linux.O_WRONLY, 673 }) 674 if err != nil { 675 return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err) 676 } 677 defer upperFD.DecRef(ctx) 678 if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil { 679 return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err) 680 } 681 } 682 683 // We need to hide the filestore from the containerized application. 684 if mountConf.IsSelfBacked() { 685 if err := overlay.CreateWhiteout(ctx, c.k.VFS(), creds, &vfs.PathOperation{ 686 Root: upperRootVD, 687 Start: upperRootVD, 688 Path: fspath.Parse(selfFilestoreName(c.sandboxID)), 689 }); err != nil { 690 return nil, nil, fmt.Errorf("failed to create whiteout to hide self overlay filestore: %w", err) 691 } 692 } 693 694 // Propagate the lower layer's root's owner, group, and mode to the upper 695 // layer's root for consistency with VFS1. 696 err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{ 697 Root: upperRootVD, 698 Start: upperRootVD, 699 }, &vfs.SetStatOptions{ 700 Stat: linux.Statx{ 701 Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask, 702 UID: stat.UID, 703 GID: stat.GID, 704 Mode: stat.Mode, 705 }, 706 }) 707 if err != nil { 708 return nil, nil, err 709 } 710 711 // Configure overlay with both layers. 712 overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{ 713 UpperRoot: upperRootVD, 714 LowerRoots: []vfs.VirtualDentry{lowerRootVD}, 715 } 716 return &overlayOpts, cu.Release(), nil 717 } 718 719 func (c *containerMounter) mountSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { 720 mounts, err := c.prepareMounts() 721 if err != nil { 722 return err 723 } 724 725 for i := range mounts { 726 submount := &mounts[i] 727 log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options) 728 var ( 729 mnt *vfs.Mount 730 err error 731 ) 732 733 if submount.hint != nil && submount.hint.ShouldShareMount() { 734 sharedMount, err := c.getSharedMount(ctx, spec, conf, submount, creds) 735 if err != nil { 736 return fmt.Errorf("getting shared mount %q: %w", submount.hint.Name, err) 737 } 738 mnt, err = c.mountSharedSubmount(ctx, conf, mns, creds, submount, sharedMount) 739 if err != nil { 740 return fmt.Errorf("mount shared mount %q to %q: %v", submount.hint.Name, submount.mount.Destination, err) 741 } 742 } else if submount.mount.Type == cgroupfs.Name { 743 // Mount all the cgroups controllers. 744 if err := c.mountCgroupSubmounts(ctx, spec, conf, mns, creds, submount); err != nil { 745 return fmt.Errorf("mount cgroup %q: %w", submount.mount.Destination, err) 746 } 747 } else { 748 mnt, err = c.mountSubmount(ctx, spec, conf, mns, creds, submount) 749 if err != nil { 750 return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err) 751 } 752 } 753 754 if mnt != nil && mnt.ReadOnly() { 755 // Switch to ReadWrite while we setup submounts. 756 if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { 757 return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err) 758 } 759 // Restore back to ReadOnly at the end. 760 defer func() { 761 if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { 762 panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err)) 763 } 764 }() 765 } 766 } 767 768 if err := c.mountTmp(ctx, spec, conf, creds, mns); err != nil { 769 return fmt.Errorf(`mount submount "/tmp": %w`, err) 770 } 771 return nil 772 } 773 774 type mountInfo struct { 775 mount *specs.Mount 776 goferFD *fd.FD 777 hint *MountHint 778 goferMountConf GoferMountConf 779 filestoreFD *fd.FD 780 } 781 782 func (c *containerMounter) prepareMounts() ([]mountInfo, error) { 783 // If device gofer exists, connect to it. 784 if c.devGoferFD != nil { 785 if err := c.k.AddDevGofer(c.containerID, c.devGoferFD.Release()); err != nil { 786 return nil, err 787 } 788 } 789 // Associate bind mounts with their FDs before sorting since there is an 790 // undocumented assumption that FDs are dispensed in the order in which 791 // they are required by mounts. 792 var mounts []mountInfo 793 goferMntIdx := 1 // First index is for rootfs. 794 for i := range c.mounts { 795 info := mountInfo{ 796 mount: &c.mounts[i], 797 hint: c.hints.FindMount(c.mounts[i].Source), 798 } 799 specutils.MaybeConvertToBindMount(info.mount) 800 if specutils.IsGoferMount(*info.mount) { 801 info.goferMountConf = c.goferMountConfs[goferMntIdx] 802 if info.goferMountConf.ShouldUseLisafs() { 803 info.goferFD = c.goferFDs.removeAsFD() 804 } 805 if info.goferMountConf.IsFilestorePresent() { 806 info.filestoreFD = c.goferFilestoreFDs.removeAsFD() 807 } 808 if info.goferMountConf.ShouldUseTmpfs() { 809 specutils.ChangeMountType(info.mount, tmpfs.Name) 810 } 811 goferMntIdx++ 812 } 813 mounts = append(mounts, info) 814 } 815 if err := c.checkDispenser(); err != nil { 816 return nil, err 817 } 818 819 // Sort the mounts so that we don't place children before parents. 820 sort.Slice(mounts, func(i, j int) bool { 821 return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination) 822 }) 823 824 return mounts, nil 825 } 826 827 func (c *containerMounter) mountSubmount(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) (*vfs.Mount, error) { 828 fsName, opts, err := getMountNameAndOptions(spec, conf, submount, c.productName, c.containerName) 829 if err != nil { 830 return nil, fmt.Errorf("mountOptions failed: %w", err) 831 } 832 if len(fsName) == 0 { 833 // Filesystem is not supported (e.g. cgroup), just skip it. 834 return nil, nil 835 } 836 837 if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil { 838 return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err) 839 } 840 841 if submount.goferMountConf.ShouldUseOverlayfs() { 842 log.Infof("Adding overlay on top of mount %q", submount.mount.Destination) 843 var cleanup func() 844 opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, submount.filestoreFD, submount.goferMountConf, submount.mount.Destination) 845 if err != nil { 846 return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err) 847 } 848 defer cleanup() 849 fsName = overlay.Name 850 } 851 852 root := mns.Root(ctx) 853 defer root.DecRef(ctx) 854 target := &vfs.PathOperation{ 855 Root: root, 856 Start: root, 857 Path: fspath.Parse(submount.mount.Destination), 858 } 859 mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) 860 if err != nil { 861 return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts) 862 } 863 log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data) 864 return mnt, nil 865 } 866 867 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values 868 // used for mounts. 869 func getMountNameAndOptions(spec *specs.Spec, conf *config.Config, m *mountInfo, productName, containerName string) (string, *vfs.MountOptions, error) { 870 fsName := m.mount.Type 871 var ( 872 mopts = m.mount.Options 873 data []string 874 internalData any 875 ) 876 877 // Find filesystem name and FS specific data field. 878 switch m.mount.Type { 879 case devpts.Name, dev.Name, proc.Name: 880 // Nothing to do. 881 882 case Nonefs: 883 fsName = sys.Name 884 885 case sys.Name: 886 sysData := &sys.InternalData{EnableTPUProxyPaths: specutils.TPUProxyIsEnabled(spec, conf)} 887 if len(productName) > 0 { 888 sysData.ProductName = productName 889 } 890 internalData = sysData 891 892 case tmpfs.Name: 893 var err error 894 mopts, data, err = consumeMountOptions(mopts, tmpfsAllowedData...) 895 if err != nil { 896 return "", nil, err 897 } 898 if m.filestoreFD != nil { 899 mf, err := createPrivateMemoryFile(m.filestoreFD.ReleaseToFile("tmpfs-filestore"), vfs.RestoreID{ContainerName: containerName, Path: m.mount.Destination}) 900 if err != nil { 901 return "", nil, fmt.Errorf("failed to create memory file for tmpfs: %v", err) 902 } 903 internalData = tmpfs.FilesystemOpts{ 904 MemoryFile: mf, 905 // If a mount is being overlaid with tmpfs, it should not be limited by 906 // the default tmpfs size limit. 907 DisableDefaultSizeLimit: true, 908 } 909 } 910 911 case Bind: 912 fsName = gofer.Name 913 if m.goferFD == nil { 914 // Check that an FD was provided to fails fast. 915 return "", nil, fmt.Errorf("gofer mount requires a connection FD") 916 } 917 var err error 918 mopts, data, err = consumeMountOptions(mopts, gofer.SupportedMountOptions...) 919 if err != nil { 920 return "", nil, err 921 } 922 data = append(data, goferMountData(m.goferFD.Release(), getMountAccessType(conf, m.hint), conf)...) 923 internalData = gofer.InternalFilesystemOptions{ 924 UniqueID: vfs.RestoreID{ 925 ContainerName: containerName, 926 Path: m.mount.Destination, 927 }, 928 } 929 930 case cgroupfs.Name: 931 var err error 932 mopts, data, err = consumeMountOptions(mopts, cgroupfs.SupportedMountOptions...) 933 if err != nil { 934 return "", nil, err 935 } 936 937 default: 938 log.Warningf("ignoring unknown filesystem type %q", m.mount.Type) 939 return "", nil, nil 940 } 941 942 opts := ParseMountOptions(mopts) 943 opts.GetFilesystemOptions = vfs.GetFilesystemOptions{ 944 Data: strings.Join(data, ","), 945 InternalData: internalData, 946 InternalMount: true, 947 } 948 949 return fsName, opts, nil 950 } 951 952 // ParseMountOptions converts specs.Mount.Options to vfs.MountOptions. 953 func ParseMountOptions(opts []string) *vfs.MountOptions { 954 mountOpts := &vfs.MountOptions{ 955 GetFilesystemOptions: vfs.GetFilesystemOptions{ 956 InternalMount: true, 957 }, 958 } 959 // Note: update mountHint.CheckCompatible when more options are added. 960 for _, o := range opts { 961 switch o { 962 case "ro": 963 mountOpts.ReadOnly = true 964 case "noatime": 965 mountOpts.Flags.NoATime = true 966 case "noexec": 967 mountOpts.Flags.NoExec = true 968 case "rw", "atime", "exec": 969 // These use the default value and don't need to be set. 970 case "bind", "rbind": 971 // These are the same as a mount with type="bind". 972 default: 973 log.Warningf("ignoring unknown mount option %q", o) 974 } 975 } 976 return mountOpts 977 } 978 979 func parseKeyValue(s string) (string, string, bool) { 980 tokens := strings.SplitN(s, "=", 2) 981 if len(tokens) < 2 { 982 return "", "", false 983 } 984 return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true 985 } 986 987 func createPrivateMemoryFile(file *os.File, restoreID vfs.RestoreID) (*pgalloc.MemoryFile, error) { 988 mfOpts := pgalloc.MemoryFileOpts{ 989 // Private memory files are usually backed by files on disk. Ideally we 990 // would confirm with fstatfs(2) but that is prohibited by seccomp. 991 DiskBackedFile: true, 992 // Disk backed files need to be decommited on destroy to release disk space. 993 DecommitOnDestroy: true, 994 // sentry's seccomp filters don't allow the mmap(2) syscalls that 995 // pgalloc.IMAWorkAroundForMemFile() uses. Users of private memory files 996 // are expected to have performed the work around outside the sandbox. 997 DisableIMAWorkAround: true, 998 // Private memory files need to be restored correctly using this ID. 999 RestoreID: restoreID.String(), 1000 } 1001 return pgalloc.NewMemoryFile(file, mfOpts) 1002 } 1003 1004 // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. 1005 // Technically we don't have to mount tmpfs at /tmp, as we could just rely on 1006 // the host /tmp, but this is a nice optimization, and fixes some apps that call 1007 // mknod in /tmp. It's unsafe to mount tmpfs if: 1008 // 1. /tmp is mounted explicitly: we should not override user's wish 1009 // 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp 1010 // 1011 // Note that when there are submounts inside of '/tmp', directories for the 1012 // mount points must be present, making '/tmp' not empty anymore. 1013 func (c *containerMounter) mountTmp(ctx context.Context, spec *specs.Spec, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { 1014 for _, m := range c.mounts { 1015 // m.Destination has been cleaned, so it's to use equality here. 1016 if m.Destination == "/tmp" { 1017 log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m) 1018 return nil 1019 } 1020 } 1021 1022 root := mns.Root(ctx) 1023 defer root.DecRef(ctx) 1024 pop := vfs.PathOperation{ 1025 Root: root, 1026 Start: root, 1027 Path: fspath.Parse("/tmp"), 1028 } 1029 fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) 1030 switch { 1031 case err == nil: 1032 defer fd.DecRef(ctx) 1033 1034 err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { 1035 if dirent.Name != "." && dirent.Name != ".." { 1036 return linuxerr.ENOTEMPTY 1037 } 1038 return nil 1039 })) 1040 switch { 1041 case err == nil: 1042 log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) 1043 case linuxerr.Equals(linuxerr.ENOTEMPTY, err): 1044 // If more than "." and ".." is found, skip internal tmpfs to prevent 1045 // hiding existing files. 1046 log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) 1047 return nil 1048 default: 1049 return fmt.Errorf("fd.IterDirents failed: %v", err) 1050 } 1051 fallthrough 1052 1053 case linuxerr.Equals(linuxerr.ENOENT, err): 1054 // No '/tmp' found (or fallthrough from above). It's safe to mount internal 1055 // tmpfs. 1056 tmpMount := specs.Mount{ 1057 Type: tmpfs.Name, 1058 Destination: "/tmp", 1059 // Sticky bit is added to prevent accidental deletion of files from 1060 // another user. This is normally done for /tmp. 1061 Options: []string{"mode=01777"}, 1062 } 1063 if _, err := c.mountSubmount(ctx, spec, conf, mns, creds, &mountInfo{mount: &tmpMount}); err != nil { 1064 return fmt.Errorf("mountSubmount failed: %v", err) 1065 } 1066 return nil 1067 1068 case linuxerr.Equals(linuxerr.ENOTDIR, err): 1069 // Not a dir?! Let it be. 1070 return nil 1071 1072 default: 1073 return fmt.Errorf(`opening "/tmp" inside container: %w`, err) 1074 } 1075 } 1076 1077 func (c *containerMounter) getSharedMount(ctx context.Context, spec *specs.Spec, conf *config.Config, mount *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) { 1078 sharedMount, ok := c.sharedMounts[mount.hint.Mount.Source] 1079 if ok { 1080 log.Infof("Using existing shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type) 1081 if mount.goferFD != nil { 1082 panic(fmt.Errorf("extra goferFD provided for shared mount %q", mount.hint.Name)) 1083 } 1084 if mount.filestoreFD != nil { 1085 mount.filestoreFD.Close() 1086 } 1087 return sharedMount, nil 1088 } 1089 log.Infof("Mounting master of shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type) 1090 sharedMount, err := c.mountSharedMaster(ctx, spec, conf, mount, creds) 1091 if err != nil { 1092 return nil, fmt.Errorf("mounting shared master %q: %v", mount.hint.Name, err) 1093 } 1094 c.sharedMounts[mount.hint.Mount.Source] = sharedMount 1095 return sharedMount, nil 1096 } 1097 1098 // mountCgroupMounts mounts the cgroups which are shared across all containers. 1099 // Postcondition: Initialized k.cgroupMounts on success. 1100 func (l *Loader) mountCgroupMounts(conf *config.Config, creds *auth.Credentials) error { 1101 ctx := l.k.SupervisorContext() 1102 for _, sopts := range kernel.CgroupCtrls { 1103 mopts := &vfs.MountOptions{ 1104 GetFilesystemOptions: vfs.GetFilesystemOptions{ 1105 Data: string(sopts), 1106 InternalMount: true, 1107 }, 1108 } 1109 fs, root, err := l.k.VFS().NewFilesystem(ctx, creds, "cgroup", cgroupfs.Name, mopts) 1110 if err != nil { 1111 return err 1112 } 1113 1114 mount := l.k.VFS().NewDisconnectedMount(fs, root, mopts) 1115 // Private so that mounts created by containers do not appear 1116 // in other container's cgroup paths. 1117 l.k.VFS().SetMountPropagation(mount, linux.MS_PRIVATE, false) 1118 l.k.AddCgroupMount(string(sopts), &kernel.CgroupMount{ 1119 Fs: fs, 1120 Root: root, 1121 Mount: mount, 1122 }) 1123 } 1124 log.Infof("created cgroup mounts for controllers %v", kernel.CgroupCtrls) 1125 return nil 1126 } 1127 1128 // mountCgroupSubmounts mounts all the cgroup controller submounts for the 1129 // container. The cgroup submounts are created under the root controller mount 1130 // with containerID as the directory name and then bind mounts this directory 1131 // inside the container's mount namespace. 1132 func (c *containerMounter) mountCgroupSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) error { 1133 root := mns.Root(ctx) 1134 defer root.DecRef(ctx) 1135 1136 // Mount "/sys/fs/cgroup" in the container's mount namespace. 1137 submount.mount.Type = tmpfs.Name 1138 mnt, err := c.mountSubmount(ctx, spec, conf, mns, creds, submount) 1139 if err != nil { 1140 return err 1141 } 1142 if mnt != nil && mnt.ReadOnly() { 1143 // Switch to ReadWrite while we setup submounts. 1144 if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { 1145 return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err) 1146 } 1147 // Restore back to ReadOnly at the end. 1148 defer func() { 1149 if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { 1150 panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err)) 1151 } 1152 }() 1153 } 1154 1155 // Mount all the cgroup controllers in the container's mount namespace. 1156 mountCtx := vfs.WithRoot(vfs.WithMountNamespace(ctx, mns), root) 1157 for _, ctrl := range kernel.CgroupCtrls { 1158 ctrlName := string(ctrl) 1159 cgroupMnt := c.k.GetCgroupMount(ctrlName) 1160 if cgroupMnt == nil { 1161 return fmt.Errorf("cgroup mount for controller %s not found", ctrlName) 1162 } 1163 1164 cgroupMntVD := vfs.MakeVirtualDentry(cgroupMnt.Mount, cgroupMnt.Root) 1165 sourcePop := vfs.PathOperation{ 1166 Root: cgroupMntVD, 1167 Start: cgroupMntVD, 1168 // Use the containerID as the cgroup path. 1169 Path: fspath.Parse(c.containerID), 1170 } 1171 if err := c.k.VFS().MkdirAt(mountCtx, creds, &sourcePop, &vfs.MkdirOptions{ 1172 Mode: 0755, 1173 }); err != nil { 1174 log.Infof("error in creating directory %v", err) 1175 return err 1176 } 1177 1178 // Bind mount the new cgroup directory into the container's mount namespace. 1179 destination := "/sys/fs/cgroup/" + ctrlName 1180 if err := c.k.VFS().MakeSyntheticMountpoint(mountCtx, destination, root, creds); err != nil { 1181 // Log a warning, but attempt the mount anyway. 1182 log.Warningf("Failed to create mount point %q: %v", destination, err) 1183 } 1184 1185 target := &vfs.PathOperation{ 1186 Root: root, 1187 Start: root, 1188 Path: fspath.Parse(destination), 1189 } 1190 if err := c.k.VFS().BindAt(mountCtx, creds, &sourcePop, target, false); err != nil { 1191 log.Infof("error in bind mounting %v", err) 1192 return err 1193 } 1194 } 1195 c.cgroupsMounted = true 1196 return nil 1197 } 1198 1199 // mountSharedMaster mounts the master of a volume that is shared among 1200 // containers in a pod. 1201 func (c *containerMounter) mountSharedMaster(ctx context.Context, spec *specs.Spec, conf *config.Config, mntInfo *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) { 1202 // Mount the master using the options from the hint (mount annotations). 1203 origOpts := mntInfo.mount.Options 1204 mntInfo.mount.Options = mntInfo.hint.Mount.Options 1205 fsName, opts, err := getMountNameAndOptions(spec, conf, mntInfo, c.productName, c.containerName) 1206 mntInfo.mount.Options = origOpts 1207 if err != nil { 1208 return nil, err 1209 } 1210 if len(fsName) == 0 { 1211 return nil, fmt.Errorf("mount type not supported %q", mntInfo.hint.Mount.Type) 1212 } 1213 return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) 1214 } 1215 1216 // mountSharedSubmount binds mount to a previously mounted volume that is shared 1217 // among containers in the same pod. 1218 func (c *containerMounter) mountSharedSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mntInfo *mountInfo, sharedMount *vfs.Mount) (*vfs.Mount, error) { 1219 if err := mntInfo.hint.checkCompatible(mntInfo.mount); err != nil { 1220 return nil, err 1221 } 1222 1223 // Generate mount point specific opts using mntInfo.mount. 1224 opts := ParseMountOptions(mntInfo.mount.Options) 1225 newMnt := c.k.VFS().NewDisconnectedMount(sharedMount.Filesystem(), sharedMount.Root(), opts) 1226 defer newMnt.DecRef(ctx) 1227 1228 root := mns.Root(ctx) 1229 defer root.DecRef(ctx) 1230 target := &vfs.PathOperation{ 1231 Root: root, 1232 Start: root, 1233 Path: fspath.Parse(mntInfo.mount.Destination), 1234 } 1235 1236 if err := c.makeMountPoint(ctx, creds, mns, mntInfo.mount.Destination); err != nil { 1237 return nil, fmt.Errorf("creating mount point %q: %w", mntInfo.mount.Destination, err) 1238 } 1239 1240 if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { 1241 return nil, err 1242 } 1243 log.Infof("Mounted %q type shared bind to %q", mntInfo.mount.Destination, mntInfo.hint.Name) 1244 return newMnt, nil 1245 } 1246 1247 func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { 1248 root := mns.Root(ctx) 1249 defer root.DecRef(ctx) 1250 target := &vfs.PathOperation{ 1251 Root: root, 1252 Start: root, 1253 Path: fspath.Parse(dest), 1254 } 1255 // First check if mount point exists. When overlay is enabled, gofer doesn't 1256 // allow changes to the FS, making MakeSytheticMountpoint() ineffective 1257 // because MkdirAt fails with EROFS even if file exists. 1258 vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{}) 1259 if err == nil { 1260 // File exists, we're done. 1261 vd.DecRef(ctx) 1262 return nil 1263 } 1264 return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) 1265 } 1266 1267 // configureRestore returns an updated context.Context including filesystem 1268 // state used by restore defined by conf. 1269 func (c *containerMounter) configureRestore(ctx context.Context) (context.Context, error) { 1270 // Compare createMountNamespace(); rootfs always consumes a gofer FD and a 1271 // filestore FD is consumed if the rootfs GoferMountConf indicates so. 1272 fdmap := make(map[vfs.RestoreID]int) 1273 1274 rootKey := vfs.RestoreID{ContainerName: c.containerName, Path: "/"} 1275 fdmap[rootKey] = c.goferFDs.remove() 1276 1277 mfmap := make(map[string]*pgalloc.MemoryFile) 1278 if rootfsConf := c.goferMountConfs[0]; rootfsConf.IsFilestorePresent() { 1279 mf, err := createPrivateMemoryFile(c.goferFilestoreFDs.removeAsFD().ReleaseToFile("overlay-filestore"), rootKey) 1280 if err != nil { 1281 return ctx, fmt.Errorf("failed to create private memory file for mount rootfs: %w", err) 1282 } 1283 mfmap[rootKey.String()] = mf 1284 } 1285 // prepareMounts() consumes the remaining FDs for submounts. 1286 mounts, err := c.prepareMounts() 1287 if err != nil { 1288 return ctx, err 1289 } 1290 for i := range mounts { 1291 submount := &mounts[i] 1292 if submount.goferFD != nil { 1293 key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination} 1294 fdmap[key] = submount.goferFD.Release() 1295 } 1296 if submount.filestoreFD != nil { 1297 key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination} 1298 mf, err := createPrivateMemoryFile(submount.filestoreFD.ReleaseToFile("overlay-filestore"), key) 1299 if err != nil { 1300 return ctx, fmt.Errorf("failed to create private memory file for mount %q: %w", submount.mount.Destination, err) 1301 } 1302 mfmap[key.String()] = mf 1303 } 1304 } 1305 return context.WithValue(context.WithValue(ctx, vfs.CtxRestoreFilesystemFDMap, fdmap), pgalloc.CtxMemoryFileMap, mfmap), nil 1306 } 1307 1308 func createDeviceFiles(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry) error { 1309 if info.spec.Linux != nil { 1310 // Create any device files specified in the spec. 1311 for _, dev := range info.spec.Linux.Devices { 1312 if err := createDeviceFile(ctx, creds, info, vfsObj, root, dev); err != nil { 1313 return err 1314 } 1315 } 1316 } 1317 if specutils.GPUFunctionalityRequestedViaHook(info.spec, info.conf) { 1318 // When using nvidia-container-runtime-hook, devices are not injected into 1319 // spec.Linux.Devices. So manually create appropriate device files. 1320 mode := os.FileMode(0666) 1321 nvidiaDevs := []specs.LinuxDevice{ 1322 specs.LinuxDevice{Path: "/dev/nvidiactl", Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: nvgpu.NV_CONTROL_DEVICE_MINOR, FileMode: &mode}, 1323 specs.LinuxDevice{Path: "/dev/nvidia-uvm", Type: "c", Major: int64(info.nvidiaUVMDevMajor), Minor: nvgpu.NVIDIA_UVM_PRIMARY_MINOR_NUMBER, FileMode: &mode}, 1324 } 1325 devClient := devutil.GoferClientFromContext(ctx) 1326 if devClient == nil { 1327 return fmt.Errorf("dev gofer client not found in context") 1328 } 1329 names, err := devClient.DirentNames(ctx) 1330 if err != nil { 1331 return fmt.Errorf("failed to get names of dirents from dev gofer: %w", err) 1332 } 1333 nvidiaDeviceRegex := regexp.MustCompile(`^nvidia(\d+)$`) 1334 for _, name := range names { 1335 ms := nvidiaDeviceRegex.FindStringSubmatch(name) 1336 if ms == nil { 1337 continue 1338 } 1339 minor, err := strconv.ParseUint(ms[1], 10, 32) 1340 if err != nil { 1341 return fmt.Errorf("invalid nvidia device name %q: %w", name, err) 1342 } 1343 nvidiaDevs = append(nvidiaDevs, specs.LinuxDevice{Path: fmt.Sprintf("/dev/nvidia%d", minor), Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: int64(minor), FileMode: &mode}) 1344 } 1345 for _, nvidiaDev := range nvidiaDevs { 1346 if err := createDeviceFile(ctx, creds, info, vfsObj, root, nvidiaDev); err != nil { 1347 return err 1348 } 1349 } 1350 } 1351 return nil 1352 } 1353 1354 func createDeviceFile(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, devSpec specs.LinuxDevice) error { 1355 mode := linux.FileMode(devSpec.FileMode.Perm()) 1356 var major, minor uint32 1357 // See https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices. 1358 switch devSpec.Type { 1359 case "b": 1360 mode |= linux.S_IFBLK 1361 major = uint32(devSpec.Major) 1362 minor = uint32(devSpec.Minor) 1363 case "c", "u": 1364 mode |= linux.S_IFCHR 1365 major = uint32(devSpec.Major) 1366 minor = uint32(devSpec.Minor) 1367 case "p": 1368 mode |= linux.S_IFIFO 1369 default: 1370 return fmt.Errorf("specified device at %q has invalid type %q", devSpec.Path, devSpec.Type) 1371 } 1372 if devSpec.Path == "/dev/nvidia-uvm" && info.nvidiaUVMDevMajor != 0 && major != info.nvidiaUVMDevMajor { 1373 // nvidia-uvm's major device number is dynamically assigned, so the 1374 // number that it has on the host may differ from the number that 1375 // it has in sentry VFS; switch from the former to the latter. 1376 log.Infof("Switching /dev/nvidia-uvm device major number from %d to %d", devSpec.Major, info.nvidiaUVMDevMajor) 1377 major = info.nvidiaUVMDevMajor 1378 } 1379 return dev.CreateDeviceFile(ctx, vfsObj, creds, root, devSpec.Path, major, minor, mode, devSpec.UID, devSpec.GID) 1380 } 1381 1382 // registerTPUDevice registers a TPU device in vfsObj based on the given device ID. 1383 func registerTPUDevice(vfsObj *vfs.VirtualFilesystem, minor uint32, deviceID int64) error { 1384 switch deviceID { 1385 case tpu.TPUV4DeviceID, tpu.TPUV4liteDeviceID: 1386 return accel.RegisterTPUDevice(vfsObj, minor, deviceID == tpu.TPUV4liteDeviceID) 1387 case tpu.TPUV5eDeviceID: 1388 return tpuproxy.RegisterTPUDevice(vfsObj, minor) 1389 default: 1390 return fmt.Errorf("unsupported TPU device with ID: 0x%x", deviceID) 1391 } 1392 } 1393 1394 // pathGlobToPathRegex is a map that points a TPU PCI path glob to its path regex. 1395 // TPU v4 devices are accessible via /sys/devices/pci0000:00/<pci_address>/accel/accel# on the host. 1396 // TPU v5 devices are accessible via at /sys/devices/pci0000:00/<pci_address>/vfio-dev/vfio# on the host. 1397 var pathGlobToPathRegex = map[string]string{ 1398 pciPathGlobTPUv4: `^/sys/devices/pci0000:00/\d+:\d+:\d+\.\d+/accel/accel(\d+)$`, 1399 pciPathGlobTPUv5: `^/sys/devices/pci0000:00/\d+:\d+:\d+\.\d+/vfio-dev/vfio(\d+)$`, 1400 } 1401 1402 func tpuProxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error { 1403 if !specutils.TPUProxyIsEnabled(info.spec, info.conf) { 1404 return nil 1405 } 1406 // Enumerate all potential PCI paths where TPU devices are available and register the found TPU devices. 1407 for pciPathGlobal, pathRegex := range pathGlobToPathRegex { 1408 pciAddrs, err := filepath.Glob(pciPathGlobal) 1409 if err != nil { 1410 return fmt.Errorf("enumerating PCI device files: %w", err) 1411 } 1412 pciPathRegex := regexp.MustCompile(pathRegex) 1413 for _, pciPath := range pciAddrs { 1414 ms := pciPathRegex.FindStringSubmatch(pciPath) 1415 if ms == nil { 1416 continue 1417 } 1418 deviceNum, err := strconv.ParseUint(ms[1], 10, 32) 1419 if err != nil { 1420 return fmt.Errorf("parsing PCI device number: %w", err) 1421 } 1422 var deviceIDBytes []byte 1423 if deviceIDBytes, err = os.ReadFile(path.Join(pciPath, "device/device")); err != nil { 1424 return fmt.Errorf("reading PCI device ID: %w", err) 1425 } 1426 deviceIDStr := strings.Replace(string(deviceIDBytes), "0x", "", -1) 1427 deviceID, err := strconv.ParseInt(strings.TrimSpace(deviceIDStr), 16, 64) 1428 if err != nil { 1429 return fmt.Errorf("parsing PCI device ID: %w", err) 1430 } 1431 if err := registerTPUDevice(vfsObj, uint32(deviceNum), deviceID); err != nil { 1432 return fmt.Errorf("registering TPU driver: %w", err) 1433 } 1434 } 1435 } 1436 if err := tpuproxy.RegisterVfioDevice(vfsObj); err != nil { 1437 return fmt.Errorf("registering vfio driver: %w", err) 1438 } 1439 return nil 1440 } 1441 1442 func nvproxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error { 1443 if !specutils.NVProxyEnabled(info.spec, info.conf) { 1444 return nil 1445 } 1446 uvmDevMajor, err := vfsObj.GetDynamicCharDevMajor() 1447 if err != nil { 1448 return fmt.Errorf("reserving device major number for nvidia-uvm: %w", err) 1449 } 1450 if err := nvproxy.Register(vfsObj, info.nvidiaDriverVersion, uvmDevMajor); err != nil { 1451 return fmt.Errorf("registering nvproxy driver: %w", err) 1452 } 1453 info.nvidiaUVMDevMajor = uvmDevMajor 1454 return nil 1455 }