gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/boot/vfs.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package boot 16 17 import ( 18 "fmt" 19 "os" 20 "path" 21 "path/filepath" 22 "regexp" 23 "slices" 24 "sort" 25 "strconv" 26 "strings" 27 28 specs "github.com/opencontainers/runtime-spec/specs-go" 29 "gvisor.dev/gvisor/pkg/abi/linux" 30 "gvisor.dev/gvisor/pkg/abi/nvgpu" 31 "gvisor.dev/gvisor/pkg/abi/tpu" 32 "gvisor.dev/gvisor/pkg/cleanup" 33 "gvisor.dev/gvisor/pkg/context" 34 "gvisor.dev/gvisor/pkg/devutil" 35 "gvisor.dev/gvisor/pkg/errors/linuxerr" 36 "gvisor.dev/gvisor/pkg/fd" 37 "gvisor.dev/gvisor/pkg/fspath" 38 "gvisor.dev/gvisor/pkg/log" 39 "gvisor.dev/gvisor/pkg/sentry/devices/accel" 40 "gvisor.dev/gvisor/pkg/sentry/devices/memdev" 41 "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" 42 "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" 43 "gvisor.dev/gvisor/pkg/sentry/devices/ttydev" 44 "gvisor.dev/gvisor/pkg/sentry/devices/tundev" 45 "gvisor.dev/gvisor/pkg/sentry/fsimpl/cgroupfs" 46 "gvisor.dev/gvisor/pkg/sentry/fsimpl/dev" 47 "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" 48 "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" 49 "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" 50 "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" 51 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" 52 "gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs" 53 "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" 54 "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" 55 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" 56 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" 57 "gvisor.dev/gvisor/pkg/sentry/fsimpl/user" 58 "gvisor.dev/gvisor/pkg/sentry/inet" 59 "gvisor.dev/gvisor/pkg/sentry/kernel" 60 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 61 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 62 "gvisor.dev/gvisor/pkg/sentry/vfs" 63 "gvisor.dev/gvisor/runsc/config" 64 "gvisor.dev/gvisor/runsc/specutils" 65 ) 66 67 // Supported filesystems that map to different internal filesystems. 68 const ( 69 Bind = "bind" 70 Nonefs = "none" 71 ) 72 73 // SelfFilestorePrefix is the prefix of the self filestore file name. 74 const SelfFilestorePrefix = ".gvisor.filestore." 75 76 const ( 77 pciPathGlobTPUv4 = "/sys/devices/pci0000:00/*/accel/accel*" 78 pciPathGlobTPUv5 = "/sys/devices/pci0000:00/*/vfio-dev/vfio*" 79 ) 80 81 // SelfFilestorePath returns the path at which the self filestore file is 82 // stored for a given mount. 83 func SelfFilestorePath(mountSrc, sandboxID string) string { 84 // We will place the filestore file in a gVisor specific hidden file inside 85 // the mount being overlaid itself. The same volume can be overlaid by 86 // multiple sandboxes. So make the filestore file unique to a sandbox by 87 // suffixing the sandbox ID. 88 return path.Join(mountSrc, selfFilestoreName(sandboxID)) 89 } 90 91 func selfFilestoreName(sandboxID string) string { 92 return SelfFilestorePrefix + sandboxID 93 } 94 95 // tmpfs has some extra supported options that we must pass through. 96 var tmpfsAllowedData = []string{"mode", "size", "uid", "gid"} 97 98 func registerFilesystems(k *kernel.Kernel, info *containerInfo) error { 99 ctx := k.SupervisorContext() 100 vfsObj := k.VFS() 101 102 vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 103 AllowUserMount: true, 104 AllowUserList: true, 105 }) 106 vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 107 AllowUserList: true, 108 // TODO(b/29356795): Users may mount this once the terminals are in a 109 // usable state. 110 AllowUserMount: true, 111 }) 112 vfsObj.MustRegisterFilesystemType(dev.Name, &dev.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{}) 113 vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 114 AllowUserMount: true, 115 AllowUserList: true, 116 }) 117 vfsObj.MustRegisterFilesystemType(erofs.Name, &erofs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 118 AllowUserList: true, 119 }) 120 vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 121 AllowUserMount: true, 122 AllowUserList: true, 123 }) 124 vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 125 AllowUserList: true, 126 }) 127 vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 128 AllowUserMount: true, 129 AllowUserList: true, 130 }) 131 vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 132 AllowUserMount: true, 133 AllowUserList: true, 134 }) 135 vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 136 AllowUserMount: true, 137 AllowUserList: true, 138 }) 139 vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 140 AllowUserMount: true, 141 AllowUserList: true, 142 }) 143 vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 144 AllowUserMount: true, 145 AllowUserList: true, 146 }) 147 148 // Register devices. 149 if err := memdev.Register(vfsObj); err != nil { 150 return fmt.Errorf("registering memdev: %w", err) 151 } 152 if err := ttydev.Register(vfsObj); err != nil { 153 return fmt.Errorf("registering ttydev: %w", err) 154 } 155 tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx)) 156 if tunSupported { 157 if err := tundev.Register(vfsObj); err != nil { 158 return fmt.Errorf("registering tundev: %v", err) 159 } 160 } 161 if err := fuse.Register(vfsObj); err != nil { 162 return fmt.Errorf("registering fusedev: %w", err) 163 } 164 165 if err := nvproxyRegisterDevices(info, vfsObj); err != nil { 166 return err 167 } 168 169 if err := tpuProxyRegisterDevices(info, vfsObj); err != nil { 170 return err 171 } 172 173 return nil 174 } 175 176 func setupContainerVFS(ctx context.Context, info *containerInfo, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { 177 // Create context with root credentials to mount the filesystem (the current 178 // user may not be privileged enough). 179 rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace) 180 rootProcArgs := *procArgs 181 rootProcArgs.WorkingDirectory = "/" 182 rootProcArgs.Credentials = rootCreds 183 rootProcArgs.Umask = 0022 184 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals 185 rootCtx := rootProcArgs.NewContext(mntr.k) 186 187 mns, err := mntr.mountAll(rootCtx, rootCreds, info.spec, info.conf, &rootProcArgs) 188 if err != nil { 189 return fmt.Errorf("failed to setupFS: %w", err) 190 } 191 procArgs.MountNamespace = mns 192 193 // If cgroups are mounted, then only check for the cgroup mounts per 194 // container. Otherwise the root cgroups will be enabled. 195 if mntr.cgroupsMounted { 196 cgroupRegistry := mntr.k.CgroupRegistry() 197 for _, ctrl := range kernel.CgroupCtrls { 198 cg, err := cgroupRegistry.FindCgroup(ctx, ctrl, "/"+mntr.containerID) 199 if err != nil { 200 return fmt.Errorf("cgroup mount for controller %v not found", ctrl) 201 } 202 if procArgs.InitialCgroups == nil { 203 procArgs.InitialCgroups = make(map[kernel.Cgroup]struct{}, len(kernel.CgroupCtrls)) 204 } 205 procArgs.InitialCgroups[cg] = struct{}{} 206 } 207 } 208 209 mnsRoot := mns.Root(rootCtx) 210 defer mnsRoot.DecRef(rootCtx) 211 212 if err := createDeviceFiles(rootCtx, rootCreds, info, mntr.k.VFS(), mnsRoot); err != nil { 213 return fmt.Errorf("failed to create device files: %w", err) 214 } 215 216 // We are executing a file directly. Do not resolve the executable path. 217 if procArgs.File != nil { 218 return nil 219 } 220 // Resolve the executable path from working dir and environment. 221 resolved, err := user.ResolveExecutablePath(ctx, procArgs) 222 if err != nil { 223 return err 224 } 225 procArgs.Filename = resolved 226 return nil 227 } 228 229 // compileMounts returns the supported mounts from the mount spec, adding any 230 // mandatory mounts that are required by the OCI specification. 231 // 232 // This function must NOT add/remove any gofer mounts or change their order. 233 func compileMounts(spec *specs.Spec, conf *config.Config, containerID string) []specs.Mount { 234 // Keep track of whether proc and sys were mounted. 235 var procMounted, sysMounted, devMounted, devptsMounted, cgroupsMounted bool 236 var mounts []specs.Mount 237 238 // Mount all submounts from the spec. 239 for _, m := range spec.Mounts { 240 // Mount all the cgroup controllers when "/sys/fs/cgroup" mount 241 // is present. If any other cgroup controller mounts are there, 242 // it will be a no-op, drop them. 243 if m.Type == cgroupfs.Name && cgroupsMounted { 244 continue 245 } 246 247 switch filepath.Clean(m.Destination) { 248 case "/proc": 249 procMounted = true 250 case "/sys": 251 sysMounted = true 252 case "/dev": 253 m.Type = dev.Name 254 devMounted = true 255 case "/dev/pts": 256 m.Type = devpts.Name 257 devptsMounted = true 258 case "/sys/fs/cgroup": 259 cgroupsMounted = true 260 } 261 262 mounts = append(mounts, m) 263 } 264 265 // Mount proc and sys even if the user did not ask for it, as the spec 266 // says we SHOULD. 267 var mandatoryMounts []specs.Mount 268 269 if !procMounted { 270 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 271 Type: proc.Name, 272 Destination: "/proc", 273 }) 274 } 275 if !sysMounted { 276 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 277 Type: sys.Name, 278 Destination: "/sys", 279 }) 280 } 281 if !devMounted { 282 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 283 Type: dev.Name, 284 Destination: "/dev", 285 }) 286 } 287 if !devptsMounted { 288 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 289 Type: devpts.Name, 290 Destination: "/dev/pts", 291 }) 292 } 293 294 // The mandatory mounts should be ordered right after the root, in case 295 // there are submounts of these mandatory mounts already in the spec. 296 mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...) 297 298 return mounts 299 } 300 301 // goferMountData creates a slice of gofer mount data. 302 func goferMountData(fd int, fa config.FileAccessType, conf *config.Config) []string { 303 opts := []string{ 304 "trans=fd", 305 "rfdno=" + strconv.Itoa(fd), 306 "wfdno=" + strconv.Itoa(fd), 307 } 308 if fa == config.FileAccessShared { 309 opts = append(opts, "cache=remote_revalidating") 310 } 311 if conf.DirectFS { 312 opts = append(opts, "directfs") 313 } 314 if !conf.HostFifo.AllowOpen() { 315 opts = append(opts, "disable_fifo_open") 316 } 317 return opts 318 } 319 320 // consumeMountOptions consumes mount options from opts based on allowedKeys 321 // and returns the remaining and consumed options. 322 func consumeMountOptions(opts []string, allowedKeys ...string) ([]string, []string, error) { 323 var rem, out []string 324 for _, o := range opts { 325 ok, err := parseMountOption(o, allowedKeys...) 326 if err != nil { 327 return nil, nil, err 328 } 329 if ok { 330 out = append(out, o) 331 } else { 332 rem = append(rem, o) 333 } 334 } 335 return rem, out, nil 336 } 337 338 func parseMountOption(opt string, allowedKeys ...string) (bool, error) { 339 kv := strings.SplitN(opt, "=", 3) 340 if len(kv) > 2 { 341 return false, fmt.Errorf("invalid option %q", opt) 342 } 343 return slices.Contains(allowedKeys, kv[0]), nil 344 } 345 346 type fdDispenser struct { 347 fds []*fd.FD 348 } 349 350 func (f *fdDispenser) remove() int { 351 return f.removeAsFD().Release() 352 } 353 354 func (f *fdDispenser) removeAsFD() *fd.FD { 355 if f.empty() { 356 panic("fdDispenser out of fds") 357 } 358 rv := f.fds[0] 359 f.fds = f.fds[1:] 360 return rv 361 } 362 363 func (f *fdDispenser) empty() bool { 364 return len(f.fds) == 0 365 } 366 367 type containerMounter struct { 368 root *specs.Root 369 370 // mounts is the set of submounts for the container. It's a copy from the spec 371 // that may be freely modified without affecting the original spec. 372 mounts []specs.Mount 373 374 // goferFDs is the list of FDs to be dispensed for gofer mounts. 375 goferFDs fdDispenser 376 377 // goferFilestoreFDs are FDs to the regular files that will back the tmpfs or 378 // overlayfs mount for certain gofer mounts. 379 goferFilestoreFDs fdDispenser 380 381 // devGoferFD is the FD to attach the sandbox to the dev gofer. 382 devGoferFD *fd.FD 383 384 // goferMountConfs contains information about how the gofer mounts have been 385 // configured. The first entry is for rootfs and the following entries are 386 // for bind mounts in Spec.Mounts (in the same order). 387 goferMountConfs []GoferMountConf 388 389 k *kernel.Kernel 390 391 // hints is the set of pod mount hints for the sandbox. 392 hints *PodMountHints 393 394 // sharedMounts is a map of shared mounts that can be reused across 395 // containers. 396 sharedMounts map[string]*vfs.Mount 397 398 // productName is the value to show in 399 // /sys/devices/virtual/dmi/id/product_name. 400 productName string 401 402 // containerID is the ID for the container. 403 containerID string 404 405 // sandboxID is the ID for the whole sandbox. 406 sandboxID string 407 containerName string 408 409 // cgroupsMounted indicates if cgroups are mounted in the container. 410 // This is used to set the InitialCgroups before starting the container 411 // process. 412 cgroupsMounted bool 413 } 414 415 func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *PodMountHints, sharedMounts map[string]*vfs.Mount, productName string, sandboxID string) *containerMounter { 416 return &containerMounter{ 417 root: info.spec.Root, 418 mounts: compileMounts(info.spec, info.conf, info.procArgs.ContainerID), 419 goferFDs: fdDispenser{fds: info.goferFDs}, 420 goferFilestoreFDs: fdDispenser{fds: info.goferFilestoreFDs}, 421 devGoferFD: info.devGoferFD, 422 goferMountConfs: info.goferMountConfs, 423 k: k, 424 hints: hints, 425 sharedMounts: sharedMounts, 426 productName: productName, 427 containerID: info.cid, 428 sandboxID: sandboxID, 429 containerName: info.containerName, 430 } 431 } 432 433 func (c *containerMounter) checkDispenser() error { 434 if !c.goferFDs.empty() { 435 return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.goferFDs) 436 } 437 if !c.goferFilestoreFDs.empty() { 438 return fmt.Errorf("not all gofer Filestore FDs were consumed, remaining: %v", c.goferFilestoreFDs) 439 } 440 if c.devGoferFD != nil && c.devGoferFD.FD() >= 0 { 441 return fmt.Errorf("dev gofer FD was not consumed: %d", c.devGoferFD.FD()) 442 } 443 return nil 444 } 445 446 func getMountAccessType(conf *config.Config, hint *MountHint) config.FileAccessType { 447 if hint != nil { 448 return hint.fileAccessType() 449 } 450 return conf.FileAccessMounts 451 } 452 453 func (c *containerMounter) mountAll(rootCtx context.Context, rootCreds *auth.Credentials, spec *specs.Spec, conf *config.Config, rootProcArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { 454 log.Infof("Configuring container's file system") 455 456 mns, err := c.createMountNamespace(rootCtx, conf, rootCreds) 457 if err != nil { 458 return nil, fmt.Errorf("creating mount namespace: %w", err) 459 } 460 rootProcArgs.MountNamespace = mns 461 462 root := mns.Root(rootCtx) 463 defer root.DecRef(rootCtx) 464 if root.Mount().ReadOnly() { 465 // Switch to ReadWrite while we setup submounts. 466 if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil { 467 return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err) 468 } 469 // Restore back to ReadOnly at the end. 470 defer func() { 471 if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { 472 panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err)) 473 } 474 }() 475 } 476 477 // Mount submounts. 478 if err := c.mountSubmounts(rootCtx, spec, conf, mns, rootCreds); err != nil { 479 return nil, fmt.Errorf("mounting submounts: %w", err) 480 } 481 482 return mns, nil 483 } 484 485 // createMountNamespace creates the container's root mount and namespace. 486 func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { 487 ioFD := c.goferFDs.remove() 488 rootfsConf := c.goferMountConfs[0] 489 490 var ( 491 fsName string 492 opts *vfs.MountOptions 493 ) 494 switch { 495 case rootfsConf.ShouldUseLisafs(): 496 fsName = gofer.Name 497 498 data := goferMountData(ioFD, conf.FileAccess, conf) 499 500 // We can't check for overlayfs here because sandbox is chroot'ed and gofer 501 // can only send mount options for specs.Mounts (specs.Root is missing 502 // Options field). So assume root is always on top of overlayfs. 503 data = append(data, "overlayfs_stale_read") 504 505 // Configure the gofer dentry cache size. 506 gofer.SetDentryCacheSize(conf.DCache) 507 508 opts = &vfs.MountOptions{ 509 ReadOnly: c.root.Readonly, 510 GetFilesystemOptions: vfs.GetFilesystemOptions{ 511 InternalMount: true, 512 Data: strings.Join(data, ","), 513 InternalData: gofer.InternalFilesystemOptions{ 514 UniqueID: vfs.RestoreID{ 515 ContainerName: c.containerName, 516 Path: "/", 517 }, 518 }, 519 }, 520 } 521 522 case rootfsConf.ShouldUseErofs(): 523 fsName = erofs.Name 524 opts = &vfs.MountOptions{ 525 ReadOnly: c.root.Readonly, 526 GetFilesystemOptions: vfs.GetFilesystemOptions{ 527 InternalMount: true, 528 Data: fmt.Sprintf("ifd=%d", ioFD), 529 InternalData: erofs.InternalFilesystemOptions{ 530 UniqueID: vfs.RestoreID{ 531 ContainerName: c.containerName, 532 Path: "/", 533 }, 534 }, 535 }, 536 } 537 538 default: 539 return nil, fmt.Errorf("unsupported rootfs config: %+v", rootfsConf) 540 } 541 542 log.Infof("Mounting root with %s, ioFD: %d", fsName, ioFD) 543 544 if rootfsConf.ShouldUseOverlayfs() { 545 log.Infof("Adding overlay on top of root") 546 var ( 547 err error 548 cleanup func() 549 filestoreFD *fd.FD 550 ) 551 if rootfsConf.IsFilestorePresent() { 552 filestoreFD = c.goferFilestoreFDs.removeAsFD() 553 } 554 opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, filestoreFD, rootfsConf, "/") 555 if err != nil { 556 return nil, fmt.Errorf("mounting root with overlay: %w", err) 557 } 558 defer cleanup() 559 fsName = overlay.Name 560 } 561 562 // The namespace root mount can't be changed, so let's mount a dummy 563 // read-only tmpfs here. It simplifies creation of containers without 564 // leaking the root file system. 565 mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "rootfs", "tmpfs", 566 &vfs.MountOptions{ReadOnly: true, Locked: true}, c.k) 567 if err != nil { 568 return nil, fmt.Errorf("setting up mount namespace: %w", err) 569 } 570 defer mns.DecRef(ctx) 571 572 mnt, err := c.k.VFS().MountDisconnected(ctx, creds, "root", fsName, opts) 573 if err != nil { 574 return nil, fmt.Errorf("creating root file system: %w", err) 575 } 576 defer mnt.DecRef(ctx) 577 root := mns.Root(ctx) 578 defer root.DecRef(ctx) 579 target := &vfs.PathOperation{ 580 Root: root, 581 Start: root, 582 } 583 if err := c.k.VFS().ConnectMountAt(ctx, creds, mnt, target); err != nil { 584 return nil, fmt.Errorf("mounting root file system: %w", err) 585 } 586 587 mns.IncRef() 588 return mns, nil 589 } 590 591 // configureOverlay mounts the lower layer using "lowerOpts", mounts the upper 592 // layer using tmpfs, and return overlay mount options. "cleanup" must be called 593 // after the options have been used to mount the overlay, to release refs on 594 // lower and upper mounts. 595 func (c *containerMounter) configureOverlay(ctx context.Context, conf *config.Config, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string, filestoreFD *fd.FD, mountConf GoferMountConf, dst string) (*vfs.MountOptions, func(), error) { 596 // First copy options from lower layer to upper layer and overlay. Clear 597 // filesystem specific options. 598 upperOpts := *lowerOpts 599 upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true} 600 601 overlayOpts := *lowerOpts 602 overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true} 603 604 // All writes go to the upper layer, be paranoid and make lower readonly. 605 lowerOpts.ReadOnly = true 606 lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) 607 if err != nil { 608 return nil, nil, err 609 } 610 cu := cleanup.Make(func() { lower.DecRef(ctx) }) 611 defer cu.Clean() 612 613 // Determine the lower layer's root's type. 614 lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root()) 615 stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{ 616 Root: lowerRootVD, 617 Start: lowerRootVD, 618 }, &vfs.StatOptions{ 619 Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE, 620 }) 621 if err != nil { 622 return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err) 623 } 624 if stat.Mask&linux.STATX_TYPE == 0 { 625 return nil, nil, fmt.Errorf("failed to get file type of lower layer's root") 626 } 627 rootType := stat.Mode & linux.S_IFMT 628 if rootType != linux.S_IFDIR && rootType != linux.S_IFREG { 629 return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType) 630 } 631 632 // Upper is a tmpfs mount to keep all modifications inside the sandbox. 633 tmpfsOpts := tmpfs.FilesystemOpts{ 634 RootFileType: uint16(rootType), 635 // If a mount is being overlaid, it should not be limited by the default 636 // tmpfs size limit. 637 DisableDefaultSizeLimit: true, 638 } 639 if filestoreFD != nil { 640 // Create memory file for disk-backed overlays. 641 mf, err := createPrivateMemoryFile(filestoreFD.ReleaseToFile("overlay-filestore"), vfs.RestoreID{ContainerName: c.containerName, Path: dst}) 642 if err != nil { 643 return nil, nil, fmt.Errorf("failed to create memory file for overlay: %v", err) 644 } 645 tmpfsOpts.MemoryFile = mf 646 } 647 upperOpts.GetFilesystemOptions.InternalData = tmpfsOpts 648 upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) 649 if err != nil { 650 return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) 651 } 652 cu.Add(func() { upper.DecRef(ctx) }) 653 654 // If the overlay mount consists of a regular file, copy up its contents 655 // from the lower layer, since in the overlay the otherwise-empty upper 656 // layer file will take precedence. 657 upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) 658 if rootType == linux.S_IFREG { 659 lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ 660 Root: lowerRootVD, 661 Start: lowerRootVD, 662 }, &vfs.OpenOptions{ 663 Flags: linux.O_RDONLY, 664 }) 665 if err != nil { 666 return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err) 667 } 668 defer lowerFD.DecRef(ctx) 669 upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ 670 Root: upperRootVD, 671 Start: upperRootVD, 672 }, &vfs.OpenOptions{ 673 Flags: linux.O_WRONLY, 674 }) 675 if err != nil { 676 return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err) 677 } 678 defer upperFD.DecRef(ctx) 679 if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil { 680 return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err) 681 } 682 } 683 684 // We need to hide the filestore from the containerized application. 685 if mountConf.IsSelfBacked() { 686 if err := overlay.CreateWhiteout(ctx, c.k.VFS(), creds, &vfs.PathOperation{ 687 Root: upperRootVD, 688 Start: upperRootVD, 689 Path: fspath.Parse(selfFilestoreName(c.sandboxID)), 690 }); err != nil { 691 return nil, nil, fmt.Errorf("failed to create whiteout to hide self overlay filestore: %w", err) 692 } 693 } 694 695 // Propagate the lower layer's root's owner, group, and mode to the upper 696 // layer's root for consistency with VFS1. 697 err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{ 698 Root: upperRootVD, 699 Start: upperRootVD, 700 }, &vfs.SetStatOptions{ 701 Stat: linux.Statx{ 702 Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask, 703 UID: stat.UID, 704 GID: stat.GID, 705 Mode: stat.Mode, 706 }, 707 }) 708 if err != nil { 709 return nil, nil, err 710 } 711 712 // Configure overlay with both layers. 713 overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{ 714 UpperRoot: upperRootVD, 715 LowerRoots: []vfs.VirtualDentry{lowerRootVD}, 716 } 717 return &overlayOpts, cu.Release(), nil 718 } 719 720 func (c *containerMounter) mountSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { 721 mounts, err := c.prepareMounts() 722 if err != nil { 723 return err 724 } 725 726 for i := range mounts { 727 submount := &mounts[i] 728 log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options) 729 var ( 730 mnt *vfs.Mount 731 err error 732 ) 733 734 if submount.hint != nil && submount.hint.ShouldShareMount() { 735 sharedMount, err := c.getSharedMount(ctx, spec, conf, submount, creds) 736 if err != nil { 737 return fmt.Errorf("getting shared mount %q: %w", submount.hint.Name, err) 738 } 739 mnt, err = c.mountSharedSubmount(ctx, conf, mns, creds, submount, sharedMount) 740 if err != nil { 741 return fmt.Errorf("mount shared mount %q to %q: %v", submount.hint.Name, submount.mount.Destination, err) 742 } 743 } else if submount.mount.Type == cgroupfs.Name { 744 // Mount all the cgroups controllers. 745 if err := c.mountCgroupSubmounts(ctx, spec, conf, mns, creds, submount); err != nil { 746 return fmt.Errorf("mount cgroup %q: %w", submount.mount.Destination, err) 747 } 748 } else { 749 mnt, err = c.mountSubmount(ctx, spec, conf, mns, creds, submount) 750 if err != nil { 751 return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err) 752 } 753 } 754 755 if mnt != nil && mnt.ReadOnly() { 756 // Switch to ReadWrite while we setup submounts. 757 if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { 758 return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err) 759 } 760 // Restore back to ReadOnly at the end. 761 defer func() { 762 if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { 763 panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err)) 764 } 765 }() 766 } 767 } 768 769 if err := c.mountTmp(ctx, spec, conf, creds, mns); err != nil { 770 return fmt.Errorf(`mount submount "/tmp": %w`, err) 771 } 772 return nil 773 } 774 775 type mountInfo struct { 776 mount *specs.Mount 777 goferFD *fd.FD 778 hint *MountHint 779 goferMountConf GoferMountConf 780 filestoreFD *fd.FD 781 } 782 783 func (c *containerMounter) prepareMounts() ([]mountInfo, error) { 784 // If device gofer exists, connect to it. 785 if c.devGoferFD != nil { 786 if err := c.k.AddDevGofer(c.containerName, c.devGoferFD.Release()); err != nil { 787 return nil, err 788 } 789 } 790 // Associate bind mounts with their FDs before sorting since there is an 791 // undocumented assumption that FDs are dispensed in the order in which 792 // they are required by mounts. 793 var mounts []mountInfo 794 goferMntIdx := 1 // First index is for rootfs. 795 for i := range c.mounts { 796 info := mountInfo{ 797 mount: &c.mounts[i], 798 hint: c.hints.FindMount(c.mounts[i].Source), 799 } 800 specutils.MaybeConvertToBindMount(info.mount) 801 if specutils.IsGoferMount(*info.mount) { 802 info.goferMountConf = c.goferMountConfs[goferMntIdx] 803 if info.goferMountConf.ShouldUseLisafs() { 804 info.goferFD = c.goferFDs.removeAsFD() 805 } 806 if info.goferMountConf.IsFilestorePresent() { 807 info.filestoreFD = c.goferFilestoreFDs.removeAsFD() 808 } 809 if info.goferMountConf.ShouldUseTmpfs() { 810 specutils.ChangeMountType(info.mount, tmpfs.Name) 811 } 812 goferMntIdx++ 813 } 814 mounts = append(mounts, info) 815 } 816 if err := c.checkDispenser(); err != nil { 817 return nil, err 818 } 819 820 // Sort the mounts so that we don't place children before parents. 821 sort.Slice(mounts, func(i, j int) bool { 822 return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination) 823 }) 824 825 return mounts, nil 826 } 827 828 func (c *containerMounter) mountSubmount(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) (*vfs.Mount, error) { 829 fsName, opts, err := getMountNameAndOptions(spec, conf, submount, c.productName, c.containerName) 830 if err != nil { 831 return nil, fmt.Errorf("mountOptions failed: %w", err) 832 } 833 if len(fsName) == 0 { 834 // Filesystem is not supported (e.g. cgroup), just skip it. 835 return nil, nil 836 } 837 838 if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil { 839 return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err) 840 } 841 842 if submount.goferMountConf.ShouldUseOverlayfs() { 843 log.Infof("Adding overlay on top of mount %q", submount.mount.Destination) 844 var cleanup func() 845 opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, submount.filestoreFD, submount.goferMountConf, submount.mount.Destination) 846 if err != nil { 847 return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err) 848 } 849 defer cleanup() 850 fsName = overlay.Name 851 } 852 853 root := mns.Root(ctx) 854 defer root.DecRef(ctx) 855 target := &vfs.PathOperation{ 856 Root: root, 857 Start: root, 858 Path: fspath.Parse(submount.mount.Destination), 859 } 860 mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) 861 if err != nil { 862 return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts) 863 } 864 log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data) 865 return mnt, nil 866 } 867 868 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values 869 // used for mounts. 870 func getMountNameAndOptions(spec *specs.Spec, conf *config.Config, m *mountInfo, productName, containerName string) (string, *vfs.MountOptions, error) { 871 fsName := m.mount.Type 872 var ( 873 mopts = m.mount.Options 874 data []string 875 internalData any 876 ) 877 878 // Find filesystem name and FS specific data field. 879 switch m.mount.Type { 880 case devpts.Name, dev.Name, proc.Name: 881 // Nothing to do. 882 883 case Nonefs: 884 fsName = sys.Name 885 886 case sys.Name: 887 sysData := &sys.InternalData{EnableTPUProxyPaths: specutils.TPUProxyIsEnabled(spec, conf)} 888 if len(productName) > 0 { 889 sysData.ProductName = productName 890 } 891 internalData = sysData 892 893 case tmpfs.Name: 894 var err error 895 mopts, data, err = consumeMountOptions(mopts, tmpfsAllowedData...) 896 if err != nil { 897 return "", nil, err 898 } 899 if m.filestoreFD != nil { 900 mf, err := createPrivateMemoryFile(m.filestoreFD.ReleaseToFile("tmpfs-filestore"), vfs.RestoreID{ContainerName: containerName, Path: m.mount.Destination}) 901 if err != nil { 902 return "", nil, fmt.Errorf("failed to create memory file for tmpfs: %v", err) 903 } 904 internalData = tmpfs.FilesystemOpts{ 905 MemoryFile: mf, 906 // If a mount is being overlaid with tmpfs, it should not be limited by 907 // the default tmpfs size limit. 908 DisableDefaultSizeLimit: true, 909 } 910 } 911 912 case Bind: 913 fsName = gofer.Name 914 if m.goferFD == nil { 915 // Check that an FD was provided to fails fast. 916 return "", nil, fmt.Errorf("gofer mount requires a connection FD") 917 } 918 var err error 919 mopts, data, err = consumeMountOptions(mopts, gofer.SupportedMountOptions...) 920 if err != nil { 921 return "", nil, err 922 } 923 data = append(data, goferMountData(m.goferFD.Release(), getMountAccessType(conf, m.hint), conf)...) 924 internalData = gofer.InternalFilesystemOptions{ 925 UniqueID: vfs.RestoreID{ 926 ContainerName: containerName, 927 Path: m.mount.Destination, 928 }, 929 } 930 931 case cgroupfs.Name: 932 var err error 933 mopts, data, err = consumeMountOptions(mopts, cgroupfs.SupportedMountOptions...) 934 if err != nil { 935 return "", nil, err 936 } 937 938 default: 939 log.Warningf("ignoring unknown filesystem type %q", m.mount.Type) 940 return "", nil, nil 941 } 942 943 opts := ParseMountOptions(mopts) 944 opts.GetFilesystemOptions = vfs.GetFilesystemOptions{ 945 Data: strings.Join(data, ","), 946 InternalData: internalData, 947 InternalMount: true, 948 } 949 950 return fsName, opts, nil 951 } 952 953 // ParseMountOptions converts specs.Mount.Options to vfs.MountOptions. 954 func ParseMountOptions(opts []string) *vfs.MountOptions { 955 mountOpts := &vfs.MountOptions{ 956 GetFilesystemOptions: vfs.GetFilesystemOptions{ 957 InternalMount: true, 958 }, 959 } 960 // Note: update mountHint.CheckCompatible when more options are added. 961 for _, o := range opts { 962 switch o { 963 case "ro": 964 mountOpts.ReadOnly = true 965 case "noatime": 966 mountOpts.Flags.NoATime = true 967 case "noexec": 968 mountOpts.Flags.NoExec = true 969 case "rw", "atime", "exec": 970 // These use the default value and don't need to be set. 971 case "bind", "rbind": 972 // These are the same as a mount with type="bind". 973 default: 974 log.Warningf("ignoring unknown mount option %q", o) 975 } 976 } 977 return mountOpts 978 } 979 980 func parseKeyValue(s string) (string, string, bool) { 981 tokens := strings.SplitN(s, "=", 2) 982 if len(tokens) < 2 { 983 return "", "", false 984 } 985 return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true 986 } 987 988 func createPrivateMemoryFile(file *os.File, restoreID vfs.RestoreID) (*pgalloc.MemoryFile, error) { 989 mfOpts := pgalloc.MemoryFileOpts{ 990 // Private memory files are usually backed by files on disk. Ideally we 991 // would confirm with fstatfs(2) but that is prohibited by seccomp. 992 DiskBackedFile: true, 993 // Disk backed files need to be decommited on destroy to release disk space. 994 DecommitOnDestroy: true, 995 // sentry's seccomp filters don't allow the mmap(2) syscalls that 996 // pgalloc.IMAWorkAroundForMemFile() uses. Users of private memory files 997 // are expected to have performed the work around outside the sandbox. 998 DisableIMAWorkAround: true, 999 // Private memory files need to be restored correctly using this ID. 1000 RestoreID: restoreID.String(), 1001 } 1002 return pgalloc.NewMemoryFile(file, mfOpts) 1003 } 1004 1005 // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. 1006 // Technically we don't have to mount tmpfs at /tmp, as we could just rely on 1007 // the host /tmp, but this is a nice optimization, and fixes some apps that call 1008 // mknod in /tmp. It's unsafe to mount tmpfs if: 1009 // 1. /tmp is mounted explicitly: we should not override user's wish 1010 // 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp 1011 // 1012 // Note that when there are submounts inside of '/tmp', directories for the 1013 // mount points must be present, making '/tmp' not empty anymore. 1014 func (c *containerMounter) mountTmp(ctx context.Context, spec *specs.Spec, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { 1015 for _, m := range c.mounts { 1016 // m.Destination has been cleaned, so it's to use equality here. 1017 if m.Destination == "/tmp" { 1018 log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m) 1019 return nil 1020 } 1021 } 1022 1023 root := mns.Root(ctx) 1024 defer root.DecRef(ctx) 1025 pop := vfs.PathOperation{ 1026 Root: root, 1027 Start: root, 1028 Path: fspath.Parse("/tmp"), 1029 } 1030 fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) 1031 switch { 1032 case err == nil: 1033 defer fd.DecRef(ctx) 1034 1035 err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { 1036 if dirent.Name != "." && dirent.Name != ".." { 1037 return linuxerr.ENOTEMPTY 1038 } 1039 return nil 1040 })) 1041 switch { 1042 case err == nil: 1043 log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) 1044 case linuxerr.Equals(linuxerr.ENOTEMPTY, err): 1045 // If more than "." and ".." is found, skip internal tmpfs to prevent 1046 // hiding existing files. 1047 log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) 1048 return nil 1049 default: 1050 return fmt.Errorf("fd.IterDirents failed: %v", err) 1051 } 1052 fallthrough 1053 1054 case linuxerr.Equals(linuxerr.ENOENT, err): 1055 // No '/tmp' found (or fallthrough from above). It's safe to mount internal 1056 // tmpfs. 1057 tmpMount := specs.Mount{ 1058 Type: tmpfs.Name, 1059 Destination: "/tmp", 1060 // Sticky bit is added to prevent accidental deletion of files from 1061 // another user. This is normally done for /tmp. 1062 Options: []string{"mode=01777"}, 1063 } 1064 if _, err := c.mountSubmount(ctx, spec, conf, mns, creds, &mountInfo{mount: &tmpMount}); err != nil { 1065 return fmt.Errorf("mountSubmount failed: %v", err) 1066 } 1067 return nil 1068 1069 case linuxerr.Equals(linuxerr.ENOTDIR, err): 1070 // Not a dir?! Let it be. 1071 return nil 1072 1073 default: 1074 return fmt.Errorf(`opening "/tmp" inside container: %w`, err) 1075 } 1076 } 1077 1078 func (c *containerMounter) getSharedMount(ctx context.Context, spec *specs.Spec, conf *config.Config, mount *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) { 1079 sharedMount, ok := c.sharedMounts[mount.hint.Mount.Source] 1080 if ok { 1081 log.Infof("Using existing shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type) 1082 if mount.goferFD != nil { 1083 panic(fmt.Errorf("extra goferFD provided for shared mount %q", mount.hint.Name)) 1084 } 1085 if mount.filestoreFD != nil { 1086 mount.filestoreFD.Close() 1087 } 1088 return sharedMount, nil 1089 } 1090 log.Infof("Mounting master of shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type) 1091 sharedMount, err := c.mountSharedMaster(ctx, spec, conf, mount, creds) 1092 if err != nil { 1093 return nil, fmt.Errorf("mounting shared master %q: %v", mount.hint.Name, err) 1094 } 1095 c.sharedMounts[mount.hint.Mount.Source] = sharedMount 1096 return sharedMount, nil 1097 } 1098 1099 // mountCgroupMounts mounts the cgroups which are shared across all containers. 1100 // Postcondition: Initialized k.cgroupMounts on success. 1101 func (l *Loader) mountCgroupMounts(conf *config.Config, creds *auth.Credentials) error { 1102 ctx := l.k.SupervisorContext() 1103 for _, sopts := range kernel.CgroupCtrls { 1104 mopts := &vfs.MountOptions{ 1105 GetFilesystemOptions: vfs.GetFilesystemOptions{ 1106 Data: string(sopts), 1107 InternalMount: true, 1108 }, 1109 } 1110 fs, root, err := l.k.VFS().NewFilesystem(ctx, creds, "cgroup", cgroupfs.Name, mopts) 1111 if err != nil { 1112 return err 1113 } 1114 1115 mount := l.k.VFS().NewDisconnectedMount(fs, root, mopts) 1116 // Private so that mounts created by containers do not appear 1117 // in other container's cgroup paths. 1118 l.k.VFS().SetMountPropagation(mount, linux.MS_PRIVATE, false) 1119 l.k.AddCgroupMount(string(sopts), &kernel.CgroupMount{ 1120 Fs: fs, 1121 Root: root, 1122 Mount: mount, 1123 }) 1124 } 1125 log.Infof("created cgroup mounts for controllers %v", kernel.CgroupCtrls) 1126 return nil 1127 } 1128 1129 // mountCgroupSubmounts mounts all the cgroup controller submounts for the 1130 // container. The cgroup submounts are created under the root controller mount 1131 // with containerID as the directory name and then bind mounts this directory 1132 // inside the container's mount namespace. 1133 func (c *containerMounter) mountCgroupSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) error { 1134 root := mns.Root(ctx) 1135 defer root.DecRef(ctx) 1136 1137 // Mount "/sys/fs/cgroup" in the container's mount namespace. 1138 submount.mount.Type = tmpfs.Name 1139 mnt, err := c.mountSubmount(ctx, spec, conf, mns, creds, submount) 1140 if err != nil { 1141 return err 1142 } 1143 if mnt != nil && mnt.ReadOnly() { 1144 // Switch to ReadWrite while we setup submounts. 1145 if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { 1146 return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err) 1147 } 1148 // Restore back to ReadOnly at the end. 1149 defer func() { 1150 if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { 1151 panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err)) 1152 } 1153 }() 1154 } 1155 1156 // Mount all the cgroup controllers in the container's mount namespace. 1157 mountCtx := vfs.WithRoot(vfs.WithMountNamespace(ctx, mns), root) 1158 for _, ctrl := range kernel.CgroupCtrls { 1159 ctrlName := string(ctrl) 1160 cgroupMnt := c.k.GetCgroupMount(ctrlName) 1161 if cgroupMnt == nil { 1162 return fmt.Errorf("cgroup mount for controller %s not found", ctrlName) 1163 } 1164 1165 cgroupMntVD := vfs.MakeVirtualDentry(cgroupMnt.Mount, cgroupMnt.Root) 1166 sourcePop := vfs.PathOperation{ 1167 Root: cgroupMntVD, 1168 Start: cgroupMntVD, 1169 // Use the containerID as the cgroup path. 1170 Path: fspath.Parse(c.containerID), 1171 } 1172 if err := c.k.VFS().MkdirAt(mountCtx, creds, &sourcePop, &vfs.MkdirOptions{ 1173 Mode: 0755, 1174 }); err != nil { 1175 log.Infof("error in creating directory %v", err) 1176 return err 1177 } 1178 1179 // Bind mount the new cgroup directory into the container's mount namespace. 1180 destination := "/sys/fs/cgroup/" + ctrlName 1181 if err := c.k.VFS().MakeSyntheticMountpoint(mountCtx, destination, root, creds); err != nil { 1182 // Log a warning, but attempt the mount anyway. 1183 log.Warningf("Failed to create mount point %q: %v", destination, err) 1184 } 1185 1186 target := &vfs.PathOperation{ 1187 Root: root, 1188 Start: root, 1189 Path: fspath.Parse(destination), 1190 } 1191 if err := c.k.VFS().BindAt(mountCtx, creds, &sourcePop, target, false); err != nil { 1192 log.Infof("error in bind mounting %v", err) 1193 return err 1194 } 1195 } 1196 c.cgroupsMounted = true 1197 return nil 1198 } 1199 1200 // mountSharedMaster mounts the master of a volume that is shared among 1201 // containers in a pod. 1202 func (c *containerMounter) mountSharedMaster(ctx context.Context, spec *specs.Spec, conf *config.Config, mntInfo *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) { 1203 // Mount the master using the options from the hint (mount annotations). 1204 origOpts := mntInfo.mount.Options 1205 mntInfo.mount.Options = mntInfo.hint.Mount.Options 1206 fsName, opts, err := getMountNameAndOptions(spec, conf, mntInfo, c.productName, c.containerName) 1207 mntInfo.mount.Options = origOpts 1208 if err != nil { 1209 return nil, err 1210 } 1211 if len(fsName) == 0 { 1212 return nil, fmt.Errorf("mount type not supported %q", mntInfo.hint.Mount.Type) 1213 } 1214 return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) 1215 } 1216 1217 // mountSharedSubmount binds mount to a previously mounted volume that is shared 1218 // among containers in the same pod. 1219 func (c *containerMounter) mountSharedSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mntInfo *mountInfo, sharedMount *vfs.Mount) (*vfs.Mount, error) { 1220 if err := mntInfo.hint.checkCompatible(mntInfo.mount); err != nil { 1221 return nil, err 1222 } 1223 1224 // Generate mount point specific opts using mntInfo.mount. 1225 opts := ParseMountOptions(mntInfo.mount.Options) 1226 newMnt := c.k.VFS().NewDisconnectedMount(sharedMount.Filesystem(), sharedMount.Root(), opts) 1227 defer newMnt.DecRef(ctx) 1228 1229 root := mns.Root(ctx) 1230 defer root.DecRef(ctx) 1231 target := &vfs.PathOperation{ 1232 Root: root, 1233 Start: root, 1234 Path: fspath.Parse(mntInfo.mount.Destination), 1235 } 1236 1237 if err := c.makeMountPoint(ctx, creds, mns, mntInfo.mount.Destination); err != nil { 1238 return nil, fmt.Errorf("creating mount point %q: %w", mntInfo.mount.Destination, err) 1239 } 1240 1241 if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { 1242 return nil, err 1243 } 1244 log.Infof("Mounted %q type shared bind to %q", mntInfo.mount.Destination, mntInfo.hint.Name) 1245 return newMnt, nil 1246 } 1247 1248 func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { 1249 root := mns.Root(ctx) 1250 defer root.DecRef(ctx) 1251 target := &vfs.PathOperation{ 1252 Root: root, 1253 Start: root, 1254 Path: fspath.Parse(dest), 1255 } 1256 // First check if mount point exists. When overlay is enabled, gofer doesn't 1257 // allow changes to the FS, making MakeSytheticMountpoint() ineffective 1258 // because MkdirAt fails with EROFS even if file exists. 1259 vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{}) 1260 if err == nil { 1261 // File exists, we're done. 1262 vd.DecRef(ctx) 1263 return nil 1264 } 1265 return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) 1266 } 1267 1268 // configureRestore returns an updated context.Context including filesystem 1269 // state used by restore defined by conf. 1270 func (c *containerMounter) configureRestore(fdmap map[vfs.RestoreID]int, mfmap map[string]*pgalloc.MemoryFile) error { 1271 // Compare createMountNamespace(); rootfs always consumes a gofer FD and a 1272 // filestore FD is consumed if the rootfs GoferMountConf indicates so. 1273 rootKey := vfs.RestoreID{ContainerName: c.containerName, Path: "/"} 1274 fdmap[rootKey] = c.goferFDs.remove() 1275 1276 if rootfsConf := c.goferMountConfs[0]; rootfsConf.IsFilestorePresent() { 1277 mf, err := createPrivateMemoryFile(c.goferFilestoreFDs.removeAsFD().ReleaseToFile("overlay-filestore"), rootKey) 1278 if err != nil { 1279 return fmt.Errorf("failed to create private memory file for mount rootfs: %w", err) 1280 } 1281 mfmap[rootKey.String()] = mf 1282 } 1283 // prepareMounts() consumes the remaining FDs for submounts. 1284 mounts, err := c.prepareMounts() 1285 if err != nil { 1286 return err 1287 } 1288 for i := range mounts { 1289 submount := &mounts[i] 1290 if submount.goferFD != nil { 1291 key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination} 1292 fdmap[key] = submount.goferFD.Release() 1293 } 1294 if submount.filestoreFD != nil { 1295 key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination} 1296 mf, err := createPrivateMemoryFile(submount.filestoreFD.ReleaseToFile("overlay-filestore"), key) 1297 if err != nil { 1298 return fmt.Errorf("failed to create private memory file for mount %q: %w", submount.mount.Destination, err) 1299 } 1300 mfmap[key.String()] = mf 1301 } 1302 } 1303 return nil 1304 } 1305 1306 func createDeviceFiles(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry) error { 1307 if info.spec.Linux != nil { 1308 // Create any device files specified in the spec. 1309 for _, dev := range info.spec.Linux.Devices { 1310 if err := createDeviceFile(ctx, creds, info, vfsObj, root, dev); err != nil { 1311 return err 1312 } 1313 } 1314 } 1315 if specutils.GPUFunctionalityRequestedViaHook(info.spec, info.conf) { 1316 // When using nvidia-container-runtime-hook, devices are not injected into 1317 // spec.Linux.Devices. So manually create appropriate device files. 1318 mode := os.FileMode(0666) 1319 nvidiaDevs := []specs.LinuxDevice{ 1320 specs.LinuxDevice{Path: "/dev/nvidiactl", Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: nvgpu.NV_CONTROL_DEVICE_MINOR, FileMode: &mode}, 1321 specs.LinuxDevice{Path: "/dev/nvidia-uvm", Type: "c", Major: int64(info.nvidiaUVMDevMajor), Minor: nvgpu.NVIDIA_UVM_PRIMARY_MINOR_NUMBER, FileMode: &mode}, 1322 } 1323 devClient := devutil.GoferClientFromContext(ctx) 1324 if devClient == nil { 1325 return fmt.Errorf("dev gofer client not found in context") 1326 } 1327 names, err := devClient.DirentNames(ctx) 1328 if err != nil { 1329 return fmt.Errorf("failed to get names of dirents from dev gofer: %w", err) 1330 } 1331 nvidiaDeviceRegex := regexp.MustCompile(`^nvidia(\d+)$`) 1332 for _, name := range names { 1333 ms := nvidiaDeviceRegex.FindStringSubmatch(name) 1334 if ms == nil { 1335 continue 1336 } 1337 minor, err := strconv.ParseUint(ms[1], 10, 32) 1338 if err != nil { 1339 return fmt.Errorf("invalid nvidia device name %q: %w", name, err) 1340 } 1341 nvidiaDevs = append(nvidiaDevs, specs.LinuxDevice{Path: fmt.Sprintf("/dev/nvidia%d", minor), Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: int64(minor), FileMode: &mode}) 1342 } 1343 for _, nvidiaDev := range nvidiaDevs { 1344 if err := createDeviceFile(ctx, creds, info, vfsObj, root, nvidiaDev); err != nil { 1345 return err 1346 } 1347 } 1348 } 1349 return nil 1350 } 1351 1352 func createDeviceFile(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, devSpec specs.LinuxDevice) error { 1353 mode := linux.FileMode(devSpec.FileMode.Perm()) 1354 var major, minor uint32 1355 // See https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices. 1356 switch devSpec.Type { 1357 case "b": 1358 mode |= linux.S_IFBLK 1359 major = uint32(devSpec.Major) 1360 minor = uint32(devSpec.Minor) 1361 case "c", "u": 1362 mode |= linux.S_IFCHR 1363 major = uint32(devSpec.Major) 1364 minor = uint32(devSpec.Minor) 1365 case "p": 1366 mode |= linux.S_IFIFO 1367 default: 1368 return fmt.Errorf("specified device at %q has invalid type %q", devSpec.Path, devSpec.Type) 1369 } 1370 if devSpec.Path == "/dev/nvidia-uvm" && info.nvidiaUVMDevMajor != 0 && major != info.nvidiaUVMDevMajor { 1371 // nvidia-uvm's major device number is dynamically assigned, so the 1372 // number that it has on the host may differ from the number that 1373 // it has in sentry VFS; switch from the former to the latter. 1374 log.Infof("Switching /dev/nvidia-uvm device major number from %d to %d", devSpec.Major, info.nvidiaUVMDevMajor) 1375 major = info.nvidiaUVMDevMajor 1376 } 1377 return dev.CreateDeviceFile(ctx, vfsObj, creds, root, devSpec.Path, major, minor, mode, devSpec.UID, devSpec.GID) 1378 } 1379 1380 // registerTPUDevice registers a TPU device in vfsObj based on the given device ID. 1381 func registerTPUDevice(vfsObj *vfs.VirtualFilesystem, minor uint32, deviceID int64) error { 1382 switch deviceID { 1383 case tpu.TPUV4DeviceID, tpu.TPUV4liteDeviceID: 1384 return accel.RegisterTPUDevice(vfsObj, minor, deviceID == tpu.TPUV4liteDeviceID) 1385 case tpu.TPUV5eDeviceID: 1386 return tpuproxy.RegisterTPUDevice(vfsObj, minor) 1387 default: 1388 return fmt.Errorf("unsupported TPU device with ID: 0x%x", deviceID) 1389 } 1390 } 1391 1392 // pathGlobToPathRegex is a map that points a TPU PCI path glob to its path regex. 1393 // TPU v4 devices are accessible via /sys/devices/pci0000:00/<pci_address>/accel/accel# on the host. 1394 // TPU v5 devices are accessible via at /sys/devices/pci0000:00/<pci_address>/vfio-dev/vfio# on the host. 1395 var pathGlobToPathRegex = map[string]string{ 1396 pciPathGlobTPUv4: `^/sys/devices/pci0000:00/\d+:\d+:\d+\.\d+/accel/accel(\d+)$`, 1397 pciPathGlobTPUv5: `^/sys/devices/pci0000:00/\d+:\d+:\d+\.\d+/vfio-dev/vfio(\d+)$`, 1398 } 1399 1400 func tpuProxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error { 1401 if !specutils.TPUProxyIsEnabled(info.spec, info.conf) { 1402 return nil 1403 } 1404 // Enumerate all potential PCI paths where TPU devices are available and register the found TPU devices. 1405 for pciPathGlobal, pathRegex := range pathGlobToPathRegex { 1406 pciAddrs, err := filepath.Glob(pciPathGlobal) 1407 if err != nil { 1408 return fmt.Errorf("enumerating PCI device files: %w", err) 1409 } 1410 pciPathRegex := regexp.MustCompile(pathRegex) 1411 for _, pciPath := range pciAddrs { 1412 ms := pciPathRegex.FindStringSubmatch(pciPath) 1413 if ms == nil { 1414 continue 1415 } 1416 deviceNum, err := strconv.ParseUint(ms[1], 10, 32) 1417 if err != nil { 1418 return fmt.Errorf("parsing PCI device number: %w", err) 1419 } 1420 var deviceIDBytes []byte 1421 if deviceIDBytes, err = os.ReadFile(path.Join(pciPath, "device/device")); err != nil { 1422 return fmt.Errorf("reading PCI device ID: %w", err) 1423 } 1424 deviceIDStr := strings.Replace(string(deviceIDBytes), "0x", "", -1) 1425 deviceID, err := strconv.ParseInt(strings.TrimSpace(deviceIDStr), 16, 64) 1426 if err != nil { 1427 return fmt.Errorf("parsing PCI device ID: %w", err) 1428 } 1429 if err := registerTPUDevice(vfsObj, uint32(deviceNum), deviceID); err != nil { 1430 return fmt.Errorf("registering TPU driver: %w", err) 1431 } 1432 } 1433 } 1434 if err := tpuproxy.RegisterVfioDevice(vfsObj); err != nil { 1435 return fmt.Errorf("registering vfio driver: %w", err) 1436 } 1437 return nil 1438 } 1439 1440 func nvproxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error { 1441 if !specutils.NVProxyEnabled(info.spec, info.conf) { 1442 return nil 1443 } 1444 uvmDevMajor, err := vfsObj.GetDynamicCharDevMajor() 1445 if err != nil { 1446 return fmt.Errorf("reserving device major number for nvidia-uvm: %w", err) 1447 } 1448 if err := nvproxy.Register(vfsObj, info.nvidiaDriverVersion, uvmDevMajor); err != nil { 1449 return fmt.Errorf("registering nvproxy driver: %w", err) 1450 } 1451 info.nvidiaUVMDevMajor = uvmDevMajor 1452 return nil 1453 }