github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/boot/vfs.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package boot 16 17 import ( 18 "fmt" 19 "path" 20 "path/filepath" 21 "regexp" 22 "sort" 23 "strconv" 24 "strings" 25 26 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 27 "github.com/MerlinKodo/gvisor/pkg/cleanup" 28 "github.com/MerlinKodo/gvisor/pkg/context" 29 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 30 "github.com/MerlinKodo/gvisor/pkg/fd" 31 "github.com/MerlinKodo/gvisor/pkg/fspath" 32 "github.com/MerlinKodo/gvisor/pkg/log" 33 "github.com/MerlinKodo/gvisor/pkg/sentry/devices/accel" 34 "github.com/MerlinKodo/gvisor/pkg/sentry/devices/memdev" 35 "github.com/MerlinKodo/gvisor/pkg/sentry/devices/nvproxy" 36 "github.com/MerlinKodo/gvisor/pkg/sentry/devices/ttydev" 37 "github.com/MerlinKodo/gvisor/pkg/sentry/devices/tundev" 38 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/cgroupfs" 39 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/devpts" 40 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/devtmpfs" 41 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/fuse" 42 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/gofer" 43 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/mqfs" 44 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/overlay" 45 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/proc" 46 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/sys" 47 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/tmpfs" 48 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/user" 49 "github.com/MerlinKodo/gvisor/pkg/sentry/inet" 50 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 51 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 52 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 53 "github.com/MerlinKodo/gvisor/runsc/config" 54 "github.com/MerlinKodo/gvisor/runsc/specutils" 55 specs "github.com/opencontainers/runtime-spec/specs-go" 56 ) 57 58 // Supported filesystems that map to different internal filesystems. 59 const ( 60 Bind = "bind" 61 Nonefs = "none" 62 ) 63 64 // SelfOverlayFilestorePrefix is the prefix in the file name of the 65 // self overlay filestore file. 66 const SelfOverlayFilestorePrefix = ".gvisor.overlay.img." 67 68 // SelfOverlayFilestorePath returns the path at which the self overlay 69 // filestore file is stored for a given mount. 70 func SelfOverlayFilestorePath(mountSrc, sandboxID string) string { 71 // We will place the filestore file in a gVisor specific hidden file inside 72 // the mount being overlay-ed itself. The same volume can be overlay-ed by 73 // multiple sandboxes. So make the filestore file unique to a sandbox by 74 // suffixing the sandbox ID. 75 return path.Join(mountSrc, selfOverlayFilestoreName(sandboxID)) 76 } 77 78 func selfOverlayFilestoreName(sandboxID string) string { 79 return SelfOverlayFilestorePrefix + sandboxID 80 } 81 82 // tmpfs has some extra supported options that we must pass through. 83 var tmpfsAllowedData = []string{"mode", "size", "uid", "gid"} 84 85 func registerFilesystems(k *kernel.Kernel, info *containerInfo) error { 86 ctx := k.SupervisorContext() 87 creds := auth.NewRootCredentials(k.RootUserNamespace()) 88 vfsObj := k.VFS() 89 90 vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 91 AllowUserMount: true, 92 AllowUserList: true, 93 }) 94 vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 95 AllowUserList: true, 96 // TODO(b/29356795): Users may mount this once the terminals are in a 97 // usable state. 98 AllowUserMount: false, 99 }) 100 vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 101 AllowUserMount: true, 102 AllowUserList: true, 103 }) 104 vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 105 AllowUserMount: true, 106 AllowUserList: true, 107 }) 108 vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 109 AllowUserList: true, 110 }) 111 vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 112 AllowUserMount: true, 113 AllowUserList: true, 114 }) 115 vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 116 AllowUserMount: true, 117 AllowUserList: true, 118 }) 119 vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 120 AllowUserMount: true, 121 AllowUserList: true, 122 }) 123 vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 124 AllowUserMount: true, 125 AllowUserList: true, 126 }) 127 vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ 128 AllowUserMount: true, 129 AllowUserList: true, 130 }) 131 132 // Register devices. 133 if err := memdev.Register(vfsObj); err != nil { 134 return fmt.Errorf("registering memdev: %w", err) 135 } 136 if err := ttydev.Register(vfsObj); err != nil { 137 return fmt.Errorf("registering ttydev: %w", err) 138 } 139 tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx)) 140 if tunSupported { 141 if err := tundev.Register(vfsObj); err != nil { 142 return fmt.Errorf("registering tundev: %v", err) 143 } 144 } 145 if err := fuse.Register(vfsObj); err != nil { 146 return fmt.Errorf("registering fusedev: %w", err) 147 } 148 149 // Setup files in devtmpfs. 150 a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name) 151 if err != nil { 152 return fmt.Errorf("creating devtmpfs accessor: %w", err) 153 } 154 defer a.Release(ctx) 155 156 if err := a.UserspaceInit(ctx); err != nil { 157 return fmt.Errorf("initializing userspace: %w", err) 158 } 159 if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { 160 return fmt.Errorf("creating memdev devtmpfs files: %w", err) 161 } 162 if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil { 163 return fmt.Errorf("creating ttydev devtmpfs files: %w", err) 164 } 165 if tunSupported { 166 if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { 167 return fmt.Errorf("creating tundev devtmpfs files: %v", err) 168 } 169 } 170 if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { 171 return fmt.Errorf("creating fusedev devtmpfs files: %w", err) 172 } 173 174 if err := nvproxyRegisterDevicesAndCreateFiles(ctx, info, k, vfsObj, a); err != nil { 175 return err 176 } 177 178 if err := tpuProxyRegisterDevicesAndCreateFiles(ctx, info, k, vfsObj, a); err != nil { 179 return err 180 } 181 182 return nil 183 } 184 185 func setupContainerVFS(ctx context.Context, info *containerInfo, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { 186 // Create context with root credentials to mount the filesystem (the current 187 // user may not be privileged enough). 188 rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace) 189 rootProcArgs := *procArgs 190 rootProcArgs.WorkingDirectory = "/" 191 rootProcArgs.Credentials = rootCreds 192 rootProcArgs.Umask = 0022 193 rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals 194 rootCtx := rootProcArgs.NewContext(mntr.k) 195 196 mns, err := mntr.mountAll(rootCtx, rootCreds, info.conf, &rootProcArgs) 197 if err != nil { 198 return fmt.Errorf("failed to setupFS: %w", err) 199 } 200 procArgs.MountNamespace = mns 201 202 mnsRoot := mns.Root(rootCtx) 203 defer mnsRoot.DecRef(rootCtx) 204 205 if err := createDeviceFiles(rootCtx, rootCreds, info, mntr.k.VFS(), mnsRoot); err != nil { 206 return fmt.Errorf("failed to create device files: %w", err) 207 } 208 209 // We are executing a file directly. Do not resolve the executable path. 210 if procArgs.File != nil { 211 return nil 212 } 213 // Resolve the executable path from working dir and environment. 214 resolved, err := user.ResolveExecutablePath(ctx, procArgs) 215 if err != nil { 216 return err 217 } 218 procArgs.Filename = resolved 219 return nil 220 } 221 222 // compileMounts returns the supported mounts from the mount spec, adding any 223 // mandatory mounts that are required by the OCI specification. 224 // 225 // This function must NOT add/remove any gofer mounts or change their order. 226 func compileMounts(spec *specs.Spec, conf *config.Config) []specs.Mount { 227 // Keep track of whether proc and sys were mounted. 228 var procMounted, sysMounted, devMounted, devptsMounted bool 229 var mounts []specs.Mount 230 231 // Mount all submounts from the spec. 232 for _, m := range spec.Mounts { 233 // Unconditionally drop any cgroupfs mounts. If requested, we'll add our 234 // own below. 235 if m.Type == cgroupfs.Name { 236 continue 237 } 238 switch filepath.Clean(m.Destination) { 239 case "/proc": 240 procMounted = true 241 case "/sys": 242 sysMounted = true 243 case "/dev": 244 m.Type = devtmpfs.Name 245 devMounted = true 246 case "/dev/pts": 247 m.Type = devpts.Name 248 devptsMounted = true 249 } 250 mounts = append(mounts, m) 251 } 252 253 // Mount proc and sys even if the user did not ask for it, as the spec 254 // says we SHOULD. 255 var mandatoryMounts []specs.Mount 256 257 if conf.Cgroupfs { 258 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 259 Type: tmpfs.Name, 260 Destination: "/sys/fs/cgroup", 261 }) 262 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 263 Type: cgroupfs.Name, 264 Destination: "/sys/fs/cgroup/memory", 265 Options: []string{"memory"}, 266 }) 267 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 268 Type: cgroupfs.Name, 269 Destination: "/sys/fs/cgroup/cpu", 270 Options: []string{"cpu"}, 271 }) 272 } 273 274 if !procMounted { 275 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 276 Type: proc.Name, 277 Destination: "/proc", 278 }) 279 } 280 if !sysMounted { 281 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 282 Type: sys.Name, 283 Destination: "/sys", 284 }) 285 } 286 if !devMounted { 287 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 288 Type: devtmpfs.Name, 289 Destination: "/dev", 290 }) 291 } 292 if !devptsMounted { 293 mandatoryMounts = append(mandatoryMounts, specs.Mount{ 294 Type: devpts.Name, 295 Destination: "/dev/pts", 296 }) 297 } 298 299 // The mandatory mounts should be ordered right after the root, in case 300 // there are submounts of these mandatory mounts already in the spec. 301 mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...) 302 303 return mounts 304 } 305 306 // goferMountData creates a slice of gofer mount data. 307 func goferMountData(fd int, fa config.FileAccessType, conf *config.Config) []string { 308 opts := []string{ 309 "trans=fd", 310 "rfdno=" + strconv.Itoa(fd), 311 "wfdno=" + strconv.Itoa(fd), 312 } 313 if fa == config.FileAccessShared { 314 opts = append(opts, "cache=remote_revalidating") 315 } 316 if conf.DirectFS { 317 opts = append(opts, "directfs") 318 } 319 if !conf.HostFifo.AllowOpen() { 320 opts = append(opts, "disable_fifo_open") 321 } 322 return opts 323 } 324 325 // parseAndFilterOptions parses a MountOptions slice and filters by the allowed 326 // keys. 327 func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) { 328 var out []string 329 for _, o := range opts { 330 ok, err := parseMountOption(o, allowedKeys...) 331 if err != nil { 332 return nil, err 333 } 334 if ok { 335 out = append(out, o) 336 } 337 } 338 return out, nil 339 } 340 341 func parseMountOption(opt string, allowedKeys ...string) (bool, error) { 342 kv := strings.SplitN(opt, "=", 3) 343 if len(kv) > 2 { 344 return false, fmt.Errorf("invalid option %q", opt) 345 } 346 return specutils.ContainsStr(allowedKeys, kv[0]), nil 347 } 348 349 type fdDispenser struct { 350 fds []*fd.FD 351 } 352 353 func (f *fdDispenser) remove() int { 354 return f.removeAsFD().Release() 355 } 356 357 func (f *fdDispenser) removeAsFD() *fd.FD { 358 if f.empty() { 359 panic("fdDispenser out of fds") 360 } 361 rv := f.fds[0] 362 f.fds = f.fds[1:] 363 return rv 364 } 365 366 func (f *fdDispenser) empty() bool { 367 return len(f.fds) == 0 368 } 369 370 type containerMounter struct { 371 root *specs.Root 372 373 // mounts is the set of submounts for the container. It's a copy from the spec 374 // that may be freely modified without affecting the original spec. 375 mounts []specs.Mount 376 377 // fds is the list of FDs to be dispensed for mounts that require it. 378 fds fdDispenser 379 380 // overlayFilestoreFDs are the FDs to the regular files that will back the 381 // tmpfs upper mount in the overlay mounts. 382 overlayFilestoreFDs fdDispenser 383 384 // overlayMediums contains information about how the gofer mounts have been 385 // overlaid. The first entry is for rootfs and the following entries are for 386 // bind mounts in `mounts` slice above (in the same order). 387 overlayMediums []OverlayMedium 388 389 k *kernel.Kernel 390 391 // hints is the set of pod mount hints for the sandbox. 392 hints *PodMountHints 393 394 // sharedMounts is a map of shared mounts that can be reused across 395 // containers. 396 sharedMounts map[string]*vfs.Mount 397 398 // productName is the value to show in 399 // /sys/devices/virtual/dmi/id/product_name. 400 productName string 401 402 // sandboxID is the ID for the whole sandbox. 403 sandboxID string 404 } 405 406 func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *PodMountHints, sharedMounts map[string]*vfs.Mount, productName string, sandboxID string) *containerMounter { 407 return &containerMounter{ 408 root: info.spec.Root, 409 mounts: compileMounts(info.spec, info.conf), 410 fds: fdDispenser{fds: info.goferFDs}, 411 overlayFilestoreFDs: fdDispenser{fds: info.overlayFilestoreFDs}, 412 overlayMediums: info.overlayMediums, 413 k: k, 414 hints: hints, 415 sharedMounts: sharedMounts, 416 productName: productName, 417 sandboxID: sandboxID, 418 } 419 } 420 421 func (c *containerMounter) checkDispenser() error { 422 if !c.fds.empty() { 423 return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds) 424 } 425 return nil 426 } 427 428 func getMountAccessType(conf *config.Config, mount *specs.Mount, hint *MountHint) config.FileAccessType { 429 if hint != nil { 430 return hint.fileAccessType() 431 } 432 return conf.FileAccessMounts 433 } 434 435 func (c *containerMounter) mountAll(rootCtx context.Context, rootCreds *auth.Credentials, conf *config.Config, rootProcArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { 436 log.Infof("Configuring container's file system") 437 438 mns, err := c.createMountNamespace(rootCtx, conf, rootCreds) 439 if err != nil { 440 return nil, fmt.Errorf("creating mount namespace: %w", err) 441 } 442 rootProcArgs.MountNamespace = mns 443 444 root := mns.Root(rootCtx) 445 defer root.DecRef(rootCtx) 446 if root.Mount().ReadOnly() { 447 // Switch to ReadWrite while we setup submounts. 448 if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil { 449 return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err) 450 } 451 // Restore back to ReadOnly at the end. 452 defer func() { 453 if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil { 454 panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err)) 455 } 456 }() 457 } 458 459 // Mount submounts. 460 if err := c.mountSubmounts(rootCtx, conf, mns, rootCreds); err != nil { 461 return nil, fmt.Errorf("mounting submounts: %w", err) 462 } 463 464 return mns, nil 465 } 466 467 // createMountNamespace creates the container's root mount and namespace. 468 func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { 469 ioFD := c.fds.remove() 470 data := goferMountData(ioFD, conf.FileAccess, conf) 471 472 // We can't check for overlayfs here because sandbox is chroot'ed and gofer 473 // can only send mount options for specs.Mounts (specs.Root is missing 474 // Options field). So assume root is always on top of overlayfs. 475 data = append(data, "overlayfs_stale_read") 476 477 // Configure the gofer dentry cache size. 478 gofer.SetDentryCacheSize(conf.DCache) 479 480 log.Infof("Mounting root with gofer, ioFD: %d", ioFD) 481 opts := &vfs.MountOptions{ 482 ReadOnly: c.root.Readonly, 483 GetFilesystemOptions: vfs.GetFilesystemOptions{ 484 Data: strings.Join(data, ","), 485 InternalData: gofer.InternalFilesystemOptions{ 486 UniqueID: "/", 487 }, 488 }, 489 InternalMount: true, 490 } 491 492 fsName := gofer.Name 493 if c.overlayMediums[0].IsEnabled() { 494 log.Infof("Adding overlay on top of root") 495 var ( 496 err error 497 cleanup func() 498 overlayFilestore *fd.FD 499 ) 500 if c.overlayMediums[0].IsBackedByHostFile() { 501 overlayFilestore = c.overlayFilestoreFDs.removeAsFD() 502 } 503 opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, overlayFilestore, c.overlayMediums[0]) 504 if err != nil { 505 return nil, fmt.Errorf("mounting root with overlay: %w", err) 506 } 507 defer cleanup() 508 fsName = overlay.Name 509 } 510 511 // The namespace root mount can't be changed, so let's mount a dummy 512 // read-only tmpfs here. It simplifies creation of containers without 513 // leaking the root file system. 514 mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "rootfs", "tmpfs", 515 &vfs.MountOptions{ReadOnly: true}, c.k) 516 if err != nil { 517 return nil, fmt.Errorf("setting up mount namespace: %w", err) 518 } 519 defer mns.DecRef(ctx) 520 521 mnt, err := c.k.VFS().MountDisconnected(ctx, creds, "root", fsName, opts) 522 if err != nil { 523 return nil, fmt.Errorf("creating root file system: %w", err) 524 } 525 defer mnt.DecRef(ctx) 526 root := mns.Root(ctx) 527 defer root.DecRef(ctx) 528 target := &vfs.PathOperation{ 529 Root: root, 530 Start: root, 531 } 532 if err := c.k.VFS().ConnectMountAt(ctx, creds, mnt, target); err != nil { 533 return nil, fmt.Errorf("mounting root file system: %w", err) 534 } 535 536 mns.IncRef() 537 return mns, nil 538 } 539 540 // configureOverlay mounts the lower layer using "lowerOpts", mounts the upper 541 // layer using tmpfs, and return overlay mount options. "cleanup" must be called 542 // after the options have been used to mount the overlay, to release refs on 543 // lower and upper mounts. 544 func (c *containerMounter) configureOverlay(ctx context.Context, conf *config.Config, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string, filestoreFD *fd.FD, medium OverlayMedium) (*vfs.MountOptions, func(), error) { 545 // First copy options from lower layer to upper layer and overlay. Clear 546 // filesystem specific options. 547 upperOpts := *lowerOpts 548 upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} 549 550 overlayOpts := *lowerOpts 551 overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{} 552 553 // All writes go to the upper layer, be paranoid and make lower readonly. 554 lowerOpts.ReadOnly = true 555 lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts) 556 if err != nil { 557 return nil, nil, err 558 } 559 cu := cleanup.Make(func() { lower.DecRef(ctx) }) 560 defer cu.Clean() 561 562 // Determine the lower layer's root's type. 563 lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root()) 564 stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{ 565 Root: lowerRootVD, 566 Start: lowerRootVD, 567 }, &vfs.StatOptions{ 568 Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE, 569 }) 570 if err != nil { 571 return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err) 572 } 573 if stat.Mask&linux.STATX_TYPE == 0 { 574 return nil, nil, fmt.Errorf("failed to get file type of lower layer's root") 575 } 576 rootType := stat.Mode & linux.S_IFMT 577 if rootType != linux.S_IFDIR && rootType != linux.S_IFREG { 578 return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType) 579 } 580 581 // Upper is a tmpfs mount to keep all modifications inside the sandbox. 582 tmpfsOpts := tmpfs.FilesystemOpts{ 583 RootFileType: uint16(rootType), 584 FilestoreFD: filestoreFD, 585 // If a mount is being overlaid, it should not be limited by the default 586 // tmpfs size limit. 587 DisableDefaultSizeLimit: true, 588 } 589 upperOpts.GetFilesystemOptions.InternalData = tmpfsOpts 590 upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts) 591 if err != nil { 592 return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err) 593 } 594 cu.Add(func() { upper.DecRef(ctx) }) 595 596 // If the overlay mount consists of a regular file, copy up its contents 597 // from the lower layer, since in the overlay the otherwise-empty upper 598 // layer file will take precedence. 599 upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root()) 600 if rootType == linux.S_IFREG { 601 lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ 602 Root: lowerRootVD, 603 Start: lowerRootVD, 604 }, &vfs.OpenOptions{ 605 Flags: linux.O_RDONLY, 606 }) 607 if err != nil { 608 return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err) 609 } 610 defer lowerFD.DecRef(ctx) 611 upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{ 612 Root: upperRootVD, 613 Start: upperRootVD, 614 }, &vfs.OpenOptions{ 615 Flags: linux.O_WRONLY, 616 }) 617 if err != nil { 618 return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err) 619 } 620 defer upperFD.DecRef(ctx) 621 if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil { 622 return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err) 623 } 624 } 625 626 // We need to hide the filestore from the containerized application. 627 if medium == SelfMedium { 628 if err := overlay.CreateWhiteout(ctx, c.k.VFS(), creds, &vfs.PathOperation{ 629 Root: upperRootVD, 630 Start: upperRootVD, 631 Path: fspath.Parse(selfOverlayFilestoreName(c.sandboxID)), 632 }); err != nil { 633 return nil, nil, fmt.Errorf("failed to create whiteout to hide self overlay filestore: %w", err) 634 } 635 } 636 637 // Propagate the lower layer's root's owner, group, and mode to the upper 638 // layer's root for consistency with VFS1. 639 err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{ 640 Root: upperRootVD, 641 Start: upperRootVD, 642 }, &vfs.SetStatOptions{ 643 Stat: linux.Statx{ 644 Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask, 645 UID: stat.UID, 646 GID: stat.GID, 647 Mode: stat.Mode, 648 }, 649 }) 650 if err != nil { 651 return nil, nil, err 652 } 653 654 // Configure overlay with both layers. 655 overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{ 656 UpperRoot: upperRootVD, 657 LowerRoots: []vfs.VirtualDentry{lowerRootVD}, 658 } 659 return &overlayOpts, cu.Release(), nil 660 } 661 662 func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { 663 mounts, err := c.prepareMounts() 664 if err != nil { 665 return err 666 } 667 668 for i := range mounts { 669 submount := &mounts[i] 670 log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options) 671 var ( 672 mnt *vfs.Mount 673 err error 674 ) 675 676 if submount.hint != nil && submount.hint.shouldShareMount() { 677 sharedMount, ok := c.sharedMounts[submount.hint.Mount.Source] 678 if !ok { 679 return fmt.Errorf("shared mount %q not found", submount.hint.Name) 680 } 681 mnt, err = c.mountSharedSubmount(ctx, conf, mns, creds, submount.mount, submount.hint, sharedMount) 682 if err != nil { 683 return fmt.Errorf("mount shared mount %q to %q: %v", submount.hint.Name, submount.mount.Destination, err) 684 } 685 } else { 686 mnt, err = c.mountSubmount(ctx, conf, mns, creds, submount) 687 if err != nil { 688 return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err) 689 } 690 } 691 692 if mnt != nil && mnt.ReadOnly() { 693 // Switch to ReadWrite while we setup submounts. 694 if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil { 695 return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err) 696 } 697 // Restore back to ReadOnly at the end. 698 defer func() { 699 if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil { 700 panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err)) 701 } 702 }() 703 } 704 } 705 706 if err := c.mountTmp(ctx, conf, creds, mns); err != nil { 707 return fmt.Errorf(`mount submount "/tmp": %w`, err) 708 } 709 return nil 710 } 711 712 type mountInfo struct { 713 mount *specs.Mount 714 fd int 715 hint *MountHint 716 overlayMedium OverlayMedium 717 overlayFilestoreFD *fd.FD 718 } 719 720 func newNonGoferMountInfo(mount *specs.Mount) *mountInfo { 721 return &mountInfo{mount: mount, fd: -1} 722 } 723 724 func (c *containerMounter) prepareMounts() ([]mountInfo, error) { 725 // Associate bind mounts with their FDs before sorting since there is an 726 // undocumented assumption that FDs are dispensed in the order in which 727 // they are required by mounts. 728 var mounts []mountInfo 729 goferMntIdx := 1 // First index is for rootfs. 730 for i := range c.mounts { 731 m := &c.mounts[i] 732 specutils.MaybeConvertToBindMount(m) 733 734 // Only bind mounts use host FDs; see 735 // containerMounter.getMountNameAndOptions. 736 info := mountInfo{ 737 mount: m, 738 fd: -1, 739 hint: c.hints.FindMount(m), 740 overlayMedium: NoOverlay, 741 } 742 if specutils.IsGoferMount(*m) { 743 info.fd = c.fds.remove() 744 info.overlayMedium = c.overlayMediums[goferMntIdx] 745 if info.overlayMedium.IsBackedByHostFile() { 746 info.overlayFilestoreFD = c.overlayFilestoreFDs.removeAsFD() 747 } 748 goferMntIdx++ 749 } 750 mounts = append(mounts, info) 751 } 752 if err := c.checkDispenser(); err != nil { 753 return nil, err 754 } 755 756 // Sort the mounts so that we don't place children before parents. 757 sort.Slice(mounts, func(i, j int) bool { 758 return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination) 759 }) 760 761 return mounts, nil 762 } 763 764 func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) (*vfs.Mount, error) { 765 fsName, opts, err := getMountNameAndOptions(conf, submount, c.productName) 766 if err != nil { 767 return nil, fmt.Errorf("mountOptions failed: %w", err) 768 } 769 if len(fsName) == 0 { 770 // Filesystem is not supported (e.g. cgroup), just skip it. 771 return nil, nil 772 } 773 774 if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil { 775 return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err) 776 } 777 778 if submount.overlayMedium.IsEnabled() { 779 log.Infof("Adding overlay on top of mount %q", submount.mount.Destination) 780 var cleanup func() 781 opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, submount.overlayFilestoreFD, submount.overlayMedium) 782 if err != nil { 783 return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err) 784 } 785 defer cleanup() 786 fsName = overlay.Name 787 } 788 789 root := mns.Root(ctx) 790 defer root.DecRef(ctx) 791 target := &vfs.PathOperation{ 792 Root: root, 793 Start: root, 794 Path: fspath.Parse(submount.mount.Destination), 795 } 796 mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts) 797 if err != nil { 798 return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts) 799 } 800 log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data) 801 return mnt, nil 802 } 803 804 // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values 805 // used for mounts. 806 func getMountNameAndOptions(conf *config.Config, m *mountInfo, productName string) (string, *vfs.MountOptions, error) { 807 fsName := m.mount.Type 808 var ( 809 data []string 810 internalData any 811 ) 812 813 // Find filesystem name and FS specific data field. 814 switch m.mount.Type { 815 case devpts.Name, devtmpfs.Name, proc.Name: 816 // Nothing to do. 817 818 case Nonefs: 819 fsName = sys.Name 820 821 case sys.Name: 822 sysData := &sys.InternalData{EnableAccelSysfs: conf.TPUProxy} 823 if len(productName) > 0 { 824 sysData.ProductName = productName 825 } 826 internalData = sysData 827 828 case tmpfs.Name: 829 var err error 830 data, err = parseAndFilterOptions(m.mount.Options, tmpfsAllowedData...) 831 if err != nil { 832 return "", nil, err 833 } 834 835 case Bind: 836 fsName = gofer.Name 837 if m.fd < 0 { 838 // Check that an FD was provided to fails fast. 839 return "", nil, fmt.Errorf("gofer mount requires a connection FD") 840 } 841 data = goferMountData(m.fd, getMountAccessType(conf, m.mount, m.hint), conf) 842 internalData = gofer.InternalFilesystemOptions{ 843 UniqueID: m.mount.Destination, 844 } 845 846 case cgroupfs.Name: 847 var err error 848 data, err = parseAndFilterOptions(m.mount.Options, cgroupfs.SupportedMountOptions...) 849 if err != nil { 850 return "", nil, err 851 } 852 853 default: 854 log.Warningf("ignoring unknown filesystem type %q", m.mount.Type) 855 return "", nil, nil 856 } 857 858 opts := ParseMountOptions(m.mount.Options) 859 opts.GetFilesystemOptions = vfs.GetFilesystemOptions{ 860 Data: strings.Join(data, ","), 861 InternalData: internalData, 862 } 863 864 return fsName, opts, nil 865 } 866 867 // ParseMountOptions converts specs.Mount.Options to vfs.MountOptions. 868 func ParseMountOptions(opts []string) *vfs.MountOptions { 869 mountOpts := &vfs.MountOptions{ 870 InternalMount: true, 871 } 872 // Note: update mountHint.CheckCompatible when more options are added. 873 for _, o := range opts { 874 switch o { 875 case "ro": 876 mountOpts.ReadOnly = true 877 case "noatime": 878 mountOpts.Flags.NoATime = true 879 case "noexec": 880 mountOpts.Flags.NoExec = true 881 case "rw", "atime", "exec": 882 // These use the default value and don't need to be set. 883 case "bind", "rbind": 884 // These are the same as a mount with type="bind". 885 default: 886 log.Warningf("ignoring unknown mount option %q", o) 887 } 888 } 889 return mountOpts 890 } 891 892 func parseKeyValue(s string) (string, string, bool) { 893 tokens := strings.SplitN(s, "=", 2) 894 if len(tokens) < 2 { 895 return "", "", false 896 } 897 return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true 898 } 899 900 // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so. 901 // Technically we don't have to mount tmpfs at /tmp, as we could just rely on 902 // the host /tmp, but this is a nice optimization, and fixes some apps that call 903 // mknod in /tmp. It's unsafe to mount tmpfs if: 904 // 1. /tmp is mounted explicitly: we should not override user's wish 905 // 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp 906 // 907 // Note that when there are submounts inside of '/tmp', directories for the 908 // mount points must be present, making '/tmp' not empty anymore. 909 func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error { 910 for _, m := range c.mounts { 911 // m.Destination has been cleaned, so it's to use equality here. 912 if m.Destination == "/tmp" { 913 log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m) 914 return nil 915 } 916 } 917 918 root := mns.Root(ctx) 919 defer root.DecRef(ctx) 920 pop := vfs.PathOperation{ 921 Root: root, 922 Start: root, 923 Path: fspath.Parse("/tmp"), 924 } 925 fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) 926 switch { 927 case err == nil: 928 defer fd.DecRef(ctx) 929 930 err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { 931 if dirent.Name != "." && dirent.Name != ".." { 932 return linuxerr.ENOTEMPTY 933 } 934 return nil 935 })) 936 switch { 937 case err == nil: 938 log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) 939 case linuxerr.Equals(linuxerr.ENOTEMPTY, err): 940 // If more than "." and ".." is found, skip internal tmpfs to prevent 941 // hiding existing files. 942 log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) 943 return nil 944 default: 945 return fmt.Errorf("fd.IterDirents failed: %v", err) 946 } 947 fallthrough 948 949 case linuxerr.Equals(linuxerr.ENOENT, err): 950 // No '/tmp' found (or fallthrough from above). It's safe to mount internal 951 // tmpfs. 952 tmpMount := specs.Mount{ 953 Type: tmpfs.Name, 954 Destination: "/tmp", 955 // Sticky bit is added to prevent accidental deletion of files from 956 // another user. This is normally done for /tmp. 957 Options: []string{"mode=01777"}, 958 } 959 if _, err := c.mountSubmount(ctx, conf, mns, creds, newNonGoferMountInfo(&tmpMount)); err != nil { 960 return fmt.Errorf("mountSubmount failed: %v", err) 961 } 962 return nil 963 964 case linuxerr.Equals(linuxerr.ENOTDIR, err): 965 // Not a dir?! Let it be. 966 return nil 967 968 default: 969 return fmt.Errorf(`opening "/tmp" inside container: %w`, err) 970 } 971 } 972 973 // processHints processes annotations that container hints about how volumes 974 // should be mounted (e.g. a volume shared between containers). 975 // Precondition: Must be only called once during the loader sequence 976 // for the root container. 977 // Postcondition: Initialized l.sharedMounts on success. 978 func (l *Loader) processHints(conf *config.Config, creds *auth.Credentials) error { 979 ctx := l.k.SupervisorContext() 980 var sharedMounts map[string]*vfs.Mount 981 for _, hint := range l.mountHints.Mounts { 982 if !hint.shouldShareMount() { 983 continue 984 } 985 986 log.Infof("Mounting master of shared mount %q from %q type %q", hint.Name, hint.Mount.Source, hint.Mount.Type) 987 mnt, err := l.mountSharedMaster(ctx, conf, hint, creds) 988 if err != nil { 989 return fmt.Errorf("mounting shared master %q: %v", hint.Name, err) 990 } 991 if sharedMounts == nil { 992 sharedMounts = make(map[string]*vfs.Mount) 993 } 994 sharedMounts[hint.Mount.Source] = mnt 995 } 996 l.sharedMounts = sharedMounts 997 return nil 998 } 999 1000 // mountSharedMaster mounts the master of a volume that is shared among 1001 // containers in a pod. 1002 func (l *Loader) mountSharedMaster(ctx context.Context, conf *config.Config, hint *MountHint, creds *auth.Credentials) (*vfs.Mount, error) { 1003 // Map mount type to filesystem name, and parse out the options that we are 1004 // capable of dealing with. 1005 mntInfo := newNonGoferMountInfo(&hint.Mount) 1006 fsName, opts, err := getMountNameAndOptions(conf, mntInfo, l.productName) 1007 if err != nil { 1008 return nil, err 1009 } 1010 if len(fsName) == 0 { 1011 return nil, fmt.Errorf("mount type not supported %q", hint.Mount.Type) 1012 } 1013 return l.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) 1014 } 1015 1016 // mountSharedSubmount binds mount to a previously mounted volume that is shared 1017 // among containers in the same pod. 1018 func (c *containerMounter) mountSharedSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount *specs.Mount, srcHint *MountHint, srcMount *vfs.Mount) (*vfs.Mount, error) { 1019 if err := srcHint.checkCompatible(mount); err != nil { 1020 return nil, err 1021 } 1022 1023 // Ignore data and useOverlay because these were already applied to 1024 // the master mount. 1025 _, opts, err := getMountNameAndOptions(conf, newNonGoferMountInfo(mount), c.productName) 1026 if err != nil { 1027 return nil, err 1028 } 1029 newMnt := c.k.VFS().NewDisconnectedMount(srcMount.Filesystem(), srcMount.Root(), opts) 1030 defer newMnt.DecRef(ctx) 1031 1032 root := mns.Root(ctx) 1033 defer root.DecRef(ctx) 1034 target := &vfs.PathOperation{ 1035 Root: root, 1036 Start: root, 1037 Path: fspath.Parse(mount.Destination), 1038 } 1039 1040 if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil { 1041 return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err) 1042 } 1043 1044 if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { 1045 return nil, err 1046 } 1047 log.Infof("Mounted %q type shared bind to %q", mount.Destination, srcHint.Name) 1048 return newMnt, nil 1049 } 1050 1051 func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error { 1052 root := mns.Root(ctx) 1053 defer root.DecRef(ctx) 1054 target := &vfs.PathOperation{ 1055 Root: root, 1056 Start: root, 1057 Path: fspath.Parse(dest), 1058 } 1059 // First check if mount point exists. When overlay is enabled, gofer doesn't 1060 // allow changes to the FS, making MakeSytheticMountpoint() ineffective 1061 // because MkdirAt fails with EROFS even if file exists. 1062 vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{}) 1063 if err == nil { 1064 // File exists, we're done. 1065 vd.DecRef(ctx) 1066 return nil 1067 } 1068 return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds) 1069 } 1070 1071 // configureRestore returns an updated context.Context including filesystem 1072 // state used by restore defined by conf. 1073 func (c *containerMounter) configureRestore(ctx context.Context) (context.Context, error) { 1074 fdmap := make(map[string]int) 1075 fdmap["/"] = c.fds.remove() 1076 mounts, err := c.prepareMounts() 1077 if err != nil { 1078 return ctx, err 1079 } 1080 for i := range c.mounts { 1081 submount := &mounts[i] 1082 if submount.fd >= 0 { 1083 fdmap[submount.mount.Destination] = submount.fd 1084 } 1085 } 1086 return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil 1087 } 1088 1089 func createDeviceFiles(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry) error { 1090 if info.spec.Linux == nil { 1091 return nil 1092 } 1093 for _, dev := range info.spec.Linux.Devices { 1094 pop := vfs.PathOperation{ 1095 Root: root, 1096 Start: root, 1097 Path: fspath.Parse(dev.Path), 1098 } 1099 opts := vfs.MknodOptions{ 1100 Mode: linux.FileMode(dev.FileMode.Perm()), 1101 } 1102 // See https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices. 1103 switch dev.Type { 1104 case "b": 1105 opts.Mode |= linux.S_IFBLK 1106 opts.DevMajor = uint32(dev.Major) 1107 opts.DevMinor = uint32(dev.Minor) 1108 case "c", "u": 1109 opts.Mode |= linux.S_IFCHR 1110 opts.DevMajor = uint32(dev.Major) 1111 opts.DevMinor = uint32(dev.Minor) 1112 case "p": 1113 opts.Mode |= linux.S_IFIFO 1114 default: 1115 return fmt.Errorf("specified device at %q has invalid type %q", dev.Path, dev.Type) 1116 } 1117 if dev.Path == "/dev/nvidia-uvm" && info.nvidiaUVMDevMajor != 0 && opts.DevMajor != info.nvidiaUVMDevMajor { 1118 // nvidia-uvm's major device number is dynamically assigned, so the 1119 // number that it has on the host may differ from the number that 1120 // it has in sentry VFS; switch from the former to the latter. 1121 log.Infof("Switching /dev/nvidia-uvm device major number from %d to %d", dev.Major, info.nvidiaUVMDevMajor) 1122 opts.DevMajor = info.nvidiaUVMDevMajor 1123 } 1124 if err := vfsObj.MkdirAllAt(ctx, path.Dir(dev.Path), root, creds, &vfs.MkdirOptions{ 1125 Mode: 0o755, 1126 }, true /* mustBeDir */); err != nil { 1127 return fmt.Errorf("failed to create ancestor directories of %q: %w", dev.Path, err) 1128 } 1129 // EEXIST is silently ignored; compare 1130 // opencontainers/runc:libcontainer/rootfs_linux.go:createDeviceNode(). 1131 created := true 1132 if err := vfsObj.MknodAt(ctx, creds, &pop, &opts); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) { 1133 if linuxerr.Equals(linuxerr.EEXIST, err) { 1134 created = false 1135 } else { 1136 return fmt.Errorf("failed to create device file at %q: %w", dev.Path, err) 1137 } 1138 } 1139 if created && (dev.UID != nil || dev.GID != nil) { 1140 var opts vfs.SetStatOptions 1141 if dev.UID != nil { 1142 opts.Stat.Mask |= linux.STATX_UID 1143 opts.Stat.UID = *dev.UID 1144 } 1145 if dev.GID != nil { 1146 opts.Stat.Mask |= linux.STATX_GID 1147 opts.Stat.GID = *dev.GID 1148 } 1149 if err := vfsObj.SetStatAt(ctx, creds, &pop, &opts); err != nil { 1150 return fmt.Errorf("failed to set UID/GID for device file %q: %w", dev.Path, err) 1151 } 1152 } 1153 } 1154 return nil 1155 } 1156 1157 func tpuProxyRegisterDevicesAndCreateFiles(ctx context.Context, info *containerInfo, k *kernel.Kernel, vfsObj *vfs.VirtualFilesystem, a *devtmpfs.Accessor) error { 1158 if !info.conf.TPUProxy { 1159 return nil 1160 } 1161 // At this point /dev/accel just contains the TPU devices have been mounted 1162 // into the sandbox chroot. Enumerate all of them and create sentry devices. 1163 paths, err := filepath.Glob("/dev/accel*") 1164 if err != nil { 1165 return fmt.Errorf("enumerating accel device files: %w", err) 1166 } 1167 for _, path := range paths { 1168 accelDeviceRegex := regexp.MustCompile(`^/dev/accel(\d+)$`) 1169 if ms := accelDeviceRegex.FindStringSubmatch(path); ms != nil { 1170 deviceNum, _ := strconv.ParseUint(ms[1], 10, 32) 1171 if err := accel.Register(vfsObj, uint32(deviceNum)); err != nil { 1172 return fmt.Errorf("registering accel driver: %w", err) 1173 } 1174 if err := accel.CreateDevtmpfsFile(ctx, a, uint32(deviceNum)); err != nil { 1175 return fmt.Errorf("creating accel device file %q: %w", deviceNum, err) 1176 } 1177 } 1178 } 1179 return nil 1180 } 1181 1182 func nvproxyRegisterDevicesAndCreateFiles(ctx context.Context, info *containerInfo, k *kernel.Kernel, vfsObj *vfs.VirtualFilesystem, a *devtmpfs.Accessor) error { 1183 if !specutils.GPUFunctionalityRequested(info.spec, info.conf) { 1184 return nil 1185 } 1186 uvmDevMajor, err := k.VFS().GetDynamicCharDevMajor() 1187 if err != nil { 1188 return fmt.Errorf("reserving device major number for nvidia-uvm: %w", err) 1189 } 1190 if err := nvproxy.Register(vfsObj, uvmDevMajor); err != nil { 1191 return fmt.Errorf("registering nvproxy driver: %w", err) 1192 } 1193 info.nvidiaUVMDevMajor = uvmDevMajor 1194 if info.conf.NVProxyDocker { 1195 // In Docker mode, create all the device files now. 1196 // In non-Docker mode, these are instead created as part of 1197 // `createDeviceFiles`, using the spec's Device list. 1198 minors, err := specutils.FindAllGPUDevices("/") 1199 if err != nil { 1200 return fmt.Errorf("getting nvidia devices: %w", err) 1201 } 1202 if err := nvproxy.CreateDriverDevtmpfsFiles(ctx, a, uvmDevMajor); err != nil { 1203 return fmt.Errorf("creating nvproxy devtmpfs files: %w", err) 1204 } 1205 for _, minor := range minors { 1206 if err := nvproxy.CreateIndexDevtmpfsFile(ctx, a, minor); err != nil { 1207 return fmt.Errorf("creating nvproxy devtmpfs file for device minor %d: %w", minor, err) 1208 } 1209 } 1210 } 1211 return nil 1212 }