github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/cmd/boot.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cmd 16 17 import ( 18 "context" 19 "fmt" 20 "io/ioutil" 21 "os" 22 "os/exec" 23 "path/filepath" 24 "runtime" 25 "runtime/debug" 26 "strconv" 27 "strings" 28 "time" 29 30 "github.com/MerlinKodo/gvisor/pkg/coretag" 31 "github.com/MerlinKodo/gvisor/pkg/cpuid" 32 "github.com/MerlinKodo/gvisor/pkg/log" 33 "github.com/MerlinKodo/gvisor/pkg/metric" 34 "github.com/MerlinKodo/gvisor/pkg/ring0" 35 "github.com/MerlinKodo/gvisor/pkg/sentry/platform" 36 "github.com/MerlinKodo/gvisor/runsc/boot" 37 "github.com/MerlinKodo/gvisor/runsc/cmd/util" 38 "github.com/MerlinKodo/gvisor/runsc/config" 39 "github.com/MerlinKodo/gvisor/runsc/flag" 40 "github.com/MerlinKodo/gvisor/runsc/profile" 41 "github.com/MerlinKodo/gvisor/runsc/specutils" 42 "github.com/google/subcommands" 43 specs "github.com/opencontainers/runtime-spec/specs-go" 44 "golang.org/x/sys/unix" 45 ) 46 47 // Note that directfsSandboxCaps is the same as caps defined in gofer.go 48 // except CAP_SYS_CHROOT because we don't need to chroot in directfs mode. 49 var directfsSandboxCaps = []string{ 50 "CAP_CHOWN", 51 "CAP_DAC_OVERRIDE", 52 "CAP_DAC_READ_SEARCH", 53 "CAP_FOWNER", 54 "CAP_FSETID", 55 } 56 57 // directfsSandboxLinuxCaps is the minimal set of capabilities needed by the 58 // sandbox to operate on files in directfs mode. 59 var directfsSandboxLinuxCaps = &specs.LinuxCapabilities{ 60 Bounding: directfsSandboxCaps, 61 Effective: directfsSandboxCaps, 62 Permitted: directfsSandboxCaps, 63 } 64 65 // Boot implements subcommands.Command for the "boot" command which starts a 66 // new sandbox. It should not be called directly. 67 type Boot struct { 68 // bundleDir is the directory containing the OCI spec. 69 bundleDir string 70 71 // specFD is the file descriptor that the spec will be read from. 72 specFD int 73 74 // controllerFD is the file descriptor of a stream socket for the 75 // control server that is donated to this process. 76 controllerFD int 77 78 // deviceFD is the file descriptor for the platform device file. 79 deviceFD int 80 81 // ioFDs is the list of FDs used to connect to FS gofers. 82 ioFDs intFlags 83 84 // overlayFilestoreFDs are FDs to the regular files that will back the tmpfs 85 // upper mount in the overlay mounts. 86 overlayFilestoreFDs intFlags 87 88 // overlayMediums contains information about how the gofer mounts have been 89 // overlaid. The first entry is for rootfs and the following entries are for 90 // bind mounts in Spec.Mounts (in the same order). 91 overlayMediums boot.OverlayMediumFlags 92 93 // stdioFDs are the fds for stdin, stdout, and stderr. They must be 94 // provided in that order. 95 stdioFDs intFlags 96 97 // passFDs are mappings of user-supplied host to guest file descriptors. 98 passFDs fdMappings 99 100 // execFD is the host file descriptor used for program execution. 101 execFD int 102 103 // applyCaps determines if capabilities defined in the spec should be applied 104 // to the process. 105 applyCaps bool 106 107 // setUpChroot is set to true if the sandbox is started in an empty root. 108 setUpRoot bool 109 110 // cpuNum number of CPUs to create inside the sandbox. 111 cpuNum int 112 113 // totalMem sets the initial amount of total memory to report back to the 114 // container. 115 totalMem uint64 116 117 // totalHostMem is the total memory reported by host /proc/meminfo. 118 totalHostMem uint64 119 120 // userLogFD is the file descriptor to write user logs to. 121 userLogFD int 122 123 // startSyncFD is the file descriptor to synchronize runsc and sandbox. 124 startSyncFD int 125 126 // mountsFD is the file descriptor to read list of mounts after they have 127 // been resolved (direct paths, no symlinks). They are resolved outside the 128 // sandbox (e.g. gofer) and sent through this FD. 129 mountsFD int 130 131 podInitConfigFD int 132 133 sinkFDs intFlags 134 135 // pidns is set if the sandbox is in its own pid namespace. 136 pidns bool 137 138 // attached is set to true to kill the sandbox process when the parent process 139 // terminates. This flag is set when the command execve's itself because 140 // parent death signal doesn't propagate through execve when uid/gid changes. 141 attached bool 142 143 // productName is the value to show in 144 // /sys/devices/virtual/dmi/id/product_name. 145 productName string 146 147 // FDs for profile data. 148 profileFDs profile.FDArgs 149 150 // procMountSyncFD is a file descriptor that has to be closed when the 151 // procfs mount isn't needed anymore. 152 procMountSyncFD int 153 154 // syncUsernsFD is the file descriptor that has to be closed when the 155 // boot process should invoke setuid/setgid for root user. This is mainly 156 // used to synchronize rootless user namespace initialization. 157 syncUsernsFD int 158 159 // nvidiaDevMinors is a list of device minors for Nvidia GPU devices exposed 160 // to the sandbox. 161 nvidiaDevMinors boot.NvidiaDevMinors 162 } 163 164 // Name implements subcommands.Command.Name. 165 func (*Boot) Name() string { 166 return "boot" 167 } 168 169 // Synopsis implements subcommands.Command.Synopsis. 170 func (*Boot) Synopsis() string { 171 return "launch a sandbox process" 172 } 173 174 // Usage implements subcommands.Command.Usage. 175 func (*Boot) Usage() string { 176 return `boot [flags] <container id>` 177 } 178 179 // SetFlags implements subcommands.Command.SetFlags. 180 func (b *Boot) SetFlags(f *flag.FlagSet) { 181 f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory") 182 f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") 183 f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") 184 f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") 185 f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox") 186 f.IntVar(&b.procMountSyncFD, "proc-mount-sync-fd", -1, "file descriptor that has to be written to when /proc isn't needed anymore and can be unmounted") 187 f.IntVar(&b.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.") 188 f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") 189 f.Uint64Var(&b.totalHostMem, "total-host-memory", 0, "total memory reported by host /proc/meminfo") 190 f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates") 191 f.StringVar(&b.productName, "product-name", "", "value to show in /sys/devices/virtual/dmi/id/product_name") 192 193 // Open FDs that are donated to the sandbox. 194 f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec") 195 f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") 196 f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") 197 f.Var(&b.ioFDs, "io-fds", "list of FDs to connect gofer clients. They must follow this order: root first, then mounts as defined in the spec") 198 f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") 199 f.Var(&b.passFDs, "pass-fd", "mapping of host to guest FDs. They must be in M:N format. M is the host and N the guest descriptor.") 200 f.IntVar(&b.execFD, "exec-fd", -1, "host file descriptor used for program execution.") 201 f.Var(&b.overlayFilestoreFDs, "overlay-filestore-fds", "FDs to the regular files that will back the tmpfs upper mount in the overlay mounts.") 202 f.Var(&b.overlayMediums, "overlay-mediums", "information about how the gofer mounts have been overlaid.") 203 f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") 204 f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") 205 f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") 206 f.IntVar(&b.podInitConfigFD, "pod-init-config-fd", -1, "file descriptor to the pod init configuration file.") 207 f.Var(&b.sinkFDs, "sink-fds", "ordered list of file descriptors to be used by the sinks defined in --pod-init-config.") 208 f.Var(&b.nvidiaDevMinors, "nvidia-dev-minors", "list of device minors for Nvidia GPU devices exposed to the sandbox.") 209 210 // Profiling flags. 211 b.profileFDs.SetFromFlags(f) 212 } 213 214 // Execute implements subcommands.Command.Execute. It starts a sandbox in a 215 // waiting state. 216 func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { 217 if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 { 218 f.Usage() 219 return subcommands.ExitUsageError 220 } 221 222 conf := args[0].(*config.Config) 223 224 // Set traceback level 225 debug.SetTraceback(conf.Traceback) 226 227 // Initialize CPUID information. 228 cpuid.Initialize() 229 230 // Initialize ring0 library. 231 ring0.InitDefault() 232 233 argOverride := make(map[string]string) 234 if len(b.productName) == 0 { 235 // Do this before chroot takes effect, otherwise we can't read /sys. 236 if product, err := ioutil.ReadFile("/sys/devices/virtual/dmi/id/product_name"); err != nil { 237 log.Warningf("Not setting product_name: %v", err) 238 } else { 239 b.productName = strings.TrimSpace(string(product)) 240 log.Infof("Setting product_name: %q", b.productName) 241 argOverride["product-name"] = b.productName 242 } 243 } 244 245 if b.attached { 246 // Ensure this process is killed after parent process terminates when 247 // attached mode is enabled. In the unfortunate event that the parent 248 // terminates before this point, this process leaks. 249 if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil { 250 util.Fatalf("error setting parent death signal: %v", err) 251 } 252 } 253 254 if b.syncUsernsFD >= 0 { 255 syncUsernsForRootless(b.syncUsernsFD) 256 argOverride["sync-userns-fd"] = "-1" 257 } 258 259 // Get the spec from the specFD. We *must* keep this os.File alive past 260 // the call setCapsAndCallSelf, otherwise the FD will be closed and the 261 // child process cannot read it 262 specFile := os.NewFile(uintptr(b.specFD), "spec file") 263 spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf) 264 if err != nil { 265 util.Fatalf("reading spec: %v", err) 266 } 267 268 if b.setUpRoot { 269 if err := setUpChroot(b.pidns, spec, conf, b.nvidiaDevMinors); err != nil { 270 util.Fatalf("error setting up chroot: %v", err) 271 } 272 argOverride["setup-root"] = "false" 273 274 if !conf.Rootless { 275 // /proc is umounted from a forked process, because the 276 // current one is going to re-execute itself without 277 // capabilities. 278 cmd, w := execProcUmounter() 279 defer cmd.Wait() 280 defer w.Close() 281 if b.procMountSyncFD != -1 { 282 panic("procMountSyncFD is set") 283 } 284 b.procMountSyncFD = int(w.Fd()) 285 argOverride["proc-mount-sync-fd"] = strconv.Itoa(b.procMountSyncFD) 286 287 // Clear FD_CLOEXEC. Regardless of b.applyCaps, this process will be 288 // re-executed. procMountSyncFD should remain open. 289 if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 { 290 util.Fatalf("error clearing CLOEXEC: %v", errno) 291 } 292 293 if !b.applyCaps { 294 // Remove the args that have already been done before calling self. 295 args := prepareArgs(b.Name(), f, argOverride) 296 297 // Note that we've already read the spec from the spec FD, and 298 // we will read it again after the exec call. This works 299 // because the ReadSpecFromFile function seeks to the beginning 300 // of the file before reading. 301 util.Fatalf("callSelfAsNobody(%v): %v", args, callSelfAsNobody(args)) 302 303 // This prevents the specFile finalizer from running and closed 304 // the specFD, which we have passed to ourselves when 305 // re-execing. 306 runtime.KeepAlive(specFile) 307 panic("unreachable") 308 } 309 } 310 } 311 312 specutils.LogSpecDebug(spec, conf.OCISeccomp) 313 314 if b.applyCaps { 315 caps := spec.Process.Capabilities 316 if caps == nil { 317 caps = &specs.LinuxCapabilities{} 318 } 319 320 gPlatform, err := platform.Lookup(conf.Platform) 321 if err != nil { 322 util.Fatalf("loading platform: %v", err) 323 } 324 if gPlatform.Requirements().RequiresCapSysPtrace { 325 // Ptrace platform requires extra capabilities. 326 const c = "CAP_SYS_PTRACE" 327 caps.Bounding = append(caps.Bounding, c) 328 caps.Effective = append(caps.Effective, c) 329 caps.Permitted = append(caps.Permitted, c) 330 } 331 332 if conf.DirectFS { 333 caps = specutils.MergeCapabilities(caps, directfsSandboxLinuxCaps) 334 } 335 argOverride["apply-caps"] = "false" 336 337 // Remove the args that have already been done before calling self. 338 args := prepareArgs(b.Name(), f, argOverride) 339 340 // Note that we've already read the spec from the spec FD, and 341 // we will read it again after the exec call. This works 342 // because the ReadSpecFromFile function seeks to the beginning 343 // of the file before reading. 344 util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, caps, setCapsAndCallSelf(args, caps)) 345 346 // This prevents the specFile finalizer from running and closed 347 // the specFD, which we have passed to ourselves when 348 // re-execing. 349 runtime.KeepAlive(specFile) 350 panic("unreachable") 351 } 352 353 if b.syncUsernsFD >= 0 { 354 // syncUsernsFD is set, but runsc hasn't been re-executed with a new UID and GID. 355 // We expect that setCapsAndCallSelf has to be called in this case. 356 panic("unreachable") 357 } 358 359 // Close specFile to avoid exposing it to the sandbox. 360 if err := specFile.Close(); err != nil { 361 util.Fatalf("closing specFile: %v", err) 362 } 363 364 // At this point we won't re-execute, so it's safe to limit via rlimits. Any 365 // limit >= 0 works. If the limit is lower than the current number of open 366 // files, then Setrlimit will succeed, and the next open will fail. 367 if conf.FDLimit > -1 { 368 rlimit := unix.Rlimit{ 369 Cur: uint64(conf.FDLimit), 370 Max: uint64(conf.FDLimit), 371 } 372 switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err { 373 case nil: 374 case unix.EPERM: 375 log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit) 376 default: 377 util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err) 378 } 379 } 380 381 // Read resolved mount list and replace the original one from the spec. 382 mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file") 383 cleanMounts, err := specutils.ReadMounts(mountsFile) 384 if err != nil { 385 mountsFile.Close() 386 util.Fatalf("Error reading mounts file: %v", err) 387 } 388 mountsFile.Close() 389 spec.Mounts = cleanMounts 390 391 if conf.DirectFS { 392 // sandbox should run with a umask of 0, because we want to preserve file 393 // modes exactly as sent by the sentry, which would have already applied 394 // the application umask. 395 unix.Umask(0) 396 } 397 398 if conf.EnableCoreTags { 399 if err := coretag.Enable(); err != nil { 400 util.Fatalf("Failed to core tag sentry: %v", err) 401 } 402 403 // Verify that all sentry threads are properly core tagged, and log 404 // current core tag. 405 coreTags, err := coretag.GetAllCoreTags(os.Getpid()) 406 if err != nil { 407 util.Fatalf("Failed read current core tags: %v", err) 408 } 409 if len(coreTags) != 1 { 410 util.Fatalf("Not all child threads were core tagged the same. Tags=%v", coreTags) 411 } 412 log.Infof("Core tag enabled (core tag=%d)", coreTags[0]) 413 } 414 415 // Create the loader. 416 bootArgs := boot.Args{ 417 ID: f.Arg(0), 418 Spec: spec, 419 Conf: conf, 420 ControllerFD: b.controllerFD, 421 Device: os.NewFile(uintptr(b.deviceFD), "platform device"), 422 GoferFDs: b.ioFDs.GetArray(), 423 StdioFDs: b.stdioFDs.GetArray(), 424 PassFDs: b.passFDs.GetArray(), 425 ExecFD: b.execFD, 426 OverlayFilestoreFDs: b.overlayFilestoreFDs.GetArray(), 427 OverlayMediums: b.overlayMediums.GetArray(), 428 NumCPU: b.cpuNum, 429 TotalMem: b.totalMem, 430 TotalHostMem: b.totalHostMem, 431 UserLogFD: b.userLogFD, 432 ProductName: b.productName, 433 PodInitConfigFD: b.podInitConfigFD, 434 SinkFDs: b.sinkFDs.GetArray(), 435 ProfileOpts: b.profileFDs.ToOpts(), 436 } 437 l, err := boot.New(bootArgs) 438 if err != nil { 439 util.Fatalf("creating loader: %v", err) 440 } 441 442 // Fatalf exits the process and doesn't run defers. 443 // 'l' must be destroyed explicitly after this point! 444 445 if b.procMountSyncFD != -1 { 446 l.PreSeccompCallback = func() { 447 // Call validateOpenFDs() before umounting /proc. 448 validateOpenFDs(bootArgs.PassFDs) 449 // Umount /proc right before installing seccomp filters. 450 umountProc(b.procMountSyncFD) 451 } 452 } 453 454 // Prepare metrics. 455 // This needs to happen after the kernel is initialized (such that all metrics are registered) 456 // but before the start-sync file is notified, as the parent process needs to query for 457 // registered metrics prior to sending the start signal. 458 metric.Initialize() 459 if metric.ProfilingMetricWriter != nil { 460 if err := metric.StartProfilingMetrics(conf.ProfilingMetrics, time.Duration(conf.ProfilingMetricsRate)*time.Microsecond); err != nil { 461 l.Destroy() 462 util.Fatalf("unable to start profiling metrics: %v", err) 463 } 464 defer metric.StopProfilingMetrics() 465 } 466 467 // Notify the parent process the sandbox has booted (and that the controller 468 // is up). 469 startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file") 470 buf := make([]byte, 1) 471 if w, err := startSyncFile.Write(buf); err != nil || w != 1 { 472 l.Destroy() 473 util.Fatalf("unable to write into the start-sync descriptor: %v", err) 474 } 475 // Closes startSyncFile because 'l.Run()' only returns when the sandbox exits. 476 startSyncFile.Close() 477 478 // Wait for the start signal from runsc. 479 l.WaitForStartSignal() 480 481 // Run the application and wait for it to finish. 482 if err := l.Run(); err != nil { 483 l.Destroy() 484 util.Fatalf("running sandbox: %v", err) 485 } 486 487 ws := l.WaitExit() 488 log.Infof("application exiting with %+v", ws) 489 waitStatus := args[1].(*unix.WaitStatus) 490 *waitStatus = unix.WaitStatus(ws) 491 l.Destroy() 492 return subcommands.ExitSuccess 493 } 494 495 // prepareArgs returns the args that can be used to re-execute the current 496 // program. It manipulates the flags of the subcommands.Command identified by 497 // subCmdName and fSet is the flag.FlagSet of this subcommand. It applies the 498 // flags specified by override map. In case of conflict, flag is overriden. 499 // 500 // Postcondition: prepareArgs() takes ownership of override map. 501 func prepareArgs(subCmdName string, fSet *flag.FlagSet, override map[string]string) []string { 502 var args []string 503 // Add all args up until (and including) the sub command. 504 for _, arg := range os.Args { 505 args = append(args, arg) 506 if arg == subCmdName { 507 break 508 } 509 } 510 // Set sub command flags. Iterate through all the explicitly set flags. 511 fSet.Visit(func(gf *flag.Flag) { 512 // If a conflict is found with override, then prefer override flag. 513 if ov, ok := override[gf.Name]; ok { 514 args = append(args, fmt.Sprintf("--%s=%s", gf.Name, ov)) 515 delete(override, gf.Name) 516 return 517 } 518 // Otherwise pass through the original flag. 519 args = append(args, fmt.Sprintf("--%s=%s", gf.Name, gf.Value)) 520 }) 521 // Apply remaining override flags (that didn't conflict above). 522 for of, ov := range override { 523 args = append(args, fmt.Sprintf("--%s=%s", of, ov)) 524 } 525 // Add the non-flag arguments at the end. 526 args = append(args, fSet.Args()...) 527 return args 528 } 529 530 // execProcUmounter execute a child process that umounts /proc when the 531 // returned pipe is closed. 532 func execProcUmounter() (*exec.Cmd, *os.File) { 533 r, w, err := os.Pipe() 534 if err != nil { 535 util.Fatalf("error creating a pipe: %v", err) 536 } 537 defer r.Close() 538 539 cmd := exec.Command(specutils.ExePath) 540 cmd.Args = append(cmd.Args, "umount", "--sync-fd=3", "/proc") 541 cmd.ExtraFiles = append(cmd.ExtraFiles, r) 542 cmd.Stdin = os.Stdin 543 cmd.Stdout = os.Stdout 544 cmd.Stderr = os.Stderr 545 if err := cmd.Start(); err != nil { 546 util.Fatalf("error executing umounter: %v", err) 547 } 548 return cmd, w 549 } 550 551 // umountProc writes to syncFD signalling the process started by 552 // execProcUmounter() to umount /proc. 553 func umountProc(syncFD int) { 554 syncFile := os.NewFile(uintptr(syncFD), "procfs umount sync FD") 555 buf := make([]byte, 1) 556 if w, err := syncFile.Write(buf); err != nil || w != 1 { 557 util.Fatalf("unable to write into the proc umounter descriptor: %v", err) 558 } 559 syncFile.Close() 560 561 var waitStatus unix.WaitStatus 562 if _, err := unix.Wait4(0, &waitStatus, 0, nil); err != nil { 563 util.Fatalf("error waiting for the proc umounter process: %v", err) 564 } 565 if !waitStatus.Exited() || waitStatus.ExitStatus() != 0 { 566 util.Fatalf("the proc umounter process failed: %v", waitStatus) 567 } 568 if err := unix.Access("/proc/self", unix.F_OK); err != unix.ENOENT { 569 util.Fatalf("/proc is still accessible") 570 } 571 } 572 573 // validateOpenFDs checks that the sandbox process does not have any open 574 // directory FDs. 575 func validateOpenFDs(passFDs []boot.FDMapping) { 576 passHostFDs := make(map[int]struct{}) 577 for _, passFD := range passFDs { 578 passHostFDs[passFD.Host] = struct{}{} 579 } 580 const selfFDDir = "/proc/self/fd" 581 if err := filepath.WalkDir(selfFDDir, func(path string, d os.DirEntry, err error) error { 582 if err != nil { 583 return err 584 } 585 if d.Type() != os.ModeSymlink { 586 // All entries are symlinks. Ignore the callback for fd directory itself. 587 return nil 588 } 589 if fdInfo, err := os.Stat(path); err != nil { 590 if os.IsNotExist(err) { 591 // Ignore FDs that are now closed. For example, the FD to selfFDDir that 592 // was opened by filepath.WalkDir() to read dirents. 593 return nil 594 } 595 return fmt.Errorf("os.Stat(%s) failed: %v", path, err) 596 } else if !fdInfo.IsDir() { 597 return nil 598 } 599 // Uh-oh. This is a directory FD. 600 fdNo, err := strconv.Atoi(d.Name()) 601 if err != nil { 602 return fmt.Errorf("strconv.Atoi(%s) failed: %v", d.Name(), err) 603 } 604 dirLink, err := os.Readlink(path) 605 if err != nil { 606 return fmt.Errorf("os.Readlink(%s) failed: %v", path, err) 607 } 608 if _, ok := passHostFDs[fdNo]; ok { 609 // Passed FDs are allowed to be directories. The user must be knowing 610 // what they are doing. Log a warning regardless. 611 log.Warningf("Sandbox has access to FD %d, which is a directory for %s", fdNo, dirLink) 612 return nil 613 } 614 return fmt.Errorf("FD %d is a directory for %s", fdNo, dirLink) 615 }); err != nil { 616 util.Fatalf("WalkDir(%s) failed: %v", selfFDDir, err) 617 } 618 }