github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/cmd/boot.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cmd 16 17 import ( 18 "context" 19 "fmt" 20 "io/ioutil" 21 "os" 22 "os/exec" 23 "path/filepath" 24 "runtime" 25 "runtime/debug" 26 "strconv" 27 "strings" 28 "time" 29 30 "github.com/google/subcommands" 31 specs "github.com/opencontainers/runtime-spec/specs-go" 32 "golang.org/x/sys/unix" 33 "github.com/metacubex/gvisor/pkg/coretag" 34 "github.com/metacubex/gvisor/pkg/cpuid" 35 "github.com/metacubex/gvisor/pkg/log" 36 "github.com/metacubex/gvisor/pkg/metric" 37 "github.com/metacubex/gvisor/pkg/ring0" 38 "github.com/metacubex/gvisor/pkg/sentry/platform" 39 "github.com/metacubex/gvisor/runsc/boot" 40 "github.com/metacubex/gvisor/runsc/cmd/util" 41 "github.com/metacubex/gvisor/runsc/config" 42 "github.com/metacubex/gvisor/runsc/flag" 43 "github.com/metacubex/gvisor/runsc/profile" 44 "github.com/metacubex/gvisor/runsc/specutils" 45 ) 46 47 // Note that directfsSandboxCaps is the same as caps defined in gofer.go 48 // except CAP_SYS_CHROOT because we don't need to chroot in directfs mode. 49 var directfsSandboxCaps = []string{ 50 "CAP_CHOWN", 51 "CAP_DAC_OVERRIDE", 52 "CAP_DAC_READ_SEARCH", 53 "CAP_FOWNER", 54 "CAP_FSETID", 55 } 56 57 // directfsSandboxLinuxCaps is the minimal set of capabilities needed by the 58 // sandbox to operate on files in directfs mode. 59 var directfsSandboxLinuxCaps = &specs.LinuxCapabilities{ 60 Bounding: directfsSandboxCaps, 61 Effective: directfsSandboxCaps, 62 Permitted: directfsSandboxCaps, 63 } 64 65 // Boot implements subcommands.Command for the "boot" command which starts a 66 // new sandbox. It should not be called directly. 67 type Boot struct { 68 // bundleDir is the directory containing the OCI spec. 69 bundleDir string 70 71 // specFD is the file descriptor that the spec will be read from. 72 specFD int 73 74 // controllerFD is the file descriptor of a stream socket for the 75 // control server that is donated to this process. 76 controllerFD int 77 78 // deviceFD is the file descriptor for the platform device file. 79 deviceFD int 80 81 // ioFDs is the list of FDs used to connect to FS gofers. 82 ioFDs intFlags 83 84 // devIoFD is the FD to connect to dev gofer. 85 devIoFD int 86 87 // goferFilestoreFDs are FDs to the regular files that will back the tmpfs or 88 // overlayfs mount for certain gofer mounts. 89 goferFilestoreFDs intFlags 90 91 // goferMountConfs contains information about how the gofer mounts have been 92 // configured. The first entry is for rootfs and the following entries are 93 // for bind mounts in Spec.Mounts (in the same order). 94 goferMountConfs boot.GoferMountConfFlags 95 96 // stdioFDs are the fds for stdin, stdout, and stderr. They must be 97 // provided in that order. 98 stdioFDs intFlags 99 100 // passFDs are mappings of user-supplied host to guest file descriptors. 101 passFDs fdMappings 102 103 // execFD is the host file descriptor used for program execution. 104 execFD int 105 106 // applyCaps determines if capabilities defined in the spec should be applied 107 // to the process. 108 applyCaps bool 109 110 // setUpChroot is set to true if the sandbox is started in an empty root. 111 setUpRoot bool 112 113 // cpuNum number of CPUs to create inside the sandbox. 114 cpuNum int 115 116 // totalMem sets the initial amount of total memory to report back to the 117 // container. 118 totalMem uint64 119 120 // totalHostMem is the total memory reported by host /proc/meminfo. 121 totalHostMem uint64 122 123 // userLogFD is the file descriptor to write user logs to. 124 userLogFD int 125 126 // startSyncFD is the file descriptor to synchronize runsc and sandbox. 127 startSyncFD int 128 129 // mountsFD is the file descriptor to read list of mounts after they have 130 // been resolved (direct paths, no symlinks). They are resolved outside the 131 // sandbox (e.g. gofer) and sent through this FD. When mountsFD is not 132 // provided, there is no cleaning required for mounts and the mounts in 133 // the spec can be used as is. 134 mountsFD int 135 136 podInitConfigFD int 137 138 sinkFDs intFlags 139 140 // pidns is set if the sandbox is in its own pid namespace. 141 pidns bool 142 143 // attached is set to true to kill the sandbox process when the parent process 144 // terminates. This flag is set when the command execve's itself because 145 // parent death signal doesn't propagate through execve when uid/gid changes. 146 attached bool 147 148 // productName is the value to show in 149 // /sys/devices/virtual/dmi/id/product_name. 150 productName string 151 152 // FDs for profile data. 153 profileFDs profile.FDArgs 154 155 // procMountSyncFD is a file descriptor that has to be closed when the 156 // procfs mount isn't needed anymore. 157 procMountSyncFD int 158 159 // syncUsernsFD is the file descriptor that has to be closed when the 160 // boot process should invoke setuid/setgid for root user. This is mainly 161 // used to synchronize rootless user namespace initialization. 162 syncUsernsFD int 163 164 // nvidiaDriverVersion is the Nvidia driver version on the host. 165 nvidiaDriverVersion string 166 } 167 168 // Name implements subcommands.Command.Name. 169 func (*Boot) Name() string { 170 return "boot" 171 } 172 173 // Synopsis implements subcommands.Command.Synopsis. 174 func (*Boot) Synopsis() string { 175 return "launch a sandbox process" 176 } 177 178 // Usage implements subcommands.Command.Usage. 179 func (*Boot) Usage() string { 180 return `boot [flags] <container id>` 181 } 182 183 // SetFlags implements subcommands.Command.SetFlags. 184 func (b *Boot) SetFlags(f *flag.FlagSet) { 185 f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory") 186 f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") 187 f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") 188 f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") 189 f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox") 190 f.IntVar(&b.procMountSyncFD, "proc-mount-sync-fd", -1, "file descriptor that has to be written to when /proc isn't needed anymore and can be unmounted") 191 f.IntVar(&b.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.") 192 f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") 193 f.Uint64Var(&b.totalHostMem, "total-host-memory", 0, "total memory reported by host /proc/meminfo") 194 f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates") 195 f.StringVar(&b.productName, "product-name", "", "value to show in /sys/devices/virtual/dmi/id/product_name") 196 f.StringVar(&b.nvidiaDriverVersion, "nvidia-driver-version", "", "Nvidia driver version on the host") 197 198 // Open FDs that are donated to the sandbox. 199 f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec") 200 f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") 201 f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") 202 f.Var(&b.ioFDs, "io-fds", "list of image FDs and/or socket FDs to connect gofer clients. They must follow this order: root first, then mounts as defined in the spec") 203 f.IntVar(&b.devIoFD, "dev-io-fd", -1, "FD to connect dev gofer client") 204 f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") 205 f.Var(&b.passFDs, "pass-fd", "mapping of host to guest FDs. They must be in M:N format. M is the host and N the guest descriptor.") 206 f.IntVar(&b.execFD, "exec-fd", -1, "host file descriptor used for program execution.") 207 f.Var(&b.goferFilestoreFDs, "gofer-filestore-fds", "FDs to the regular files that will back the overlayfs or tmpfs mount if a gofer mount is to be overlaid.") 208 f.Var(&b.goferMountConfs, "gofer-mount-confs", "information about how the gofer mounts have been configured.") 209 f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") 210 f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") 211 f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is an optional file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") 212 f.IntVar(&b.podInitConfigFD, "pod-init-config-fd", -1, "file descriptor to the pod init configuration file.") 213 f.Var(&b.sinkFDs, "sink-fds", "ordered list of file descriptors to be used by the sinks defined in --pod-init-config.") 214 215 // Profiling flags. 216 b.profileFDs.SetFromFlags(f) 217 } 218 219 // Execute implements subcommands.Command.Execute. It starts a sandbox in a 220 // waiting state. 221 func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { 222 if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 { 223 f.Usage() 224 return subcommands.ExitUsageError 225 } 226 227 conf := args[0].(*config.Config) 228 229 // Set traceback level 230 debug.SetTraceback(conf.Traceback) 231 232 // Initialize CPUID information. 233 cpuid.Initialize() 234 235 // Initialize ring0 library. 236 ring0.InitDefault() 237 238 argOverride := make(map[string]string) 239 if len(b.productName) == 0 { 240 // Do this before chroot takes effect, otherwise we can't read /sys. 241 if product, err := ioutil.ReadFile("/sys/devices/virtual/dmi/id/product_name"); err != nil { 242 log.Warningf("Not setting product_name: %v", err) 243 } else { 244 b.productName = strings.TrimSpace(string(product)) 245 log.Infof("Setting product_name: %q", b.productName) 246 argOverride["product-name"] = b.productName 247 } 248 } 249 250 if b.attached { 251 // Ensure this process is killed after parent process terminates when 252 // attached mode is enabled. In the unfortunate event that the parent 253 // terminates before this point, this process leaks. 254 if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil { 255 util.Fatalf("error setting parent death signal: %v", err) 256 } 257 } 258 259 if b.syncUsernsFD >= 0 { 260 syncUsernsForRootless(b.syncUsernsFD) 261 argOverride["sync-userns-fd"] = "-1" 262 } 263 264 // Get the spec from the specFD. We *must* keep this os.File alive past 265 // the call setCapsAndCallSelf, otherwise the FD will be closed and the 266 // child process cannot read it 267 specFile := os.NewFile(uintptr(b.specFD), "spec file") 268 spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf) 269 if err != nil { 270 util.Fatalf("reading spec: %v", err) 271 } 272 273 if b.setUpRoot { 274 if err := setUpChroot(b.pidns, spec, conf); err != nil { 275 util.Fatalf("error setting up chroot: %v", err) 276 } 277 argOverride["setup-root"] = "false" 278 279 if !conf.Rootless { 280 // /proc is umounted from a forked process, because the 281 // current one is going to re-execute itself without 282 // capabilities. 283 cmd, w := execProcUmounter() 284 defer cmd.Wait() 285 defer w.Close() 286 if b.procMountSyncFD != -1 { 287 panic("procMountSyncFD is set") 288 } 289 b.procMountSyncFD = int(w.Fd()) 290 argOverride["proc-mount-sync-fd"] = strconv.Itoa(b.procMountSyncFD) 291 292 // Clear FD_CLOEXEC. Regardless of b.applyCaps, this process will be 293 // re-executed. procMountSyncFD should remain open. 294 if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 { 295 util.Fatalf("error clearing CLOEXEC: %v", errno) 296 } 297 298 if !b.applyCaps { 299 // Remove the args that have already been done before calling self. 300 args := prepareArgs(b.Name(), f, argOverride) 301 302 // Note that we've already read the spec from the spec FD, and 303 // we will read it again after the exec call. This works 304 // because the ReadSpecFromFile function seeks to the beginning 305 // of the file before reading. 306 util.Fatalf("callSelfAsNobody(%v): %v", args, callSelfAsNobody(args)) 307 308 // This prevents the specFile finalizer from running and closed 309 // the specFD, which we have passed to ourselves when 310 // re-execing. 311 runtime.KeepAlive(specFile) 312 panic("unreachable") 313 } 314 } 315 } 316 317 specutils.LogSpecDebug(spec, conf.OCISeccomp) 318 319 if b.applyCaps { 320 caps := spec.Process.Capabilities 321 if caps == nil { 322 caps = &specs.LinuxCapabilities{} 323 } 324 325 gPlatform, err := platform.Lookup(conf.Platform) 326 if err != nil { 327 util.Fatalf("loading platform: %v", err) 328 } 329 if gPlatform.Requirements().RequiresCapSysPtrace { 330 // Ptrace platform requires extra capabilities. 331 const c = "CAP_SYS_PTRACE" 332 caps.Bounding = append(caps.Bounding, c) 333 caps.Effective = append(caps.Effective, c) 334 caps.Permitted = append(caps.Permitted, c) 335 } 336 337 if conf.DirectFS { 338 caps = specutils.MergeCapabilities(caps, directfsSandboxLinuxCaps) 339 } 340 argOverride["apply-caps"] = "false" 341 342 // Remove the args that have already been done before calling self. 343 args := prepareArgs(b.Name(), f, argOverride) 344 345 // Note that we've already read the spec from the spec FD, and 346 // we will read it again after the exec call. This works 347 // because the ReadSpecFromFile function seeks to the beginning 348 // of the file before reading. 349 util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, caps, setCapsAndCallSelf(args, caps)) 350 351 // This prevents the specFile finalizer from running and closed 352 // the specFD, which we have passed to ourselves when 353 // re-execing. 354 runtime.KeepAlive(specFile) 355 panic("unreachable") 356 } 357 358 if b.syncUsernsFD >= 0 { 359 // syncUsernsFD is set, but runsc hasn't been re-executed with a new UID and GID. 360 // We expect that setCapsAndCallSelf has to be called in this case. 361 panic("unreachable") 362 } 363 364 // Close specFile to avoid exposing it to the sandbox. 365 if err := specFile.Close(); err != nil { 366 util.Fatalf("closing specFile: %v", err) 367 } 368 369 // At this point we won't re-execute, so it's safe to limit via rlimits. Any 370 // limit >= 0 works. If the limit is lower than the current number of open 371 // files, then Setrlimit will succeed, and the next open will fail. 372 if conf.FDLimit > -1 { 373 rlimit := unix.Rlimit{ 374 Cur: uint64(conf.FDLimit), 375 Max: uint64(conf.FDLimit), 376 } 377 switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err { 378 case nil: 379 case unix.EPERM: 380 log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit) 381 default: 382 util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err) 383 } 384 } 385 386 // When mountsFD is not provided, there is no cleaning required. 387 if b.mountsFD >= 0 { 388 // Read resolved mount list and replace the original one from the spec. 389 mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file") 390 cleanMounts, err := specutils.ReadMounts(mountsFile) 391 if err != nil { 392 mountsFile.Close() 393 util.Fatalf("Error reading mounts file: %v", err) 394 } 395 mountsFile.Close() 396 spec.Mounts = cleanMounts 397 } 398 399 if conf.DirectFS { 400 // sandbox should run with a umask of 0, because we want to preserve file 401 // modes exactly as sent by the sentry, which would have already applied 402 // the application umask. 403 unix.Umask(0) 404 } 405 406 if conf.EnableCoreTags { 407 if err := coretag.Enable(); err != nil { 408 util.Fatalf("Failed to core tag sentry: %v", err) 409 } 410 411 // Verify that all sentry threads are properly core tagged, and log 412 // current core tag. 413 coreTags, err := coretag.GetAllCoreTags(os.Getpid()) 414 if err != nil { 415 util.Fatalf("Failed read current core tags: %v", err) 416 } 417 if len(coreTags) != 1 { 418 util.Fatalf("Not all child threads were core tagged the same. Tags=%v", coreTags) 419 } 420 log.Infof("Core tag enabled (core tag=%d)", coreTags[0]) 421 } 422 423 // Create the loader. 424 bootArgs := boot.Args{ 425 ID: f.Arg(0), 426 Spec: spec, 427 Conf: conf, 428 ControllerFD: b.controllerFD, 429 Device: os.NewFile(uintptr(b.deviceFD), "platform device"), 430 GoferFDs: b.ioFDs.GetArray(), 431 DevGoferFD: b.devIoFD, 432 StdioFDs: b.stdioFDs.GetArray(), 433 PassFDs: b.passFDs.GetArray(), 434 ExecFD: b.execFD, 435 GoferFilestoreFDs: b.goferFilestoreFDs.GetArray(), 436 GoferMountConfs: b.goferMountConfs.GetArray(), 437 NumCPU: b.cpuNum, 438 TotalMem: b.totalMem, 439 TotalHostMem: b.totalHostMem, 440 UserLogFD: b.userLogFD, 441 ProductName: b.productName, 442 PodInitConfigFD: b.podInitConfigFD, 443 SinkFDs: b.sinkFDs.GetArray(), 444 ProfileOpts: b.profileFDs.ToOpts(), 445 NvidiaDriverVersion: b.nvidiaDriverVersion, 446 } 447 l, err := boot.New(bootArgs) 448 if err != nil { 449 util.Fatalf("creating loader: %v", err) 450 } 451 452 // Fatalf exits the process and doesn't run defers. 453 // 'l' must be destroyed explicitly after this point! 454 455 if b.procMountSyncFD != -1 { 456 l.PreSeccompCallback = func() { 457 // Call validateOpenFDs() before umounting /proc. 458 validateOpenFDs(bootArgs.PassFDs) 459 // Umount /proc right before installing seccomp filters. 460 umountProc(b.procMountSyncFD) 461 } 462 } 463 464 if conf.TestOnlyAutosaveImagePath != "" { 465 fName := filepath.Join(conf.TestOnlyAutosaveImagePath, checkpointFileName) 466 f, err := os.OpenFile(fName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) 467 if err != nil { 468 util.Fatalf("error in creating state file %v", err) 469 } 470 defer f.Close() 471 boot.EnableAutosave(l, f) 472 } 473 474 // Prepare metrics. 475 // This needs to happen after the kernel is initialized (such that all metrics are registered) 476 // but before the start-sync file is notified, as the parent process needs to query for 477 // registered metrics prior to sending the start signal. 478 metric.Initialize() 479 if metric.ProfilingMetricWriter != nil { 480 if err := metric.StartProfilingMetrics(conf.ProfilingMetrics, time.Duration(conf.ProfilingMetricsRate)*time.Microsecond); err != nil { 481 l.Destroy() 482 util.Fatalf("unable to start profiling metrics: %v", err) 483 } 484 defer metric.StopProfilingMetrics() 485 } 486 487 // Notify the parent process the sandbox has booted (and that the controller 488 // is up). 489 startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file") 490 buf := make([]byte, 1) 491 if w, err := startSyncFile.Write(buf); err != nil || w != 1 { 492 l.Destroy() 493 util.Fatalf("unable to write into the start-sync descriptor: %v", err) 494 } 495 // Closes startSyncFile because 'l.Run()' only returns when the sandbox exits. 496 startSyncFile.Close() 497 498 // Wait for the start signal from runsc. 499 l.WaitForStartSignal() 500 501 // Run the application and wait for it to finish. 502 if err := l.Run(); err != nil { 503 l.Destroy() 504 util.Fatalf("running sandbox: %v", err) 505 } 506 507 ws := l.WaitExit() 508 log.Infof("application exiting with %+v", ws) 509 waitStatus := args[1].(*unix.WaitStatus) 510 *waitStatus = unix.WaitStatus(ws) 511 l.Destroy() 512 return subcommands.ExitSuccess 513 } 514 515 // prepareArgs returns the args that can be used to re-execute the current 516 // program. It manipulates the flags of the subcommands.Command identified by 517 // subCmdName and fSet is the flag.FlagSet of this subcommand. It applies the 518 // flags specified by override map. In case of conflict, flag is overriden. 519 // 520 // Postcondition: prepareArgs() takes ownership of override map. 521 func prepareArgs(subCmdName string, fSet *flag.FlagSet, override map[string]string) []string { 522 var args []string 523 // Add all args up until (and including) the sub command. 524 for _, arg := range os.Args { 525 args = append(args, arg) 526 if arg == subCmdName { 527 break 528 } 529 } 530 // Set sub command flags. Iterate through all the explicitly set flags. 531 fSet.Visit(func(gf *flag.Flag) { 532 // If a conflict is found with override, then prefer override flag. 533 if ov, ok := override[gf.Name]; ok { 534 args = append(args, fmt.Sprintf("--%s=%s", gf.Name, ov)) 535 delete(override, gf.Name) 536 return 537 } 538 // Otherwise pass through the original flag. 539 args = append(args, fmt.Sprintf("--%s=%s", gf.Name, gf.Value)) 540 }) 541 // Apply remaining override flags (that didn't conflict above). 542 for of, ov := range override { 543 args = append(args, fmt.Sprintf("--%s=%s", of, ov)) 544 } 545 // Add the non-flag arguments at the end. 546 args = append(args, fSet.Args()...) 547 return args 548 } 549 550 // execProcUmounter execute a child process that umounts /proc when the 551 // returned pipe is closed. 552 func execProcUmounter() (*exec.Cmd, *os.File) { 553 r, w, err := os.Pipe() 554 if err != nil { 555 util.Fatalf("error creating a pipe: %v", err) 556 } 557 defer r.Close() 558 559 cmd := exec.Command(specutils.ExePath) 560 cmd.Args = append(cmd.Args, "umount", "--sync-fd=3", "/proc") 561 cmd.ExtraFiles = append(cmd.ExtraFiles, r) 562 cmd.Stdin = os.Stdin 563 cmd.Stdout = os.Stdout 564 cmd.Stderr = os.Stderr 565 if err := cmd.Start(); err != nil { 566 util.Fatalf("error executing umounter: %v", err) 567 } 568 return cmd, w 569 } 570 571 // umountProc writes to syncFD signalling the process started by 572 // execProcUmounter() to umount /proc. 573 func umountProc(syncFD int) { 574 syncFile := os.NewFile(uintptr(syncFD), "procfs umount sync FD") 575 buf := make([]byte, 1) 576 if w, err := syncFile.Write(buf); err != nil || w != 1 { 577 util.Fatalf("unable to write into the proc umounter descriptor: %v", err) 578 } 579 syncFile.Close() 580 581 var waitStatus unix.WaitStatus 582 if _, err := unix.Wait4(0, &waitStatus, 0, nil); err != nil { 583 util.Fatalf("error waiting for the proc umounter process: %v", err) 584 } 585 if !waitStatus.Exited() || waitStatus.ExitStatus() != 0 { 586 util.Fatalf("the proc umounter process failed: %v", waitStatus) 587 } 588 if err := unix.Access("/proc/self", unix.F_OK); err != unix.ENOENT { 589 util.Fatalf("/proc is still accessible") 590 } 591 } 592 593 // validateOpenFDs checks that the sandbox process does not have any open 594 // directory FDs. 595 func validateOpenFDs(passFDs []boot.FDMapping) { 596 passHostFDs := make(map[int]struct{}) 597 for _, passFD := range passFDs { 598 passHostFDs[passFD.Host] = struct{}{} 599 } 600 const selfFDDir = "/proc/self/fd" 601 if err := filepath.WalkDir(selfFDDir, func(path string, d os.DirEntry, err error) error { 602 if err != nil { 603 return err 604 } 605 if d.Type() != os.ModeSymlink { 606 // All entries are symlinks. Ignore the callback for fd directory itself. 607 return nil 608 } 609 if fdInfo, err := os.Stat(path); err != nil { 610 if os.IsNotExist(err) { 611 // Ignore FDs that are now closed. For example, the FD to selfFDDir that 612 // was opened by filepath.WalkDir() to read dirents. 613 return nil 614 } 615 return fmt.Errorf("os.Stat(%s) failed: %v", path, err) 616 } else if !fdInfo.IsDir() { 617 return nil 618 } 619 // Uh-oh. This is a directory FD. 620 fdNo, err := strconv.Atoi(d.Name()) 621 if err != nil { 622 return fmt.Errorf("strconv.Atoi(%s) failed: %v", d.Name(), err) 623 } 624 dirLink, err := os.Readlink(path) 625 if err != nil { 626 return fmt.Errorf("os.Readlink(%s) failed: %v", path, err) 627 } 628 if _, ok := passHostFDs[fdNo]; ok { 629 // Passed FDs are allowed to be directories. The user must be knowing 630 // what they are doing. Log a warning regardless. 631 log.Warningf("Sandbox has access to FD %d, which is a directory for %s", fdNo, dirLink) 632 return nil 633 } 634 return fmt.Errorf("FD %d is a directory for %s", fdNo, dirLink) 635 }); err != nil { 636 util.Fatalf("WalkDir(%s) failed: %v", selfFDDir, err) 637 } 638 }