gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/cmd/boot.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cmd 16 17 import ( 18 "context" 19 "fmt" 20 "io/ioutil" 21 "os" 22 "os/exec" 23 "path/filepath" 24 "runtime" 25 "runtime/debug" 26 "strconv" 27 "strings" 28 "time" 29 30 "github.com/google/subcommands" 31 specs "github.com/opencontainers/runtime-spec/specs-go" 32 "golang.org/x/sys/unix" 33 "gvisor.dev/gvisor/pkg/coretag" 34 "gvisor.dev/gvisor/pkg/cpuid" 35 "gvisor.dev/gvisor/pkg/fd" 36 "gvisor.dev/gvisor/pkg/log" 37 "gvisor.dev/gvisor/pkg/metric" 38 "gvisor.dev/gvisor/pkg/ring0" 39 "gvisor.dev/gvisor/pkg/sentry/platform" 40 "gvisor.dev/gvisor/runsc/boot" 41 "gvisor.dev/gvisor/runsc/cmd/util" 42 "gvisor.dev/gvisor/runsc/config" 43 "gvisor.dev/gvisor/runsc/flag" 44 "gvisor.dev/gvisor/runsc/profile" 45 "gvisor.dev/gvisor/runsc/specutils" 46 ) 47 48 // Note that directfsSandboxCaps is the same as caps defined in gofer.go 49 // except CAP_SYS_CHROOT because we don't need to chroot in directfs mode. 50 var directfsSandboxCaps = []string{ 51 "CAP_CHOWN", 52 "CAP_DAC_OVERRIDE", 53 "CAP_DAC_READ_SEARCH", 54 "CAP_FOWNER", 55 "CAP_FSETID", 56 } 57 58 // directfsSandboxLinuxCaps is the minimal set of capabilities needed by the 59 // sandbox to operate on files in directfs mode. 60 var directfsSandboxLinuxCaps = &specs.LinuxCapabilities{ 61 Bounding: directfsSandboxCaps, 62 Effective: directfsSandboxCaps, 63 Permitted: directfsSandboxCaps, 64 } 65 66 // Boot implements subcommands.Command for the "boot" command which starts a 67 // new sandbox. It should not be called directly. 68 type Boot struct { 69 // bundleDir is the directory containing the OCI spec. 70 bundleDir string 71 72 // specFD is the file descriptor that the spec will be read from. 73 specFD int 74 75 // controllerFD is the file descriptor of a stream socket for the 76 // control server that is donated to this process. 77 controllerFD int 78 79 // deviceFD is the file descriptor for the platform device file. 80 deviceFD int 81 82 // ioFDs is the list of FDs used to connect to FS gofers. 83 ioFDs intFlags 84 85 // devIoFD is the FD to connect to dev gofer. 86 devIoFD int 87 88 // goferFilestoreFDs are FDs to the regular files that will back the tmpfs or 89 // overlayfs mount for certain gofer mounts. 90 goferFilestoreFDs intFlags 91 92 // goferMountConfs contains information about how the gofer mounts have been 93 // configured. The first entry is for rootfs and the following entries are 94 // for bind mounts in Spec.Mounts (in the same order). 95 goferMountConfs boot.GoferMountConfFlags 96 97 // stdioFDs are the fds for stdin, stdout, and stderr. They must be 98 // provided in that order. 99 stdioFDs intFlags 100 101 // passFDs are mappings of user-supplied host to guest file descriptors. 102 passFDs fdMappings 103 104 // execFD is the host file descriptor used for program execution. 105 execFD int 106 107 // applyCaps determines if capabilities defined in the spec should be applied 108 // to the process. 109 applyCaps bool 110 111 // setUpChroot is set to true if the sandbox is started in an empty root. 112 setUpRoot bool 113 114 // cpuNum number of CPUs to create inside the sandbox. 115 cpuNum int 116 117 // totalMem sets the initial amount of total memory to report back to the 118 // container. 119 totalMem uint64 120 121 // totalHostMem is the total memory reported by host /proc/meminfo. 122 totalHostMem uint64 123 124 // userLogFD is the file descriptor to write user logs to. 125 userLogFD int 126 127 // startSyncFD is the file descriptor to synchronize runsc and sandbox. 128 startSyncFD int 129 130 // mountsFD is the file descriptor to read list of mounts after they have 131 // been resolved (direct paths, no symlinks). They are resolved outside the 132 // sandbox (e.g. gofer) and sent through this FD. When mountsFD is not 133 // provided, there is no cleaning required for mounts and the mounts in 134 // the spec can be used as is. 135 mountsFD int 136 137 podInitConfigFD int 138 139 sinkFDs intFlags 140 141 // pidns is set if the sandbox is in its own pid namespace. 142 pidns bool 143 144 // attached is set to true to kill the sandbox process when the parent process 145 // terminates. This flag is set when the command execve's itself because 146 // parent death signal doesn't propagate through execve when uid/gid changes. 147 attached bool 148 149 // productName is the value to show in 150 // /sys/devices/virtual/dmi/id/product_name. 151 productName string 152 153 // FDs for profile data. 154 profileFDs profile.FDArgs 155 156 // profilingMetricsFD is a file descriptor to write Sentry metrics data to. 157 profilingMetricsFD int 158 159 // profilingMetricsLossy sets whether profilingMetricsFD is a lossy channel. 160 // If so, the format used to write to it will contain a checksum. 161 profilingMetricsLossy bool 162 163 // procMountSyncFD is a file descriptor that has to be closed when the 164 // procfs mount isn't needed anymore. 165 procMountSyncFD int 166 167 // syncUsernsFD is the file descriptor that has to be closed when the 168 // boot process should invoke setuid/setgid for root user. This is mainly 169 // used to synchronize rootless user namespace initialization. 170 syncUsernsFD int 171 172 // nvidiaDriverVersion is the Nvidia driver version on the host. 173 nvidiaDriverVersion string 174 } 175 176 // Name implements subcommands.Command.Name. 177 func (*Boot) Name() string { 178 return "boot" 179 } 180 181 // Synopsis implements subcommands.Command.Synopsis. 182 func (*Boot) Synopsis() string { 183 return "launch a sandbox process" 184 } 185 186 // Usage implements subcommands.Command.Usage. 187 func (*Boot) Usage() string { 188 return `boot [flags] <container id>` 189 } 190 191 // SetFlags implements subcommands.Command.SetFlags. 192 func (b *Boot) SetFlags(f *flag.FlagSet) { 193 f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory") 194 f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") 195 f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") 196 f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") 197 f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox") 198 f.IntVar(&b.procMountSyncFD, "proc-mount-sync-fd", -1, "file descriptor that has to be written to when /proc isn't needed anymore and can be unmounted") 199 f.IntVar(&b.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.") 200 f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") 201 f.Uint64Var(&b.totalHostMem, "total-host-memory", 0, "total memory reported by host /proc/meminfo") 202 f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates") 203 f.StringVar(&b.productName, "product-name", "", "value to show in /sys/devices/virtual/dmi/id/product_name") 204 f.StringVar(&b.nvidiaDriverVersion, "nvidia-driver-version", "", "Nvidia driver version on the host") 205 206 // Open FDs that are donated to the sandbox. 207 f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec") 208 f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") 209 f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") 210 f.Var(&b.ioFDs, "io-fds", "list of image FDs and/or socket FDs to connect gofer clients. They must follow this order: root first, then mounts as defined in the spec") 211 f.IntVar(&b.devIoFD, "dev-io-fd", -1, "FD to connect dev gofer client") 212 f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") 213 f.Var(&b.passFDs, "pass-fd", "mapping of host to guest FDs. They must be in M:N format. M is the host and N the guest descriptor.") 214 f.IntVar(&b.execFD, "exec-fd", -1, "host file descriptor used for program execution.") 215 f.Var(&b.goferFilestoreFDs, "gofer-filestore-fds", "FDs to the regular files that will back the overlayfs or tmpfs mount if a gofer mount is to be overlaid.") 216 f.Var(&b.goferMountConfs, "gofer-mount-confs", "information about how the gofer mounts have been configured.") 217 f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") 218 f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") 219 f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is an optional file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") 220 f.IntVar(&b.podInitConfigFD, "pod-init-config-fd", -1, "file descriptor to the pod init configuration file.") 221 f.Var(&b.sinkFDs, "sink-fds", "ordered list of file descriptors to be used by the sinks defined in --pod-init-config.") 222 223 // Profiling flags. 224 b.profileFDs.SetFromFlags(f) 225 f.IntVar(&b.profilingMetricsFD, "profiling-metrics-fd", -1, "file descriptor to write sentry profiling metrics.") 226 f.BoolVar(&b.profilingMetricsLossy, "profiling-metrics-fd-lossy", false, "if true, treat the sentry profiling metrics FD as lossy and write a checksum to it.") 227 } 228 229 // Execute implements subcommands.Command.Execute. It starts a sandbox in a 230 // waiting state. 231 func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { 232 if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 { 233 f.Usage() 234 return subcommands.ExitUsageError 235 } 236 237 conf := args[0].(*config.Config) 238 239 // Set traceback level 240 debug.SetTraceback(conf.Traceback) 241 242 // Initialize CPUID information. 243 cpuid.Initialize() 244 245 // Initialize ring0 library. 246 ring0.InitDefault() 247 248 argOverride := make(map[string]string) 249 if len(b.productName) == 0 { 250 // Do this before chroot takes effect, otherwise we can't read /sys. 251 if product, err := ioutil.ReadFile("/sys/devices/virtual/dmi/id/product_name"); err != nil { 252 log.Warningf("Not setting product_name: %v", err) 253 } else { 254 b.productName = strings.TrimSpace(string(product)) 255 log.Infof("Setting product_name: %q", b.productName) 256 argOverride["product-name"] = b.productName 257 } 258 } 259 260 if b.attached { 261 // Ensure this process is killed after parent process terminates when 262 // attached mode is enabled. In the unfortunate event that the parent 263 // terminates before this point, this process leaks. 264 if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil { 265 util.Fatalf("error setting parent death signal: %v", err) 266 } 267 } 268 269 if b.syncUsernsFD >= 0 { 270 syncUsernsForRootless(b.syncUsernsFD) 271 argOverride["sync-userns-fd"] = "-1" 272 } 273 274 // Get the spec from the specFD. We *must* keep this os.File alive past 275 // the call setCapsAndCallSelf, otherwise the FD will be closed and the 276 // child process cannot read it 277 specFile := os.NewFile(uintptr(b.specFD), "spec file") 278 spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf) 279 if err != nil { 280 util.Fatalf("reading spec: %v", err) 281 } 282 283 if b.setUpRoot { 284 if err := setUpChroot(b.pidns, spec, conf); err != nil { 285 util.Fatalf("error setting up chroot: %v", err) 286 } 287 argOverride["setup-root"] = "false" 288 289 if !conf.Rootless { 290 // /proc is umounted from a forked process, because the 291 // current one is going to re-execute itself without 292 // capabilities. 293 cmd, w := execProcUmounter() 294 defer cmd.Wait() 295 defer w.Close() 296 if b.procMountSyncFD != -1 { 297 panic("procMountSyncFD is set") 298 } 299 b.procMountSyncFD = int(w.Fd()) 300 argOverride["proc-mount-sync-fd"] = strconv.Itoa(b.procMountSyncFD) 301 302 // Clear FD_CLOEXEC. Regardless of b.applyCaps, this process will be 303 // re-executed. procMountSyncFD should remain open. 304 if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 { 305 util.Fatalf("error clearing CLOEXEC: %v", errno) 306 } 307 308 if !b.applyCaps { 309 // Remove the args that have already been done before calling self. 310 args := prepareArgs(b.Name(), f, argOverride) 311 312 // Note that we've already read the spec from the spec FD, and 313 // we will read it again after the exec call. This works 314 // because the ReadSpecFromFile function seeks to the beginning 315 // of the file before reading. 316 util.Fatalf("callSelfAsNobody(%v): %v", args, callSelfAsNobody(args)) 317 318 // This prevents the specFile finalizer from running and closed 319 // the specFD, which we have passed to ourselves when 320 // re-execing. 321 runtime.KeepAlive(specFile) 322 panic("unreachable") 323 } 324 } 325 } 326 327 specutils.LogSpecDebug(spec, conf.OCISeccomp) 328 329 if b.applyCaps { 330 caps := spec.Process.Capabilities 331 if caps == nil { 332 caps = &specs.LinuxCapabilities{} 333 } 334 335 gPlatform, err := platform.Lookup(conf.Platform) 336 if err != nil { 337 util.Fatalf("loading platform: %v", err) 338 } 339 if gPlatform.Requirements().RequiresCapSysPtrace { 340 // Ptrace platform requires extra capabilities. 341 const c = "CAP_SYS_PTRACE" 342 caps.Bounding = append(caps.Bounding, c) 343 caps.Effective = append(caps.Effective, c) 344 caps.Permitted = append(caps.Permitted, c) 345 } 346 347 if conf.DirectFS { 348 caps = specutils.MergeCapabilities(caps, directfsSandboxLinuxCaps) 349 } 350 argOverride["apply-caps"] = "false" 351 352 // Remove the args that have already been done before calling self. 353 args := prepareArgs(b.Name(), f, argOverride) 354 355 // Note that we've already read the spec from the spec FD, and 356 // we will read it again after the exec call. This works 357 // because the ReadSpecFromFile function seeks to the beginning 358 // of the file before reading. 359 util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, caps, setCapsAndCallSelf(args, caps)) 360 361 // This prevents the specFile finalizer from running and closed 362 // the specFD, which we have passed to ourselves when 363 // re-execing. 364 runtime.KeepAlive(specFile) 365 panic("unreachable") 366 } 367 368 if b.syncUsernsFD >= 0 { 369 // syncUsernsFD is set, but runsc hasn't been re-executed with a new UID and GID. 370 // We expect that setCapsAndCallSelf has to be called in this case. 371 panic("unreachable") 372 } 373 374 // Close specFile to avoid exposing it to the sandbox. 375 if err := specFile.Close(); err != nil { 376 util.Fatalf("closing specFile: %v", err) 377 } 378 379 // At this point we won't re-execute, so it's safe to limit via rlimits. Any 380 // limit >= 0 works. If the limit is lower than the current number of open 381 // files, then Setrlimit will succeed, and the next open will fail. 382 if conf.FDLimit > -1 { 383 rlimit := unix.Rlimit{ 384 Cur: uint64(conf.FDLimit), 385 Max: uint64(conf.FDLimit), 386 } 387 switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err { 388 case nil: 389 case unix.EPERM: 390 log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit) 391 default: 392 util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err) 393 } 394 } 395 396 // When mountsFD is not provided, there is no cleaning required. 397 if b.mountsFD >= 0 { 398 // Read resolved mount list and replace the original one from the spec. 399 mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file") 400 cleanMounts, err := specutils.ReadMounts(mountsFile) 401 if err != nil { 402 mountsFile.Close() 403 util.Fatalf("Error reading mounts file: %v", err) 404 } 405 mountsFile.Close() 406 spec.Mounts = cleanMounts 407 } 408 409 if conf.DirectFS { 410 // sandbox should run with a umask of 0, because we want to preserve file 411 // modes exactly as sent by the sentry, which would have already applied 412 // the application umask. 413 unix.Umask(0) 414 } 415 416 if conf.EnableCoreTags { 417 if err := coretag.Enable(); err != nil { 418 util.Fatalf("Failed to core tag sentry: %v", err) 419 } 420 421 // Verify that all sentry threads are properly core tagged, and log 422 // current core tag. 423 coreTags, err := coretag.GetAllCoreTags(os.Getpid()) 424 if err != nil { 425 util.Fatalf("Failed read current core tags: %v", err) 426 } 427 if len(coreTags) != 1 { 428 util.Fatalf("Not all child threads were core tagged the same. Tags=%v", coreTags) 429 } 430 log.Infof("Core tag enabled (core tag=%d)", coreTags[0]) 431 } 432 433 // Create the loader. 434 bootArgs := boot.Args{ 435 ID: f.Arg(0), 436 Spec: spec, 437 Conf: conf, 438 ControllerFD: b.controllerFD, 439 Device: fd.New(b.deviceFD), 440 GoferFDs: b.ioFDs.GetArray(), 441 DevGoferFD: b.devIoFD, 442 StdioFDs: b.stdioFDs.GetArray(), 443 PassFDs: b.passFDs.GetArray(), 444 ExecFD: b.execFD, 445 GoferFilestoreFDs: b.goferFilestoreFDs.GetArray(), 446 GoferMountConfs: b.goferMountConfs.GetArray(), 447 NumCPU: b.cpuNum, 448 TotalMem: b.totalMem, 449 TotalHostMem: b.totalHostMem, 450 UserLogFD: b.userLogFD, 451 ProductName: b.productName, 452 PodInitConfigFD: b.podInitConfigFD, 453 SinkFDs: b.sinkFDs.GetArray(), 454 ProfileOpts: b.profileFDs.ToOpts(), 455 NvidiaDriverVersion: b.nvidiaDriverVersion, 456 } 457 l, err := boot.New(bootArgs) 458 if err != nil { 459 util.Fatalf("creating loader: %v", err) 460 } 461 462 // Fatalf exits the process and doesn't run defers. 463 // 'l' must be destroyed explicitly after this point! 464 465 if b.procMountSyncFD != -1 { 466 l.PreSeccompCallback = func() { 467 // Call validateOpenFDs() before umounting /proc. 468 validateOpenFDs(bootArgs.PassFDs) 469 // Umount /proc right before installing seccomp filters. 470 umountProc(b.procMountSyncFD) 471 } 472 } 473 474 if conf.TestOnlyAutosaveImagePath != "" { 475 fName := filepath.Join(conf.TestOnlyAutosaveImagePath, boot.CheckpointStateFileName) 476 f, err := os.OpenFile(fName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) 477 if err != nil { 478 util.Fatalf("error in creating state file %v", err) 479 } 480 defer f.Close() 481 482 boot.EnableAutosave(l, f, conf.TestOnlyAutosaveResume) 483 } 484 485 // Prepare metrics. 486 // This needs to happen after the kernel is initialized (such that all metrics are registered) 487 // but before the start-sync file is notified, as the parent process needs to query for 488 // registered metrics prior to sending the start signal. 489 metric.Initialize() 490 if b.profilingMetricsFD != -1 { 491 if err := metric.StartProfilingMetrics(metric.ProfilingMetricsOptions[*os.File]{ 492 Sink: os.NewFile(uintptr(b.profilingMetricsFD), "metrics file"), 493 Lossy: b.profilingMetricsLossy, 494 Metrics: conf.ProfilingMetrics, 495 Rate: time.Duration(conf.ProfilingMetricsRate) * time.Microsecond, 496 }); err != nil { 497 l.Destroy() 498 util.Fatalf("unable to start profiling metrics: %v", err) 499 } 500 defer metric.StopProfilingMetrics() 501 } 502 503 // Notify the parent process the sandbox has booted (and that the controller 504 // is up). 505 startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file") 506 buf := make([]byte, 1) 507 if w, err := startSyncFile.Write(buf); err != nil || w != 1 { 508 l.Destroy() 509 util.Fatalf("unable to write into the start-sync descriptor: %v", err) 510 } 511 // Closes startSyncFile because 'l.Run()' only returns when the sandbox exits. 512 startSyncFile.Close() 513 514 // Wait for the start signal from runsc. 515 l.WaitForStartSignal() 516 517 // Run the application and wait for it to finish. 518 if err := l.Run(); err != nil { 519 l.Destroy() 520 util.Fatalf("running sandbox: %v", err) 521 } 522 523 ws := l.WaitExit() 524 log.Infof("application exiting with %+v", ws) 525 waitStatus := args[1].(*unix.WaitStatus) 526 *waitStatus = unix.WaitStatus(ws) 527 l.Destroy() 528 return subcommands.ExitSuccess 529 } 530 531 // prepareArgs returns the args that can be used to re-execute the current 532 // program. It manipulates the flags of the subcommands.Command identified by 533 // subCmdName and fSet is the flag.FlagSet of this subcommand. It applies the 534 // flags specified by override map. In case of conflict, flag is overriden. 535 // 536 // Postcondition: prepareArgs() takes ownership of override map. 537 func prepareArgs(subCmdName string, fSet *flag.FlagSet, override map[string]string) []string { 538 var args []string 539 // Add all args up until (and including) the sub command. 540 for _, arg := range os.Args { 541 args = append(args, arg) 542 if arg == subCmdName { 543 break 544 } 545 } 546 // Set sub command flags. Iterate through all the explicitly set flags. 547 fSet.Visit(func(gf *flag.Flag) { 548 // If a conflict is found with override, then prefer override flag. 549 if ov, ok := override[gf.Name]; ok { 550 args = append(args, fmt.Sprintf("--%s=%s", gf.Name, ov)) 551 delete(override, gf.Name) 552 return 553 } 554 // Otherwise pass through the original flag. 555 args = append(args, fmt.Sprintf("--%s=%s", gf.Name, gf.Value)) 556 }) 557 // Apply remaining override flags (that didn't conflict above). 558 for of, ov := range override { 559 args = append(args, fmt.Sprintf("--%s=%s", of, ov)) 560 } 561 // Add the non-flag arguments at the end. 562 args = append(args, fSet.Args()...) 563 return args 564 } 565 566 // execProcUmounter execute a child process that umounts /proc when the 567 // returned pipe is closed. 568 func execProcUmounter() (*exec.Cmd, *os.File) { 569 r, w, err := os.Pipe() 570 if err != nil { 571 util.Fatalf("error creating a pipe: %v", err) 572 } 573 defer r.Close() 574 575 cmd := exec.Command(specutils.ExePath) 576 cmd.Args = append(cmd.Args, "umount", "--sync-fd=3", "/proc") 577 cmd.ExtraFiles = append(cmd.ExtraFiles, r) 578 cmd.Stdin = os.Stdin 579 cmd.Stdout = os.Stdout 580 cmd.Stderr = os.Stderr 581 if err := cmd.Start(); err != nil { 582 util.Fatalf("error executing umounter: %v", err) 583 } 584 return cmd, w 585 } 586 587 // umountProc writes to syncFD signalling the process started by 588 // execProcUmounter() to umount /proc. 589 func umountProc(syncFD int) { 590 syncFile := os.NewFile(uintptr(syncFD), "procfs umount sync FD") 591 buf := make([]byte, 1) 592 if w, err := syncFile.Write(buf); err != nil || w != 1 { 593 util.Fatalf("unable to write into the proc umounter descriptor: %v", err) 594 } 595 syncFile.Close() 596 597 var waitStatus unix.WaitStatus 598 if _, err := unix.Wait4(0, &waitStatus, 0, nil); err != nil { 599 util.Fatalf("error waiting for the proc umounter process: %v", err) 600 } 601 if !waitStatus.Exited() || waitStatus.ExitStatus() != 0 { 602 util.Fatalf("the proc umounter process failed: %v", waitStatus) 603 } 604 if err := unix.Access("/proc/self", unix.F_OK); err != unix.ENOENT { 605 util.Fatalf("/proc is still accessible") 606 } 607 } 608 609 // validateOpenFDs checks that the sandbox process does not have any open 610 // directory FDs. 611 func validateOpenFDs(passFDs []boot.FDMapping) { 612 passHostFDs := make(map[int]struct{}) 613 for _, passFD := range passFDs { 614 passHostFDs[passFD.Host] = struct{}{} 615 } 616 const selfFDDir = "/proc/self/fd" 617 if err := filepath.WalkDir(selfFDDir, func(path string, d os.DirEntry, err error) error { 618 if err != nil { 619 return err 620 } 621 if d.Type() != os.ModeSymlink { 622 // All entries are symlinks. Ignore the callback for fd directory itself. 623 return nil 624 } 625 if fdInfo, err := os.Stat(path); err != nil { 626 if os.IsNotExist(err) { 627 // Ignore FDs that are now closed. For example, the FD to selfFDDir that 628 // was opened by filepath.WalkDir() to read dirents. 629 return nil 630 } 631 return fmt.Errorf("os.Stat(%s) failed: %v", path, err) 632 } else if !fdInfo.IsDir() { 633 return nil 634 } 635 // Uh-oh. This is a directory FD. 636 fdNo, err := strconv.Atoi(d.Name()) 637 if err != nil { 638 return fmt.Errorf("strconv.Atoi(%s) failed: %v", d.Name(), err) 639 } 640 dirLink, err := os.Readlink(path) 641 if err != nil { 642 return fmt.Errorf("os.Readlink(%s) failed: %v", path, err) 643 } 644 if _, ok := passHostFDs[fdNo]; ok { 645 // Passed FDs are allowed to be directories. The user must be knowing 646 // what they are doing. Log a warning regardless. 647 log.Warningf("Sandbox has access to FD %d, which is a directory for %s", fdNo, dirLink) 648 return nil 649 } 650 return fmt.Errorf("FD %d is a directory for %s", fdNo, dirLink) 651 }); err != nil { 652 util.Fatalf("WalkDir(%s) failed: %v", selfFDDir, err) 653 } 654 }