github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/cmd/boot.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cmd
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"io/ioutil"
    21  	"os"
    22  	"os/exec"
    23  	"path/filepath"
    24  	"runtime"
    25  	"runtime/debug"
    26  	"strconv"
    27  	"strings"
    28  	"time"
    29  
    30  	"github.com/MerlinKodo/gvisor/pkg/coretag"
    31  	"github.com/MerlinKodo/gvisor/pkg/cpuid"
    32  	"github.com/MerlinKodo/gvisor/pkg/log"
    33  	"github.com/MerlinKodo/gvisor/pkg/metric"
    34  	"github.com/MerlinKodo/gvisor/pkg/ring0"
    35  	"github.com/MerlinKodo/gvisor/pkg/sentry/platform"
    36  	"github.com/MerlinKodo/gvisor/runsc/boot"
    37  	"github.com/MerlinKodo/gvisor/runsc/cmd/util"
    38  	"github.com/MerlinKodo/gvisor/runsc/config"
    39  	"github.com/MerlinKodo/gvisor/runsc/flag"
    40  	"github.com/MerlinKodo/gvisor/runsc/profile"
    41  	"github.com/MerlinKodo/gvisor/runsc/specutils"
    42  	"github.com/google/subcommands"
    43  	specs "github.com/opencontainers/runtime-spec/specs-go"
    44  	"golang.org/x/sys/unix"
    45  )
    46  
    47  // Note that directfsSandboxCaps is the same as caps defined in gofer.go
    48  // except CAP_SYS_CHROOT because we don't need to chroot in directfs mode.
    49  var directfsSandboxCaps = []string{
    50  	"CAP_CHOWN",
    51  	"CAP_DAC_OVERRIDE",
    52  	"CAP_DAC_READ_SEARCH",
    53  	"CAP_FOWNER",
    54  	"CAP_FSETID",
    55  }
    56  
    57  // directfsSandboxLinuxCaps is the minimal set of capabilities needed by the
    58  // sandbox to operate on files in directfs mode.
    59  var directfsSandboxLinuxCaps = &specs.LinuxCapabilities{
    60  	Bounding:  directfsSandboxCaps,
    61  	Effective: directfsSandboxCaps,
    62  	Permitted: directfsSandboxCaps,
    63  }
    64  
    65  // Boot implements subcommands.Command for the "boot" command which starts a
    66  // new sandbox. It should not be called directly.
    67  type Boot struct {
    68  	// bundleDir is the directory containing the OCI spec.
    69  	bundleDir string
    70  
    71  	// specFD is the file descriptor that the spec will be read from.
    72  	specFD int
    73  
    74  	// controllerFD is the file descriptor of a stream socket for the
    75  	// control server that is donated to this process.
    76  	controllerFD int
    77  
    78  	// deviceFD is the file descriptor for the platform device file.
    79  	deviceFD int
    80  
    81  	// ioFDs is the list of FDs used to connect to FS gofers.
    82  	ioFDs intFlags
    83  
    84  	// overlayFilestoreFDs are FDs to the regular files that will back the tmpfs
    85  	// upper mount in the overlay mounts.
    86  	overlayFilestoreFDs intFlags
    87  
    88  	// overlayMediums contains information about how the gofer mounts have been
    89  	// overlaid. The first entry is for rootfs and the following entries are for
    90  	// bind mounts in Spec.Mounts (in the same order).
    91  	overlayMediums boot.OverlayMediumFlags
    92  
    93  	// stdioFDs are the fds for stdin, stdout, and stderr. They must be
    94  	// provided in that order.
    95  	stdioFDs intFlags
    96  
    97  	// passFDs are mappings of user-supplied host to guest file descriptors.
    98  	passFDs fdMappings
    99  
   100  	// execFD is the host file descriptor used for program execution.
   101  	execFD int
   102  
   103  	// applyCaps determines if capabilities defined in the spec should be applied
   104  	// to the process.
   105  	applyCaps bool
   106  
   107  	// setUpChroot is set to true if the sandbox is started in an empty root.
   108  	setUpRoot bool
   109  
   110  	// cpuNum number of CPUs to create inside the sandbox.
   111  	cpuNum int
   112  
   113  	// totalMem sets the initial amount of total memory to report back to the
   114  	// container.
   115  	totalMem uint64
   116  
   117  	// totalHostMem is the total memory reported by host /proc/meminfo.
   118  	totalHostMem uint64
   119  
   120  	// userLogFD is the file descriptor to write user logs to.
   121  	userLogFD int
   122  
   123  	// startSyncFD is the file descriptor to synchronize runsc and sandbox.
   124  	startSyncFD int
   125  
   126  	// mountsFD is the file descriptor to read list of mounts after they have
   127  	// been resolved (direct paths, no symlinks). They are resolved outside the
   128  	// sandbox (e.g. gofer) and sent through this FD.
   129  	mountsFD int
   130  
   131  	podInitConfigFD int
   132  
   133  	sinkFDs intFlags
   134  
   135  	// pidns is set if the sandbox is in its own pid namespace.
   136  	pidns bool
   137  
   138  	// attached is set to true to kill the sandbox process when the parent process
   139  	// terminates. This flag is set when the command execve's itself because
   140  	// parent death signal doesn't propagate through execve when uid/gid changes.
   141  	attached bool
   142  
   143  	// productName is the value to show in
   144  	// /sys/devices/virtual/dmi/id/product_name.
   145  	productName string
   146  
   147  	// FDs for profile data.
   148  	profileFDs profile.FDArgs
   149  
   150  	// procMountSyncFD is a file descriptor that has to be closed when the
   151  	// procfs mount isn't needed anymore.
   152  	procMountSyncFD int
   153  
   154  	// syncUsernsFD is the file descriptor that has to be closed when the
   155  	// boot process should invoke setuid/setgid for root user. This is mainly
   156  	// used to synchronize rootless user namespace initialization.
   157  	syncUsernsFD int
   158  
   159  	// nvidiaDevMinors is a list of device minors for Nvidia GPU devices exposed
   160  	// to the sandbox.
   161  	nvidiaDevMinors boot.NvidiaDevMinors
   162  }
   163  
   164  // Name implements subcommands.Command.Name.
   165  func (*Boot) Name() string {
   166  	return "boot"
   167  }
   168  
   169  // Synopsis implements subcommands.Command.Synopsis.
   170  func (*Boot) Synopsis() string {
   171  	return "launch a sandbox process"
   172  }
   173  
   174  // Usage implements subcommands.Command.Usage.
   175  func (*Boot) Usage() string {
   176  	return `boot [flags] <container id>`
   177  }
   178  
   179  // SetFlags implements subcommands.Command.SetFlags.
   180  func (b *Boot) SetFlags(f *flag.FlagSet) {
   181  	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
   182  	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
   183  	f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
   184  	f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
   185  	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
   186  	f.IntVar(&b.procMountSyncFD, "proc-mount-sync-fd", -1, "file descriptor that has to be written to when /proc isn't needed anymore and can be unmounted")
   187  	f.IntVar(&b.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.")
   188  	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
   189  	f.Uint64Var(&b.totalHostMem, "total-host-memory", 0, "total memory reported by host /proc/meminfo")
   190  	f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates")
   191  	f.StringVar(&b.productName, "product-name", "", "value to show in /sys/devices/virtual/dmi/id/product_name")
   192  
   193  	// Open FDs that are donated to the sandbox.
   194  	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
   195  	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
   196  	f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
   197  	f.Var(&b.ioFDs, "io-fds", "list of FDs to connect gofer clients. They must follow this order: root first, then mounts as defined in the spec")
   198  	f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
   199  	f.Var(&b.passFDs, "pass-fd", "mapping of host to guest FDs. They must be in M:N format. M is the host and N the guest descriptor.")
   200  	f.IntVar(&b.execFD, "exec-fd", -1, "host file descriptor used for program execution.")
   201  	f.Var(&b.overlayFilestoreFDs, "overlay-filestore-fds", "FDs to the regular files that will back the tmpfs upper mount in the overlay mounts.")
   202  	f.Var(&b.overlayMediums, "overlay-mediums", "information about how the gofer mounts have been overlaid.")
   203  	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
   204  	f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
   205  	f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
   206  	f.IntVar(&b.podInitConfigFD, "pod-init-config-fd", -1, "file descriptor to the pod init configuration file.")
   207  	f.Var(&b.sinkFDs, "sink-fds", "ordered list of file descriptors to be used by the sinks defined in --pod-init-config.")
   208  	f.Var(&b.nvidiaDevMinors, "nvidia-dev-minors", "list of device minors for Nvidia GPU devices exposed to the sandbox.")
   209  
   210  	// Profiling flags.
   211  	b.profileFDs.SetFromFlags(f)
   212  }
   213  
   214  // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
   215  // waiting state.
   216  func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus {
   217  	if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 {
   218  		f.Usage()
   219  		return subcommands.ExitUsageError
   220  	}
   221  
   222  	conf := args[0].(*config.Config)
   223  
   224  	// Set traceback level
   225  	debug.SetTraceback(conf.Traceback)
   226  
   227  	// Initialize CPUID information.
   228  	cpuid.Initialize()
   229  
   230  	// Initialize ring0 library.
   231  	ring0.InitDefault()
   232  
   233  	argOverride := make(map[string]string)
   234  	if len(b.productName) == 0 {
   235  		// Do this before chroot takes effect, otherwise we can't read /sys.
   236  		if product, err := ioutil.ReadFile("/sys/devices/virtual/dmi/id/product_name"); err != nil {
   237  			log.Warningf("Not setting product_name: %v", err)
   238  		} else {
   239  			b.productName = strings.TrimSpace(string(product))
   240  			log.Infof("Setting product_name: %q", b.productName)
   241  			argOverride["product-name"] = b.productName
   242  		}
   243  	}
   244  
   245  	if b.attached {
   246  		// Ensure this process is killed after parent process terminates when
   247  		// attached mode is enabled. In the unfortunate event that the parent
   248  		// terminates before this point, this process leaks.
   249  		if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
   250  			util.Fatalf("error setting parent death signal: %v", err)
   251  		}
   252  	}
   253  
   254  	if b.syncUsernsFD >= 0 {
   255  		syncUsernsForRootless(b.syncUsernsFD)
   256  		argOverride["sync-userns-fd"] = "-1"
   257  	}
   258  
   259  	// Get the spec from the specFD. We *must* keep this os.File alive past
   260  	// the call setCapsAndCallSelf, otherwise the FD will be closed and the
   261  	// child process cannot read it
   262  	specFile := os.NewFile(uintptr(b.specFD), "spec file")
   263  	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf)
   264  	if err != nil {
   265  		util.Fatalf("reading spec: %v", err)
   266  	}
   267  
   268  	if b.setUpRoot {
   269  		if err := setUpChroot(b.pidns, spec, conf, b.nvidiaDevMinors); err != nil {
   270  			util.Fatalf("error setting up chroot: %v", err)
   271  		}
   272  		argOverride["setup-root"] = "false"
   273  
   274  		if !conf.Rootless {
   275  			// /proc is umounted from a forked process, because the
   276  			// current one is going to re-execute itself without
   277  			// capabilities.
   278  			cmd, w := execProcUmounter()
   279  			defer cmd.Wait()
   280  			defer w.Close()
   281  			if b.procMountSyncFD != -1 {
   282  				panic("procMountSyncFD is set")
   283  			}
   284  			b.procMountSyncFD = int(w.Fd())
   285  			argOverride["proc-mount-sync-fd"] = strconv.Itoa(b.procMountSyncFD)
   286  
   287  			// Clear FD_CLOEXEC. Regardless of b.applyCaps, this process will be
   288  			// re-executed. procMountSyncFD should remain open.
   289  			if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 {
   290  				util.Fatalf("error clearing CLOEXEC: %v", errno)
   291  			}
   292  
   293  			if !b.applyCaps {
   294  				// Remove the args that have already been done before calling self.
   295  				args := prepareArgs(b.Name(), f, argOverride)
   296  
   297  				// Note that we've already read the spec from the spec FD, and
   298  				// we will read it again after the exec call. This works
   299  				// because the ReadSpecFromFile function seeks to the beginning
   300  				// of the file before reading.
   301  				util.Fatalf("callSelfAsNobody(%v): %v", args, callSelfAsNobody(args))
   302  
   303  				// This prevents the specFile finalizer from running and closed
   304  				// the specFD, which we have passed to ourselves when
   305  				// re-execing.
   306  				runtime.KeepAlive(specFile)
   307  				panic("unreachable")
   308  			}
   309  		}
   310  	}
   311  
   312  	specutils.LogSpecDebug(spec, conf.OCISeccomp)
   313  
   314  	if b.applyCaps {
   315  		caps := spec.Process.Capabilities
   316  		if caps == nil {
   317  			caps = &specs.LinuxCapabilities{}
   318  		}
   319  
   320  		gPlatform, err := platform.Lookup(conf.Platform)
   321  		if err != nil {
   322  			util.Fatalf("loading platform: %v", err)
   323  		}
   324  		if gPlatform.Requirements().RequiresCapSysPtrace {
   325  			// Ptrace platform requires extra capabilities.
   326  			const c = "CAP_SYS_PTRACE"
   327  			caps.Bounding = append(caps.Bounding, c)
   328  			caps.Effective = append(caps.Effective, c)
   329  			caps.Permitted = append(caps.Permitted, c)
   330  		}
   331  
   332  		if conf.DirectFS {
   333  			caps = specutils.MergeCapabilities(caps, directfsSandboxLinuxCaps)
   334  		}
   335  		argOverride["apply-caps"] = "false"
   336  
   337  		// Remove the args that have already been done before calling self.
   338  		args := prepareArgs(b.Name(), f, argOverride)
   339  
   340  		// Note that we've already read the spec from the spec FD, and
   341  		// we will read it again after the exec call. This works
   342  		// because the ReadSpecFromFile function seeks to the beginning
   343  		// of the file before reading.
   344  		util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, caps, setCapsAndCallSelf(args, caps))
   345  
   346  		// This prevents the specFile finalizer from running and closed
   347  		// the specFD, which we have passed to ourselves when
   348  		// re-execing.
   349  		runtime.KeepAlive(specFile)
   350  		panic("unreachable")
   351  	}
   352  
   353  	if b.syncUsernsFD >= 0 {
   354  		// syncUsernsFD is set, but runsc hasn't been re-executed with a new UID and GID.
   355  		// We expect that setCapsAndCallSelf has to be called in this case.
   356  		panic("unreachable")
   357  	}
   358  
   359  	// Close specFile to avoid exposing it to the sandbox.
   360  	if err := specFile.Close(); err != nil {
   361  		util.Fatalf("closing specFile: %v", err)
   362  	}
   363  
   364  	// At this point we won't re-execute, so it's safe to limit via rlimits. Any
   365  	// limit >= 0 works. If the limit is lower than the current number of open
   366  	// files, then Setrlimit will succeed, and the next open will fail.
   367  	if conf.FDLimit > -1 {
   368  		rlimit := unix.Rlimit{
   369  			Cur: uint64(conf.FDLimit),
   370  			Max: uint64(conf.FDLimit),
   371  		}
   372  		switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err {
   373  		case nil:
   374  		case unix.EPERM:
   375  			log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit)
   376  		default:
   377  			util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err)
   378  		}
   379  	}
   380  
   381  	// Read resolved mount list and replace the original one from the spec.
   382  	mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file")
   383  	cleanMounts, err := specutils.ReadMounts(mountsFile)
   384  	if err != nil {
   385  		mountsFile.Close()
   386  		util.Fatalf("Error reading mounts file: %v", err)
   387  	}
   388  	mountsFile.Close()
   389  	spec.Mounts = cleanMounts
   390  
   391  	if conf.DirectFS {
   392  		// sandbox should run with a umask of 0, because we want to preserve file
   393  		// modes exactly as sent by the sentry, which would have already applied
   394  		// the application umask.
   395  		unix.Umask(0)
   396  	}
   397  
   398  	if conf.EnableCoreTags {
   399  		if err := coretag.Enable(); err != nil {
   400  			util.Fatalf("Failed to core tag sentry: %v", err)
   401  		}
   402  
   403  		// Verify that all sentry threads are properly core tagged, and log
   404  		// current core tag.
   405  		coreTags, err := coretag.GetAllCoreTags(os.Getpid())
   406  		if err != nil {
   407  			util.Fatalf("Failed read current core tags: %v", err)
   408  		}
   409  		if len(coreTags) != 1 {
   410  			util.Fatalf("Not all child threads were core tagged the same. Tags=%v", coreTags)
   411  		}
   412  		log.Infof("Core tag enabled (core tag=%d)", coreTags[0])
   413  	}
   414  
   415  	// Create the loader.
   416  	bootArgs := boot.Args{
   417  		ID:                  f.Arg(0),
   418  		Spec:                spec,
   419  		Conf:                conf,
   420  		ControllerFD:        b.controllerFD,
   421  		Device:              os.NewFile(uintptr(b.deviceFD), "platform device"),
   422  		GoferFDs:            b.ioFDs.GetArray(),
   423  		StdioFDs:            b.stdioFDs.GetArray(),
   424  		PassFDs:             b.passFDs.GetArray(),
   425  		ExecFD:              b.execFD,
   426  		OverlayFilestoreFDs: b.overlayFilestoreFDs.GetArray(),
   427  		OverlayMediums:      b.overlayMediums.GetArray(),
   428  		NumCPU:              b.cpuNum,
   429  		TotalMem:            b.totalMem,
   430  		TotalHostMem:        b.totalHostMem,
   431  		UserLogFD:           b.userLogFD,
   432  		ProductName:         b.productName,
   433  		PodInitConfigFD:     b.podInitConfigFD,
   434  		SinkFDs:             b.sinkFDs.GetArray(),
   435  		ProfileOpts:         b.profileFDs.ToOpts(),
   436  	}
   437  	l, err := boot.New(bootArgs)
   438  	if err != nil {
   439  		util.Fatalf("creating loader: %v", err)
   440  	}
   441  
   442  	// Fatalf exits the process and doesn't run defers.
   443  	// 'l' must be destroyed explicitly after this point!
   444  
   445  	if b.procMountSyncFD != -1 {
   446  		l.PreSeccompCallback = func() {
   447  			// Call validateOpenFDs() before umounting /proc.
   448  			validateOpenFDs(bootArgs.PassFDs)
   449  			// Umount /proc right before installing seccomp filters.
   450  			umountProc(b.procMountSyncFD)
   451  		}
   452  	}
   453  
   454  	// Prepare metrics.
   455  	// This needs to happen after the kernel is initialized (such that all metrics are registered)
   456  	// but before the start-sync file is notified, as the parent process needs to query for
   457  	// registered metrics prior to sending the start signal.
   458  	metric.Initialize()
   459  	if metric.ProfilingMetricWriter != nil {
   460  		if err := metric.StartProfilingMetrics(conf.ProfilingMetrics, time.Duration(conf.ProfilingMetricsRate)*time.Microsecond); err != nil {
   461  			l.Destroy()
   462  			util.Fatalf("unable to start profiling metrics: %v", err)
   463  		}
   464  		defer metric.StopProfilingMetrics()
   465  	}
   466  
   467  	// Notify the parent process the sandbox has booted (and that the controller
   468  	// is up).
   469  	startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
   470  	buf := make([]byte, 1)
   471  	if w, err := startSyncFile.Write(buf); err != nil || w != 1 {
   472  		l.Destroy()
   473  		util.Fatalf("unable to write into the start-sync descriptor: %v", err)
   474  	}
   475  	// Closes startSyncFile because 'l.Run()' only returns when the sandbox exits.
   476  	startSyncFile.Close()
   477  
   478  	// Wait for the start signal from runsc.
   479  	l.WaitForStartSignal()
   480  
   481  	// Run the application and wait for it to finish.
   482  	if err := l.Run(); err != nil {
   483  		l.Destroy()
   484  		util.Fatalf("running sandbox: %v", err)
   485  	}
   486  
   487  	ws := l.WaitExit()
   488  	log.Infof("application exiting with %+v", ws)
   489  	waitStatus := args[1].(*unix.WaitStatus)
   490  	*waitStatus = unix.WaitStatus(ws)
   491  	l.Destroy()
   492  	return subcommands.ExitSuccess
   493  }
   494  
   495  // prepareArgs returns the args that can be used to re-execute the current
   496  // program. It manipulates the flags of the subcommands.Command identified by
   497  // subCmdName and fSet is the flag.FlagSet of this subcommand. It applies the
   498  // flags specified by override map. In case of conflict, flag is overriden.
   499  //
   500  // Postcondition: prepareArgs() takes ownership of override map.
   501  func prepareArgs(subCmdName string, fSet *flag.FlagSet, override map[string]string) []string {
   502  	var args []string
   503  	// Add all args up until (and including) the sub command.
   504  	for _, arg := range os.Args {
   505  		args = append(args, arg)
   506  		if arg == subCmdName {
   507  			break
   508  		}
   509  	}
   510  	// Set sub command flags. Iterate through all the explicitly set flags.
   511  	fSet.Visit(func(gf *flag.Flag) {
   512  		// If a conflict is found with override, then prefer override flag.
   513  		if ov, ok := override[gf.Name]; ok {
   514  			args = append(args, fmt.Sprintf("--%s=%s", gf.Name, ov))
   515  			delete(override, gf.Name)
   516  			return
   517  		}
   518  		// Otherwise pass through the original flag.
   519  		args = append(args, fmt.Sprintf("--%s=%s", gf.Name, gf.Value))
   520  	})
   521  	// Apply remaining override flags (that didn't conflict above).
   522  	for of, ov := range override {
   523  		args = append(args, fmt.Sprintf("--%s=%s", of, ov))
   524  	}
   525  	// Add the non-flag arguments at the end.
   526  	args = append(args, fSet.Args()...)
   527  	return args
   528  }
   529  
   530  // execProcUmounter execute a child process that umounts /proc when the
   531  // returned pipe is closed.
   532  func execProcUmounter() (*exec.Cmd, *os.File) {
   533  	r, w, err := os.Pipe()
   534  	if err != nil {
   535  		util.Fatalf("error creating a pipe: %v", err)
   536  	}
   537  	defer r.Close()
   538  
   539  	cmd := exec.Command(specutils.ExePath)
   540  	cmd.Args = append(cmd.Args, "umount", "--sync-fd=3", "/proc")
   541  	cmd.ExtraFiles = append(cmd.ExtraFiles, r)
   542  	cmd.Stdin = os.Stdin
   543  	cmd.Stdout = os.Stdout
   544  	cmd.Stderr = os.Stderr
   545  	if err := cmd.Start(); err != nil {
   546  		util.Fatalf("error executing umounter: %v", err)
   547  	}
   548  	return cmd, w
   549  }
   550  
   551  // umountProc writes to syncFD signalling the process started by
   552  // execProcUmounter() to umount /proc.
   553  func umountProc(syncFD int) {
   554  	syncFile := os.NewFile(uintptr(syncFD), "procfs umount sync FD")
   555  	buf := make([]byte, 1)
   556  	if w, err := syncFile.Write(buf); err != nil || w != 1 {
   557  		util.Fatalf("unable to write into the proc umounter descriptor: %v", err)
   558  	}
   559  	syncFile.Close()
   560  
   561  	var waitStatus unix.WaitStatus
   562  	if _, err := unix.Wait4(0, &waitStatus, 0, nil); err != nil {
   563  		util.Fatalf("error waiting for the proc umounter process: %v", err)
   564  	}
   565  	if !waitStatus.Exited() || waitStatus.ExitStatus() != 0 {
   566  		util.Fatalf("the proc umounter process failed: %v", waitStatus)
   567  	}
   568  	if err := unix.Access("/proc/self", unix.F_OK); err != unix.ENOENT {
   569  		util.Fatalf("/proc is still accessible")
   570  	}
   571  }
   572  
   573  // validateOpenFDs checks that the sandbox process does not have any open
   574  // directory FDs.
   575  func validateOpenFDs(passFDs []boot.FDMapping) {
   576  	passHostFDs := make(map[int]struct{})
   577  	for _, passFD := range passFDs {
   578  		passHostFDs[passFD.Host] = struct{}{}
   579  	}
   580  	const selfFDDir = "/proc/self/fd"
   581  	if err := filepath.WalkDir(selfFDDir, func(path string, d os.DirEntry, err error) error {
   582  		if err != nil {
   583  			return err
   584  		}
   585  		if d.Type() != os.ModeSymlink {
   586  			// All entries are symlinks. Ignore the callback for fd directory itself.
   587  			return nil
   588  		}
   589  		if fdInfo, err := os.Stat(path); err != nil {
   590  			if os.IsNotExist(err) {
   591  				// Ignore FDs that are now closed. For example, the FD to selfFDDir that
   592  				// was opened by filepath.WalkDir() to read dirents.
   593  				return nil
   594  			}
   595  			return fmt.Errorf("os.Stat(%s) failed: %v", path, err)
   596  		} else if !fdInfo.IsDir() {
   597  			return nil
   598  		}
   599  		// Uh-oh. This is a directory FD.
   600  		fdNo, err := strconv.Atoi(d.Name())
   601  		if err != nil {
   602  			return fmt.Errorf("strconv.Atoi(%s) failed: %v", d.Name(), err)
   603  		}
   604  		dirLink, err := os.Readlink(path)
   605  		if err != nil {
   606  			return fmt.Errorf("os.Readlink(%s) failed: %v", path, err)
   607  		}
   608  		if _, ok := passHostFDs[fdNo]; ok {
   609  			// Passed FDs are allowed to be directories. The user must be knowing
   610  			// what they are doing. Log a warning regardless.
   611  			log.Warningf("Sandbox has access to FD %d, which is a directory for %s", fdNo, dirLink)
   612  			return nil
   613  		}
   614  		return fmt.Errorf("FD %d is a directory for %s", fdNo, dirLink)
   615  	}); err != nil {
   616  		util.Fatalf("WalkDir(%s) failed: %v", selfFDDir, err)
   617  	}
   618  }