github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/cmd/boot.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cmd
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"io/ioutil"
    21  	"os"
    22  	"os/exec"
    23  	"path/filepath"
    24  	"runtime"
    25  	"runtime/debug"
    26  	"strconv"
    27  	"strings"
    28  	"time"
    29  
    30  	"github.com/google/subcommands"
    31  	specs "github.com/opencontainers/runtime-spec/specs-go"
    32  	"golang.org/x/sys/unix"
    33  	"github.com/metacubex/gvisor/pkg/coretag"
    34  	"github.com/metacubex/gvisor/pkg/cpuid"
    35  	"github.com/metacubex/gvisor/pkg/log"
    36  	"github.com/metacubex/gvisor/pkg/metric"
    37  	"github.com/metacubex/gvisor/pkg/ring0"
    38  	"github.com/metacubex/gvisor/pkg/sentry/platform"
    39  	"github.com/metacubex/gvisor/runsc/boot"
    40  	"github.com/metacubex/gvisor/runsc/cmd/util"
    41  	"github.com/metacubex/gvisor/runsc/config"
    42  	"github.com/metacubex/gvisor/runsc/flag"
    43  	"github.com/metacubex/gvisor/runsc/profile"
    44  	"github.com/metacubex/gvisor/runsc/specutils"
    45  )
    46  
    47  // Note that directfsSandboxCaps is the same as caps defined in gofer.go
    48  // except CAP_SYS_CHROOT because we don't need to chroot in directfs mode.
    49  var directfsSandboxCaps = []string{
    50  	"CAP_CHOWN",
    51  	"CAP_DAC_OVERRIDE",
    52  	"CAP_DAC_READ_SEARCH",
    53  	"CAP_FOWNER",
    54  	"CAP_FSETID",
    55  }
    56  
    57  // directfsSandboxLinuxCaps is the minimal set of capabilities needed by the
    58  // sandbox to operate on files in directfs mode.
    59  var directfsSandboxLinuxCaps = &specs.LinuxCapabilities{
    60  	Bounding:  directfsSandboxCaps,
    61  	Effective: directfsSandboxCaps,
    62  	Permitted: directfsSandboxCaps,
    63  }
    64  
    65  // Boot implements subcommands.Command for the "boot" command which starts a
    66  // new sandbox. It should not be called directly.
    67  type Boot struct {
    68  	// bundleDir is the directory containing the OCI spec.
    69  	bundleDir string
    70  
    71  	// specFD is the file descriptor that the spec will be read from.
    72  	specFD int
    73  
    74  	// controllerFD is the file descriptor of a stream socket for the
    75  	// control server that is donated to this process.
    76  	controllerFD int
    77  
    78  	// deviceFD is the file descriptor for the platform device file.
    79  	deviceFD int
    80  
    81  	// ioFDs is the list of FDs used to connect to FS gofers.
    82  	ioFDs intFlags
    83  
    84  	// devIoFD is the FD to connect to dev gofer.
    85  	devIoFD int
    86  
    87  	// goferFilestoreFDs are FDs to the regular files that will back the tmpfs or
    88  	// overlayfs mount for certain gofer mounts.
    89  	goferFilestoreFDs intFlags
    90  
    91  	// goferMountConfs contains information about how the gofer mounts have been
    92  	// configured. The first entry is for rootfs and the following entries are
    93  	// for bind mounts in Spec.Mounts (in the same order).
    94  	goferMountConfs boot.GoferMountConfFlags
    95  
    96  	// stdioFDs are the fds for stdin, stdout, and stderr. They must be
    97  	// provided in that order.
    98  	stdioFDs intFlags
    99  
   100  	// passFDs are mappings of user-supplied host to guest file descriptors.
   101  	passFDs fdMappings
   102  
   103  	// execFD is the host file descriptor used for program execution.
   104  	execFD int
   105  
   106  	// applyCaps determines if capabilities defined in the spec should be applied
   107  	// to the process.
   108  	applyCaps bool
   109  
   110  	// setUpChroot is set to true if the sandbox is started in an empty root.
   111  	setUpRoot bool
   112  
   113  	// cpuNum number of CPUs to create inside the sandbox.
   114  	cpuNum int
   115  
   116  	// totalMem sets the initial amount of total memory to report back to the
   117  	// container.
   118  	totalMem uint64
   119  
   120  	// totalHostMem is the total memory reported by host /proc/meminfo.
   121  	totalHostMem uint64
   122  
   123  	// userLogFD is the file descriptor to write user logs to.
   124  	userLogFD int
   125  
   126  	// startSyncFD is the file descriptor to synchronize runsc and sandbox.
   127  	startSyncFD int
   128  
   129  	// mountsFD is the file descriptor to read list of mounts after they have
   130  	// been resolved (direct paths, no symlinks). They are resolved outside the
   131  	// sandbox (e.g. gofer) and sent through this FD. When mountsFD is not
   132  	// provided, there is no cleaning required for mounts and the mounts in
   133  	// the spec can be used as is.
   134  	mountsFD int
   135  
   136  	podInitConfigFD int
   137  
   138  	sinkFDs intFlags
   139  
   140  	// pidns is set if the sandbox is in its own pid namespace.
   141  	pidns bool
   142  
   143  	// attached is set to true to kill the sandbox process when the parent process
   144  	// terminates. This flag is set when the command execve's itself because
   145  	// parent death signal doesn't propagate through execve when uid/gid changes.
   146  	attached bool
   147  
   148  	// productName is the value to show in
   149  	// /sys/devices/virtual/dmi/id/product_name.
   150  	productName string
   151  
   152  	// FDs for profile data.
   153  	profileFDs profile.FDArgs
   154  
   155  	// procMountSyncFD is a file descriptor that has to be closed when the
   156  	// procfs mount isn't needed anymore.
   157  	procMountSyncFD int
   158  
   159  	// syncUsernsFD is the file descriptor that has to be closed when the
   160  	// boot process should invoke setuid/setgid for root user. This is mainly
   161  	// used to synchronize rootless user namespace initialization.
   162  	syncUsernsFD int
   163  
   164  	// nvidiaDriverVersion is the Nvidia driver version on the host.
   165  	nvidiaDriverVersion string
   166  }
   167  
   168  // Name implements subcommands.Command.Name.
   169  func (*Boot) Name() string {
   170  	return "boot"
   171  }
   172  
   173  // Synopsis implements subcommands.Command.Synopsis.
   174  func (*Boot) Synopsis() string {
   175  	return "launch a sandbox process"
   176  }
   177  
   178  // Usage implements subcommands.Command.Usage.
   179  func (*Boot) Usage() string {
   180  	return `boot [flags] <container id>`
   181  }
   182  
   183  // SetFlags implements subcommands.Command.SetFlags.
   184  func (b *Boot) SetFlags(f *flag.FlagSet) {
   185  	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
   186  	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
   187  	f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
   188  	f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
   189  	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
   190  	f.IntVar(&b.procMountSyncFD, "proc-mount-sync-fd", -1, "file descriptor that has to be written to when /proc isn't needed anymore and can be unmounted")
   191  	f.IntVar(&b.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.")
   192  	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
   193  	f.Uint64Var(&b.totalHostMem, "total-host-memory", 0, "total memory reported by host /proc/meminfo")
   194  	f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates")
   195  	f.StringVar(&b.productName, "product-name", "", "value to show in /sys/devices/virtual/dmi/id/product_name")
   196  	f.StringVar(&b.nvidiaDriverVersion, "nvidia-driver-version", "", "Nvidia driver version on the host")
   197  
   198  	// Open FDs that are donated to the sandbox.
   199  	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
   200  	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
   201  	f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
   202  	f.Var(&b.ioFDs, "io-fds", "list of image FDs and/or socket FDs to connect gofer clients. They must follow this order: root first, then mounts as defined in the spec")
   203  	f.IntVar(&b.devIoFD, "dev-io-fd", -1, "FD to connect dev gofer client")
   204  	f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
   205  	f.Var(&b.passFDs, "pass-fd", "mapping of host to guest FDs. They must be in M:N format. M is the host and N the guest descriptor.")
   206  	f.IntVar(&b.execFD, "exec-fd", -1, "host file descriptor used for program execution.")
   207  	f.Var(&b.goferFilestoreFDs, "gofer-filestore-fds", "FDs to the regular files that will back the overlayfs or tmpfs mount if a gofer mount is to be overlaid.")
   208  	f.Var(&b.goferMountConfs, "gofer-mount-confs", "information about how the gofer mounts have been configured.")
   209  	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
   210  	f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
   211  	f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is an optional file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
   212  	f.IntVar(&b.podInitConfigFD, "pod-init-config-fd", -1, "file descriptor to the pod init configuration file.")
   213  	f.Var(&b.sinkFDs, "sink-fds", "ordered list of file descriptors to be used by the sinks defined in --pod-init-config.")
   214  
   215  	// Profiling flags.
   216  	b.profileFDs.SetFromFlags(f)
   217  }
   218  
   219  // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
   220  // waiting state.
   221  func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus {
   222  	if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 {
   223  		f.Usage()
   224  		return subcommands.ExitUsageError
   225  	}
   226  
   227  	conf := args[0].(*config.Config)
   228  
   229  	// Set traceback level
   230  	debug.SetTraceback(conf.Traceback)
   231  
   232  	// Initialize CPUID information.
   233  	cpuid.Initialize()
   234  
   235  	// Initialize ring0 library.
   236  	ring0.InitDefault()
   237  
   238  	argOverride := make(map[string]string)
   239  	if len(b.productName) == 0 {
   240  		// Do this before chroot takes effect, otherwise we can't read /sys.
   241  		if product, err := ioutil.ReadFile("/sys/devices/virtual/dmi/id/product_name"); err != nil {
   242  			log.Warningf("Not setting product_name: %v", err)
   243  		} else {
   244  			b.productName = strings.TrimSpace(string(product))
   245  			log.Infof("Setting product_name: %q", b.productName)
   246  			argOverride["product-name"] = b.productName
   247  		}
   248  	}
   249  
   250  	if b.attached {
   251  		// Ensure this process is killed after parent process terminates when
   252  		// attached mode is enabled. In the unfortunate event that the parent
   253  		// terminates before this point, this process leaks.
   254  		if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
   255  			util.Fatalf("error setting parent death signal: %v", err)
   256  		}
   257  	}
   258  
   259  	if b.syncUsernsFD >= 0 {
   260  		syncUsernsForRootless(b.syncUsernsFD)
   261  		argOverride["sync-userns-fd"] = "-1"
   262  	}
   263  
   264  	// Get the spec from the specFD. We *must* keep this os.File alive past
   265  	// the call setCapsAndCallSelf, otherwise the FD will be closed and the
   266  	// child process cannot read it
   267  	specFile := os.NewFile(uintptr(b.specFD), "spec file")
   268  	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf)
   269  	if err != nil {
   270  		util.Fatalf("reading spec: %v", err)
   271  	}
   272  
   273  	if b.setUpRoot {
   274  		if err := setUpChroot(b.pidns, spec, conf); err != nil {
   275  			util.Fatalf("error setting up chroot: %v", err)
   276  		}
   277  		argOverride["setup-root"] = "false"
   278  
   279  		if !conf.Rootless {
   280  			// /proc is umounted from a forked process, because the
   281  			// current one is going to re-execute itself without
   282  			// capabilities.
   283  			cmd, w := execProcUmounter()
   284  			defer cmd.Wait()
   285  			defer w.Close()
   286  			if b.procMountSyncFD != -1 {
   287  				panic("procMountSyncFD is set")
   288  			}
   289  			b.procMountSyncFD = int(w.Fd())
   290  			argOverride["proc-mount-sync-fd"] = strconv.Itoa(b.procMountSyncFD)
   291  
   292  			// Clear FD_CLOEXEC. Regardless of b.applyCaps, this process will be
   293  			// re-executed. procMountSyncFD should remain open.
   294  			if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 {
   295  				util.Fatalf("error clearing CLOEXEC: %v", errno)
   296  			}
   297  
   298  			if !b.applyCaps {
   299  				// Remove the args that have already been done before calling self.
   300  				args := prepareArgs(b.Name(), f, argOverride)
   301  
   302  				// Note that we've already read the spec from the spec FD, and
   303  				// we will read it again after the exec call. This works
   304  				// because the ReadSpecFromFile function seeks to the beginning
   305  				// of the file before reading.
   306  				util.Fatalf("callSelfAsNobody(%v): %v", args, callSelfAsNobody(args))
   307  
   308  				// This prevents the specFile finalizer from running and closed
   309  				// the specFD, which we have passed to ourselves when
   310  				// re-execing.
   311  				runtime.KeepAlive(specFile)
   312  				panic("unreachable")
   313  			}
   314  		}
   315  	}
   316  
   317  	specutils.LogSpecDebug(spec, conf.OCISeccomp)
   318  
   319  	if b.applyCaps {
   320  		caps := spec.Process.Capabilities
   321  		if caps == nil {
   322  			caps = &specs.LinuxCapabilities{}
   323  		}
   324  
   325  		gPlatform, err := platform.Lookup(conf.Platform)
   326  		if err != nil {
   327  			util.Fatalf("loading platform: %v", err)
   328  		}
   329  		if gPlatform.Requirements().RequiresCapSysPtrace {
   330  			// Ptrace platform requires extra capabilities.
   331  			const c = "CAP_SYS_PTRACE"
   332  			caps.Bounding = append(caps.Bounding, c)
   333  			caps.Effective = append(caps.Effective, c)
   334  			caps.Permitted = append(caps.Permitted, c)
   335  		}
   336  
   337  		if conf.DirectFS {
   338  			caps = specutils.MergeCapabilities(caps, directfsSandboxLinuxCaps)
   339  		}
   340  		argOverride["apply-caps"] = "false"
   341  
   342  		// Remove the args that have already been done before calling self.
   343  		args := prepareArgs(b.Name(), f, argOverride)
   344  
   345  		// Note that we've already read the spec from the spec FD, and
   346  		// we will read it again after the exec call. This works
   347  		// because the ReadSpecFromFile function seeks to the beginning
   348  		// of the file before reading.
   349  		util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, caps, setCapsAndCallSelf(args, caps))
   350  
   351  		// This prevents the specFile finalizer from running and closed
   352  		// the specFD, which we have passed to ourselves when
   353  		// re-execing.
   354  		runtime.KeepAlive(specFile)
   355  		panic("unreachable")
   356  	}
   357  
   358  	if b.syncUsernsFD >= 0 {
   359  		// syncUsernsFD is set, but runsc hasn't been re-executed with a new UID and GID.
   360  		// We expect that setCapsAndCallSelf has to be called in this case.
   361  		panic("unreachable")
   362  	}
   363  
   364  	// Close specFile to avoid exposing it to the sandbox.
   365  	if err := specFile.Close(); err != nil {
   366  		util.Fatalf("closing specFile: %v", err)
   367  	}
   368  
   369  	// At this point we won't re-execute, so it's safe to limit via rlimits. Any
   370  	// limit >= 0 works. If the limit is lower than the current number of open
   371  	// files, then Setrlimit will succeed, and the next open will fail.
   372  	if conf.FDLimit > -1 {
   373  		rlimit := unix.Rlimit{
   374  			Cur: uint64(conf.FDLimit),
   375  			Max: uint64(conf.FDLimit),
   376  		}
   377  		switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err {
   378  		case nil:
   379  		case unix.EPERM:
   380  			log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit)
   381  		default:
   382  			util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err)
   383  		}
   384  	}
   385  
   386  	// When mountsFD is not provided, there is no cleaning required.
   387  	if b.mountsFD >= 0 {
   388  		// Read resolved mount list and replace the original one from the spec.
   389  		mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file")
   390  		cleanMounts, err := specutils.ReadMounts(mountsFile)
   391  		if err != nil {
   392  			mountsFile.Close()
   393  			util.Fatalf("Error reading mounts file: %v", err)
   394  		}
   395  		mountsFile.Close()
   396  		spec.Mounts = cleanMounts
   397  	}
   398  
   399  	if conf.DirectFS {
   400  		// sandbox should run with a umask of 0, because we want to preserve file
   401  		// modes exactly as sent by the sentry, which would have already applied
   402  		// the application umask.
   403  		unix.Umask(0)
   404  	}
   405  
   406  	if conf.EnableCoreTags {
   407  		if err := coretag.Enable(); err != nil {
   408  			util.Fatalf("Failed to core tag sentry: %v", err)
   409  		}
   410  
   411  		// Verify that all sentry threads are properly core tagged, and log
   412  		// current core tag.
   413  		coreTags, err := coretag.GetAllCoreTags(os.Getpid())
   414  		if err != nil {
   415  			util.Fatalf("Failed read current core tags: %v", err)
   416  		}
   417  		if len(coreTags) != 1 {
   418  			util.Fatalf("Not all child threads were core tagged the same. Tags=%v", coreTags)
   419  		}
   420  		log.Infof("Core tag enabled (core tag=%d)", coreTags[0])
   421  	}
   422  
   423  	// Create the loader.
   424  	bootArgs := boot.Args{
   425  		ID:                  f.Arg(0),
   426  		Spec:                spec,
   427  		Conf:                conf,
   428  		ControllerFD:        b.controllerFD,
   429  		Device:              os.NewFile(uintptr(b.deviceFD), "platform device"),
   430  		GoferFDs:            b.ioFDs.GetArray(),
   431  		DevGoferFD:          b.devIoFD,
   432  		StdioFDs:            b.stdioFDs.GetArray(),
   433  		PassFDs:             b.passFDs.GetArray(),
   434  		ExecFD:              b.execFD,
   435  		GoferFilestoreFDs:   b.goferFilestoreFDs.GetArray(),
   436  		GoferMountConfs:     b.goferMountConfs.GetArray(),
   437  		NumCPU:              b.cpuNum,
   438  		TotalMem:            b.totalMem,
   439  		TotalHostMem:        b.totalHostMem,
   440  		UserLogFD:           b.userLogFD,
   441  		ProductName:         b.productName,
   442  		PodInitConfigFD:     b.podInitConfigFD,
   443  		SinkFDs:             b.sinkFDs.GetArray(),
   444  		ProfileOpts:         b.profileFDs.ToOpts(),
   445  		NvidiaDriverVersion: b.nvidiaDriverVersion,
   446  	}
   447  	l, err := boot.New(bootArgs)
   448  	if err != nil {
   449  		util.Fatalf("creating loader: %v", err)
   450  	}
   451  
   452  	// Fatalf exits the process and doesn't run defers.
   453  	// 'l' must be destroyed explicitly after this point!
   454  
   455  	if b.procMountSyncFD != -1 {
   456  		l.PreSeccompCallback = func() {
   457  			// Call validateOpenFDs() before umounting /proc.
   458  			validateOpenFDs(bootArgs.PassFDs)
   459  			// Umount /proc right before installing seccomp filters.
   460  			umountProc(b.procMountSyncFD)
   461  		}
   462  	}
   463  
   464  	if conf.TestOnlyAutosaveImagePath != "" {
   465  		fName := filepath.Join(conf.TestOnlyAutosaveImagePath, checkpointFileName)
   466  		f, err := os.OpenFile(fName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666)
   467  		if err != nil {
   468  			util.Fatalf("error in creating state file %v", err)
   469  		}
   470  		defer f.Close()
   471  		boot.EnableAutosave(l, f)
   472  	}
   473  
   474  	// Prepare metrics.
   475  	// This needs to happen after the kernel is initialized (such that all metrics are registered)
   476  	// but before the start-sync file is notified, as the parent process needs to query for
   477  	// registered metrics prior to sending the start signal.
   478  	metric.Initialize()
   479  	if metric.ProfilingMetricWriter != nil {
   480  		if err := metric.StartProfilingMetrics(conf.ProfilingMetrics, time.Duration(conf.ProfilingMetricsRate)*time.Microsecond); err != nil {
   481  			l.Destroy()
   482  			util.Fatalf("unable to start profiling metrics: %v", err)
   483  		}
   484  		defer metric.StopProfilingMetrics()
   485  	}
   486  
   487  	// Notify the parent process the sandbox has booted (and that the controller
   488  	// is up).
   489  	startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
   490  	buf := make([]byte, 1)
   491  	if w, err := startSyncFile.Write(buf); err != nil || w != 1 {
   492  		l.Destroy()
   493  		util.Fatalf("unable to write into the start-sync descriptor: %v", err)
   494  	}
   495  	// Closes startSyncFile because 'l.Run()' only returns when the sandbox exits.
   496  	startSyncFile.Close()
   497  
   498  	// Wait for the start signal from runsc.
   499  	l.WaitForStartSignal()
   500  
   501  	// Run the application and wait for it to finish.
   502  	if err := l.Run(); err != nil {
   503  		l.Destroy()
   504  		util.Fatalf("running sandbox: %v", err)
   505  	}
   506  
   507  	ws := l.WaitExit()
   508  	log.Infof("application exiting with %+v", ws)
   509  	waitStatus := args[1].(*unix.WaitStatus)
   510  	*waitStatus = unix.WaitStatus(ws)
   511  	l.Destroy()
   512  	return subcommands.ExitSuccess
   513  }
   514  
   515  // prepareArgs returns the args that can be used to re-execute the current
   516  // program. It manipulates the flags of the subcommands.Command identified by
   517  // subCmdName and fSet is the flag.FlagSet of this subcommand. It applies the
   518  // flags specified by override map. In case of conflict, flag is overriden.
   519  //
   520  // Postcondition: prepareArgs() takes ownership of override map.
   521  func prepareArgs(subCmdName string, fSet *flag.FlagSet, override map[string]string) []string {
   522  	var args []string
   523  	// Add all args up until (and including) the sub command.
   524  	for _, arg := range os.Args {
   525  		args = append(args, arg)
   526  		if arg == subCmdName {
   527  			break
   528  		}
   529  	}
   530  	// Set sub command flags. Iterate through all the explicitly set flags.
   531  	fSet.Visit(func(gf *flag.Flag) {
   532  		// If a conflict is found with override, then prefer override flag.
   533  		if ov, ok := override[gf.Name]; ok {
   534  			args = append(args, fmt.Sprintf("--%s=%s", gf.Name, ov))
   535  			delete(override, gf.Name)
   536  			return
   537  		}
   538  		// Otherwise pass through the original flag.
   539  		args = append(args, fmt.Sprintf("--%s=%s", gf.Name, gf.Value))
   540  	})
   541  	// Apply remaining override flags (that didn't conflict above).
   542  	for of, ov := range override {
   543  		args = append(args, fmt.Sprintf("--%s=%s", of, ov))
   544  	}
   545  	// Add the non-flag arguments at the end.
   546  	args = append(args, fSet.Args()...)
   547  	return args
   548  }
   549  
   550  // execProcUmounter execute a child process that umounts /proc when the
   551  // returned pipe is closed.
   552  func execProcUmounter() (*exec.Cmd, *os.File) {
   553  	r, w, err := os.Pipe()
   554  	if err != nil {
   555  		util.Fatalf("error creating a pipe: %v", err)
   556  	}
   557  	defer r.Close()
   558  
   559  	cmd := exec.Command(specutils.ExePath)
   560  	cmd.Args = append(cmd.Args, "umount", "--sync-fd=3", "/proc")
   561  	cmd.ExtraFiles = append(cmd.ExtraFiles, r)
   562  	cmd.Stdin = os.Stdin
   563  	cmd.Stdout = os.Stdout
   564  	cmd.Stderr = os.Stderr
   565  	if err := cmd.Start(); err != nil {
   566  		util.Fatalf("error executing umounter: %v", err)
   567  	}
   568  	return cmd, w
   569  }
   570  
   571  // umountProc writes to syncFD signalling the process started by
   572  // execProcUmounter() to umount /proc.
   573  func umountProc(syncFD int) {
   574  	syncFile := os.NewFile(uintptr(syncFD), "procfs umount sync FD")
   575  	buf := make([]byte, 1)
   576  	if w, err := syncFile.Write(buf); err != nil || w != 1 {
   577  		util.Fatalf("unable to write into the proc umounter descriptor: %v", err)
   578  	}
   579  	syncFile.Close()
   580  
   581  	var waitStatus unix.WaitStatus
   582  	if _, err := unix.Wait4(0, &waitStatus, 0, nil); err != nil {
   583  		util.Fatalf("error waiting for the proc umounter process: %v", err)
   584  	}
   585  	if !waitStatus.Exited() || waitStatus.ExitStatus() != 0 {
   586  		util.Fatalf("the proc umounter process failed: %v", waitStatus)
   587  	}
   588  	if err := unix.Access("/proc/self", unix.F_OK); err != unix.ENOENT {
   589  		util.Fatalf("/proc is still accessible")
   590  	}
   591  }
   592  
   593  // validateOpenFDs checks that the sandbox process does not have any open
   594  // directory FDs.
   595  func validateOpenFDs(passFDs []boot.FDMapping) {
   596  	passHostFDs := make(map[int]struct{})
   597  	for _, passFD := range passFDs {
   598  		passHostFDs[passFD.Host] = struct{}{}
   599  	}
   600  	const selfFDDir = "/proc/self/fd"
   601  	if err := filepath.WalkDir(selfFDDir, func(path string, d os.DirEntry, err error) error {
   602  		if err != nil {
   603  			return err
   604  		}
   605  		if d.Type() != os.ModeSymlink {
   606  			// All entries are symlinks. Ignore the callback for fd directory itself.
   607  			return nil
   608  		}
   609  		if fdInfo, err := os.Stat(path); err != nil {
   610  			if os.IsNotExist(err) {
   611  				// Ignore FDs that are now closed. For example, the FD to selfFDDir that
   612  				// was opened by filepath.WalkDir() to read dirents.
   613  				return nil
   614  			}
   615  			return fmt.Errorf("os.Stat(%s) failed: %v", path, err)
   616  		} else if !fdInfo.IsDir() {
   617  			return nil
   618  		}
   619  		// Uh-oh. This is a directory FD.
   620  		fdNo, err := strconv.Atoi(d.Name())
   621  		if err != nil {
   622  			return fmt.Errorf("strconv.Atoi(%s) failed: %v", d.Name(), err)
   623  		}
   624  		dirLink, err := os.Readlink(path)
   625  		if err != nil {
   626  			return fmt.Errorf("os.Readlink(%s) failed: %v", path, err)
   627  		}
   628  		if _, ok := passHostFDs[fdNo]; ok {
   629  			// Passed FDs are allowed to be directories. The user must be knowing
   630  			// what they are doing. Log a warning regardless.
   631  			log.Warningf("Sandbox has access to FD %d, which is a directory for %s", fdNo, dirLink)
   632  			return nil
   633  		}
   634  		return fmt.Errorf("FD %d is a directory for %s", fdNo, dirLink)
   635  	}); err != nil {
   636  		util.Fatalf("WalkDir(%s) failed: %v", selfFDDir, err)
   637  	}
   638  }