gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/cmd/boot.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cmd
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"io/ioutil"
    21  	"os"
    22  	"os/exec"
    23  	"path/filepath"
    24  	"runtime"
    25  	"runtime/debug"
    26  	"strconv"
    27  	"strings"
    28  	"time"
    29  
    30  	"github.com/google/subcommands"
    31  	specs "github.com/opencontainers/runtime-spec/specs-go"
    32  	"golang.org/x/sys/unix"
    33  	"gvisor.dev/gvisor/pkg/coretag"
    34  	"gvisor.dev/gvisor/pkg/cpuid"
    35  	"gvisor.dev/gvisor/pkg/fd"
    36  	"gvisor.dev/gvisor/pkg/log"
    37  	"gvisor.dev/gvisor/pkg/metric"
    38  	"gvisor.dev/gvisor/pkg/ring0"
    39  	"gvisor.dev/gvisor/pkg/sentry/platform"
    40  	"gvisor.dev/gvisor/runsc/boot"
    41  	"gvisor.dev/gvisor/runsc/cmd/util"
    42  	"gvisor.dev/gvisor/runsc/config"
    43  	"gvisor.dev/gvisor/runsc/flag"
    44  	"gvisor.dev/gvisor/runsc/profile"
    45  	"gvisor.dev/gvisor/runsc/specutils"
    46  )
    47  
    48  // Note that directfsSandboxCaps is the same as caps defined in gofer.go
    49  // except CAP_SYS_CHROOT because we don't need to chroot in directfs mode.
    50  var directfsSandboxCaps = []string{
    51  	"CAP_CHOWN",
    52  	"CAP_DAC_OVERRIDE",
    53  	"CAP_DAC_READ_SEARCH",
    54  	"CAP_FOWNER",
    55  	"CAP_FSETID",
    56  }
    57  
    58  // directfsSandboxLinuxCaps is the minimal set of capabilities needed by the
    59  // sandbox to operate on files in directfs mode.
    60  var directfsSandboxLinuxCaps = &specs.LinuxCapabilities{
    61  	Bounding:  directfsSandboxCaps,
    62  	Effective: directfsSandboxCaps,
    63  	Permitted: directfsSandboxCaps,
    64  }
    65  
    66  // Boot implements subcommands.Command for the "boot" command which starts a
    67  // new sandbox. It should not be called directly.
    68  type Boot struct {
    69  	// bundleDir is the directory containing the OCI spec.
    70  	bundleDir string
    71  
    72  	// specFD is the file descriptor that the spec will be read from.
    73  	specFD int
    74  
    75  	// controllerFD is the file descriptor of a stream socket for the
    76  	// control server that is donated to this process.
    77  	controllerFD int
    78  
    79  	// deviceFD is the file descriptor for the platform device file.
    80  	deviceFD int
    81  
    82  	// ioFDs is the list of FDs used to connect to FS gofers.
    83  	ioFDs intFlags
    84  
    85  	// devIoFD is the FD to connect to dev gofer.
    86  	devIoFD int
    87  
    88  	// goferFilestoreFDs are FDs to the regular files that will back the tmpfs or
    89  	// overlayfs mount for certain gofer mounts.
    90  	goferFilestoreFDs intFlags
    91  
    92  	// goferMountConfs contains information about how the gofer mounts have been
    93  	// configured. The first entry is for rootfs and the following entries are
    94  	// for bind mounts in Spec.Mounts (in the same order).
    95  	goferMountConfs boot.GoferMountConfFlags
    96  
    97  	// stdioFDs are the fds for stdin, stdout, and stderr. They must be
    98  	// provided in that order.
    99  	stdioFDs intFlags
   100  
   101  	// passFDs are mappings of user-supplied host to guest file descriptors.
   102  	passFDs fdMappings
   103  
   104  	// execFD is the host file descriptor used for program execution.
   105  	execFD int
   106  
   107  	// applyCaps determines if capabilities defined in the spec should be applied
   108  	// to the process.
   109  	applyCaps bool
   110  
   111  	// setUpChroot is set to true if the sandbox is started in an empty root.
   112  	setUpRoot bool
   113  
   114  	// cpuNum number of CPUs to create inside the sandbox.
   115  	cpuNum int
   116  
   117  	// totalMem sets the initial amount of total memory to report back to the
   118  	// container.
   119  	totalMem uint64
   120  
   121  	// totalHostMem is the total memory reported by host /proc/meminfo.
   122  	totalHostMem uint64
   123  
   124  	// userLogFD is the file descriptor to write user logs to.
   125  	userLogFD int
   126  
   127  	// startSyncFD is the file descriptor to synchronize runsc and sandbox.
   128  	startSyncFD int
   129  
   130  	// mountsFD is the file descriptor to read list of mounts after they have
   131  	// been resolved (direct paths, no symlinks). They are resolved outside the
   132  	// sandbox (e.g. gofer) and sent through this FD. When mountsFD is not
   133  	// provided, there is no cleaning required for mounts and the mounts in
   134  	// the spec can be used as is.
   135  	mountsFD int
   136  
   137  	podInitConfigFD int
   138  
   139  	sinkFDs intFlags
   140  
   141  	// pidns is set if the sandbox is in its own pid namespace.
   142  	pidns bool
   143  
   144  	// attached is set to true to kill the sandbox process when the parent process
   145  	// terminates. This flag is set when the command execve's itself because
   146  	// parent death signal doesn't propagate through execve when uid/gid changes.
   147  	attached bool
   148  
   149  	// productName is the value to show in
   150  	// /sys/devices/virtual/dmi/id/product_name.
   151  	productName string
   152  
   153  	// FDs for profile data.
   154  	profileFDs profile.FDArgs
   155  
   156  	// profilingMetricsFD is a file descriptor to write Sentry metrics data to.
   157  	profilingMetricsFD int
   158  
   159  	// profilingMetricsLossy sets whether profilingMetricsFD is a lossy channel.
   160  	// If so, the format used to write to it will contain a checksum.
   161  	profilingMetricsLossy bool
   162  
   163  	// procMountSyncFD is a file descriptor that has to be closed when the
   164  	// procfs mount isn't needed anymore.
   165  	procMountSyncFD int
   166  
   167  	// syncUsernsFD is the file descriptor that has to be closed when the
   168  	// boot process should invoke setuid/setgid for root user. This is mainly
   169  	// used to synchronize rootless user namespace initialization.
   170  	syncUsernsFD int
   171  
   172  	// nvidiaDriverVersion is the Nvidia driver version on the host.
   173  	nvidiaDriverVersion string
   174  }
   175  
   176  // Name implements subcommands.Command.Name.
   177  func (*Boot) Name() string {
   178  	return "boot"
   179  }
   180  
   181  // Synopsis implements subcommands.Command.Synopsis.
   182  func (*Boot) Synopsis() string {
   183  	return "launch a sandbox process"
   184  }
   185  
   186  // Usage implements subcommands.Command.Usage.
   187  func (*Boot) Usage() string {
   188  	return `boot [flags] <container id>`
   189  }
   190  
   191  // SetFlags implements subcommands.Command.SetFlags.
   192  func (b *Boot) SetFlags(f *flag.FlagSet) {
   193  	f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory")
   194  	f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process")
   195  	f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process")
   196  	f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace")
   197  	f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox")
   198  	f.IntVar(&b.procMountSyncFD, "proc-mount-sync-fd", -1, "file descriptor that has to be written to when /proc isn't needed anymore and can be unmounted")
   199  	f.IntVar(&b.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.")
   200  	f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container")
   201  	f.Uint64Var(&b.totalHostMem, "total-host-memory", 0, "total memory reported by host /proc/meminfo")
   202  	f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates")
   203  	f.StringVar(&b.productName, "product-name", "", "value to show in /sys/devices/virtual/dmi/id/product_name")
   204  	f.StringVar(&b.nvidiaDriverVersion, "nvidia-driver-version", "", "Nvidia driver version on the host")
   205  
   206  	// Open FDs that are donated to the sandbox.
   207  	f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec")
   208  	f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process")
   209  	f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file")
   210  	f.Var(&b.ioFDs, "io-fds", "list of image FDs and/or socket FDs to connect gofer clients. They must follow this order: root first, then mounts as defined in the spec")
   211  	f.IntVar(&b.devIoFD, "dev-io-fd", -1, "FD to connect dev gofer client")
   212  	f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order")
   213  	f.Var(&b.passFDs, "pass-fd", "mapping of host to guest FDs. They must be in M:N format. M is the host and N the guest descriptor.")
   214  	f.IntVar(&b.execFD, "exec-fd", -1, "host file descriptor used for program execution.")
   215  	f.Var(&b.goferFilestoreFDs, "gofer-filestore-fds", "FDs to the regular files that will back the overlayfs or tmpfs mount if a gofer mount is to be overlaid.")
   216  	f.Var(&b.goferMountConfs, "gofer-mount-confs", "information about how the gofer mounts have been configured.")
   217  	f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.")
   218  	f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup")
   219  	f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is an optional file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).")
   220  	f.IntVar(&b.podInitConfigFD, "pod-init-config-fd", -1, "file descriptor to the pod init configuration file.")
   221  	f.Var(&b.sinkFDs, "sink-fds", "ordered list of file descriptors to be used by the sinks defined in --pod-init-config.")
   222  
   223  	// Profiling flags.
   224  	b.profileFDs.SetFromFlags(f)
   225  	f.IntVar(&b.profilingMetricsFD, "profiling-metrics-fd", -1, "file descriptor to write sentry profiling metrics.")
   226  	f.BoolVar(&b.profilingMetricsLossy, "profiling-metrics-fd-lossy", false, "if true, treat the sentry profiling metrics FD as lossy and write a checksum to it.")
   227  }
   228  
   229  // Execute implements subcommands.Command.Execute.  It starts a sandbox in a
   230  // waiting state.
   231  func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus {
   232  	if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 {
   233  		f.Usage()
   234  		return subcommands.ExitUsageError
   235  	}
   236  
   237  	conf := args[0].(*config.Config)
   238  
   239  	// Set traceback level
   240  	debug.SetTraceback(conf.Traceback)
   241  
   242  	// Initialize CPUID information.
   243  	cpuid.Initialize()
   244  
   245  	// Initialize ring0 library.
   246  	ring0.InitDefault()
   247  
   248  	argOverride := make(map[string]string)
   249  	if len(b.productName) == 0 {
   250  		// Do this before chroot takes effect, otherwise we can't read /sys.
   251  		if product, err := ioutil.ReadFile("/sys/devices/virtual/dmi/id/product_name"); err != nil {
   252  			log.Warningf("Not setting product_name: %v", err)
   253  		} else {
   254  			b.productName = strings.TrimSpace(string(product))
   255  			log.Infof("Setting product_name: %q", b.productName)
   256  			argOverride["product-name"] = b.productName
   257  		}
   258  	}
   259  
   260  	if b.attached {
   261  		// Ensure this process is killed after parent process terminates when
   262  		// attached mode is enabled. In the unfortunate event that the parent
   263  		// terminates before this point, this process leaks.
   264  		if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil {
   265  			util.Fatalf("error setting parent death signal: %v", err)
   266  		}
   267  	}
   268  
   269  	if b.syncUsernsFD >= 0 {
   270  		syncUsernsForRootless(b.syncUsernsFD)
   271  		argOverride["sync-userns-fd"] = "-1"
   272  	}
   273  
   274  	// Get the spec from the specFD. We *must* keep this os.File alive past
   275  	// the call setCapsAndCallSelf, otherwise the FD will be closed and the
   276  	// child process cannot read it
   277  	specFile := os.NewFile(uintptr(b.specFD), "spec file")
   278  	spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf)
   279  	if err != nil {
   280  		util.Fatalf("reading spec: %v", err)
   281  	}
   282  
   283  	if b.setUpRoot {
   284  		if err := setUpChroot(b.pidns, spec, conf); err != nil {
   285  			util.Fatalf("error setting up chroot: %v", err)
   286  		}
   287  		argOverride["setup-root"] = "false"
   288  
   289  		if !conf.Rootless {
   290  			// /proc is umounted from a forked process, because the
   291  			// current one is going to re-execute itself without
   292  			// capabilities.
   293  			cmd, w := execProcUmounter()
   294  			defer cmd.Wait()
   295  			defer w.Close()
   296  			if b.procMountSyncFD != -1 {
   297  				panic("procMountSyncFD is set")
   298  			}
   299  			b.procMountSyncFD = int(w.Fd())
   300  			argOverride["proc-mount-sync-fd"] = strconv.Itoa(b.procMountSyncFD)
   301  
   302  			// Clear FD_CLOEXEC. Regardless of b.applyCaps, this process will be
   303  			// re-executed. procMountSyncFD should remain open.
   304  			if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 {
   305  				util.Fatalf("error clearing CLOEXEC: %v", errno)
   306  			}
   307  
   308  			if !b.applyCaps {
   309  				// Remove the args that have already been done before calling self.
   310  				args := prepareArgs(b.Name(), f, argOverride)
   311  
   312  				// Note that we've already read the spec from the spec FD, and
   313  				// we will read it again after the exec call. This works
   314  				// because the ReadSpecFromFile function seeks to the beginning
   315  				// of the file before reading.
   316  				util.Fatalf("callSelfAsNobody(%v): %v", args, callSelfAsNobody(args))
   317  
   318  				// This prevents the specFile finalizer from running and closed
   319  				// the specFD, which we have passed to ourselves when
   320  				// re-execing.
   321  				runtime.KeepAlive(specFile)
   322  				panic("unreachable")
   323  			}
   324  		}
   325  	}
   326  
   327  	specutils.LogSpecDebug(spec, conf.OCISeccomp)
   328  
   329  	if b.applyCaps {
   330  		caps := spec.Process.Capabilities
   331  		if caps == nil {
   332  			caps = &specs.LinuxCapabilities{}
   333  		}
   334  
   335  		gPlatform, err := platform.Lookup(conf.Platform)
   336  		if err != nil {
   337  			util.Fatalf("loading platform: %v", err)
   338  		}
   339  		if gPlatform.Requirements().RequiresCapSysPtrace {
   340  			// Ptrace platform requires extra capabilities.
   341  			const c = "CAP_SYS_PTRACE"
   342  			caps.Bounding = append(caps.Bounding, c)
   343  			caps.Effective = append(caps.Effective, c)
   344  			caps.Permitted = append(caps.Permitted, c)
   345  		}
   346  
   347  		if conf.DirectFS {
   348  			caps = specutils.MergeCapabilities(caps, directfsSandboxLinuxCaps)
   349  		}
   350  		argOverride["apply-caps"] = "false"
   351  
   352  		// Remove the args that have already been done before calling self.
   353  		args := prepareArgs(b.Name(), f, argOverride)
   354  
   355  		// Note that we've already read the spec from the spec FD, and
   356  		// we will read it again after the exec call. This works
   357  		// because the ReadSpecFromFile function seeks to the beginning
   358  		// of the file before reading.
   359  		util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, caps, setCapsAndCallSelf(args, caps))
   360  
   361  		// This prevents the specFile finalizer from running and closed
   362  		// the specFD, which we have passed to ourselves when
   363  		// re-execing.
   364  		runtime.KeepAlive(specFile)
   365  		panic("unreachable")
   366  	}
   367  
   368  	if b.syncUsernsFD >= 0 {
   369  		// syncUsernsFD is set, but runsc hasn't been re-executed with a new UID and GID.
   370  		// We expect that setCapsAndCallSelf has to be called in this case.
   371  		panic("unreachable")
   372  	}
   373  
   374  	// Close specFile to avoid exposing it to the sandbox.
   375  	if err := specFile.Close(); err != nil {
   376  		util.Fatalf("closing specFile: %v", err)
   377  	}
   378  
   379  	// At this point we won't re-execute, so it's safe to limit via rlimits. Any
   380  	// limit >= 0 works. If the limit is lower than the current number of open
   381  	// files, then Setrlimit will succeed, and the next open will fail.
   382  	if conf.FDLimit > -1 {
   383  		rlimit := unix.Rlimit{
   384  			Cur: uint64(conf.FDLimit),
   385  			Max: uint64(conf.FDLimit),
   386  		}
   387  		switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err {
   388  		case nil:
   389  		case unix.EPERM:
   390  			log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit)
   391  		default:
   392  			util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err)
   393  		}
   394  	}
   395  
   396  	// When mountsFD is not provided, there is no cleaning required.
   397  	if b.mountsFD >= 0 {
   398  		// Read resolved mount list and replace the original one from the spec.
   399  		mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file")
   400  		cleanMounts, err := specutils.ReadMounts(mountsFile)
   401  		if err != nil {
   402  			mountsFile.Close()
   403  			util.Fatalf("Error reading mounts file: %v", err)
   404  		}
   405  		mountsFile.Close()
   406  		spec.Mounts = cleanMounts
   407  	}
   408  
   409  	if conf.DirectFS {
   410  		// sandbox should run with a umask of 0, because we want to preserve file
   411  		// modes exactly as sent by the sentry, which would have already applied
   412  		// the application umask.
   413  		unix.Umask(0)
   414  	}
   415  
   416  	if conf.EnableCoreTags {
   417  		if err := coretag.Enable(); err != nil {
   418  			util.Fatalf("Failed to core tag sentry: %v", err)
   419  		}
   420  
   421  		// Verify that all sentry threads are properly core tagged, and log
   422  		// current core tag.
   423  		coreTags, err := coretag.GetAllCoreTags(os.Getpid())
   424  		if err != nil {
   425  			util.Fatalf("Failed read current core tags: %v", err)
   426  		}
   427  		if len(coreTags) != 1 {
   428  			util.Fatalf("Not all child threads were core tagged the same. Tags=%v", coreTags)
   429  		}
   430  		log.Infof("Core tag enabled (core tag=%d)", coreTags[0])
   431  	}
   432  
   433  	// Create the loader.
   434  	bootArgs := boot.Args{
   435  		ID:                  f.Arg(0),
   436  		Spec:                spec,
   437  		Conf:                conf,
   438  		ControllerFD:        b.controllerFD,
   439  		Device:              fd.New(b.deviceFD),
   440  		GoferFDs:            b.ioFDs.GetArray(),
   441  		DevGoferFD:          b.devIoFD,
   442  		StdioFDs:            b.stdioFDs.GetArray(),
   443  		PassFDs:             b.passFDs.GetArray(),
   444  		ExecFD:              b.execFD,
   445  		GoferFilestoreFDs:   b.goferFilestoreFDs.GetArray(),
   446  		GoferMountConfs:     b.goferMountConfs.GetArray(),
   447  		NumCPU:              b.cpuNum,
   448  		TotalMem:            b.totalMem,
   449  		TotalHostMem:        b.totalHostMem,
   450  		UserLogFD:           b.userLogFD,
   451  		ProductName:         b.productName,
   452  		PodInitConfigFD:     b.podInitConfigFD,
   453  		SinkFDs:             b.sinkFDs.GetArray(),
   454  		ProfileOpts:         b.profileFDs.ToOpts(),
   455  		NvidiaDriverVersion: b.nvidiaDriverVersion,
   456  	}
   457  	l, err := boot.New(bootArgs)
   458  	if err != nil {
   459  		util.Fatalf("creating loader: %v", err)
   460  	}
   461  
   462  	// Fatalf exits the process and doesn't run defers.
   463  	// 'l' must be destroyed explicitly after this point!
   464  
   465  	if b.procMountSyncFD != -1 {
   466  		l.PreSeccompCallback = func() {
   467  			// Call validateOpenFDs() before umounting /proc.
   468  			validateOpenFDs(bootArgs.PassFDs)
   469  			// Umount /proc right before installing seccomp filters.
   470  			umountProc(b.procMountSyncFD)
   471  		}
   472  	}
   473  
   474  	if conf.TestOnlyAutosaveImagePath != "" {
   475  		fName := filepath.Join(conf.TestOnlyAutosaveImagePath, boot.CheckpointStateFileName)
   476  		f, err := os.OpenFile(fName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666)
   477  		if err != nil {
   478  			util.Fatalf("error in creating state file %v", err)
   479  		}
   480  		defer f.Close()
   481  
   482  		boot.EnableAutosave(l, f, conf.TestOnlyAutosaveResume)
   483  	}
   484  
   485  	// Prepare metrics.
   486  	// This needs to happen after the kernel is initialized (such that all metrics are registered)
   487  	// but before the start-sync file is notified, as the parent process needs to query for
   488  	// registered metrics prior to sending the start signal.
   489  	metric.Initialize()
   490  	if b.profilingMetricsFD != -1 {
   491  		if err := metric.StartProfilingMetrics(metric.ProfilingMetricsOptions[*os.File]{
   492  			Sink:    os.NewFile(uintptr(b.profilingMetricsFD), "metrics file"),
   493  			Lossy:   b.profilingMetricsLossy,
   494  			Metrics: conf.ProfilingMetrics,
   495  			Rate:    time.Duration(conf.ProfilingMetricsRate) * time.Microsecond,
   496  		}); err != nil {
   497  			l.Destroy()
   498  			util.Fatalf("unable to start profiling metrics: %v", err)
   499  		}
   500  		defer metric.StopProfilingMetrics()
   501  	}
   502  
   503  	// Notify the parent process the sandbox has booted (and that the controller
   504  	// is up).
   505  	startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file")
   506  	buf := make([]byte, 1)
   507  	if w, err := startSyncFile.Write(buf); err != nil || w != 1 {
   508  		l.Destroy()
   509  		util.Fatalf("unable to write into the start-sync descriptor: %v", err)
   510  	}
   511  	// Closes startSyncFile because 'l.Run()' only returns when the sandbox exits.
   512  	startSyncFile.Close()
   513  
   514  	// Wait for the start signal from runsc.
   515  	l.WaitForStartSignal()
   516  
   517  	// Run the application and wait for it to finish.
   518  	if err := l.Run(); err != nil {
   519  		l.Destroy()
   520  		util.Fatalf("running sandbox: %v", err)
   521  	}
   522  
   523  	ws := l.WaitExit()
   524  	log.Infof("application exiting with %+v", ws)
   525  	waitStatus := args[1].(*unix.WaitStatus)
   526  	*waitStatus = unix.WaitStatus(ws)
   527  	l.Destroy()
   528  	return subcommands.ExitSuccess
   529  }
   530  
   531  // prepareArgs returns the args that can be used to re-execute the current
   532  // program. It manipulates the flags of the subcommands.Command identified by
   533  // subCmdName and fSet is the flag.FlagSet of this subcommand. It applies the
   534  // flags specified by override map. In case of conflict, flag is overriden.
   535  //
   536  // Postcondition: prepareArgs() takes ownership of override map.
   537  func prepareArgs(subCmdName string, fSet *flag.FlagSet, override map[string]string) []string {
   538  	var args []string
   539  	// Add all args up until (and including) the sub command.
   540  	for _, arg := range os.Args {
   541  		args = append(args, arg)
   542  		if arg == subCmdName {
   543  			break
   544  		}
   545  	}
   546  	// Set sub command flags. Iterate through all the explicitly set flags.
   547  	fSet.Visit(func(gf *flag.Flag) {
   548  		// If a conflict is found with override, then prefer override flag.
   549  		if ov, ok := override[gf.Name]; ok {
   550  			args = append(args, fmt.Sprintf("--%s=%s", gf.Name, ov))
   551  			delete(override, gf.Name)
   552  			return
   553  		}
   554  		// Otherwise pass through the original flag.
   555  		args = append(args, fmt.Sprintf("--%s=%s", gf.Name, gf.Value))
   556  	})
   557  	// Apply remaining override flags (that didn't conflict above).
   558  	for of, ov := range override {
   559  		args = append(args, fmt.Sprintf("--%s=%s", of, ov))
   560  	}
   561  	// Add the non-flag arguments at the end.
   562  	args = append(args, fSet.Args()...)
   563  	return args
   564  }
   565  
   566  // execProcUmounter execute a child process that umounts /proc when the
   567  // returned pipe is closed.
   568  func execProcUmounter() (*exec.Cmd, *os.File) {
   569  	r, w, err := os.Pipe()
   570  	if err != nil {
   571  		util.Fatalf("error creating a pipe: %v", err)
   572  	}
   573  	defer r.Close()
   574  
   575  	cmd := exec.Command(specutils.ExePath)
   576  	cmd.Args = append(cmd.Args, "umount", "--sync-fd=3", "/proc")
   577  	cmd.ExtraFiles = append(cmd.ExtraFiles, r)
   578  	cmd.Stdin = os.Stdin
   579  	cmd.Stdout = os.Stdout
   580  	cmd.Stderr = os.Stderr
   581  	if err := cmd.Start(); err != nil {
   582  		util.Fatalf("error executing umounter: %v", err)
   583  	}
   584  	return cmd, w
   585  }
   586  
   587  // umountProc writes to syncFD signalling the process started by
   588  // execProcUmounter() to umount /proc.
   589  func umountProc(syncFD int) {
   590  	syncFile := os.NewFile(uintptr(syncFD), "procfs umount sync FD")
   591  	buf := make([]byte, 1)
   592  	if w, err := syncFile.Write(buf); err != nil || w != 1 {
   593  		util.Fatalf("unable to write into the proc umounter descriptor: %v", err)
   594  	}
   595  	syncFile.Close()
   596  
   597  	var waitStatus unix.WaitStatus
   598  	if _, err := unix.Wait4(0, &waitStatus, 0, nil); err != nil {
   599  		util.Fatalf("error waiting for the proc umounter process: %v", err)
   600  	}
   601  	if !waitStatus.Exited() || waitStatus.ExitStatus() != 0 {
   602  		util.Fatalf("the proc umounter process failed: %v", waitStatus)
   603  	}
   604  	if err := unix.Access("/proc/self", unix.F_OK); err != unix.ENOENT {
   605  		util.Fatalf("/proc is still accessible")
   606  	}
   607  }
   608  
   609  // validateOpenFDs checks that the sandbox process does not have any open
   610  // directory FDs.
   611  func validateOpenFDs(passFDs []boot.FDMapping) {
   612  	passHostFDs := make(map[int]struct{})
   613  	for _, passFD := range passFDs {
   614  		passHostFDs[passFD.Host] = struct{}{}
   615  	}
   616  	const selfFDDir = "/proc/self/fd"
   617  	if err := filepath.WalkDir(selfFDDir, func(path string, d os.DirEntry, err error) error {
   618  		if err != nil {
   619  			return err
   620  		}
   621  		if d.Type() != os.ModeSymlink {
   622  			// All entries are symlinks. Ignore the callback for fd directory itself.
   623  			return nil
   624  		}
   625  		if fdInfo, err := os.Stat(path); err != nil {
   626  			if os.IsNotExist(err) {
   627  				// Ignore FDs that are now closed. For example, the FD to selfFDDir that
   628  				// was opened by filepath.WalkDir() to read dirents.
   629  				return nil
   630  			}
   631  			return fmt.Errorf("os.Stat(%s) failed: %v", path, err)
   632  		} else if !fdInfo.IsDir() {
   633  			return nil
   634  		}
   635  		// Uh-oh. This is a directory FD.
   636  		fdNo, err := strconv.Atoi(d.Name())
   637  		if err != nil {
   638  			return fmt.Errorf("strconv.Atoi(%s) failed: %v", d.Name(), err)
   639  		}
   640  		dirLink, err := os.Readlink(path)
   641  		if err != nil {
   642  			return fmt.Errorf("os.Readlink(%s) failed: %v", path, err)
   643  		}
   644  		if _, ok := passHostFDs[fdNo]; ok {
   645  			// Passed FDs are allowed to be directories. The user must be knowing
   646  			// what they are doing. Log a warning regardless.
   647  			log.Warningf("Sandbox has access to FD %d, which is a directory for %s", fdNo, dirLink)
   648  			return nil
   649  		}
   650  		return fmt.Errorf("FD %d is a directory for %s", fdNo, dirLink)
   651  	}); err != nil {
   652  		util.Fatalf("WalkDir(%s) failed: %v", selfFDDir, err)
   653  	}
   654  }