gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/config/config.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package config provides basic infrastructure to set configuration settings
    16  // for runsc. The configuration is set by flags to the command line. They can
    17  // also propagate to a different process using the same flags.
    18  package config
    19  
    20  import (
    21  	"fmt"
    22  	"path/filepath"
    23  	"reflect"
    24  	"runtime"
    25  	"strconv"
    26  	"strings"
    27  
    28  	"gvisor.dev/gvisor/pkg/log"
    29  	"gvisor.dev/gvisor/pkg/refs"
    30  	"gvisor.dev/gvisor/pkg/sentry/watchdog"
    31  	"gvisor.dev/gvisor/runsc/flag"
    32  	"gvisor.dev/gvisor/runsc/version"
    33  )
    34  
    35  // Config holds configuration that is not part of the runtime spec.
    36  //
    37  // Follow these steps to add a new flag:
    38  //  1. Create a new field in Config.
    39  //  2. Add a field tag with the flag name
    40  //  3. Register a new flag in flags.go, with same name and add a description
    41  //  4. Add any necessary validation into validate()
    42  //  5. If adding an enum, follow the same pattern as FileAccessType
    43  //  6. Evaluate if the flag can be changed with OCI annotations. See
    44  //     overrideAllowlist for more details
    45  type Config struct {
    46  	// RootDir is the runtime root directory.
    47  	RootDir string `flag:"root"`
    48  
    49  	// Traceback changes the Go runtime's traceback level.
    50  	Traceback string `flag:"traceback"`
    51  
    52  	// Debug indicates that debug logging should be enabled.
    53  	Debug bool `flag:"debug"`
    54  
    55  	// LogFilename is the filename to log to, if not empty.
    56  	LogFilename string `flag:"log"`
    57  
    58  	// LogFormat is the log format.
    59  	LogFormat string `flag:"log-format"`
    60  
    61  	// DebugLog is the path to log debug information to, if not empty.
    62  	// If specified together with `DebugToUserLog`, debug logs are emitted
    63  	// to both.
    64  	DebugLog string `flag:"debug-log"`
    65  
    66  	// DebugToUserLog indicates that Sentry debug logs should be emitted
    67  	// to user-visible logs.
    68  	// If specified together with `DebugLog`, debug logs are emitted
    69  	// to both.
    70  	DebugToUserLog bool `flag:"debug-to-user-log"`
    71  
    72  	// DebugCommand is a comma-separated list of commands to be debugged if
    73  	// --debug-log is also set. Empty means debug all. "!" negates the expression.
    74  	// E.g. "create,start" or "!boot,events".
    75  	DebugCommand string `flag:"debug-command"`
    76  
    77  	// PanicLog is the path to log GO's runtime messages, if not empty.
    78  	PanicLog string `flag:"panic-log"`
    79  
    80  	// CoverageReport is the path to write Go coverage information, if not empty.
    81  	CoverageReport string `flag:"coverage-report"`
    82  
    83  	// DebugLogFormat is the log format for debug.
    84  	DebugLogFormat string `flag:"debug-log-format"`
    85  
    86  	// FileAccess indicates how the root filesystem is accessed.
    87  	FileAccess FileAccessType `flag:"file-access"`
    88  
    89  	// FileAccessMounts indicates how non-root volumes are accessed.
    90  	FileAccessMounts FileAccessType `flag:"file-access-mounts"`
    91  
    92  	// Overlay is whether to wrap all mounts in an overlay. The upper tmpfs layer
    93  	// will be backed by application memory.
    94  	Overlay bool `flag:"overlay"`
    95  
    96  	// Overlay2 holds configuration about wrapping mounts in overlayfs.
    97  	// DO NOT call it directly, use GetOverlay2() instead.
    98  	Overlay2 Overlay2 `flag:"overlay2"`
    99  
   100  	// FSGoferHostUDS is deprecated: use host-uds=all.
   101  	FSGoferHostUDS bool `flag:"fsgofer-host-uds"`
   102  
   103  	// HostUDS controls permission to access host Unix-domain sockets.
   104  	// DO NOT call it directly, use GetHostUDS() instead.
   105  	HostUDS HostUDS `flag:"host-uds"`
   106  
   107  	// HostFifo controls permission to access host FIFO (or named pipes).
   108  	HostFifo HostFifo `flag:"host-fifo"`
   109  
   110  	// Network indicates what type of network to use.
   111  	Network NetworkType `flag:"network"`
   112  
   113  	// EnableRaw indicates whether raw sockets should be enabled. Raw
   114  	// sockets are disabled by stripping CAP_NET_RAW from the list of
   115  	// capabilities.
   116  	EnableRaw bool `flag:"net-raw"`
   117  
   118  	// AllowPacketEndpointWrite enables write operations on packet endpoints.
   119  	AllowPacketEndpointWrite bool `flag:"TESTONLY-allow-packet-endpoint-write"`
   120  
   121  	// HostGSO indicates that host segmentation offload is enabled.
   122  	HostGSO bool `flag:"gso"`
   123  
   124  	// GVisorGSO indicates that gVisor segmentation offload is enabled. The flag
   125  	// retains its old name of "software" GSO for API consistency.
   126  	GVisorGSO bool `flag:"software-gso"`
   127  
   128  	// GVisorGRO enables gVisor's generic receive offload.
   129  	GVisorGRO bool `flag:"gvisor-gro"`
   130  
   131  	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
   132  	TXChecksumOffload bool `flag:"tx-checksum-offload"`
   133  
   134  	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
   135  	RXChecksumOffload bool `flag:"rx-checksum-offload"`
   136  
   137  	// QDisc indicates the type of queuening discipline to use by default
   138  	// for non-loopback interfaces.
   139  	QDisc QueueingDiscipline `flag:"qdisc"`
   140  
   141  	// LogPackets indicates that all network packets should be logged.
   142  	LogPackets bool `flag:"log-packets"`
   143  
   144  	// PCAP is a file to which network packets should be logged in PCAP format.
   145  	PCAP string `flag:"pcap-log"`
   146  
   147  	// Platform is the platform to run on.
   148  	Platform string `flag:"platform"`
   149  
   150  	// PlatformDevicePath is the path to the device file used by the platform.
   151  	// e.g. "/dev/kvm" for the KVM platform.
   152  	// If unset, a sane platform-specific default will be used.
   153  	PlatformDevicePath string `flag:"platform_device_path"`
   154  
   155  	// MetricServer, if set, indicates that metrics should be exported on this address.
   156  	// This may either be 1) "addr:port" to export metrics on a specific network interface address,
   157  	// 2) ":port" for exporting metrics on all addresses, or 3) an absolute path to a Unix Domain
   158  	// Socket.
   159  	// The substring "%ID%" will be replaced by the container ID, and "%RUNTIME_ROOT%" by the root.
   160  	// This flag must be specified *both* as part of the `runsc metric-server` arguments (so that the
   161  	// metric server knows which address to bind to), and as part of the `runsc create` arguments (as
   162  	// an indication that the container being created wishes that its metrics should be exported).
   163  	// The value of this flag must also match across the two command lines.
   164  	MetricServer string `flag:"metric-server"`
   165  
   166  	// ProfilingMetrics is a comma separated list of metric names which are
   167  	// going to be written to the ProfilingMetricsLog file from within the
   168  	// sentry in CSV format. ProfilingMetrics will be snapshotted at a rate
   169  	// specified by ProfilingMetricsRate. Requires ProfilingMetricsLog to be
   170  	// set.
   171  	ProfilingMetrics string `flag:"profiling-metrics"`
   172  
   173  	// ProfilingMetricsLog is the file name to use for ProfilingMetrics
   174  	// output.
   175  	ProfilingMetricsLog string `flag:"profiling-metrics-log"`
   176  
   177  	// ProfilingMetricsRate is the target rate (in microseconds) at which
   178  	// profiling metrics will be snapshotted.
   179  	ProfilingMetricsRate int `flag:"profiling-metrics-rate-us"`
   180  
   181  	// Strace indicates that strace should be enabled.
   182  	Strace bool `flag:"strace"`
   183  
   184  	// StraceSyscalls is the set of syscalls to trace (comma-separated values).
   185  	// If StraceEnable is true and this string is empty, then all syscalls will
   186  	// be traced.
   187  	StraceSyscalls string `flag:"strace-syscalls"`
   188  
   189  	// StraceLogSize is the max size of data blobs to display.
   190  	StraceLogSize uint `flag:"strace-log-size"`
   191  
   192  	// StraceEvent indicates sending strace to events if true. Strace is
   193  	// sent to log if false.
   194  	StraceEvent bool `flag:"strace-event"`
   195  
   196  	// DisableSeccomp indicates whether seccomp syscall filters should be
   197  	// disabled. Pardon the double negation, but default to enabled is important.
   198  	DisableSeccomp bool
   199  
   200  	// EnableCoreTags indicates whether the Sentry process and children will be
   201  	// run in a core tagged process. This isolates the sentry from sharing
   202  	// physical cores with other core tagged processes. This is useful as a
   203  	// mitigation for hyperthreading side channel based attacks. Requires host
   204  	// linux kernel >= 5.14.
   205  	EnableCoreTags bool `flag:"enable-core-tags"`
   206  
   207  	// WatchdogAction sets what action the watchdog takes when triggered.
   208  	WatchdogAction watchdog.Action `flag:"watchdog-action"`
   209  
   210  	// PanicSignal registers signal handling that panics. Usually set to
   211  	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
   212  	PanicSignal int `flag:"panic-signal"`
   213  
   214  	// ProfileEnable is set to prepare the sandbox to be profiled.
   215  	ProfileEnable bool `flag:"profile"`
   216  
   217  	// ProfileBlock collects a block profile to the passed file for the
   218  	// duration of the container execution. Requires ProfileEnabled.
   219  	ProfileBlock string `flag:"profile-block"`
   220  
   221  	// ProfileCPU collects a CPU profile to the passed file for the
   222  	// duration of the container execution. Requires ProfileEnabled.
   223  	ProfileCPU string `flag:"profile-cpu"`
   224  
   225  	// ProfileHeap collects a heap profile to the passed file for the
   226  	// duration of the container execution. Requires ProfileEnabled.
   227  	ProfileHeap string `flag:"profile-heap"`
   228  
   229  	// ProfileMutex collects a mutex profile to the passed file for the
   230  	// duration of the container execution. Requires ProfileEnabled.
   231  	ProfileMutex string `flag:"profile-mutex"`
   232  
   233  	// TraceFile collects a Go runtime execution trace to the passed file
   234  	// for the duration of the container execution.
   235  	TraceFile string `flag:"trace"`
   236  
   237  	// NumNetworkChannels controls the number of AF_PACKET sockets that map
   238  	// to the same underlying network device. This allows netstack to better
   239  	// scale for high throughput use cases.
   240  	NumNetworkChannels int `flag:"num-network-channels"`
   241  
   242  	// NetworkProcessorsPerChannel controls the number of goroutines used to
   243  	// handle packets on a single network channel. A higher number can help handle
   244  	// many simultaneous connections. If this is 0, runsc will divide GOMAXPROCS
   245  	// evenly among each network channel.
   246  	NetworkProcessorsPerChannel int `flag:"network-processors-per-channel"`
   247  
   248  	// Rootless allows the sandbox to be started with a user that is not root.
   249  	// Defense in depth measures are weaker in rootless mode. Specifically, the
   250  	// sandbox and Gofer process run as root inside a user namespace with root
   251  	// mapped to the caller's user. When using rootless, the container root path
   252  	// should not have a symlink.
   253  	Rootless bool `flag:"rootless"`
   254  
   255  	// AlsoLogToStderr allows to send log messages to stderr.
   256  	AlsoLogToStderr bool `flag:"alsologtostderr"`
   257  
   258  	// ReferenceLeakMode sets reference leak check mode
   259  	ReferenceLeak refs.LeakMode `flag:"ref-leak-mode"`
   260  
   261  	// CPUNumFromQuota sets CPU number count to available CPU quota, using
   262  	// least integer value greater than or equal to quota.
   263  	//
   264  	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
   265  	CPUNumFromQuota bool `flag:"cpu-num-from-quota"`
   266  
   267  	// Allows overriding of flags in OCI annotations.
   268  	AllowFlagOverride bool `flag:"allow-flag-override"`
   269  
   270  	// Enables seccomp inside the sandbox.
   271  	OCISeccomp bool `flag:"oci-seccomp"`
   272  
   273  	// Don't configure cgroups.
   274  	IgnoreCgroups bool `flag:"ignore-cgroups"`
   275  
   276  	// Use systemd to configure cgroups.
   277  	SystemdCgroup bool `flag:"systemd-cgroup"`
   278  
   279  	// PodInitConfig is the path to configuration file with additional steps to
   280  	// take during pod creation.
   281  	PodInitConfig string `flag:"pod-init-config"`
   282  
   283  	// Use pools to manage buffer memory instead of heap.
   284  	BufferPooling bool `flag:"buffer-pooling"`
   285  
   286  	// XDP controls Whether and how to use XDP.
   287  	XDP XDP `flag:"EXPERIMENTAL-xdp"`
   288  
   289  	// AFXDPUseNeedWakeup determines whether XDP_USE_NEED_WAKEUP is set
   290  	// when using AF_XDP sockets.
   291  	AFXDPUseNeedWakeup bool `flag:"EXPERIMENTAL-xdp-need-wakeup"`
   292  
   293  	// FDLimit specifies a limit on the number of host file descriptors that can
   294  	// be open simultaneously by the sentry and gofer. It applies separately to
   295  	// each.
   296  	FDLimit int `flag:"fdlimit"`
   297  
   298  	// DCache sets the global dirent cache size. If negative, per-mount caches are
   299  	// used.
   300  	DCache int `flag:"dcache"`
   301  
   302  	// IOUring enables support for the IO_URING API calls to perform
   303  	// asynchronous I/O operations.
   304  	IOUring bool `flag:"iouring"`
   305  
   306  	// DirectFS sets up the sandbox to directly access/mutate the filesystem from
   307  	// the sentry. Sentry runs with escalated privileges. Gofer process still
   308  	// exists, but is mostly idle. Not supported in rootless mode.
   309  	DirectFS bool `flag:"directfs"`
   310  
   311  	// NVProxy enables support for Nvidia GPUs.
   312  	NVProxy bool `flag:"nvproxy"`
   313  
   314  	// NVProxyDocker is deprecated. Please use nvidia-container-runtime or
   315  	// `docker run --gpus` directly. For backward compatibility, this has the
   316  	// effect of injecting nvidia-container-runtime-hook as a prestart hook.
   317  	NVProxyDocker bool `flag:"nvproxy-docker"`
   318  
   319  	// NVProxyDriverVersion is the version of the NVIDIA driver ABI to use.
   320  	// If empty, it is autodetected from the installed NVIDIA driver.
   321  	// It can also be set to the special value "latest" to force the use of
   322  	// the latest supported NVIDIA driver ABI.
   323  	NVProxyDriverVersion string `flag:"nvproxy-driver-version"`
   324  
   325  	// TPUProxy enables support for TPUs.
   326  	TPUProxy bool `flag:"tpuproxy"`
   327  
   328  	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
   329  	// tests. It allows runsc to start the sandbox process as the current
   330  	// user, and without chrooting the sandbox process. This can be
   331  	// necessary in test environments that have limited capabilities. When
   332  	// disabling chroot, the container root path should not have a symlink.
   333  	TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"`
   334  
   335  	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
   336  	// test name in the container environment variables and adds it to the debug
   337  	// log file name. This is done to help identify the log with the test when
   338  	// multiple tests are run in parallel, since there is no way to pass
   339  	// parameters to the runtime from docker.
   340  	TestOnlyTestNameEnv string `flag:"TESTONLY-test-name-env"`
   341  
   342  	// TestOnlyAFSSyscallPanic should only be used in tests. It enables the
   343  	// alternate behaviour for afs_syscall to trigger a Go-runtime panic upon being
   344  	// called. This is useful for tests exercising gVisor panic-reporting.
   345  	TestOnlyAFSSyscallPanic bool `flag:"TESTONLY-afs-syscall-panic"`
   346  
   347  	// explicitlySet contains whether a flag was explicitly set on the command-line from which this
   348  	// Config was constructed. Nil when the Config was not initialized from a FlagSet.
   349  	explicitlySet map[string]struct{}
   350  
   351  	// ReproduceNAT, when true, tells runsc to scrape the host network
   352  	// namespace's NAT iptables and reproduce it inside the sandbox.
   353  	ReproduceNAT bool `flag:"reproduce-nat"`
   354  
   355  	// ReproduceNftables attempts to scrape nftables routing rules if
   356  	// present, and reproduce them in the sandbox.
   357  	ReproduceNftables bool `flag:"reproduce-nftables"`
   358  
   359  	// NetDisconnectOk indicates whether the link endpoint capability
   360  	// CapabilityDisconnectOk should be set. This allows open connections to be
   361  	// disconnected upon save.
   362  	NetDisconnectOk bool `flag:"net-disconnect-ok"`
   363  
   364  	// TestOnlyAutosaveImagePath if not empty enables auto save for syscall tests
   365  	// and stores the directory path to the saved state file.
   366  	TestOnlyAutosaveImagePath string `flag:"TESTONLY-autosave-image-path"`
   367  
   368  	// TestOnlyAutosaveResume indicates save resume for syscall tests.
   369  	TestOnlyAutosaveResume bool `flag:"TESTONLY-autosave-resume"`
   370  }
   371  
   372  func (c *Config) validate() error {
   373  	if c.Overlay && c.Overlay2.Enabled() {
   374  		// Deprecated flag was used together with flag that replaced it.
   375  		return fmt.Errorf("overlay flag has been replaced with overlay2 flag")
   376  	}
   377  	if overlay2 := c.GetOverlay2(); c.FileAccess == FileAccessShared && overlay2.Enabled() {
   378  		return fmt.Errorf("overlay flag is incompatible with shared file access for rootfs")
   379  	}
   380  	if c.NumNetworkChannels <= 0 {
   381  		return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels)
   382  	}
   383  	// Require profile flags to explicitly opt-in to profiling with
   384  	// -profile rather than implying it since these options have security
   385  	// implications.
   386  	if c.ProfileBlock != "" && !c.ProfileEnable {
   387  		return fmt.Errorf("profile-block flag requires enabling profiling with profile flag")
   388  	}
   389  	if c.ProfileCPU != "" && !c.ProfileEnable {
   390  		return fmt.Errorf("profile-cpu flag requires enabling profiling with profile flag")
   391  	}
   392  	if c.ProfileHeap != "" && !c.ProfileEnable {
   393  		return fmt.Errorf("profile-heap flag requires enabling profiling with profile flag")
   394  	}
   395  	if c.ProfileMutex != "" && !c.ProfileEnable {
   396  		return fmt.Errorf("profile-mutex flag requires enabling profiling with profile flag")
   397  	}
   398  	if c.FSGoferHostUDS && c.HostUDS != HostUDSNone {
   399  		// Deprecated flag was used together with flag that replaced it.
   400  		return fmt.Errorf("fsgofer-host-uds has been replaced with host-uds flag")
   401  	}
   402  	if len(c.ProfilingMetrics) > 0 && len(c.ProfilingMetricsLog) == 0 {
   403  		return fmt.Errorf("profiling-metrics flag requires defining a profiling-metrics-log for output")
   404  	}
   405  	return nil
   406  }
   407  
   408  // Log logs important aspects of the configuration to the given log function.
   409  func (c *Config) Log() {
   410  	log.Infof("Platform: %v", c.Platform)
   411  	log.Infof("RootDir: %s", c.RootDir)
   412  	log.Infof("FileAccess: %v / Directfs: %t / Overlay: %v", c.FileAccess, c.DirectFS, c.GetOverlay2())
   413  	log.Infof("Network: %v", c.Network)
   414  	if c.Debug || c.Strace {
   415  		log.Infof("Debug: %t. Strace: %t, max size: %d, syscalls: %s", c.Debug, c.Strace, c.StraceLogSize, c.StraceSyscalls)
   416  	}
   417  	if c.Debug {
   418  		obj := reflect.ValueOf(c).Elem()
   419  		st := obj.Type()
   420  		for i := 0; i < st.NumField(); i++ {
   421  			f := st.Field(i)
   422  			var val any
   423  			if strVal := obj.Field(i).String(); strVal == "" {
   424  				val = "(empty)"
   425  			} else if !f.IsExported() {
   426  				// Cannot convert to `interface{}` for non-exported fields,
   427  				// so just use `strVal`.
   428  				val = fmt.Sprintf("%s (unexported)", strVal)
   429  			} else {
   430  				val = obj.Field(i).Interface()
   431  			}
   432  			if flagName, hasFlag := f.Tag.Lookup("flag"); hasFlag {
   433  				log.Debugf("Config.%s (--%s): %v", f.Name, flagName, val)
   434  			} else {
   435  				log.Debugf("Config.%s: %v", f.Name, val)
   436  			}
   437  		}
   438  	}
   439  }
   440  
   441  // GetHostUDS returns the FS gofer communication that is allowed, taking into
   442  // consideration all flags what affect the result.
   443  func (c *Config) GetHostUDS() HostUDS {
   444  	if c.FSGoferHostUDS {
   445  		if c.HostUDS != HostUDSNone {
   446  			panic(fmt.Sprintf("HostUDS cannot be set when --fsgofer-host-uds=true"))
   447  		}
   448  		// Using deprecated flag, honor it to avoid breaking users.
   449  		return HostUDSOpen
   450  	}
   451  	return c.HostUDS
   452  }
   453  
   454  // GetOverlay2 returns the overlay configuration, taking into consideration all
   455  // flags that affect the result.
   456  func (c *Config) GetOverlay2() Overlay2 {
   457  	if c.Overlay {
   458  		if c.Overlay2.Enabled() {
   459  			panic(fmt.Sprintf("Overlay2 cannot be set when --overlay=true"))
   460  		}
   461  		// Using a deprecated flag, honor it to avoid breaking users.
   462  		return Overlay2{rootMount: true, subMounts: true, medium: "memory"}
   463  	}
   464  	return c.Overlay2
   465  }
   466  
   467  // Bundle is a set of flag name-value pairs.
   468  type Bundle map[string]string
   469  
   470  // BundleName is a human-friendly name for a Bundle.
   471  // It is used as part of an annotation to specify that the user wants to apply a Bundle.
   472  type BundleName string
   473  
   474  // Validate validates that given flag string values map to actual flags in runsc.
   475  func (b Bundle) Validate() error {
   476  	flagSet := flag.NewFlagSet("tmp", flag.ContinueOnError)
   477  	RegisterFlags(flagSet)
   478  	for key, val := range b {
   479  		flag := flagSet.Lookup(key)
   480  		if flag == nil {
   481  			return fmt.Errorf("unknown flag %q", key)
   482  		}
   483  		if err := flagSet.Set(key, val); err != nil {
   484  			return err
   485  		}
   486  	}
   487  	return nil
   488  }
   489  
   490  // MetricMetadataKeys is the set of keys of metric metadata labels
   491  // as returned by `Config.MetricMetadata`.
   492  var MetricMetadataKeys = []string{
   493  	"version",
   494  	"platform",
   495  	"network",
   496  	"numcores",
   497  	"coretags",
   498  	"overlay",
   499  	"fsmode",
   500  	"cpuarch",
   501  	"go",
   502  	"experiment",
   503  }
   504  
   505  // MetricMetadata returns key-value pairs that are useful to include in metrics
   506  // exported about the sandbox this config represents.
   507  // It must return the same set of labels as listed in `MetricMetadataKeys`.
   508  func (c *Config) MetricMetadata() map[string]string {
   509  	var fsMode = "goferfs"
   510  	if c.DirectFS {
   511  		fsMode = "directfs"
   512  	}
   513  	return map[string]string{
   514  		"version":  version.Version(),
   515  		"platform": c.Platform,
   516  		"network":  c.Network.String(),
   517  		"numcores": strconv.Itoa(runtime.NumCPU()),
   518  		"coretags": strconv.FormatBool(c.EnableCoreTags),
   519  		"overlay":  c.Overlay2.String(),
   520  		"fsmode":   fsMode,
   521  		"cpuarch":  runtime.GOARCH,
   522  		"go":       runtime.Version(),
   523  		// The "experiment" label is currently unused, but may be used to contain
   524  		// extra information about e.g. an experiment that may be enabled.
   525  		"experiment": "",
   526  	}
   527  }
   528  
   529  // FileAccessType tells how the filesystem is accessed.
   530  type FileAccessType int
   531  
   532  const (
   533  	// FileAccessExclusive gives the sandbox exclusive access over files and
   534  	// directories in the filesystem. No external modifications are permitted and
   535  	// can lead to undefined behavior.
   536  	//
   537  	// Exclusive filesystem access enables more aggressive caching and offers
   538  	// significantly better performance. This is the default mode for the root
   539  	// volume.
   540  	FileAccessExclusive FileAccessType = iota
   541  
   542  	// FileAccessShared is used for volumes that can have external changes. It
   543  	// requires revalidation on every filesystem access to detect external
   544  	// changes, and reduces the amount of caching that can be done. This is the
   545  	// default mode for non-root volumes.
   546  	FileAccessShared
   547  )
   548  
   549  func fileAccessTypePtr(v FileAccessType) *FileAccessType {
   550  	return &v
   551  }
   552  
   553  // Set implements flag.Value. Set(String()) should be idempotent.
   554  func (f *FileAccessType) Set(v string) error {
   555  	switch v {
   556  	case "shared":
   557  		*f = FileAccessShared
   558  	case "exclusive":
   559  		*f = FileAccessExclusive
   560  	default:
   561  		return fmt.Errorf("invalid file access type %q", v)
   562  	}
   563  	return nil
   564  }
   565  
   566  // Get implements flag.Value.
   567  func (f *FileAccessType) Get() any {
   568  	return *f
   569  }
   570  
   571  // String implements flag.Value.
   572  func (f FileAccessType) String() string {
   573  	switch f {
   574  	case FileAccessShared:
   575  		return "shared"
   576  	case FileAccessExclusive:
   577  		return "exclusive"
   578  	}
   579  	panic(fmt.Sprintf("Invalid file access type %d", f))
   580  }
   581  
   582  // NetworkType tells which network stack to use.
   583  type NetworkType int
   584  
   585  const (
   586  	// NetworkSandbox uses internal network stack, isolated from the host.
   587  	NetworkSandbox NetworkType = iota
   588  
   589  	// NetworkHost redirects network related syscalls to the host network.
   590  	NetworkHost
   591  
   592  	// NetworkNone sets up just loopback using netstack.
   593  	NetworkNone
   594  )
   595  
   596  func networkTypePtr(v NetworkType) *NetworkType {
   597  	return &v
   598  }
   599  
   600  // Set implements flag.Value. Set(String()) should be idempotent.
   601  func (n *NetworkType) Set(v string) error {
   602  	switch v {
   603  	case "sandbox":
   604  		*n = NetworkSandbox
   605  	case "host":
   606  		*n = NetworkHost
   607  	case "none":
   608  		*n = NetworkNone
   609  	default:
   610  		return fmt.Errorf("invalid network type %q", v)
   611  	}
   612  	return nil
   613  }
   614  
   615  // Get implements flag.Value.
   616  func (n *NetworkType) Get() any {
   617  	return *n
   618  }
   619  
   620  // String implements flag.Value.
   621  func (n NetworkType) String() string {
   622  	switch n {
   623  	case NetworkSandbox:
   624  		return "sandbox"
   625  	case NetworkHost:
   626  		return "host"
   627  	case NetworkNone:
   628  		return "none"
   629  	}
   630  	panic(fmt.Sprintf("Invalid network type %d", n))
   631  }
   632  
   633  // QueueingDiscipline is used to specify the kind of Queueing Discipline to
   634  // apply for a give FDBasedLink.
   635  type QueueingDiscipline int
   636  
   637  const (
   638  	// QDiscNone disables any queueing for the underlying FD.
   639  	QDiscNone QueueingDiscipline = iota
   640  
   641  	// QDiscFIFO applies a simple fifo based queue to the underlying FD.
   642  	QDiscFIFO
   643  )
   644  
   645  func queueingDisciplinePtr(v QueueingDiscipline) *QueueingDiscipline {
   646  	return &v
   647  }
   648  
   649  // Set implements flag.Value. Set(String()) should be idempotent.
   650  func (q *QueueingDiscipline) Set(v string) error {
   651  	switch v {
   652  	case "none":
   653  		*q = QDiscNone
   654  	case "fifo":
   655  		*q = QDiscFIFO
   656  	default:
   657  		return fmt.Errorf("invalid qdisc %q", v)
   658  	}
   659  	return nil
   660  }
   661  
   662  // Get implements flag.Value.
   663  func (q *QueueingDiscipline) Get() any {
   664  	return *q
   665  }
   666  
   667  // String implements flag.Value.
   668  func (q QueueingDiscipline) String() string {
   669  	switch q {
   670  	case QDiscNone:
   671  		return "none"
   672  	case QDiscFIFO:
   673  		return "fifo"
   674  	}
   675  	panic(fmt.Sprintf("Invalid qdisc %d", q))
   676  }
   677  
   678  func leakModePtr(v refs.LeakMode) *refs.LeakMode {
   679  	return &v
   680  }
   681  
   682  func watchdogActionPtr(v watchdog.Action) *watchdog.Action {
   683  	return &v
   684  }
   685  
   686  // HostUDS tells how much of the host UDS the file system has access to.
   687  type HostUDS int
   688  
   689  const (
   690  	// HostUDSNone doesn't allows UDS from the host to be manipulated.
   691  	HostUDSNone HostUDS = 0x0
   692  
   693  	// HostUDSOpen allows UDS from the host to be opened, e.g. connect(2).
   694  	HostUDSOpen HostUDS = 0x1
   695  
   696  	// HostUDSCreate allows UDS from the host to be created, e.g. bind(2).
   697  	HostUDSCreate HostUDS = 0x2
   698  
   699  	// HostUDSAll allows all form of communication with the host through UDS.
   700  	HostUDSAll = HostUDSOpen | HostUDSCreate
   701  )
   702  
   703  func hostUDSPtr(v HostUDS) *HostUDS {
   704  	return &v
   705  }
   706  
   707  // Set implements flag.Value. Set(String()) should be idempotent.
   708  func (g *HostUDS) Set(v string) error {
   709  	switch v {
   710  	case "", "none":
   711  		*g = HostUDSNone
   712  	case "open":
   713  		*g = HostUDSOpen
   714  	case "create":
   715  		*g = HostUDSCreate
   716  	case "all":
   717  		*g = HostUDSAll
   718  	default:
   719  		return fmt.Errorf("invalid host UDS type %q", v)
   720  	}
   721  	return nil
   722  }
   723  
   724  // Get implements flag.Value.
   725  func (g *HostUDS) Get() any {
   726  	return *g
   727  }
   728  
   729  // String implements flag.Value.
   730  func (g HostUDS) String() string {
   731  	switch g {
   732  	case HostUDSNone:
   733  		return "none"
   734  	case HostUDSOpen:
   735  		return "open"
   736  	case HostUDSCreate:
   737  		return "create"
   738  	case HostUDSAll:
   739  		return "all"
   740  	default:
   741  		panic(fmt.Sprintf("Invalid host UDS type %d", g))
   742  	}
   743  }
   744  
   745  // AllowOpen returns true if it can consume UDS from the host.
   746  func (g HostUDS) AllowOpen() bool {
   747  	return g&HostUDSOpen != 0
   748  }
   749  
   750  // AllowCreate returns true if it can create UDS in the host.
   751  func (g HostUDS) AllowCreate() bool {
   752  	return g&HostUDSCreate != 0
   753  }
   754  
   755  // HostFifo tells how much of the host FIFO (or named pipes) the file system has
   756  // access to.
   757  type HostFifo int
   758  
   759  const (
   760  	// HostFifoNone doesn't allow FIFO from the host to be manipulated.
   761  	HostFifoNone HostFifo = 0x0
   762  
   763  	// HostFifoOpen allows FIFOs from the host to be opened.
   764  	HostFifoOpen HostFifo = 0x1
   765  )
   766  
   767  func hostFifoPtr(v HostFifo) *HostFifo {
   768  	return &v
   769  }
   770  
   771  // Set implements flag.Value. Set(String()) should be idempotent.
   772  func (g *HostFifo) Set(v string) error {
   773  	switch v {
   774  	case "", "none":
   775  		*g = HostFifoNone
   776  	case "open":
   777  		*g = HostFifoOpen
   778  	default:
   779  		return fmt.Errorf("invalid host fifo type %q", v)
   780  	}
   781  	return nil
   782  }
   783  
   784  // Get implements flag.Value.
   785  func (g *HostFifo) Get() any {
   786  	return *g
   787  }
   788  
   789  // String implements flag.Value.
   790  func (g HostFifo) String() string {
   791  	switch g {
   792  	case HostFifoNone:
   793  		return "none"
   794  	case HostFifoOpen:
   795  		return "open"
   796  	default:
   797  		panic(fmt.Sprintf("Invalid host fifo type %d", g))
   798  	}
   799  }
   800  
   801  // AllowOpen returns true if it can consume FIFOs from the host.
   802  func (g HostFifo) AllowOpen() bool {
   803  	return g&HostFifoOpen != 0
   804  }
   805  
   806  // OverlayMedium describes how overlay medium is configured.
   807  type OverlayMedium string
   808  
   809  const (
   810  	// NoOverlay indicates that no overlay will be applied.
   811  	NoOverlay = OverlayMedium("")
   812  
   813  	// MemoryOverlay indicates that the overlay is backed by app memory.
   814  	MemoryOverlay = OverlayMedium("memory")
   815  
   816  	// SelfOverlay indicates that the overlaid mount is backed by itself.
   817  	SelfOverlay = OverlayMedium("self")
   818  
   819  	// AnonOverlayPrefix is the prefix that users should specify in the
   820  	// config for the anonymous overlay.
   821  	AnonOverlayPrefix = "dir="
   822  )
   823  
   824  // String returns a human-readable string representing the overlay medium config.
   825  func (m OverlayMedium) String() string {
   826  	return string(m)
   827  }
   828  
   829  // Set sets the value. Set(String()) should be idempotent.
   830  func (m *OverlayMedium) Set(v string) error {
   831  	switch OverlayMedium(v) {
   832  	case NoOverlay, MemoryOverlay, SelfOverlay: // OK
   833  	default:
   834  		if !strings.HasPrefix(v, AnonOverlayPrefix) {
   835  			return fmt.Errorf("unexpected medium: %q", v)
   836  		}
   837  		if hostFileDir := strings.TrimPrefix(v, AnonOverlayPrefix); !filepath.IsAbs(hostFileDir) {
   838  			return fmt.Errorf("overlay host file directory should be an absolute path, got %q", hostFileDir)
   839  		}
   840  	}
   841  	*m = OverlayMedium(v)
   842  	return nil
   843  }
   844  
   845  // IsBackedByAnon indicates whether the overlaid mount is backed by a host file
   846  // in an anonymous directory.
   847  func (m OverlayMedium) IsBackedByAnon() bool {
   848  	return strings.HasPrefix(string(m), AnonOverlayPrefix)
   849  }
   850  
   851  // HostFileDir indicates the directory in which the overlay-backing host file
   852  // should be created.
   853  //
   854  // Precondition: m.IsBackedByAnon().
   855  func (m OverlayMedium) HostFileDir() string {
   856  	if !m.IsBackedByAnon() {
   857  		panic(fmt.Sprintf("anonymous overlay medium = %q does not have %v prefix", m, AnonOverlayPrefix))
   858  	}
   859  	return strings.TrimPrefix(string(m), AnonOverlayPrefix)
   860  }
   861  
   862  // Overlay2 holds the configuration for setting up overlay filesystems for the
   863  // container.
   864  type Overlay2 struct {
   865  	rootMount bool
   866  	subMounts bool
   867  	medium    OverlayMedium
   868  }
   869  
   870  func defaultOverlay2() *Overlay2 {
   871  	// Rootfs overlay is enabled by default and backed by a file in rootfs itself.
   872  	return &Overlay2{rootMount: true, subMounts: false, medium: SelfOverlay}
   873  }
   874  
   875  // Set implements flag.Value. Set(String()) should be idempotent.
   876  func (o *Overlay2) Set(v string) error {
   877  	if v == "none" {
   878  		o.rootMount = false
   879  		o.subMounts = false
   880  		o.medium = NoOverlay
   881  		return nil
   882  	}
   883  	vs := strings.Split(v, ":")
   884  	if len(vs) != 2 {
   885  		return fmt.Errorf("expected format is --overlay2={mount}:{medium}, got %q", v)
   886  	}
   887  
   888  	switch mount := vs[0]; mount {
   889  	case "root":
   890  		o.rootMount = true
   891  	case "all":
   892  		o.rootMount = true
   893  		o.subMounts = true
   894  	default:
   895  		return fmt.Errorf("unexpected mount specifier for --overlay2: %q", mount)
   896  	}
   897  
   898  	return o.medium.Set(vs[1])
   899  }
   900  
   901  // Get implements flag.Value.
   902  func (o *Overlay2) Get() any {
   903  	return *o
   904  }
   905  
   906  // String implements flag.Value.
   907  func (o Overlay2) String() string {
   908  	if !o.rootMount && !o.subMounts {
   909  		return "none"
   910  	}
   911  	res := ""
   912  	switch {
   913  	case o.rootMount && o.subMounts:
   914  		res = "all"
   915  	case o.rootMount:
   916  		res = "root"
   917  	default:
   918  		panic("invalid state of subMounts = true and rootMount = false")
   919  	}
   920  	return res + ":" + o.medium.String()
   921  }
   922  
   923  // Enabled returns true if the overlay option is enabled for any mounts.
   924  func (o *Overlay2) Enabled() bool {
   925  	return o.medium != NoOverlay
   926  }
   927  
   928  // RootOverlayMedium returns the overlay medium config of the root mount.
   929  func (o *Overlay2) RootOverlayMedium() OverlayMedium {
   930  	if !o.rootMount {
   931  		return NoOverlay
   932  	}
   933  	return o.medium
   934  }
   935  
   936  // SubMountOverlayMedium returns the overlay medium config of submounts.
   937  func (o *Overlay2) SubMountOverlayMedium() OverlayMedium {
   938  	if !o.subMounts {
   939  		return NoOverlay
   940  	}
   941  	return o.medium
   942  }
   943  
   944  // XDP holds configuration for whether and how to use XDP.
   945  type XDP struct {
   946  	Mode      XDPMode
   947  	IfaceName string
   948  }
   949  
   950  // XDPMode specifies a particular use of XDP.
   951  type XDPMode int
   952  
   953  const (
   954  	// XDPModeOff doesn't use XDP.
   955  	XDPModeOff XDPMode = iota
   956  
   957  	// XDPModeNS uses an AF_XDP socket to read from the VETH device inside
   958  	// the container's network namespace.
   959  	XDPModeNS
   960  
   961  	// XDPModeRedirect uses an AF_XDP socket on the host NIC to bypass the
   962  	// Linux network stack.
   963  	XDPModeRedirect
   964  
   965  	// XDPModeTunnel uses XDP_REDIRECT to redirect packets directy from the
   966  	// host NIC to the VETH device inside the container's network
   967  	// namespace. Packets are read from the VETH via AF_XDP, as in
   968  	// XDPModeNS.
   969  	XDPModeTunnel
   970  )
   971  
   972  const (
   973  	xdpModeStrOff      = "off"
   974  	xdpModeStrNS       = "ns"
   975  	xdpModeStrRedirect = "redirect"
   976  	xdpModeStrTunnel   = "tunnel"
   977  )
   978  
   979  var xdpConfig XDP
   980  
   981  // Get implements flag.Getter.
   982  func (xd *XDP) Get() any {
   983  	return *xd
   984  }
   985  
   986  // String implements flag.Getter.
   987  func (xd *XDP) String() string {
   988  	switch xd.Mode {
   989  	case XDPModeOff:
   990  		return xdpModeStrOff
   991  	case XDPModeNS:
   992  		return xdpModeStrNS
   993  	case XDPModeRedirect:
   994  		return fmt.Sprintf("%s:%s", xdpModeStrRedirect, xd.IfaceName)
   995  	case XDPModeTunnel:
   996  		return fmt.Sprintf("%s:%s", xdpModeStrTunnel, xd.IfaceName)
   997  	default:
   998  		panic(fmt.Sprintf("unknown mode %d", xd.Mode))
   999  	}
  1000  }
  1001  
  1002  // Set implements flag.Getter.
  1003  func (xd *XDP) Set(input string) error {
  1004  	parts := strings.Split(input, ":")
  1005  	if len(parts) > 2 {
  1006  		return fmt.Errorf("invalid --xdp value: %q", input)
  1007  	}
  1008  
  1009  	switch {
  1010  	case input == xdpModeStrOff:
  1011  		xd.Mode = XDPModeOff
  1012  		xd.IfaceName = ""
  1013  	case input == xdpModeStrNS:
  1014  		xd.Mode = XDPModeNS
  1015  		xd.IfaceName = ""
  1016  	case len(parts) == 2 && parts[0] == xdpModeStrRedirect && parts[1] != "":
  1017  		xd.Mode = XDPModeRedirect
  1018  		xd.IfaceName = parts[1]
  1019  	case len(parts) == 2 && parts[0] == xdpModeStrTunnel && parts[1] != "":
  1020  		xd.Mode = XDPModeTunnel
  1021  		xd.IfaceName = parts[1]
  1022  	default:
  1023  		return fmt.Errorf("invalid --xdp value: %q", input)
  1024  	}
  1025  	return nil
  1026  }