github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/config/config.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package config provides basic infrastructure to set configuration settings
    16  // for runsc. The configuration is set by flags to the command line. They can
    17  // also propagate to a different process using the same flags.
    18  package config
    19  
    20  import (
    21  	"fmt"
    22  	"path/filepath"
    23  	"reflect"
    24  	"runtime"
    25  	"strconv"
    26  	"strings"
    27  	"time"
    28  
    29  	"github.com/metacubex/gvisor/pkg/log"
    30  	"github.com/metacubex/gvisor/pkg/refs"
    31  	"github.com/metacubex/gvisor/pkg/sentry/watchdog"
    32  	"github.com/metacubex/gvisor/runsc/flag"
    33  	"github.com/metacubex/gvisor/runsc/version"
    34  )
    35  
    36  // Config holds configuration that is not part of the runtime spec.
    37  //
    38  // Follow these steps to add a new flag:
    39  //  1. Create a new field in Config.
    40  //  2. Add a field tag with the flag name
    41  //  3. Register a new flag in flags.go, with same name and add a description
    42  //  4. Add any necessary validation into validate()
    43  //  5. If adding an enum, follow the same pattern as FileAccessType
    44  //  6. Evaluate if the flag can be changed with OCI annotations. See
    45  //     overrideAllowlist for more details
    46  type Config struct {
    47  	// RootDir is the runtime root directory.
    48  	RootDir string `flag:"root"`
    49  
    50  	// Traceback changes the Go runtime's traceback level.
    51  	Traceback string `flag:"traceback"`
    52  
    53  	// Debug indicates that debug logging should be enabled.
    54  	Debug bool `flag:"debug"`
    55  
    56  	// LogFilename is the filename to log to, if not empty.
    57  	LogFilename string `flag:"log"`
    58  
    59  	// LogFormat is the log format.
    60  	LogFormat string `flag:"log-format"`
    61  
    62  	// DebugLog is the path to log debug information to, if not empty.
    63  	// If specified together with `DebugToUserLog`, debug logs are emitted
    64  	// to both.
    65  	DebugLog string `flag:"debug-log"`
    66  
    67  	// DebugToUserLog indicates that Sentry debug logs should be emitted
    68  	// to user-visible logs.
    69  	// If specified together with `DebugLog`, debug logs are emitted
    70  	// to both.
    71  	DebugToUserLog bool `flag:"debug-to-user-log"`
    72  
    73  	// DebugCommand is a comma-separated list of commands to be debugged if
    74  	// --debug-log is also set. Empty means debug all. "!" negates the expression.
    75  	// E.g. "create,start" or "!boot,events".
    76  	DebugCommand string `flag:"debug-command"`
    77  
    78  	// PanicLog is the path to log GO's runtime messages, if not empty.
    79  	PanicLog string `flag:"panic-log"`
    80  
    81  	// CoverageReport is the path to write Go coverage information, if not empty.
    82  	CoverageReport string `flag:"coverage-report"`
    83  
    84  	// DebugLogFormat is the log format for debug.
    85  	DebugLogFormat string `flag:"debug-log-format"`
    86  
    87  	// FileAccess indicates how the root filesystem is accessed.
    88  	FileAccess FileAccessType `flag:"file-access"`
    89  
    90  	// FileAccessMounts indicates how non-root volumes are accessed.
    91  	FileAccessMounts FileAccessType `flag:"file-access-mounts"`
    92  
    93  	// Overlay is whether to wrap all mounts in an overlay. The upper tmpfs layer
    94  	// will be backed by application memory.
    95  	Overlay bool `flag:"overlay"`
    96  
    97  	// Overlay2 holds configuration about wrapping mounts in overlayfs.
    98  	// DO NOT call it directly, use GetOverlay2() instead.
    99  	Overlay2 Overlay2 `flag:"overlay2"`
   100  
   101  	// FSGoferHostUDS is deprecated: use host-uds=all.
   102  	FSGoferHostUDS bool `flag:"fsgofer-host-uds"`
   103  
   104  	// HostUDS controls permission to access host Unix-domain sockets.
   105  	// DO NOT call it directly, use GetHostUDS() instead.
   106  	HostUDS HostUDS `flag:"host-uds"`
   107  
   108  	// HostFifo controls permission to access host FIFO (or named pipes).
   109  	HostFifo HostFifo `flag:"host-fifo"`
   110  
   111  	// Network indicates what type of network to use.
   112  	Network NetworkType `flag:"network"`
   113  
   114  	// EnableRaw indicates whether raw sockets should be enabled. Raw
   115  	// sockets are disabled by stripping CAP_NET_RAW from the list of
   116  	// capabilities.
   117  	EnableRaw bool `flag:"net-raw"`
   118  
   119  	// AllowPacketEndpointWrite enables write operations on packet endpoints.
   120  	AllowPacketEndpointWrite bool `flag:"TESTONLY-allow-packet-endpoint-write"`
   121  
   122  	// HostGSO indicates that host segmentation offload is enabled.
   123  	HostGSO bool `flag:"gso"`
   124  
   125  	// GvisorGSO indicates that gVisor segmentation offload is enabled. The flag
   126  	// retains its old name of "software" GSO for API consistency.
   127  	GvisorGSO bool `flag:"software-gso"`
   128  
   129  	// GvisorGROTimeout sets gVisor's generic receive offload timeout. Zero
   130  	// bypasses GRO.
   131  	GvisorGROTimeout time.Duration `flag:"gvisor-gro"`
   132  
   133  	// TXChecksumOffload indicates that TX Checksum Offload is enabled.
   134  	TXChecksumOffload bool `flag:"tx-checksum-offload"`
   135  
   136  	// RXChecksumOffload indicates that RX Checksum Offload is enabled.
   137  	RXChecksumOffload bool `flag:"rx-checksum-offload"`
   138  
   139  	// QDisc indicates the type of queuening discipline to use by default
   140  	// for non-loopback interfaces.
   141  	QDisc QueueingDiscipline `flag:"qdisc"`
   142  
   143  	// LogPackets indicates that all network packets should be logged.
   144  	LogPackets bool `flag:"log-packets"`
   145  
   146  	// PCAP is a file to which network packets should be logged in PCAP format.
   147  	PCAP string `flag:"pcap-log"`
   148  
   149  	// Platform is the platform to run on.
   150  	Platform string `flag:"platform"`
   151  
   152  	// PlatformDevicePath is the path to the device file used by the platform.
   153  	// e.g. "/dev/kvm" for the KVM platform.
   154  	// If unset, a sane platform-specific default will be used.
   155  	PlatformDevicePath string `flag:"platform_device_path"`
   156  
   157  	// MetricServer, if set, indicates that metrics should be exported on this address.
   158  	// This may either be 1) "addr:port" to export metrics on a specific network interface address,
   159  	// 2) ":port" for exporting metrics on all addresses, or 3) an absolute path to a Unix Domain
   160  	// Socket.
   161  	// The substring "%ID%" will be replaced by the container ID, and "%RUNTIME_ROOT%" by the root.
   162  	// This flag must be specified *both* as part of the `runsc metric-server` arguments (so that the
   163  	// metric server knows which address to bind to), and as part of the `runsc create` arguments (as
   164  	// an indication that the container being created wishes that its metrics should be exported).
   165  	// The value of this flag must also match across the two command lines.
   166  	MetricServer string `flag:"metric-server"`
   167  
   168  	// ProfilingMetrics is a comma separated list of metric names which are
   169  	// going to be written to the ProfilingMetricsLog file from within the
   170  	// sentry in CSV format. ProfilingMetrics will be snapshotted at a rate
   171  	// specified by ProfilingMetricsRate. Requires ProfilingMetricsLog to be
   172  	// set.
   173  	ProfilingMetrics string `flag:"profiling-metrics"`
   174  
   175  	// ProfilingMetricsLog is the file name to use for ProfilingMetrics
   176  	// output.
   177  	ProfilingMetricsLog string `flag:"profiling-metrics-log"`
   178  
   179  	// ProfilingMetricsRate is the target rate (in microseconds) at which
   180  	// profiling metrics will be snapshotted.
   181  	ProfilingMetricsRate int `flag:"profiling-metrics-rate-us"`
   182  
   183  	// Strace indicates that strace should be enabled.
   184  	Strace bool `flag:"strace"`
   185  
   186  	// StraceSyscalls is the set of syscalls to trace (comma-separated values).
   187  	// If StraceEnable is true and this string is empty, then all syscalls will
   188  	// be traced.
   189  	StraceSyscalls string `flag:"strace-syscalls"`
   190  
   191  	// StraceLogSize is the max size of data blobs to display.
   192  	StraceLogSize uint `flag:"strace-log-size"`
   193  
   194  	// StraceEvent indicates sending strace to events if true. Strace is
   195  	// sent to log if false.
   196  	StraceEvent bool `flag:"strace-event"`
   197  
   198  	// DisableSeccomp indicates whether seccomp syscall filters should be
   199  	// disabled. Pardon the double negation, but default to enabled is important.
   200  	DisableSeccomp bool
   201  
   202  	// EnableCoreTags indicates whether the Sentry process and children will be
   203  	// run in a core tagged process. This isolates the sentry from sharing
   204  	// physical cores with other core tagged processes. This is useful as a
   205  	// mitigation for hyperthreading side channel based attacks. Requires host
   206  	// linux kernel >= 5.14.
   207  	EnableCoreTags bool `flag:"enable-core-tags"`
   208  
   209  	// WatchdogAction sets what action the watchdog takes when triggered.
   210  	WatchdogAction watchdog.Action `flag:"watchdog-action"`
   211  
   212  	// PanicSignal registers signal handling that panics. Usually set to
   213  	// SIGUSR2(12) to troubleshoot hangs. -1 disables it.
   214  	PanicSignal int `flag:"panic-signal"`
   215  
   216  	// ProfileEnable is set to prepare the sandbox to be profiled.
   217  	ProfileEnable bool `flag:"profile"`
   218  
   219  	// ProfileBlock collects a block profile to the passed file for the
   220  	// duration of the container execution. Requires ProfileEnabled.
   221  	ProfileBlock string `flag:"profile-block"`
   222  
   223  	// ProfileCPU collects a CPU profile to the passed file for the
   224  	// duration of the container execution. Requires ProfileEnabled.
   225  	ProfileCPU string `flag:"profile-cpu"`
   226  
   227  	// ProfileHeap collects a heap profile to the passed file for the
   228  	// duration of the container execution. Requires ProfileEnabled.
   229  	ProfileHeap string `flag:"profile-heap"`
   230  
   231  	// ProfileMutex collects a mutex profile to the passed file for the
   232  	// duration of the container execution. Requires ProfileEnabled.
   233  	ProfileMutex string `flag:"profile-mutex"`
   234  
   235  	// TraceFile collects a Go runtime execution trace to the passed file
   236  	// for the duration of the container execution.
   237  	TraceFile string `flag:"trace"`
   238  
   239  	// RestoreFile is the path to the saved container image.
   240  	RestoreFile string
   241  
   242  	// NumNetworkChannels controls the number of AF_PACKET sockets that map
   243  	// to the same underlying network device. This allows netstack to better
   244  	// scale for high throughput use cases.
   245  	NumNetworkChannels int `flag:"num-network-channels"`
   246  
   247  	// Rootless allows the sandbox to be started with a user that is not root.
   248  	// Defense in depth measures are weaker in rootless mode. Specifically, the
   249  	// sandbox and Gofer process run as root inside a user namespace with root
   250  	// mapped to the caller's user. When using rootless, the container root path
   251  	// should not have a symlink.
   252  	Rootless bool `flag:"rootless"`
   253  
   254  	// AlsoLogToStderr allows to send log messages to stderr.
   255  	AlsoLogToStderr bool `flag:"alsologtostderr"`
   256  
   257  	// ReferenceLeakMode sets reference leak check mode
   258  	ReferenceLeak refs.LeakMode `flag:"ref-leak-mode"`
   259  
   260  	// CPUNumFromQuota sets CPU number count to available CPU quota, using
   261  	// least integer value greater than or equal to quota.
   262  	//
   263  	// E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
   264  	CPUNumFromQuota bool `flag:"cpu-num-from-quota"`
   265  
   266  	// Allows overriding of flags in OCI annotations.
   267  	AllowFlagOverride bool `flag:"allow-flag-override"`
   268  
   269  	// Enables seccomp inside the sandbox.
   270  	OCISeccomp bool `flag:"oci-seccomp"`
   271  
   272  	// Don't configure cgroups.
   273  	IgnoreCgroups bool `flag:"ignore-cgroups"`
   274  
   275  	// Use systemd to configure cgroups.
   276  	SystemdCgroup bool `flag:"systemd-cgroup"`
   277  
   278  	// PodInitConfig is the path to configuration file with additional steps to
   279  	// take during pod creation.
   280  	PodInitConfig string `flag:"pod-init-config"`
   281  
   282  	// Use pools to manage buffer memory instead of heap.
   283  	BufferPooling bool `flag:"buffer-pooling"`
   284  
   285  	// XDP controls Whether and how to use XDP.
   286  	XDP XDP `flag:"EXPERIMENTAL-xdp"`
   287  
   288  	// AFXDPUseNeedWakeup determines whether XDP_USE_NEED_WAKEUP is set
   289  	// when using AF_XDP sockets.
   290  	AFXDPUseNeedWakeup bool `flag:"EXPERIMENTAL-xdp-need-wakeup"`
   291  
   292  	// FDLimit specifies a limit on the number of host file descriptors that can
   293  	// be open simultaneously by the sentry and gofer. It applies separately to
   294  	// each.
   295  	FDLimit int `flag:"fdlimit"`
   296  
   297  	// DCache sets the global dirent cache size. If zero, per-mount caches are
   298  	// used.
   299  	DCache int `flag:"dcache"`
   300  
   301  	// IOUring enables support for the IO_URING API calls to perform
   302  	// asynchronous I/O operations.
   303  	IOUring bool `flag:"iouring"`
   304  
   305  	// DirectFS sets up the sandbox to directly access/mutate the filesystem from
   306  	// the sentry. Sentry runs with escalated privileges. Gofer process still
   307  	// exists, but is mostly idle. Not supported in rootless mode.
   308  	DirectFS bool `flag:"directfs"`
   309  
   310  	// NVProxy enables support for Nvidia GPUs.
   311  	NVProxy bool `flag:"nvproxy"`
   312  
   313  	// NVProxyDocker is deprecated. Please use nvidia-container-runtime or
   314  	// `docker run --gpus` directly. For backward compatibility, this has the
   315  	// effect of injecting nvidia-container-runtime-hook as a prestart hook.
   316  	NVProxyDocker bool `flag:"nvproxy-docker"`
   317  
   318  	// TPUProxy enables support for TPUs.
   319  	TPUProxy bool `flag:"tpuproxy"`
   320  
   321  	// TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in
   322  	// tests. It allows runsc to start the sandbox process as the current
   323  	// user, and without chrooting the sandbox process. This can be
   324  	// necessary in test environments that have limited capabilities. When
   325  	// disabling chroot, the container root path should not have a symlink.
   326  	TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"`
   327  
   328  	// TestOnlyTestNameEnv should only be used in tests. It looks up for the
   329  	// test name in the container environment variables and adds it to the debug
   330  	// log file name. This is done to help identify the log with the test when
   331  	// multiple tests are run in parallel, since there is no way to pass
   332  	// parameters to the runtime from docker.
   333  	TestOnlyTestNameEnv string `flag:"TESTONLY-test-name-env"`
   334  
   335  	// TestOnlyAFSSyscallPanic should only be used in tests. It enables the
   336  	// alternate behaviour for afs_syscall to trigger a Go-runtime panic upon being
   337  	// called. This is useful for tests exercising gVisor panic-reporting.
   338  	TestOnlyAFSSyscallPanic bool `flag:"TESTONLY-afs-syscall-panic"`
   339  
   340  	// explicitlySet contains whether a flag was explicitly set on the command-line from which this
   341  	// Config was constructed. Nil when the Config was not initialized from a FlagSet.
   342  	explicitlySet map[string]struct{}
   343  
   344  	// ReproduceNAT, when true, tells runsc to scrape the host network
   345  	// namespace's NAT iptables and reproduce it inside the sandbox.
   346  	ReproduceNAT bool `flag:"reproduce-nat"`
   347  
   348  	// ReproduceNftables attempts to scrape nftables routing rules if
   349  	// present, and reproduce them in the sandbox.
   350  	ReproduceNftables bool `flag:"reproduce-nftables"`
   351  
   352  	// TestOnlyAutosaveImagePath if not empty enables auto save for syscall tests
   353  	// and stores the directory path to the saved state file.
   354  	TestOnlyAutosaveImagePath string `flag:"TESTONLY-autosave-image-path"`
   355  }
   356  
   357  func (c *Config) validate() error {
   358  	if c.Overlay && c.Overlay2.Enabled() {
   359  		// Deprecated flag was used together with flag that replaced it.
   360  		return fmt.Errorf("overlay flag has been replaced with overlay2 flag")
   361  	}
   362  	if overlay2 := c.GetOverlay2(); c.FileAccess == FileAccessShared && overlay2.Enabled() {
   363  		return fmt.Errorf("overlay flag is incompatible with shared file access for rootfs")
   364  	}
   365  	if c.NumNetworkChannels <= 0 {
   366  		return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels)
   367  	}
   368  	// Require profile flags to explicitly opt-in to profiling with
   369  	// -profile rather than implying it since these options have security
   370  	// implications.
   371  	if c.ProfileBlock != "" && !c.ProfileEnable {
   372  		return fmt.Errorf("profile-block flag requires enabling profiling with profile flag")
   373  	}
   374  	if c.ProfileCPU != "" && !c.ProfileEnable {
   375  		return fmt.Errorf("profile-cpu flag requires enabling profiling with profile flag")
   376  	}
   377  	if c.ProfileHeap != "" && !c.ProfileEnable {
   378  		return fmt.Errorf("profile-heap flag requires enabling profiling with profile flag")
   379  	}
   380  	if c.ProfileMutex != "" && !c.ProfileEnable {
   381  		return fmt.Errorf("profile-mutex flag requires enabling profiling with profile flag")
   382  	}
   383  	if c.FSGoferHostUDS && c.HostUDS != HostUDSNone {
   384  		// Deprecated flag was used together with flag that replaced it.
   385  		return fmt.Errorf("fsgofer-host-uds has been replaced with host-uds flag")
   386  	}
   387  	if len(c.ProfilingMetrics) > 0 && len(c.ProfilingMetricsLog) == 0 {
   388  		return fmt.Errorf("profiling-metrics flag requires defining a profiling-metrics-log for output")
   389  	}
   390  	return nil
   391  }
   392  
   393  // Log logs important aspects of the configuration to the given log function.
   394  func (c *Config) Log() {
   395  	log.Infof("Platform: %v", c.Platform)
   396  	log.Infof("RootDir: %s", c.RootDir)
   397  	log.Infof("FileAccess: %v / Directfs: %t / Overlay: %v", c.FileAccess, c.DirectFS, c.GetOverlay2())
   398  	log.Infof("Network: %v", c.Network)
   399  	if c.Debug || c.Strace {
   400  		log.Infof("Debug: %t. Strace: %t, max size: %d, syscalls: %s", c.Debug, c.Strace, c.StraceLogSize, c.StraceSyscalls)
   401  	}
   402  	if c.Debug {
   403  		obj := reflect.ValueOf(c).Elem()
   404  		st := obj.Type()
   405  		for i := 0; i < st.NumField(); i++ {
   406  			f := st.Field(i)
   407  			var val any
   408  			if strVal := obj.Field(i).String(); strVal == "" {
   409  				val = "(empty)"
   410  			} else if !f.IsExported() {
   411  				// Cannot convert to `interface{}` for non-exported fields,
   412  				// so just use `strVal`.
   413  				val = fmt.Sprintf("%s (unexported)", strVal)
   414  			} else {
   415  				val = obj.Field(i).Interface()
   416  			}
   417  			if flagName, hasFlag := f.Tag.Lookup("flag"); hasFlag {
   418  				log.Debugf("Config.%s (--%s): %v", f.Name, flagName, val)
   419  			} else {
   420  				log.Debugf("Config.%s: %v", f.Name, val)
   421  			}
   422  		}
   423  	}
   424  }
   425  
   426  // GetHostUDS returns the FS gofer communication that is allowed, taking into
   427  // consideration all flags what affect the result.
   428  func (c *Config) GetHostUDS() HostUDS {
   429  	if c.FSGoferHostUDS {
   430  		if c.HostUDS != HostUDSNone {
   431  			panic(fmt.Sprintf("HostUDS cannot be set when --fsgofer-host-uds=true"))
   432  		}
   433  		// Using deprecated flag, honor it to avoid breaking users.
   434  		return HostUDSOpen
   435  	}
   436  	return c.HostUDS
   437  }
   438  
   439  // GetOverlay2 returns the overlay configuration, taking into consideration all
   440  // flags that affect the result.
   441  func (c *Config) GetOverlay2() Overlay2 {
   442  	if c.Overlay {
   443  		if c.Overlay2.Enabled() {
   444  			panic(fmt.Sprintf("Overlay2 cannot be set when --overlay=true"))
   445  		}
   446  		// Using a deprecated flag, honor it to avoid breaking users.
   447  		return Overlay2{rootMount: true, subMounts: true, medium: "memory"}
   448  	}
   449  	return c.Overlay2
   450  }
   451  
   452  // Bundle is a set of flag name-value pairs.
   453  type Bundle map[string]string
   454  
   455  // BundleName is a human-friendly name for a Bundle.
   456  // It is used as part of an annotation to specify that the user wants to apply a Bundle.
   457  type BundleName string
   458  
   459  // Validate validates that given flag string values map to actual flags in runsc.
   460  func (b Bundle) Validate() error {
   461  	flagSet := flag.NewFlagSet("tmp", flag.ContinueOnError)
   462  	RegisterFlags(flagSet)
   463  	for key, val := range b {
   464  		flag := flagSet.Lookup(key)
   465  		if flag == nil {
   466  			return fmt.Errorf("unknown flag %q", key)
   467  		}
   468  		if err := flagSet.Set(key, val); err != nil {
   469  			return err
   470  		}
   471  	}
   472  	return nil
   473  }
   474  
   475  // MetricMetadataKeys is the set of keys of metric metadata labels
   476  // as returned by `Config.MetricMetadata`.
   477  var MetricMetadataKeys = []string{
   478  	"version",
   479  	"platform",
   480  	"network",
   481  	"numcores",
   482  	"coretags",
   483  	"overlay",
   484  	"fsmode",
   485  	"cpuarch",
   486  	"go",
   487  	"experiment",
   488  }
   489  
   490  // MetricMetadata returns key-value pairs that are useful to include in metrics
   491  // exported about the sandbox this config represents.
   492  // It must return the same set of labels as listed in `MetricMetadataKeys`.
   493  func (c *Config) MetricMetadata() map[string]string {
   494  	var fsMode = "goferfs"
   495  	if c.DirectFS {
   496  		fsMode = "directfs"
   497  	}
   498  	return map[string]string{
   499  		"version":  version.Version(),
   500  		"platform": c.Platform,
   501  		"network":  c.Network.String(),
   502  		"numcores": strconv.Itoa(runtime.NumCPU()),
   503  		"coretags": strconv.FormatBool(c.EnableCoreTags),
   504  		"overlay":  c.Overlay2.String(),
   505  		"fsmode":   fsMode,
   506  		"cpuarch":  runtime.GOARCH,
   507  		"go":       runtime.Version(),
   508  		// The "experiment" label is currently unused, but may be used to contain
   509  		// extra information about e.g. an experiment that may be enabled.
   510  		"experiment": "",
   511  	}
   512  }
   513  
   514  // FileAccessType tells how the filesystem is accessed.
   515  type FileAccessType int
   516  
   517  const (
   518  	// FileAccessExclusive gives the sandbox exclusive access over files and
   519  	// directories in the filesystem. No external modifications are permitted and
   520  	// can lead to undefined behavior.
   521  	//
   522  	// Exclusive filesystem access enables more aggressive caching and offers
   523  	// significantly better performance. This is the default mode for the root
   524  	// volume.
   525  	FileAccessExclusive FileAccessType = iota
   526  
   527  	// FileAccessShared is used for volumes that can have external changes. It
   528  	// requires revalidation on every filesystem access to detect external
   529  	// changes, and reduces the amount of caching that can be done. This is the
   530  	// default mode for non-root volumes.
   531  	FileAccessShared
   532  )
   533  
   534  func fileAccessTypePtr(v FileAccessType) *FileAccessType {
   535  	return &v
   536  }
   537  
   538  // Set implements flag.Value. Set(String()) should be idempotent.
   539  func (f *FileAccessType) Set(v string) error {
   540  	switch v {
   541  	case "shared":
   542  		*f = FileAccessShared
   543  	case "exclusive":
   544  		*f = FileAccessExclusive
   545  	default:
   546  		return fmt.Errorf("invalid file access type %q", v)
   547  	}
   548  	return nil
   549  }
   550  
   551  // Get implements flag.Value.
   552  func (f *FileAccessType) Get() any {
   553  	return *f
   554  }
   555  
   556  // String implements flag.Value.
   557  func (f FileAccessType) String() string {
   558  	switch f {
   559  	case FileAccessShared:
   560  		return "shared"
   561  	case FileAccessExclusive:
   562  		return "exclusive"
   563  	}
   564  	panic(fmt.Sprintf("Invalid file access type %d", f))
   565  }
   566  
   567  // NetworkType tells which network stack to use.
   568  type NetworkType int
   569  
   570  const (
   571  	// NetworkSandbox uses internal network stack, isolated from the host.
   572  	NetworkSandbox NetworkType = iota
   573  
   574  	// NetworkHost redirects network related syscalls to the host network.
   575  	NetworkHost
   576  
   577  	// NetworkNone sets up just loopback using netstack.
   578  	NetworkNone
   579  )
   580  
   581  func networkTypePtr(v NetworkType) *NetworkType {
   582  	return &v
   583  }
   584  
   585  // Set implements flag.Value. Set(String()) should be idempotent.
   586  func (n *NetworkType) Set(v string) error {
   587  	switch v {
   588  	case "sandbox":
   589  		*n = NetworkSandbox
   590  	case "host":
   591  		*n = NetworkHost
   592  	case "none":
   593  		*n = NetworkNone
   594  	default:
   595  		return fmt.Errorf("invalid network type %q", v)
   596  	}
   597  	return nil
   598  }
   599  
   600  // Get implements flag.Value.
   601  func (n *NetworkType) Get() any {
   602  	return *n
   603  }
   604  
   605  // String implements flag.Value.
   606  func (n NetworkType) String() string {
   607  	switch n {
   608  	case NetworkSandbox:
   609  		return "sandbox"
   610  	case NetworkHost:
   611  		return "host"
   612  	case NetworkNone:
   613  		return "none"
   614  	}
   615  	panic(fmt.Sprintf("Invalid network type %d", n))
   616  }
   617  
   618  // QueueingDiscipline is used to specify the kind of Queueing Discipline to
   619  // apply for a give FDBasedLink.
   620  type QueueingDiscipline int
   621  
   622  const (
   623  	// QDiscNone disables any queueing for the underlying FD.
   624  	QDiscNone QueueingDiscipline = iota
   625  
   626  	// QDiscFIFO applies a simple fifo based queue to the underlying FD.
   627  	QDiscFIFO
   628  )
   629  
   630  func queueingDisciplinePtr(v QueueingDiscipline) *QueueingDiscipline {
   631  	return &v
   632  }
   633  
   634  // Set implements flag.Value. Set(String()) should be idempotent.
   635  func (q *QueueingDiscipline) Set(v string) error {
   636  	switch v {
   637  	case "none":
   638  		*q = QDiscNone
   639  	case "fifo":
   640  		*q = QDiscFIFO
   641  	default:
   642  		return fmt.Errorf("invalid qdisc %q", v)
   643  	}
   644  	return nil
   645  }
   646  
   647  // Get implements flag.Value.
   648  func (q *QueueingDiscipline) Get() any {
   649  	return *q
   650  }
   651  
   652  // String implements flag.Value.
   653  func (q QueueingDiscipline) String() string {
   654  	switch q {
   655  	case QDiscNone:
   656  		return "none"
   657  	case QDiscFIFO:
   658  		return "fifo"
   659  	}
   660  	panic(fmt.Sprintf("Invalid qdisc %d", q))
   661  }
   662  
   663  func leakModePtr(v refs.LeakMode) *refs.LeakMode {
   664  	return &v
   665  }
   666  
   667  func watchdogActionPtr(v watchdog.Action) *watchdog.Action {
   668  	return &v
   669  }
   670  
   671  // HostUDS tells how much of the host UDS the file system has access to.
   672  type HostUDS int
   673  
   674  const (
   675  	// HostUDSNone doesn't allows UDS from the host to be manipulated.
   676  	HostUDSNone HostUDS = 0x0
   677  
   678  	// HostUDSOpen allows UDS from the host to be opened, e.g. connect(2).
   679  	HostUDSOpen HostUDS = 0x1
   680  
   681  	// HostUDSCreate allows UDS from the host to be created, e.g. bind(2).
   682  	HostUDSCreate HostUDS = 0x2
   683  
   684  	// HostUDSAll allows all form of communication with the host through UDS.
   685  	HostUDSAll = HostUDSOpen | HostUDSCreate
   686  )
   687  
   688  func hostUDSPtr(v HostUDS) *HostUDS {
   689  	return &v
   690  }
   691  
   692  // Set implements flag.Value. Set(String()) should be idempotent.
   693  func (g *HostUDS) Set(v string) error {
   694  	switch v {
   695  	case "", "none":
   696  		*g = HostUDSNone
   697  	case "open":
   698  		*g = HostUDSOpen
   699  	case "create":
   700  		*g = HostUDSCreate
   701  	case "all":
   702  		*g = HostUDSAll
   703  	default:
   704  		return fmt.Errorf("invalid host UDS type %q", v)
   705  	}
   706  	return nil
   707  }
   708  
   709  // Get implements flag.Value.
   710  func (g *HostUDS) Get() any {
   711  	return *g
   712  }
   713  
   714  // String implements flag.Value.
   715  func (g HostUDS) String() string {
   716  	switch g {
   717  	case HostUDSNone:
   718  		return "none"
   719  	case HostUDSOpen:
   720  		return "open"
   721  	case HostUDSCreate:
   722  		return "create"
   723  	case HostUDSAll:
   724  		return "all"
   725  	default:
   726  		panic(fmt.Sprintf("Invalid host UDS type %d", g))
   727  	}
   728  }
   729  
   730  // AllowOpen returns true if it can consume UDS from the host.
   731  func (g HostUDS) AllowOpen() bool {
   732  	return g&HostUDSOpen != 0
   733  }
   734  
   735  // AllowCreate returns true if it can create UDS in the host.
   736  func (g HostUDS) AllowCreate() bool {
   737  	return g&HostUDSCreate != 0
   738  }
   739  
   740  // HostFifo tells how much of the host FIFO (or named pipes) the file system has
   741  // access to.
   742  type HostFifo int
   743  
   744  const (
   745  	// HostFifoNone doesn't allow FIFO from the host to be manipulated.
   746  	HostFifoNone HostFifo = 0x0
   747  
   748  	// HostFifoOpen allows FIFOs from the host to be opened.
   749  	HostFifoOpen HostFifo = 0x1
   750  )
   751  
   752  func hostFifoPtr(v HostFifo) *HostFifo {
   753  	return &v
   754  }
   755  
   756  // Set implements flag.Value. Set(String()) should be idempotent.
   757  func (g *HostFifo) Set(v string) error {
   758  	switch v {
   759  	case "", "none":
   760  		*g = HostFifoNone
   761  	case "open":
   762  		*g = HostFifoOpen
   763  	default:
   764  		return fmt.Errorf("invalid host fifo type %q", v)
   765  	}
   766  	return nil
   767  }
   768  
   769  // Get implements flag.Value.
   770  func (g *HostFifo) Get() any {
   771  	return *g
   772  }
   773  
   774  // String implements flag.Value.
   775  func (g HostFifo) String() string {
   776  	switch g {
   777  	case HostFifoNone:
   778  		return "none"
   779  	case HostFifoOpen:
   780  		return "open"
   781  	default:
   782  		panic(fmt.Sprintf("Invalid host fifo type %d", g))
   783  	}
   784  }
   785  
   786  // AllowOpen returns true if it can consume FIFOs from the host.
   787  func (g HostFifo) AllowOpen() bool {
   788  	return g&HostFifoOpen != 0
   789  }
   790  
   791  // OverlayMedium describes how overlay medium is configured.
   792  type OverlayMedium string
   793  
   794  const (
   795  	// NoOverlay indicates that no overlay will be applied.
   796  	NoOverlay = OverlayMedium("")
   797  
   798  	// MemoryOverlay indicates that the overlay is backed by app memory.
   799  	MemoryOverlay = OverlayMedium("memory")
   800  
   801  	// SelfOverlay indicates that the overlaid mount is backed by itself.
   802  	SelfOverlay = OverlayMedium("self")
   803  
   804  	// AnonOverlayPrefix is the prefix that users should specify in the
   805  	// config for the anonymous overlay.
   806  	AnonOverlayPrefix = "dir="
   807  )
   808  
   809  // String returns a human-readable string representing the overlay medium config.
   810  func (m OverlayMedium) String() string {
   811  	return string(m)
   812  }
   813  
   814  // Set sets the value. Set(String()) should be idempotent.
   815  func (m *OverlayMedium) Set(v string) error {
   816  	switch OverlayMedium(v) {
   817  	case NoOverlay, MemoryOverlay, SelfOverlay: // OK
   818  	default:
   819  		if !strings.HasPrefix(v, AnonOverlayPrefix) {
   820  			return fmt.Errorf("unexpected medium: %q", v)
   821  		}
   822  		if hostFileDir := strings.TrimPrefix(v, AnonOverlayPrefix); !filepath.IsAbs(hostFileDir) {
   823  			return fmt.Errorf("overlay host file directory should be an absolute path, got %q", hostFileDir)
   824  		}
   825  	}
   826  	*m = OverlayMedium(v)
   827  	return nil
   828  }
   829  
   830  // IsBackedByAnon indicates whether the overlaid mount is backed by a host file
   831  // in an anonymous directory.
   832  func (m OverlayMedium) IsBackedByAnon() bool {
   833  	return strings.HasPrefix(string(m), AnonOverlayPrefix)
   834  }
   835  
   836  // HostFileDir indicates the directory in which the overlay-backing host file
   837  // should be created.
   838  //
   839  // Precondition: m.IsBackedByAnon().
   840  func (m OverlayMedium) HostFileDir() string {
   841  	if !m.IsBackedByAnon() {
   842  		panic(fmt.Sprintf("anonymous overlay medium = %q does not have %v prefix", m, AnonOverlayPrefix))
   843  	}
   844  	return strings.TrimPrefix(string(m), AnonOverlayPrefix)
   845  }
   846  
   847  // Overlay2 holds the configuration for setting up overlay filesystems for the
   848  // container.
   849  type Overlay2 struct {
   850  	rootMount bool
   851  	subMounts bool
   852  	medium    OverlayMedium
   853  }
   854  
   855  func defaultOverlay2() *Overlay2 {
   856  	// Rootfs overlay is enabled by default and backed by a file in rootfs itself.
   857  	return &Overlay2{rootMount: true, subMounts: false, medium: SelfOverlay}
   858  }
   859  
   860  // Set implements flag.Value. Set(String()) should be idempotent.
   861  func (o *Overlay2) Set(v string) error {
   862  	if v == "none" {
   863  		o.rootMount = false
   864  		o.subMounts = false
   865  		o.medium = NoOverlay
   866  		return nil
   867  	}
   868  	vs := strings.Split(v, ":")
   869  	if len(vs) != 2 {
   870  		return fmt.Errorf("expected format is --overlay2={mount}:{medium}, got %q", v)
   871  	}
   872  
   873  	switch mount := vs[0]; mount {
   874  	case "root":
   875  		o.rootMount = true
   876  	case "all":
   877  		o.rootMount = true
   878  		o.subMounts = true
   879  	default:
   880  		return fmt.Errorf("unexpected mount specifier for --overlay2: %q", mount)
   881  	}
   882  
   883  	return o.medium.Set(vs[1])
   884  }
   885  
   886  // Get implements flag.Value.
   887  func (o *Overlay2) Get() any {
   888  	return *o
   889  }
   890  
   891  // String implements flag.Value.
   892  func (o Overlay2) String() string {
   893  	if !o.rootMount && !o.subMounts {
   894  		return "none"
   895  	}
   896  	res := ""
   897  	switch {
   898  	case o.rootMount && o.subMounts:
   899  		res = "all"
   900  	case o.rootMount:
   901  		res = "root"
   902  	default:
   903  		panic("invalid state of subMounts = true and rootMount = false")
   904  	}
   905  	return res + ":" + o.medium.String()
   906  }
   907  
   908  // Enabled returns true if the overlay option is enabled for any mounts.
   909  func (o *Overlay2) Enabled() bool {
   910  	return o.medium != NoOverlay
   911  }
   912  
   913  // RootOverlayMedium returns the overlay medium config of the root mount.
   914  func (o *Overlay2) RootOverlayMedium() OverlayMedium {
   915  	if !o.rootMount {
   916  		return NoOverlay
   917  	}
   918  	return o.medium
   919  }
   920  
   921  // SubMountOverlayMedium returns the overlay medium config of submounts.
   922  func (o *Overlay2) SubMountOverlayMedium() OverlayMedium {
   923  	if !o.subMounts {
   924  		return NoOverlay
   925  	}
   926  	return o.medium
   927  }
   928  
   929  // XDP holds configuration for whether and how to use XDP.
   930  type XDP struct {
   931  	Mode      XDPMode
   932  	IfaceName string
   933  }
   934  
   935  // XDPMode specifies a particular use of XDP.
   936  type XDPMode int
   937  
   938  const (
   939  	// XDPModeOff doesn't use XDP.
   940  	XDPModeOff XDPMode = iota
   941  
   942  	// XDPModeNS uses an AF_XDP socket to read from the VETH device inside
   943  	// the container's network namespace.
   944  	XDPModeNS
   945  
   946  	// XDPModeRedirect uses an AF_XDP socket on the host NIC to bypass the
   947  	// Linux network stack.
   948  	XDPModeRedirect
   949  
   950  	// XDPModeTunnel uses XDP_REDIRECT to redirect packets directy from the
   951  	// host NIC to the VETH device inside the container's network
   952  	// namespace. Packets are read from the VETH via AF_XDP, as in
   953  	// XDPModeNS.
   954  	XDPModeTunnel
   955  )
   956  
   957  const (
   958  	xdpModeStrOff      = "off"
   959  	xdpModeStrNS       = "ns"
   960  	xdpModeStrRedirect = "redirect"
   961  	xdpModeStrTunnel   = "tunnel"
   962  )
   963  
   964  var xdpConfig XDP
   965  
   966  // Get implements flag.Getter.
   967  func (xd *XDP) Get() any {
   968  	return *xd
   969  }
   970  
   971  // String implements flag.Getter.
   972  func (xd *XDP) String() string {
   973  	switch xd.Mode {
   974  	case XDPModeOff:
   975  		return xdpModeStrOff
   976  	case XDPModeNS:
   977  		return xdpModeStrNS
   978  	case XDPModeRedirect:
   979  		return fmt.Sprintf("%s:%s", xdpModeStrRedirect, xd.IfaceName)
   980  	case XDPModeTunnel:
   981  		return fmt.Sprintf("%s:%s", xdpModeStrTunnel, xd.IfaceName)
   982  	default:
   983  		panic(fmt.Sprintf("unknown mode %d", xd.Mode))
   984  	}
   985  }
   986  
   987  // Set implements flag.Getter.
   988  func (xd *XDP) Set(input string) error {
   989  	parts := strings.Split(input, ":")
   990  	if len(parts) > 2 {
   991  		return fmt.Errorf("invalid --xdp value: %q", input)
   992  	}
   993  
   994  	switch {
   995  	case input == xdpModeStrOff:
   996  		xd.Mode = XDPModeOff
   997  		xd.IfaceName = ""
   998  	case input == xdpModeStrNS:
   999  		xd.Mode = XDPModeNS
  1000  		xd.IfaceName = ""
  1001  	case len(parts) == 2 && parts[0] == xdpModeStrRedirect && parts[1] != "":
  1002  		xd.Mode = XDPModeRedirect
  1003  		xd.IfaceName = parts[1]
  1004  	case len(parts) == 2 && parts[0] == xdpModeStrTunnel && parts[1] != "":
  1005  		xd.Mode = XDPModeTunnel
  1006  		xd.IfaceName = parts[1]
  1007  	default:
  1008  		return fmt.Errorf("invalid --xdp value: %q", input)
  1009  	}
  1010  	return nil
  1011  }