github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/config/config.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package config provides basic infrastructure to set configuration settings 16 // for runsc. The configuration is set by flags to the command line. They can 17 // also propagate to a different process using the same flags. 18 package config 19 20 import ( 21 "fmt" 22 "path/filepath" 23 "reflect" 24 "runtime" 25 "strconv" 26 "strings" 27 "time" 28 29 "github.com/metacubex/gvisor/pkg/log" 30 "github.com/metacubex/gvisor/pkg/refs" 31 "github.com/metacubex/gvisor/pkg/sentry/watchdog" 32 "github.com/metacubex/gvisor/runsc/flag" 33 "github.com/metacubex/gvisor/runsc/version" 34 ) 35 36 // Config holds configuration that is not part of the runtime spec. 37 // 38 // Follow these steps to add a new flag: 39 // 1. Create a new field in Config. 40 // 2. Add a field tag with the flag name 41 // 3. Register a new flag in flags.go, with same name and add a description 42 // 4. Add any necessary validation into validate() 43 // 5. If adding an enum, follow the same pattern as FileAccessType 44 // 6. Evaluate if the flag can be changed with OCI annotations. See 45 // overrideAllowlist for more details 46 type Config struct { 47 // RootDir is the runtime root directory. 48 RootDir string `flag:"root"` 49 50 // Traceback changes the Go runtime's traceback level. 51 Traceback string `flag:"traceback"` 52 53 // Debug indicates that debug logging should be enabled. 54 Debug bool `flag:"debug"` 55 56 // LogFilename is the filename to log to, if not empty. 57 LogFilename string `flag:"log"` 58 59 // LogFormat is the log format. 60 LogFormat string `flag:"log-format"` 61 62 // DebugLog is the path to log debug information to, if not empty. 63 // If specified together with `DebugToUserLog`, debug logs are emitted 64 // to both. 65 DebugLog string `flag:"debug-log"` 66 67 // DebugToUserLog indicates that Sentry debug logs should be emitted 68 // to user-visible logs. 69 // If specified together with `DebugLog`, debug logs are emitted 70 // to both. 71 DebugToUserLog bool `flag:"debug-to-user-log"` 72 73 // DebugCommand is a comma-separated list of commands to be debugged if 74 // --debug-log is also set. Empty means debug all. "!" negates the expression. 75 // E.g. "create,start" or "!boot,events". 76 DebugCommand string `flag:"debug-command"` 77 78 // PanicLog is the path to log GO's runtime messages, if not empty. 79 PanicLog string `flag:"panic-log"` 80 81 // CoverageReport is the path to write Go coverage information, if not empty. 82 CoverageReport string `flag:"coverage-report"` 83 84 // DebugLogFormat is the log format for debug. 85 DebugLogFormat string `flag:"debug-log-format"` 86 87 // FileAccess indicates how the root filesystem is accessed. 88 FileAccess FileAccessType `flag:"file-access"` 89 90 // FileAccessMounts indicates how non-root volumes are accessed. 91 FileAccessMounts FileAccessType `flag:"file-access-mounts"` 92 93 // Overlay is whether to wrap all mounts in an overlay. The upper tmpfs layer 94 // will be backed by application memory. 95 Overlay bool `flag:"overlay"` 96 97 // Overlay2 holds configuration about wrapping mounts in overlayfs. 98 // DO NOT call it directly, use GetOverlay2() instead. 99 Overlay2 Overlay2 `flag:"overlay2"` 100 101 // FSGoferHostUDS is deprecated: use host-uds=all. 102 FSGoferHostUDS bool `flag:"fsgofer-host-uds"` 103 104 // HostUDS controls permission to access host Unix-domain sockets. 105 // DO NOT call it directly, use GetHostUDS() instead. 106 HostUDS HostUDS `flag:"host-uds"` 107 108 // HostFifo controls permission to access host FIFO (or named pipes). 109 HostFifo HostFifo `flag:"host-fifo"` 110 111 // Network indicates what type of network to use. 112 Network NetworkType `flag:"network"` 113 114 // EnableRaw indicates whether raw sockets should be enabled. Raw 115 // sockets are disabled by stripping CAP_NET_RAW from the list of 116 // capabilities. 117 EnableRaw bool `flag:"net-raw"` 118 119 // AllowPacketEndpointWrite enables write operations on packet endpoints. 120 AllowPacketEndpointWrite bool `flag:"TESTONLY-allow-packet-endpoint-write"` 121 122 // HostGSO indicates that host segmentation offload is enabled. 123 HostGSO bool `flag:"gso"` 124 125 // GvisorGSO indicates that gVisor segmentation offload is enabled. The flag 126 // retains its old name of "software" GSO for API consistency. 127 GvisorGSO bool `flag:"software-gso"` 128 129 // GvisorGROTimeout sets gVisor's generic receive offload timeout. Zero 130 // bypasses GRO. 131 GvisorGROTimeout time.Duration `flag:"gvisor-gro"` 132 133 // TXChecksumOffload indicates that TX Checksum Offload is enabled. 134 TXChecksumOffload bool `flag:"tx-checksum-offload"` 135 136 // RXChecksumOffload indicates that RX Checksum Offload is enabled. 137 RXChecksumOffload bool `flag:"rx-checksum-offload"` 138 139 // QDisc indicates the type of queuening discipline to use by default 140 // for non-loopback interfaces. 141 QDisc QueueingDiscipline `flag:"qdisc"` 142 143 // LogPackets indicates that all network packets should be logged. 144 LogPackets bool `flag:"log-packets"` 145 146 // PCAP is a file to which network packets should be logged in PCAP format. 147 PCAP string `flag:"pcap-log"` 148 149 // Platform is the platform to run on. 150 Platform string `flag:"platform"` 151 152 // PlatformDevicePath is the path to the device file used by the platform. 153 // e.g. "/dev/kvm" for the KVM platform. 154 // If unset, a sane platform-specific default will be used. 155 PlatformDevicePath string `flag:"platform_device_path"` 156 157 // MetricServer, if set, indicates that metrics should be exported on this address. 158 // This may either be 1) "addr:port" to export metrics on a specific network interface address, 159 // 2) ":port" for exporting metrics on all addresses, or 3) an absolute path to a Unix Domain 160 // Socket. 161 // The substring "%ID%" will be replaced by the container ID, and "%RUNTIME_ROOT%" by the root. 162 // This flag must be specified *both* as part of the `runsc metric-server` arguments (so that the 163 // metric server knows which address to bind to), and as part of the `runsc create` arguments (as 164 // an indication that the container being created wishes that its metrics should be exported). 165 // The value of this flag must also match across the two command lines. 166 MetricServer string `flag:"metric-server"` 167 168 // ProfilingMetrics is a comma separated list of metric names which are 169 // going to be written to the ProfilingMetricsLog file from within the 170 // sentry in CSV format. ProfilingMetrics will be snapshotted at a rate 171 // specified by ProfilingMetricsRate. Requires ProfilingMetricsLog to be 172 // set. 173 ProfilingMetrics string `flag:"profiling-metrics"` 174 175 // ProfilingMetricsLog is the file name to use for ProfilingMetrics 176 // output. 177 ProfilingMetricsLog string `flag:"profiling-metrics-log"` 178 179 // ProfilingMetricsRate is the target rate (in microseconds) at which 180 // profiling metrics will be snapshotted. 181 ProfilingMetricsRate int `flag:"profiling-metrics-rate-us"` 182 183 // Strace indicates that strace should be enabled. 184 Strace bool `flag:"strace"` 185 186 // StraceSyscalls is the set of syscalls to trace (comma-separated values). 187 // If StraceEnable is true and this string is empty, then all syscalls will 188 // be traced. 189 StraceSyscalls string `flag:"strace-syscalls"` 190 191 // StraceLogSize is the max size of data blobs to display. 192 StraceLogSize uint `flag:"strace-log-size"` 193 194 // StraceEvent indicates sending strace to events if true. Strace is 195 // sent to log if false. 196 StraceEvent bool `flag:"strace-event"` 197 198 // DisableSeccomp indicates whether seccomp syscall filters should be 199 // disabled. Pardon the double negation, but default to enabled is important. 200 DisableSeccomp bool 201 202 // EnableCoreTags indicates whether the Sentry process and children will be 203 // run in a core tagged process. This isolates the sentry from sharing 204 // physical cores with other core tagged processes. This is useful as a 205 // mitigation for hyperthreading side channel based attacks. Requires host 206 // linux kernel >= 5.14. 207 EnableCoreTags bool `flag:"enable-core-tags"` 208 209 // WatchdogAction sets what action the watchdog takes when triggered. 210 WatchdogAction watchdog.Action `flag:"watchdog-action"` 211 212 // PanicSignal registers signal handling that panics. Usually set to 213 // SIGUSR2(12) to troubleshoot hangs. -1 disables it. 214 PanicSignal int `flag:"panic-signal"` 215 216 // ProfileEnable is set to prepare the sandbox to be profiled. 217 ProfileEnable bool `flag:"profile"` 218 219 // ProfileBlock collects a block profile to the passed file for the 220 // duration of the container execution. Requires ProfileEnabled. 221 ProfileBlock string `flag:"profile-block"` 222 223 // ProfileCPU collects a CPU profile to the passed file for the 224 // duration of the container execution. Requires ProfileEnabled. 225 ProfileCPU string `flag:"profile-cpu"` 226 227 // ProfileHeap collects a heap profile to the passed file for the 228 // duration of the container execution. Requires ProfileEnabled. 229 ProfileHeap string `flag:"profile-heap"` 230 231 // ProfileMutex collects a mutex profile to the passed file for the 232 // duration of the container execution. Requires ProfileEnabled. 233 ProfileMutex string `flag:"profile-mutex"` 234 235 // TraceFile collects a Go runtime execution trace to the passed file 236 // for the duration of the container execution. 237 TraceFile string `flag:"trace"` 238 239 // RestoreFile is the path to the saved container image. 240 RestoreFile string 241 242 // NumNetworkChannels controls the number of AF_PACKET sockets that map 243 // to the same underlying network device. This allows netstack to better 244 // scale for high throughput use cases. 245 NumNetworkChannels int `flag:"num-network-channels"` 246 247 // Rootless allows the sandbox to be started with a user that is not root. 248 // Defense in depth measures are weaker in rootless mode. Specifically, the 249 // sandbox and Gofer process run as root inside a user namespace with root 250 // mapped to the caller's user. When using rootless, the container root path 251 // should not have a symlink. 252 Rootless bool `flag:"rootless"` 253 254 // AlsoLogToStderr allows to send log messages to stderr. 255 AlsoLogToStderr bool `flag:"alsologtostderr"` 256 257 // ReferenceLeakMode sets reference leak check mode 258 ReferenceLeak refs.LeakMode `flag:"ref-leak-mode"` 259 260 // CPUNumFromQuota sets CPU number count to available CPU quota, using 261 // least integer value greater than or equal to quota. 262 // 263 // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2. 264 CPUNumFromQuota bool `flag:"cpu-num-from-quota"` 265 266 // Allows overriding of flags in OCI annotations. 267 AllowFlagOverride bool `flag:"allow-flag-override"` 268 269 // Enables seccomp inside the sandbox. 270 OCISeccomp bool `flag:"oci-seccomp"` 271 272 // Don't configure cgroups. 273 IgnoreCgroups bool `flag:"ignore-cgroups"` 274 275 // Use systemd to configure cgroups. 276 SystemdCgroup bool `flag:"systemd-cgroup"` 277 278 // PodInitConfig is the path to configuration file with additional steps to 279 // take during pod creation. 280 PodInitConfig string `flag:"pod-init-config"` 281 282 // Use pools to manage buffer memory instead of heap. 283 BufferPooling bool `flag:"buffer-pooling"` 284 285 // XDP controls Whether and how to use XDP. 286 XDP XDP `flag:"EXPERIMENTAL-xdp"` 287 288 // AFXDPUseNeedWakeup determines whether XDP_USE_NEED_WAKEUP is set 289 // when using AF_XDP sockets. 290 AFXDPUseNeedWakeup bool `flag:"EXPERIMENTAL-xdp-need-wakeup"` 291 292 // FDLimit specifies a limit on the number of host file descriptors that can 293 // be open simultaneously by the sentry and gofer. It applies separately to 294 // each. 295 FDLimit int `flag:"fdlimit"` 296 297 // DCache sets the global dirent cache size. If zero, per-mount caches are 298 // used. 299 DCache int `flag:"dcache"` 300 301 // IOUring enables support for the IO_URING API calls to perform 302 // asynchronous I/O operations. 303 IOUring bool `flag:"iouring"` 304 305 // DirectFS sets up the sandbox to directly access/mutate the filesystem from 306 // the sentry. Sentry runs with escalated privileges. Gofer process still 307 // exists, but is mostly idle. Not supported in rootless mode. 308 DirectFS bool `flag:"directfs"` 309 310 // NVProxy enables support for Nvidia GPUs. 311 NVProxy bool `flag:"nvproxy"` 312 313 // NVProxyDocker is deprecated. Please use nvidia-container-runtime or 314 // `docker run --gpus` directly. For backward compatibility, this has the 315 // effect of injecting nvidia-container-runtime-hook as a prestart hook. 316 NVProxyDocker bool `flag:"nvproxy-docker"` 317 318 // TPUProxy enables support for TPUs. 319 TPUProxy bool `flag:"tpuproxy"` 320 321 // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in 322 // tests. It allows runsc to start the sandbox process as the current 323 // user, and without chrooting the sandbox process. This can be 324 // necessary in test environments that have limited capabilities. When 325 // disabling chroot, the container root path should not have a symlink. 326 TestOnlyAllowRunAsCurrentUserWithoutChroot bool `flag:"TESTONLY-unsafe-nonroot"` 327 328 // TestOnlyTestNameEnv should only be used in tests. It looks up for the 329 // test name in the container environment variables and adds it to the debug 330 // log file name. This is done to help identify the log with the test when 331 // multiple tests are run in parallel, since there is no way to pass 332 // parameters to the runtime from docker. 333 TestOnlyTestNameEnv string `flag:"TESTONLY-test-name-env"` 334 335 // TestOnlyAFSSyscallPanic should only be used in tests. It enables the 336 // alternate behaviour for afs_syscall to trigger a Go-runtime panic upon being 337 // called. This is useful for tests exercising gVisor panic-reporting. 338 TestOnlyAFSSyscallPanic bool `flag:"TESTONLY-afs-syscall-panic"` 339 340 // explicitlySet contains whether a flag was explicitly set on the command-line from which this 341 // Config was constructed. Nil when the Config was not initialized from a FlagSet. 342 explicitlySet map[string]struct{} 343 344 // ReproduceNAT, when true, tells runsc to scrape the host network 345 // namespace's NAT iptables and reproduce it inside the sandbox. 346 ReproduceNAT bool `flag:"reproduce-nat"` 347 348 // ReproduceNftables attempts to scrape nftables routing rules if 349 // present, and reproduce them in the sandbox. 350 ReproduceNftables bool `flag:"reproduce-nftables"` 351 352 // TestOnlyAutosaveImagePath if not empty enables auto save for syscall tests 353 // and stores the directory path to the saved state file. 354 TestOnlyAutosaveImagePath string `flag:"TESTONLY-autosave-image-path"` 355 } 356 357 func (c *Config) validate() error { 358 if c.Overlay && c.Overlay2.Enabled() { 359 // Deprecated flag was used together with flag that replaced it. 360 return fmt.Errorf("overlay flag has been replaced with overlay2 flag") 361 } 362 if overlay2 := c.GetOverlay2(); c.FileAccess == FileAccessShared && overlay2.Enabled() { 363 return fmt.Errorf("overlay flag is incompatible with shared file access for rootfs") 364 } 365 if c.NumNetworkChannels <= 0 { 366 return fmt.Errorf("num_network_channels must be > 0, got: %d", c.NumNetworkChannels) 367 } 368 // Require profile flags to explicitly opt-in to profiling with 369 // -profile rather than implying it since these options have security 370 // implications. 371 if c.ProfileBlock != "" && !c.ProfileEnable { 372 return fmt.Errorf("profile-block flag requires enabling profiling with profile flag") 373 } 374 if c.ProfileCPU != "" && !c.ProfileEnable { 375 return fmt.Errorf("profile-cpu flag requires enabling profiling with profile flag") 376 } 377 if c.ProfileHeap != "" && !c.ProfileEnable { 378 return fmt.Errorf("profile-heap flag requires enabling profiling with profile flag") 379 } 380 if c.ProfileMutex != "" && !c.ProfileEnable { 381 return fmt.Errorf("profile-mutex flag requires enabling profiling with profile flag") 382 } 383 if c.FSGoferHostUDS && c.HostUDS != HostUDSNone { 384 // Deprecated flag was used together with flag that replaced it. 385 return fmt.Errorf("fsgofer-host-uds has been replaced with host-uds flag") 386 } 387 if len(c.ProfilingMetrics) > 0 && len(c.ProfilingMetricsLog) == 0 { 388 return fmt.Errorf("profiling-metrics flag requires defining a profiling-metrics-log for output") 389 } 390 return nil 391 } 392 393 // Log logs important aspects of the configuration to the given log function. 394 func (c *Config) Log() { 395 log.Infof("Platform: %v", c.Platform) 396 log.Infof("RootDir: %s", c.RootDir) 397 log.Infof("FileAccess: %v / Directfs: %t / Overlay: %v", c.FileAccess, c.DirectFS, c.GetOverlay2()) 398 log.Infof("Network: %v", c.Network) 399 if c.Debug || c.Strace { 400 log.Infof("Debug: %t. Strace: %t, max size: %d, syscalls: %s", c.Debug, c.Strace, c.StraceLogSize, c.StraceSyscalls) 401 } 402 if c.Debug { 403 obj := reflect.ValueOf(c).Elem() 404 st := obj.Type() 405 for i := 0; i < st.NumField(); i++ { 406 f := st.Field(i) 407 var val any 408 if strVal := obj.Field(i).String(); strVal == "" { 409 val = "(empty)" 410 } else if !f.IsExported() { 411 // Cannot convert to `interface{}` for non-exported fields, 412 // so just use `strVal`. 413 val = fmt.Sprintf("%s (unexported)", strVal) 414 } else { 415 val = obj.Field(i).Interface() 416 } 417 if flagName, hasFlag := f.Tag.Lookup("flag"); hasFlag { 418 log.Debugf("Config.%s (--%s): %v", f.Name, flagName, val) 419 } else { 420 log.Debugf("Config.%s: %v", f.Name, val) 421 } 422 } 423 } 424 } 425 426 // GetHostUDS returns the FS gofer communication that is allowed, taking into 427 // consideration all flags what affect the result. 428 func (c *Config) GetHostUDS() HostUDS { 429 if c.FSGoferHostUDS { 430 if c.HostUDS != HostUDSNone { 431 panic(fmt.Sprintf("HostUDS cannot be set when --fsgofer-host-uds=true")) 432 } 433 // Using deprecated flag, honor it to avoid breaking users. 434 return HostUDSOpen 435 } 436 return c.HostUDS 437 } 438 439 // GetOverlay2 returns the overlay configuration, taking into consideration all 440 // flags that affect the result. 441 func (c *Config) GetOverlay2() Overlay2 { 442 if c.Overlay { 443 if c.Overlay2.Enabled() { 444 panic(fmt.Sprintf("Overlay2 cannot be set when --overlay=true")) 445 } 446 // Using a deprecated flag, honor it to avoid breaking users. 447 return Overlay2{rootMount: true, subMounts: true, medium: "memory"} 448 } 449 return c.Overlay2 450 } 451 452 // Bundle is a set of flag name-value pairs. 453 type Bundle map[string]string 454 455 // BundleName is a human-friendly name for a Bundle. 456 // It is used as part of an annotation to specify that the user wants to apply a Bundle. 457 type BundleName string 458 459 // Validate validates that given flag string values map to actual flags in runsc. 460 func (b Bundle) Validate() error { 461 flagSet := flag.NewFlagSet("tmp", flag.ContinueOnError) 462 RegisterFlags(flagSet) 463 for key, val := range b { 464 flag := flagSet.Lookup(key) 465 if flag == nil { 466 return fmt.Errorf("unknown flag %q", key) 467 } 468 if err := flagSet.Set(key, val); err != nil { 469 return err 470 } 471 } 472 return nil 473 } 474 475 // MetricMetadataKeys is the set of keys of metric metadata labels 476 // as returned by `Config.MetricMetadata`. 477 var MetricMetadataKeys = []string{ 478 "version", 479 "platform", 480 "network", 481 "numcores", 482 "coretags", 483 "overlay", 484 "fsmode", 485 "cpuarch", 486 "go", 487 "experiment", 488 } 489 490 // MetricMetadata returns key-value pairs that are useful to include in metrics 491 // exported about the sandbox this config represents. 492 // It must return the same set of labels as listed in `MetricMetadataKeys`. 493 func (c *Config) MetricMetadata() map[string]string { 494 var fsMode = "goferfs" 495 if c.DirectFS { 496 fsMode = "directfs" 497 } 498 return map[string]string{ 499 "version": version.Version(), 500 "platform": c.Platform, 501 "network": c.Network.String(), 502 "numcores": strconv.Itoa(runtime.NumCPU()), 503 "coretags": strconv.FormatBool(c.EnableCoreTags), 504 "overlay": c.Overlay2.String(), 505 "fsmode": fsMode, 506 "cpuarch": runtime.GOARCH, 507 "go": runtime.Version(), 508 // The "experiment" label is currently unused, but may be used to contain 509 // extra information about e.g. an experiment that may be enabled. 510 "experiment": "", 511 } 512 } 513 514 // FileAccessType tells how the filesystem is accessed. 515 type FileAccessType int 516 517 const ( 518 // FileAccessExclusive gives the sandbox exclusive access over files and 519 // directories in the filesystem. No external modifications are permitted and 520 // can lead to undefined behavior. 521 // 522 // Exclusive filesystem access enables more aggressive caching and offers 523 // significantly better performance. This is the default mode for the root 524 // volume. 525 FileAccessExclusive FileAccessType = iota 526 527 // FileAccessShared is used for volumes that can have external changes. It 528 // requires revalidation on every filesystem access to detect external 529 // changes, and reduces the amount of caching that can be done. This is the 530 // default mode for non-root volumes. 531 FileAccessShared 532 ) 533 534 func fileAccessTypePtr(v FileAccessType) *FileAccessType { 535 return &v 536 } 537 538 // Set implements flag.Value. Set(String()) should be idempotent. 539 func (f *FileAccessType) Set(v string) error { 540 switch v { 541 case "shared": 542 *f = FileAccessShared 543 case "exclusive": 544 *f = FileAccessExclusive 545 default: 546 return fmt.Errorf("invalid file access type %q", v) 547 } 548 return nil 549 } 550 551 // Get implements flag.Value. 552 func (f *FileAccessType) Get() any { 553 return *f 554 } 555 556 // String implements flag.Value. 557 func (f FileAccessType) String() string { 558 switch f { 559 case FileAccessShared: 560 return "shared" 561 case FileAccessExclusive: 562 return "exclusive" 563 } 564 panic(fmt.Sprintf("Invalid file access type %d", f)) 565 } 566 567 // NetworkType tells which network stack to use. 568 type NetworkType int 569 570 const ( 571 // NetworkSandbox uses internal network stack, isolated from the host. 572 NetworkSandbox NetworkType = iota 573 574 // NetworkHost redirects network related syscalls to the host network. 575 NetworkHost 576 577 // NetworkNone sets up just loopback using netstack. 578 NetworkNone 579 ) 580 581 func networkTypePtr(v NetworkType) *NetworkType { 582 return &v 583 } 584 585 // Set implements flag.Value. Set(String()) should be idempotent. 586 func (n *NetworkType) Set(v string) error { 587 switch v { 588 case "sandbox": 589 *n = NetworkSandbox 590 case "host": 591 *n = NetworkHost 592 case "none": 593 *n = NetworkNone 594 default: 595 return fmt.Errorf("invalid network type %q", v) 596 } 597 return nil 598 } 599 600 // Get implements flag.Value. 601 func (n *NetworkType) Get() any { 602 return *n 603 } 604 605 // String implements flag.Value. 606 func (n NetworkType) String() string { 607 switch n { 608 case NetworkSandbox: 609 return "sandbox" 610 case NetworkHost: 611 return "host" 612 case NetworkNone: 613 return "none" 614 } 615 panic(fmt.Sprintf("Invalid network type %d", n)) 616 } 617 618 // QueueingDiscipline is used to specify the kind of Queueing Discipline to 619 // apply for a give FDBasedLink. 620 type QueueingDiscipline int 621 622 const ( 623 // QDiscNone disables any queueing for the underlying FD. 624 QDiscNone QueueingDiscipline = iota 625 626 // QDiscFIFO applies a simple fifo based queue to the underlying FD. 627 QDiscFIFO 628 ) 629 630 func queueingDisciplinePtr(v QueueingDiscipline) *QueueingDiscipline { 631 return &v 632 } 633 634 // Set implements flag.Value. Set(String()) should be idempotent. 635 func (q *QueueingDiscipline) Set(v string) error { 636 switch v { 637 case "none": 638 *q = QDiscNone 639 case "fifo": 640 *q = QDiscFIFO 641 default: 642 return fmt.Errorf("invalid qdisc %q", v) 643 } 644 return nil 645 } 646 647 // Get implements flag.Value. 648 func (q *QueueingDiscipline) Get() any { 649 return *q 650 } 651 652 // String implements flag.Value. 653 func (q QueueingDiscipline) String() string { 654 switch q { 655 case QDiscNone: 656 return "none" 657 case QDiscFIFO: 658 return "fifo" 659 } 660 panic(fmt.Sprintf("Invalid qdisc %d", q)) 661 } 662 663 func leakModePtr(v refs.LeakMode) *refs.LeakMode { 664 return &v 665 } 666 667 func watchdogActionPtr(v watchdog.Action) *watchdog.Action { 668 return &v 669 } 670 671 // HostUDS tells how much of the host UDS the file system has access to. 672 type HostUDS int 673 674 const ( 675 // HostUDSNone doesn't allows UDS from the host to be manipulated. 676 HostUDSNone HostUDS = 0x0 677 678 // HostUDSOpen allows UDS from the host to be opened, e.g. connect(2). 679 HostUDSOpen HostUDS = 0x1 680 681 // HostUDSCreate allows UDS from the host to be created, e.g. bind(2). 682 HostUDSCreate HostUDS = 0x2 683 684 // HostUDSAll allows all form of communication with the host through UDS. 685 HostUDSAll = HostUDSOpen | HostUDSCreate 686 ) 687 688 func hostUDSPtr(v HostUDS) *HostUDS { 689 return &v 690 } 691 692 // Set implements flag.Value. Set(String()) should be idempotent. 693 func (g *HostUDS) Set(v string) error { 694 switch v { 695 case "", "none": 696 *g = HostUDSNone 697 case "open": 698 *g = HostUDSOpen 699 case "create": 700 *g = HostUDSCreate 701 case "all": 702 *g = HostUDSAll 703 default: 704 return fmt.Errorf("invalid host UDS type %q", v) 705 } 706 return nil 707 } 708 709 // Get implements flag.Value. 710 func (g *HostUDS) Get() any { 711 return *g 712 } 713 714 // String implements flag.Value. 715 func (g HostUDS) String() string { 716 switch g { 717 case HostUDSNone: 718 return "none" 719 case HostUDSOpen: 720 return "open" 721 case HostUDSCreate: 722 return "create" 723 case HostUDSAll: 724 return "all" 725 default: 726 panic(fmt.Sprintf("Invalid host UDS type %d", g)) 727 } 728 } 729 730 // AllowOpen returns true if it can consume UDS from the host. 731 func (g HostUDS) AllowOpen() bool { 732 return g&HostUDSOpen != 0 733 } 734 735 // AllowCreate returns true if it can create UDS in the host. 736 func (g HostUDS) AllowCreate() bool { 737 return g&HostUDSCreate != 0 738 } 739 740 // HostFifo tells how much of the host FIFO (or named pipes) the file system has 741 // access to. 742 type HostFifo int 743 744 const ( 745 // HostFifoNone doesn't allow FIFO from the host to be manipulated. 746 HostFifoNone HostFifo = 0x0 747 748 // HostFifoOpen allows FIFOs from the host to be opened. 749 HostFifoOpen HostFifo = 0x1 750 ) 751 752 func hostFifoPtr(v HostFifo) *HostFifo { 753 return &v 754 } 755 756 // Set implements flag.Value. Set(String()) should be idempotent. 757 func (g *HostFifo) Set(v string) error { 758 switch v { 759 case "", "none": 760 *g = HostFifoNone 761 case "open": 762 *g = HostFifoOpen 763 default: 764 return fmt.Errorf("invalid host fifo type %q", v) 765 } 766 return nil 767 } 768 769 // Get implements flag.Value. 770 func (g *HostFifo) Get() any { 771 return *g 772 } 773 774 // String implements flag.Value. 775 func (g HostFifo) String() string { 776 switch g { 777 case HostFifoNone: 778 return "none" 779 case HostFifoOpen: 780 return "open" 781 default: 782 panic(fmt.Sprintf("Invalid host fifo type %d", g)) 783 } 784 } 785 786 // AllowOpen returns true if it can consume FIFOs from the host. 787 func (g HostFifo) AllowOpen() bool { 788 return g&HostFifoOpen != 0 789 } 790 791 // OverlayMedium describes how overlay medium is configured. 792 type OverlayMedium string 793 794 const ( 795 // NoOverlay indicates that no overlay will be applied. 796 NoOverlay = OverlayMedium("") 797 798 // MemoryOverlay indicates that the overlay is backed by app memory. 799 MemoryOverlay = OverlayMedium("memory") 800 801 // SelfOverlay indicates that the overlaid mount is backed by itself. 802 SelfOverlay = OverlayMedium("self") 803 804 // AnonOverlayPrefix is the prefix that users should specify in the 805 // config for the anonymous overlay. 806 AnonOverlayPrefix = "dir=" 807 ) 808 809 // String returns a human-readable string representing the overlay medium config. 810 func (m OverlayMedium) String() string { 811 return string(m) 812 } 813 814 // Set sets the value. Set(String()) should be idempotent. 815 func (m *OverlayMedium) Set(v string) error { 816 switch OverlayMedium(v) { 817 case NoOverlay, MemoryOverlay, SelfOverlay: // OK 818 default: 819 if !strings.HasPrefix(v, AnonOverlayPrefix) { 820 return fmt.Errorf("unexpected medium: %q", v) 821 } 822 if hostFileDir := strings.TrimPrefix(v, AnonOverlayPrefix); !filepath.IsAbs(hostFileDir) { 823 return fmt.Errorf("overlay host file directory should be an absolute path, got %q", hostFileDir) 824 } 825 } 826 *m = OverlayMedium(v) 827 return nil 828 } 829 830 // IsBackedByAnon indicates whether the overlaid mount is backed by a host file 831 // in an anonymous directory. 832 func (m OverlayMedium) IsBackedByAnon() bool { 833 return strings.HasPrefix(string(m), AnonOverlayPrefix) 834 } 835 836 // HostFileDir indicates the directory in which the overlay-backing host file 837 // should be created. 838 // 839 // Precondition: m.IsBackedByAnon(). 840 func (m OverlayMedium) HostFileDir() string { 841 if !m.IsBackedByAnon() { 842 panic(fmt.Sprintf("anonymous overlay medium = %q does not have %v prefix", m, AnonOverlayPrefix)) 843 } 844 return strings.TrimPrefix(string(m), AnonOverlayPrefix) 845 } 846 847 // Overlay2 holds the configuration for setting up overlay filesystems for the 848 // container. 849 type Overlay2 struct { 850 rootMount bool 851 subMounts bool 852 medium OverlayMedium 853 } 854 855 func defaultOverlay2() *Overlay2 { 856 // Rootfs overlay is enabled by default and backed by a file in rootfs itself. 857 return &Overlay2{rootMount: true, subMounts: false, medium: SelfOverlay} 858 } 859 860 // Set implements flag.Value. Set(String()) should be idempotent. 861 func (o *Overlay2) Set(v string) error { 862 if v == "none" { 863 o.rootMount = false 864 o.subMounts = false 865 o.medium = NoOverlay 866 return nil 867 } 868 vs := strings.Split(v, ":") 869 if len(vs) != 2 { 870 return fmt.Errorf("expected format is --overlay2={mount}:{medium}, got %q", v) 871 } 872 873 switch mount := vs[0]; mount { 874 case "root": 875 o.rootMount = true 876 case "all": 877 o.rootMount = true 878 o.subMounts = true 879 default: 880 return fmt.Errorf("unexpected mount specifier for --overlay2: %q", mount) 881 } 882 883 return o.medium.Set(vs[1]) 884 } 885 886 // Get implements flag.Value. 887 func (o *Overlay2) Get() any { 888 return *o 889 } 890 891 // String implements flag.Value. 892 func (o Overlay2) String() string { 893 if !o.rootMount && !o.subMounts { 894 return "none" 895 } 896 res := "" 897 switch { 898 case o.rootMount && o.subMounts: 899 res = "all" 900 case o.rootMount: 901 res = "root" 902 default: 903 panic("invalid state of subMounts = true and rootMount = false") 904 } 905 return res + ":" + o.medium.String() 906 } 907 908 // Enabled returns true if the overlay option is enabled for any mounts. 909 func (o *Overlay2) Enabled() bool { 910 return o.medium != NoOverlay 911 } 912 913 // RootOverlayMedium returns the overlay medium config of the root mount. 914 func (o *Overlay2) RootOverlayMedium() OverlayMedium { 915 if !o.rootMount { 916 return NoOverlay 917 } 918 return o.medium 919 } 920 921 // SubMountOverlayMedium returns the overlay medium config of submounts. 922 func (o *Overlay2) SubMountOverlayMedium() OverlayMedium { 923 if !o.subMounts { 924 return NoOverlay 925 } 926 return o.medium 927 } 928 929 // XDP holds configuration for whether and how to use XDP. 930 type XDP struct { 931 Mode XDPMode 932 IfaceName string 933 } 934 935 // XDPMode specifies a particular use of XDP. 936 type XDPMode int 937 938 const ( 939 // XDPModeOff doesn't use XDP. 940 XDPModeOff XDPMode = iota 941 942 // XDPModeNS uses an AF_XDP socket to read from the VETH device inside 943 // the container's network namespace. 944 XDPModeNS 945 946 // XDPModeRedirect uses an AF_XDP socket on the host NIC to bypass the 947 // Linux network stack. 948 XDPModeRedirect 949 950 // XDPModeTunnel uses XDP_REDIRECT to redirect packets directy from the 951 // host NIC to the VETH device inside the container's network 952 // namespace. Packets are read from the VETH via AF_XDP, as in 953 // XDPModeNS. 954 XDPModeTunnel 955 ) 956 957 const ( 958 xdpModeStrOff = "off" 959 xdpModeStrNS = "ns" 960 xdpModeStrRedirect = "redirect" 961 xdpModeStrTunnel = "tunnel" 962 ) 963 964 var xdpConfig XDP 965 966 // Get implements flag.Getter. 967 func (xd *XDP) Get() any { 968 return *xd 969 } 970 971 // String implements flag.Getter. 972 func (xd *XDP) String() string { 973 switch xd.Mode { 974 case XDPModeOff: 975 return xdpModeStrOff 976 case XDPModeNS: 977 return xdpModeStrNS 978 case XDPModeRedirect: 979 return fmt.Sprintf("%s:%s", xdpModeStrRedirect, xd.IfaceName) 980 case XDPModeTunnel: 981 return fmt.Sprintf("%s:%s", xdpModeStrTunnel, xd.IfaceName) 982 default: 983 panic(fmt.Sprintf("unknown mode %d", xd.Mode)) 984 } 985 } 986 987 // Set implements flag.Getter. 988 func (xd *XDP) Set(input string) error { 989 parts := strings.Split(input, ":") 990 if len(parts) > 2 { 991 return fmt.Errorf("invalid --xdp value: %q", input) 992 } 993 994 switch { 995 case input == xdpModeStrOff: 996 xd.Mode = XDPModeOff 997 xd.IfaceName = "" 998 case input == xdpModeStrNS: 999 xd.Mode = XDPModeNS 1000 xd.IfaceName = "" 1001 case len(parts) == 2 && parts[0] == xdpModeStrRedirect && parts[1] != "": 1002 xd.Mode = XDPModeRedirect 1003 xd.IfaceName = parts[1] 1004 case len(parts) == 2 && parts[0] == xdpModeStrTunnel && parts[1] != "": 1005 xd.Mode = XDPModeTunnel 1006 xd.IfaceName = parts[1] 1007 default: 1008 return fmt.Errorf("invalid --xdp value: %q", input) 1009 } 1010 return nil 1011 }