github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/specutils/specutils.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package specutils contains utility functions for working with OCI runtime 16 // specs. 17 package specutils 18 19 import ( 20 "encoding/json" 21 "fmt" 22 "io" 23 "io/ioutil" 24 "os" 25 "path" 26 "path/filepath" 27 "strconv" 28 "strings" 29 "time" 30 31 "github.com/cenkalti/backoff" 32 "github.com/mohae/deepcopy" 33 specs "github.com/opencontainers/runtime-spec/specs-go" 34 "golang.org/x/sys/unix" 35 "github.com/metacubex/gvisor/pkg/abi/linux" 36 "github.com/metacubex/gvisor/pkg/bits" 37 "github.com/metacubex/gvisor/pkg/log" 38 "github.com/metacubex/gvisor/pkg/sentry/devices/tpuproxy" 39 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 40 "github.com/metacubex/gvisor/runsc/config" 41 "github.com/metacubex/gvisor/runsc/flag" 42 ) 43 44 const ( 45 annotationFlagPrefix = "dev.gvisor.flag." 46 annotationSeccomp = "dev.gvisor.internal.seccomp." 47 annotationTPU = "dev.gvisor.internal.tpuproxy" 48 annotationSeccompRuntimeDefault = "RuntimeDefault" 49 50 annotationContainerName = "io.kubernetes.cri.container-name" 51 ) 52 53 // ExePath must point to runsc binary, which is normally the same binary. It's 54 // changed in tests that aren't linked in the same binary. 55 var ExePath = "/proc/self/exe" 56 57 // Version is the supported spec version. 58 var Version = specs.Version 59 60 // LogSpecDebug writes the spec in a human-friendly format to the debug log. 61 func LogSpecDebug(orig *specs.Spec, logSeccomp bool) { 62 if !log.IsLogging(log.Debug) { 63 return 64 } 65 66 // Strip down parts of the spec that are not interesting. 67 spec := deepcopy.Copy(orig).(*specs.Spec) 68 if spec.Process != nil { 69 spec.Process.Capabilities = nil 70 } 71 if spec.Linux != nil { 72 if !logSeccomp { 73 spec.Linux.Seccomp = nil 74 } 75 spec.Linux.MaskedPaths = nil 76 spec.Linux.ReadonlyPaths = nil 77 if spec.Linux.Resources != nil { 78 spec.Linux.Resources.Devices = nil 79 } 80 } 81 82 out, err := json.MarshalIndent(spec, "", " ") 83 if err != nil { 84 log.Debugf("Failed to marshal spec: %v", err) 85 return 86 } 87 log.Debugf("Spec:\n%s", out) 88 } 89 90 // ValidateSpec validates that the spec is compatible with runsc. 91 func ValidateSpec(spec *specs.Spec) error { 92 // Mandatory fields. 93 if spec.Process == nil { 94 return fmt.Errorf("Spec.Process must be defined: %+v", spec) 95 } 96 if len(spec.Process.Args) == 0 { 97 return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process) 98 } 99 if spec.Root == nil { 100 return fmt.Errorf("Spec.Root must be defined: %+v", spec) 101 } 102 if len(spec.Root.Path) == 0 { 103 return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root) 104 } 105 106 // Unsupported fields. 107 if spec.Solaris != nil { 108 return fmt.Errorf("Spec.Solaris is not supported: %+v", spec) 109 } 110 if spec.Windows != nil { 111 return fmt.Errorf("Spec.Windows is not supported: %+v", spec) 112 } 113 if len(spec.Process.SelinuxLabel) != 0 { 114 return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel) 115 } 116 117 // Docker uses AppArmor by default, so just log that it's being ignored. 118 if spec.Process.ApparmorProfile != "" { 119 log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile) 120 } 121 122 // PR_SET_NO_NEW_PRIVS is assumed to always be set. 123 // See kernel.Task.updateCredsForExecLocked. 124 if !spec.Process.NoNewPrivileges { 125 log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.") 126 } 127 128 if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { 129 if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil { 130 return err 131 } 132 } 133 for _, m := range spec.Mounts { 134 if err := validateMount(&m); err != nil { 135 return err 136 } 137 } 138 139 // CRI specifies whether a container should start a new sandbox, or run 140 // another container in an existing sandbox. 141 switch SpecContainerType(spec) { 142 case ContainerTypeContainer: 143 // When starting a container in an existing sandbox, the 144 // sandbox ID must be set. 145 if _, ok := SandboxID(spec); !ok { 146 return fmt.Errorf("spec has container-type of container, but no sandbox ID set") 147 } 148 case ContainerTypeUnknown: 149 return fmt.Errorf("unknown container-type") 150 default: 151 } 152 153 return nil 154 } 155 156 // absPath turns the given path into an absolute path (if it is not already 157 // absolute) by prepending the base path. 158 func absPath(base, rel string) string { 159 if filepath.IsAbs(rel) { 160 return rel 161 } 162 return filepath.Join(base, rel) 163 } 164 165 // OpenSpec opens an OCI runtime spec from the given bundle directory. 166 func OpenSpec(bundleDir string) (*os.File, error) { 167 // The spec file must be named "config.json" inside the bundle directory. 168 return os.Open(filepath.Join(bundleDir, "config.json")) 169 } 170 171 // ReadSpec reads an OCI runtime spec from the given bundle directory. 172 // ReadSpec also normalizes all potential relative paths into absolute 173 // path, e.g. spec.Root.Path, mount.Source. 174 func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) { 175 specFile, err := OpenSpec(bundleDir) 176 if err != nil { 177 return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err) 178 } 179 defer specFile.Close() 180 return ReadSpecFromFile(bundleDir, specFile, conf) 181 } 182 183 // ReadSpecFromFile reads an OCI runtime spec from the given file. It also fixes 184 // up the spec so that the rest of the code doesn't need to worry about it. 185 // 1. Normalizes all relative paths into absolute by prepending the bundle 186 // dir to them. 187 // 2. Looks for flag overrides and applies them if any. 188 // 3. Removes seccomp rules if `RuntimeDefault` was used. 189 func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) { 190 if _, err := specFile.Seek(0, io.SeekStart); err != nil { 191 return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err) 192 } 193 specBytes, err := ioutil.ReadAll(specFile) 194 if err != nil { 195 return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err) 196 } 197 var spec specs.Spec 198 if err := json.Unmarshal(specBytes, &spec); err != nil { 199 return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes)) 200 } 201 if err := ValidateSpec(&spec); err != nil { 202 return nil, err 203 } 204 if err := fixSpec(&spec, bundleDir, conf); err != nil { 205 return nil, err 206 } 207 return &spec, nil 208 } 209 210 func fixSpec(spec *specs.Spec, bundleDir string, conf *config.Config) error { 211 // Turn any relative paths in the spec to absolute by prepending the bundleDir. 212 spec.Root.Path = absPath(bundleDir, spec.Root.Path) 213 for i := range spec.Mounts { 214 m := &spec.Mounts[i] 215 if m.Source != "" { 216 m.Source = absPath(bundleDir, m.Source) 217 } 218 } 219 // Look for config bundle annotations and verify that they exist. 220 const configBundlePrefix = "dev.gvisor.bundle." 221 var bundles []config.BundleName 222 for annotation, val := range spec.Annotations { 223 if !strings.HasPrefix(annotation, configBundlePrefix) { 224 continue 225 } 226 if val != "true" { 227 return fmt.Errorf("invalid value %q for annotation %q (must be set to 'true' or removed entirely)", val, annotation) 228 } 229 bundleName := config.BundleName(annotation[len(configBundlePrefix):]) 230 if _, exists := config.Bundles[bundleName]; !exists { 231 log.Warningf("Bundle name %q (from annotation %q=%q) does not exist; this bundle may have been deprecated. Skipping.", bundleName, annotation, val) 232 continue 233 } 234 bundles = append(bundles, bundleName) 235 } 236 237 // Apply config bundles, if any. 238 if len(bundles) > 0 { 239 log.Infof("Applying config bundles: %v", bundles) 240 if err := conf.ApplyBundles(flag.CommandLine, bundles...); err != nil { 241 return err 242 } 243 } 244 245 containerName := ContainerName(spec) 246 for annotation, val := range spec.Annotations { 247 if strings.HasPrefix(annotation, annotationFlagPrefix) { 248 // Override flags using annotation to allow customization per sandbox 249 // instance. 250 name := annotation[len(annotationFlagPrefix):] 251 log.Infof("Overriding flag from flag annotation: --%s=%q", name, val) 252 if err := conf.Override(flag.CommandLine, name, val /* force= */, false); err != nil { 253 return err 254 } 255 } else if len(containerName) > 0 { 256 // If we know the container name, then check to see if seccomp 257 // instructions were given to the the container. 258 if annotation == annotationSeccomp+containerName && val == annotationSeccompRuntimeDefault { 259 // Container seccomp rules are redundant when using gVisor, so remove 260 // them when seccomp is set to RuntimeDefault. 261 if spec.Linux != nil && spec.Linux.Seccomp != nil { 262 log.Debugf("Seccomp is being ignored because annotation %q is set to default.", annotationSeccomp) 263 spec.Linux.Seccomp = nil 264 } 265 } 266 } 267 } 268 return nil 269 } 270 271 // ReadMounts reads mount list from a file. 272 func ReadMounts(f *os.File) ([]specs.Mount, error) { 273 bytes, err := ioutil.ReadAll(f) 274 if err != nil { 275 return nil, fmt.Errorf("error reading mounts: %v", err) 276 } 277 var mounts []specs.Mount 278 if err := json.Unmarshal(bytes, &mounts); err != nil { 279 return nil, fmt.Errorf("error unmarshaling mounts: %v\nJSON bytes:\n%s", err, string(bytes)) 280 } 281 return mounts, nil 282 } 283 284 // ChangeMountType changes m.Type to the specified type. It may do necessary 285 // amends to m.Options. 286 func ChangeMountType(m *specs.Mount, newType string) { 287 m.Type = newType 288 289 // OCI spec allows bind mounts to be specified in options only. So if new type 290 // is not bind, remove bind/rbind from options. 291 // 292 // "For bind mounts (when options include either bind or rbind), the type is 293 // a dummy, often "none" (not listed in /proc/filesystems)." 294 if newType != "bind" { 295 newOpts := make([]string, 0, len(m.Options)) 296 for _, opt := range m.Options { 297 if opt != "rbind" && opt != "bind" { 298 newOpts = append(newOpts, opt) 299 } 300 } 301 m.Options = newOpts 302 } 303 } 304 305 // Capabilities takes in spec and returns a TaskCapabilities corresponding to 306 // the spec. 307 func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) { 308 // Strip CAP_NET_RAW from all capability sets if necessary. 309 skipSet := map[linux.Capability]struct{}{} 310 if !enableRaw { 311 skipSet[linux.CAP_NET_RAW] = struct{}{} 312 } 313 314 var caps auth.TaskCapabilities 315 if specCaps != nil { 316 var err error 317 if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil { 318 return nil, err 319 } 320 if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil { 321 return nil, err 322 } 323 if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil { 324 return nil, err 325 } 326 if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil { 327 return nil, err 328 } 329 // TODO(gvisor.dev/issue/3166): Support ambient capabilities. 330 } 331 return &caps, nil 332 } 333 334 // AllCapabilities returns a LinuxCapabilities struct with all capabilities. 335 func AllCapabilities() *specs.LinuxCapabilities { 336 var names []string 337 for n := range capFromName { 338 names = append(names, n) 339 } 340 return &specs.LinuxCapabilities{ 341 Bounding: names, 342 Effective: names, 343 Inheritable: names, 344 Permitted: names, 345 Ambient: names, 346 } 347 } 348 349 // AllCapabilitiesUint64 returns a bitmask containing all capabilities set. 350 func AllCapabilitiesUint64() uint64 { 351 var rv uint64 352 for _, cap := range capFromName { 353 rv |= bits.MaskOf64(int(cap)) 354 } 355 return rv 356 } 357 358 // MergeCapabilities merges the capabilites from first and second. 359 func MergeCapabilities(first, second *specs.LinuxCapabilities) *specs.LinuxCapabilities { 360 return &specs.LinuxCapabilities{ 361 Bounding: mergeUnique(first.Bounding, second.Bounding), 362 Effective: mergeUnique(first.Effective, second.Effective), 363 Inheritable: mergeUnique(first.Inheritable, second.Inheritable), 364 Permitted: mergeUnique(first.Permitted, second.Permitted), 365 Ambient: mergeUnique(first.Ambient, second.Ambient), 366 } 367 } 368 369 // DropCapability removes the specified capability from all capability sets. 370 func DropCapability(caps *specs.LinuxCapabilities, drop string) { 371 caps.Bounding = remove(caps.Bounding, drop) 372 caps.Effective = remove(caps.Effective, drop) 373 caps.Inheritable = remove(caps.Inheritable, drop) 374 caps.Permitted = remove(caps.Permitted, drop) 375 caps.Ambient = remove(caps.Ambient, drop) 376 } 377 378 func mergeUnique(strSlices ...[]string) []string { 379 common := make(map[string]struct{}) 380 for _, strSlice := range strSlices { 381 for _, s := range strSlice { 382 common[s] = struct{}{} 383 } 384 } 385 386 res := make([]string, 0, len(common)) 387 for s := range common { 388 res = append(res, s) 389 } 390 return res 391 } 392 393 func remove(ss []string, rem string) []string { 394 var out []string 395 for _, s := range ss { 396 if s == rem { 397 continue 398 } 399 out = append(out, s) 400 } 401 return out 402 } 403 404 var capFromName = map[string]linux.Capability{ 405 "CAP_CHOWN": linux.CAP_CHOWN, 406 "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE, 407 "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH, 408 "CAP_FOWNER": linux.CAP_FOWNER, 409 "CAP_FSETID": linux.CAP_FSETID, 410 "CAP_KILL": linux.CAP_KILL, 411 "CAP_SETGID": linux.CAP_SETGID, 412 "CAP_SETUID": linux.CAP_SETUID, 413 "CAP_SETPCAP": linux.CAP_SETPCAP, 414 "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE, 415 "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE, 416 "CAP_NET_BROADCAST": linux.CAP_NET_BROADCAST, 417 "CAP_NET_ADMIN": linux.CAP_NET_ADMIN, 418 "CAP_NET_RAW": linux.CAP_NET_RAW, 419 "CAP_IPC_LOCK": linux.CAP_IPC_LOCK, 420 "CAP_IPC_OWNER": linux.CAP_IPC_OWNER, 421 "CAP_SYS_MODULE": linux.CAP_SYS_MODULE, 422 "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO, 423 "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT, 424 "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE, 425 "CAP_SYS_PACCT": linux.CAP_SYS_PACCT, 426 "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN, 427 "CAP_SYS_BOOT": linux.CAP_SYS_BOOT, 428 "CAP_SYS_NICE": linux.CAP_SYS_NICE, 429 "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE, 430 "CAP_SYS_TIME": linux.CAP_SYS_TIME, 431 "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG, 432 "CAP_MKNOD": linux.CAP_MKNOD, 433 "CAP_LEASE": linux.CAP_LEASE, 434 "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE, 435 "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL, 436 "CAP_SETFCAP": linux.CAP_SETFCAP, 437 "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE, 438 "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN, 439 "CAP_SYSLOG": linux.CAP_SYSLOG, 440 "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM, 441 "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND, 442 "CAP_AUDIT_READ": linux.CAP_AUDIT_READ, 443 "CAP_PERFMON": linux.CAP_PERFMON, 444 "CAP_BPF": linux.CAP_BPF, 445 "CAP_CHECKPOINT_RESTORE": linux.CAP_CHECKPOINT_RESTORE, 446 } 447 448 func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) { 449 var caps []linux.Capability 450 for _, n := range names { 451 c, ok := capFromName[n] 452 if !ok { 453 return 0, fmt.Errorf("unknown capability %q", n) 454 } 455 // Should we skip this capabilty? 456 if _, ok := skipSet[c]; ok { 457 continue 458 } 459 caps = append(caps, c) 460 } 461 return auth.CapabilitySetOfMany(caps), nil 462 } 463 464 // IsGoferMount returns true if the given mount can be mounted as an external 465 // gofer. 466 func IsGoferMount(m specs.Mount) bool { 467 MaybeConvertToBindMount(&m) 468 return m.Type == "bind" && m.Source != "" 469 } 470 471 // MaybeConvertToBindMount converts mount type to "bind" in case any of the 472 // mount options are either "bind" or "rbind" as required by the OCI spec. 473 // 474 // "For bind mounts (when options include either bind or rbind), the type is a 475 // dummy, often "none" (not listed in /proc/filesystems)." 476 func MaybeConvertToBindMount(m *specs.Mount) { 477 if m.Type == "bind" { 478 return 479 } 480 for _, opt := range m.Options { 481 if opt == "bind" || opt == "rbind" { 482 m.Type = "bind" 483 return 484 } 485 } 486 } 487 488 // WaitForReady waits for a process to become ready. The process is ready when 489 // the 'ready' function returns true. It continues to wait if 'ready' returns 490 // false. It returns error on timeout, if the process stops or if 'ready' fails. 491 func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error { 492 b := backoff.NewExponentialBackOff() 493 b.InitialInterval = 1 * time.Millisecond 494 b.MaxInterval = 1 * time.Second 495 b.MaxElapsedTime = timeout 496 497 op := func() error { 498 if ok, err := ready(); err != nil { 499 return backoff.Permanent(err) 500 } else if ok { 501 return nil 502 } 503 504 // Check if the process is still running. 505 // If the process is alive, child is 0 because of the NOHANG option. 506 // If the process has terminated, child equals the process id. 507 var ws unix.WaitStatus 508 var ru unix.Rusage 509 child, err := unix.Wait4(pid, &ws, unix.WNOHANG, &ru) 510 if err != nil { 511 return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err)) 512 } else if child == pid { 513 return backoff.Permanent(fmt.Errorf("process %d has terminated", pid)) 514 } 515 return fmt.Errorf("process %d not running yet", pid) 516 } 517 return backoff.Retry(op, b) 518 } 519 520 // DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern' 521 // ends with '/', it's used as a directory with default file name. 522 // 'logPattern' can contain variables that are substituted: 523 // - %TIMESTAMP%: is replaced with a timestamp using the following format: 524 // <yyyymmdd-hhmmss.uuuuuu> 525 // - %COMMAND%: is replaced with 'command' 526 // - %TEST%: is replaced with 'test' (omitted by default) 527 func DebugLogFile(logPattern, command, test string) (*os.File, error) { 528 if strings.HasSuffix(logPattern, "/") { 529 // Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>.txt 530 logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%.txt" 531 } 532 logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1) 533 logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1) 534 logPattern = strings.Replace(logPattern, "%TEST%", test, -1) 535 536 dir := filepath.Dir(logPattern) 537 if err := os.MkdirAll(dir, 0775); err != nil { 538 return nil, fmt.Errorf("error creating dir %q: %v", dir, err) 539 } 540 return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) 541 } 542 543 // IsDebugCommand returns true if the command should be debugged or not, based 544 // on the current configuration. 545 func IsDebugCommand(conf *config.Config, command string) bool { 546 if len(conf.DebugCommand) == 0 { 547 // Debug everything by default. 548 return true 549 } 550 filter := conf.DebugCommand 551 rv := true 552 if filter[0] == '!' { 553 // Negate the match, e.g. !boot should log all, but "boot". 554 filter = filter[1:] 555 rv = false 556 } 557 for _, cmd := range strings.Split(filter, ",") { 558 if cmd == command { 559 return rv 560 } 561 } 562 return !rv 563 } 564 565 // TPUProxyIsEnabled checks if tpuproxy is enabled in the config or annotations. 566 func TPUProxyIsEnabled(spec *specs.Spec, conf *config.Config) bool { 567 if conf.TPUProxy { 568 return true 569 } 570 val, ok := spec.Annotations[annotationTPU] 571 if !ok { 572 return false 573 } 574 ret, err := strconv.ParseBool(val) 575 if err != nil { 576 log.Warningf("tpuproxy annotation set to invalid value %q: %w. Skipping.", val, err) 577 } 578 return ret 579 } 580 581 // VFIOFunctionalityRequested returns true if the container should have access 582 // to VFIO functionality. 583 func VFIOFunctionalityRequested(dev *specs.LinuxDevice) bool { 584 return strings.HasPrefix(dev.Path, filepath.Dir(tpuproxy.VFIOPath)) 585 } 586 587 // AcceleratorFunctionalityRequested returns true if the container should have 588 // access to compute accelerators. Compute accelerators are different from GPUs 589 // by using a different major number and different device char files. 590 func AcceleratorFunctionalityRequested(dev *specs.LinuxDevice) bool { 591 return strings.HasPrefix(dev.Path, "/dev/accel") 592 } 593 594 // TPUFunctionalityRequested returns true if the container should have access 595 // to TPU functionality. 596 func TPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool { 597 if !TPUProxyIsEnabled(spec, conf) { 598 return false 599 } 600 if spec.Linux != nil { 601 for _, dev := range spec.Linux.Devices { 602 if AcceleratorFunctionalityRequested(&dev) || VFIOFunctionalityRequested(&dev) { 603 return true 604 } 605 } 606 } 607 return false 608 } 609 610 // SafeSetupAndMount creates the mount point and calls Mount with the given 611 // flags. procPath is the path to procfs. If it is "", procfs is assumed to be 612 // mounted at /proc. 613 func SafeSetupAndMount(src, dst, typ string, flags uint32, procPath string) error { 614 // Create the mount point inside. The type must be the same as the source 615 // (file or directory). 616 var isDir bool 617 if typ == "proc" { 618 // Special case, as there is no source directory for proc mounts. 619 isDir = true 620 } else if fi, err := os.Stat(src); err != nil { 621 return fmt.Errorf("stat(%q) failed: %v", src, err) 622 } else { 623 isDir = fi.IsDir() 624 } 625 626 if isDir { 627 // Create the destination directory. 628 if err := os.MkdirAll(dst, 0777); err != nil { 629 return fmt.Errorf("mkdir(%q) failed: %v", dst, err) 630 } 631 } else { 632 // Create the parent destination directory. 633 parent := path.Dir(dst) 634 if err := os.MkdirAll(parent, 0777); err != nil { 635 return fmt.Errorf("mkdir(%q) failed: %v", parent, err) 636 } 637 // Create the destination file if it does not exist. 638 f, err := os.OpenFile(dst, unix.O_CREAT, 0777) 639 if err != nil { 640 return fmt.Errorf("open(%q) failed: %v", dst, err) 641 } 642 f.Close() 643 } 644 645 // Do the mount. 646 if err := SafeMount(src, dst, typ, uintptr(flags), "", procPath); err != nil { 647 return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err) 648 } 649 return nil 650 } 651 652 // ErrSymlinkMount is returned by SafeMount when the mount destination is found 653 // to be a symlink. 654 type ErrSymlinkMount struct { 655 error 656 } 657 658 // SafeMount is like unix.Mount, but will fail if dst is a symlink. procPath is 659 // the path to procfs. If it is "", procfs is assumed to be mounted at /proc. 660 // 661 // SafeMount can fail when dst contains a symlink. However, it is called in the 662 // normal case with a destination consisting of a known root (/proc/root) and 663 // symlink-free path (from resolveSymlink). 664 func SafeMount(src, dst, fstype string, flags uintptr, data, procPath string) error { 665 // Open the destination. 666 fd, err := unix.Open(dst, unix.O_PATH|unix.O_CLOEXEC, 0) 667 if err != nil { 668 return fmt.Errorf("failed to safely mount: Open(%s, _, _): %w", dst, err) 669 } 670 defer unix.Close(fd) 671 672 // Use /proc/self/fd/ to verify that we opened the intended destination. This 673 // guards against dst being a symlink, in which case we could accidentally 674 // mount over the symlink's target. 675 if procPath == "" { 676 procPath = "/proc" 677 } 678 safePath := filepath.Join(procPath, "self/fd", strconv.Itoa(fd)) 679 target, err := os.Readlink(safePath) 680 if err != nil { 681 return fmt.Errorf("failed to safely mount: Readlink(%s): %w", safePath, err) 682 } 683 if dst != target { 684 return &ErrSymlinkMount{fmt.Errorf("failed to safely mount: expected to open %s, but found %s", dst, target)} 685 } 686 687 return unix.Mount(src, safePath, fstype, flags, data) 688 } 689 690 // ContainsStr returns true if 'str' is inside 'strs'. 691 func ContainsStr(strs []string, str string) bool { 692 for _, s := range strs { 693 if s == str { 694 return true 695 } 696 } 697 return false 698 } 699 700 // RetryEintr retries the function until an error different than EINTR is 701 // returned. 702 func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) { 703 for { 704 r1, r2, err := f() 705 if err != unix.EINTR { 706 return r1, r2, err 707 } 708 } 709 } 710 711 // GetOOMScoreAdj reads the given process' oom_score_adj 712 func GetOOMScoreAdj(pid int) (int, error) { 713 data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid)) 714 if err != nil { 715 return 0, err 716 } 717 return strconv.Atoi(strings.TrimSpace(string(data))) 718 } 719 720 // EnvVar looks for a variable value in the env slice assuming the following 721 // format: "NAME=VALUE". If a variable is defined multiple times, the last 722 // value is used. 723 func EnvVar(env []string, name string) (string, bool) { 724 var err error 725 env, err = ResolveEnvs(env) 726 if err != nil { 727 return "", false 728 } 729 prefix := name + "=" 730 for _, e := range env { 731 if strings.HasPrefix(e, prefix) { 732 return strings.TrimPrefix(e, prefix), true 733 } 734 } 735 return "", false 736 } 737 738 // ResolveEnvs transforms lists of environment variables into a single list of 739 // environment variables. If a variable is defined multiple times, the last 740 // value is used. 741 func ResolveEnvs(envs ...[]string) ([]string, error) { 742 // First create a map of variable names to values. This removes any 743 // duplicates. 744 envMap := make(map[string]string) 745 for _, env := range envs { 746 for _, str := range env { 747 parts := strings.SplitN(str, "=", 2) 748 if len(parts) != 2 { 749 return nil, fmt.Errorf("invalid variable: %s", str) 750 } 751 envMap[parts[0]] = parts[1] 752 } 753 } 754 // Reassemble envMap into a list of environment variables of the form 755 // NAME=VALUE. 756 env := make([]string, 0, len(envMap)) 757 for k, v := range envMap { 758 env = append(env, fmt.Sprintf("%s=%s", k, v)) 759 } 760 return env, nil 761 } 762 763 // FaqErrorMsg returns an error message pointing to the FAQ. 764 func FaqErrorMsg(anchor, msg string) string { 765 return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor) 766 } 767 768 // ContainerName looks for an annotation in the spec with the container name. Returns empty string 769 // if no annotation is found. 770 func ContainerName(spec *specs.Spec) string { 771 return spec.Annotations[annotationContainerName] 772 }