gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/specutils/specutils.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package specutils contains utility functions for working with OCI runtime 16 // specs. 17 package specutils 18 19 import ( 20 "encoding/json" 21 "fmt" 22 "io" 23 "io/ioutil" 24 "os" 25 "path" 26 "path/filepath" 27 "strconv" 28 "strings" 29 "time" 30 31 "github.com/cenkalti/backoff" 32 "github.com/mohae/deepcopy" 33 specs "github.com/opencontainers/runtime-spec/specs-go" 34 "golang.org/x/sys/unix" 35 "gvisor.dev/gvisor/pkg/abi/linux" 36 "gvisor.dev/gvisor/pkg/bits" 37 "gvisor.dev/gvisor/pkg/log" 38 "gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy" 39 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 40 "gvisor.dev/gvisor/runsc/config" 41 "gvisor.dev/gvisor/runsc/flag" 42 ) 43 44 const ( 45 annotationFlagPrefix = "dev.gvisor.flag." 46 annotationSeccomp = "dev.gvisor.internal.seccomp." 47 annotationSeccompRuntimeDefault = "RuntimeDefault" 48 49 annotationContainerName = "io.kubernetes.cri.container-name" 50 ) 51 52 const ( 53 // AnnotationTPU is the annotation used to enable TPU proxy on a pod. 54 AnnotationTPU = "dev.gvisor.internal.tpuproxy" 55 ) 56 57 // ExePath must point to runsc binary, which is normally the same binary. It's 58 // changed in tests that aren't linked in the same binary. 59 var ExePath = "/proc/self/exe" 60 61 // Version is the supported spec version. 62 var Version = specs.Version 63 64 // LogSpecDebug writes the spec in a human-friendly format to the debug log. 65 func LogSpecDebug(orig *specs.Spec, logSeccomp bool) { 66 if !log.IsLogging(log.Debug) { 67 return 68 } 69 70 // Strip down parts of the spec that are not interesting. 71 spec := deepcopy.Copy(orig).(*specs.Spec) 72 if spec.Process != nil { 73 spec.Process.Capabilities = nil 74 } 75 if spec.Linux != nil { 76 if !logSeccomp { 77 spec.Linux.Seccomp = nil 78 } 79 spec.Linux.MaskedPaths = nil 80 spec.Linux.ReadonlyPaths = nil 81 if spec.Linux.Resources != nil { 82 spec.Linux.Resources.Devices = nil 83 } 84 } 85 86 out, err := json.MarshalIndent(spec, "", " ") 87 if err != nil { 88 log.Debugf("Failed to marshal spec: %v", err) 89 return 90 } 91 log.Debugf("Spec:\n%s", out) 92 } 93 94 // ValidateSpec validates that the spec is compatible with runsc. 95 func ValidateSpec(spec *specs.Spec) error { 96 // Mandatory fields. 97 if spec.Process == nil { 98 return fmt.Errorf("Spec.Process must be defined: %+v", spec) 99 } 100 if len(spec.Process.Args) == 0 { 101 return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process) 102 } 103 if spec.Root == nil { 104 return fmt.Errorf("Spec.Root must be defined: %+v", spec) 105 } 106 if len(spec.Root.Path) == 0 { 107 return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root) 108 } 109 110 // Unsupported fields. 111 if spec.Solaris != nil { 112 return fmt.Errorf("Spec.Solaris is not supported: %+v", spec) 113 } 114 if spec.Windows != nil { 115 return fmt.Errorf("Spec.Windows is not supported: %+v", spec) 116 } 117 if len(spec.Process.SelinuxLabel) != 0 { 118 return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel) 119 } 120 121 // Docker uses AppArmor by default, so just log that it's being ignored. 122 if spec.Process.ApparmorProfile != "" { 123 log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile) 124 } 125 126 // PR_SET_NO_NEW_PRIVS is assumed to always be set. 127 // See kernel.Task.updateCredsForExecLocked. 128 if !spec.Process.NoNewPrivileges { 129 log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.") 130 } 131 132 if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { 133 if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil { 134 return err 135 } 136 } 137 for _, m := range spec.Mounts { 138 if err := validateMount(&m); err != nil { 139 return err 140 } 141 } 142 143 // CRI specifies whether a container should start a new sandbox, or run 144 // another container in an existing sandbox. 145 switch SpecContainerType(spec) { 146 case ContainerTypeContainer: 147 // When starting a container in an existing sandbox, the 148 // sandbox ID must be set. 149 if _, ok := SandboxID(spec); !ok { 150 return fmt.Errorf("spec has container-type of container, but no sandbox ID set") 151 } 152 case ContainerTypeUnknown: 153 return fmt.Errorf("unknown container-type") 154 default: 155 } 156 157 return nil 158 } 159 160 // absPath turns the given path into an absolute path (if it is not already 161 // absolute) by prepending the base path. 162 func absPath(base, rel string) string { 163 if filepath.IsAbs(rel) { 164 return rel 165 } 166 return filepath.Join(base, rel) 167 } 168 169 // OpenSpec opens an OCI runtime spec from the given bundle directory. 170 func OpenSpec(bundleDir string) (*os.File, error) { 171 // The spec file must be named "config.json" inside the bundle directory. 172 return os.Open(filepath.Join(bundleDir, "config.json")) 173 } 174 175 // ReadSpec reads an OCI runtime spec from the given bundle directory. 176 // ReadSpec also normalizes all potential relative paths into absolute 177 // path, e.g. spec.Root.Path, mount.Source. 178 func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) { 179 specFile, err := OpenSpec(bundleDir) 180 if err != nil { 181 return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err) 182 } 183 defer specFile.Close() 184 return ReadSpecFromFile(bundleDir, specFile, conf) 185 } 186 187 // ReadSpecFromFile reads an OCI runtime spec from the given file. It also fixes 188 // up the spec so that the rest of the code doesn't need to worry about it. 189 // 1. Normalizes all relative paths into absolute by prepending the bundle 190 // dir to them. 191 // 2. Looks for flag overrides and applies them if any. 192 // 3. Removes seccomp rules if `RuntimeDefault` was used. 193 func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) { 194 if _, err := specFile.Seek(0, io.SeekStart); err != nil { 195 return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err) 196 } 197 specBytes, err := ioutil.ReadAll(specFile) 198 if err != nil { 199 return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err) 200 } 201 var spec specs.Spec 202 if err := json.Unmarshal(specBytes, &spec); err != nil { 203 return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes)) 204 } 205 if err := ValidateSpec(&spec); err != nil { 206 return nil, err 207 } 208 if err := fixSpec(&spec, bundleDir, conf); err != nil { 209 return nil, err 210 } 211 return &spec, nil 212 } 213 214 func fixSpec(spec *specs.Spec, bundleDir string, conf *config.Config) error { 215 // Turn any relative paths in the spec to absolute by prepending the bundleDir. 216 spec.Root.Path = absPath(bundleDir, spec.Root.Path) 217 for i := range spec.Mounts { 218 m := &spec.Mounts[i] 219 if m.Source != "" { 220 m.Source = absPath(bundleDir, m.Source) 221 } 222 } 223 // Look for config bundle annotations and verify that they exist. 224 const configBundlePrefix = "dev.gvisor.bundle." 225 var bundles []config.BundleName 226 for annotation, val := range spec.Annotations { 227 if !strings.HasPrefix(annotation, configBundlePrefix) { 228 continue 229 } 230 if val != "true" { 231 return fmt.Errorf("invalid value %q for annotation %q (must be set to 'true' or removed entirely)", val, annotation) 232 } 233 bundleName := config.BundleName(annotation[len(configBundlePrefix):]) 234 if _, exists := config.Bundles[bundleName]; !exists { 235 log.Warningf("Bundle name %q (from annotation %q=%q) does not exist; this bundle may have been deprecated. Skipping.", bundleName, annotation, val) 236 continue 237 } 238 bundles = append(bundles, bundleName) 239 } 240 241 // Apply config bundles, if any. 242 if len(bundles) > 0 { 243 log.Infof("Applying config bundles: %v", bundles) 244 if err := conf.ApplyBundles(flag.CommandLine, bundles...); err != nil { 245 return err 246 } 247 } 248 249 containerName := ContainerName(spec) 250 for annotation, val := range spec.Annotations { 251 if strings.HasPrefix(annotation, annotationFlagPrefix) { 252 // Override flags using annotation to allow customization per sandbox 253 // instance. 254 name := annotation[len(annotationFlagPrefix):] 255 log.Infof("Overriding flag from flag annotation: --%s=%q", name, val) 256 if err := conf.Override(flag.CommandLine, name, val /* force= */, false); err != nil { 257 return err 258 } 259 } else if len(containerName) > 0 { 260 // If we know the container name, then check to see if seccomp 261 // instructions were given to the container. 262 if annotation == annotationSeccomp+containerName && val == annotationSeccompRuntimeDefault { 263 // Container seccomp rules are redundant when using gVisor, so remove 264 // them when seccomp is set to RuntimeDefault. 265 if spec.Linux != nil && spec.Linux.Seccomp != nil { 266 log.Debugf("Seccomp is being ignored because annotation %q is set to default.", annotationSeccomp) 267 spec.Linux.Seccomp = nil 268 } 269 } 270 } 271 } 272 return nil 273 } 274 275 // ReadMounts reads mount list from a file. 276 func ReadMounts(f *os.File) ([]specs.Mount, error) { 277 bytes, err := ioutil.ReadAll(f) 278 if err != nil { 279 return nil, fmt.Errorf("error reading mounts: %v", err) 280 } 281 var mounts []specs.Mount 282 if err := json.Unmarshal(bytes, &mounts); err != nil { 283 return nil, fmt.Errorf("error unmarshaling mounts: %v\nJSON bytes:\n%s", err, string(bytes)) 284 } 285 return mounts, nil 286 } 287 288 // ChangeMountType changes m.Type to the specified type. It may do necessary 289 // amends to m.Options. 290 func ChangeMountType(m *specs.Mount, newType string) { 291 m.Type = newType 292 293 // OCI spec allows bind mounts to be specified in options only. So if new type 294 // is not bind, remove bind/rbind from options. 295 // 296 // "For bind mounts (when options include either bind or rbind), the type is 297 // a dummy, often "none" (not listed in /proc/filesystems)." 298 if newType != "bind" { 299 newOpts := make([]string, 0, len(m.Options)) 300 for _, opt := range m.Options { 301 if opt != "rbind" && opt != "bind" { 302 newOpts = append(newOpts, opt) 303 } 304 } 305 m.Options = newOpts 306 } 307 } 308 309 // Capabilities takes in spec and returns a TaskCapabilities corresponding to 310 // the spec. 311 func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) { 312 // Strip CAP_NET_RAW from all capability sets if necessary. 313 skipSet := map[linux.Capability]struct{}{} 314 if !enableRaw { 315 skipSet[linux.CAP_NET_RAW] = struct{}{} 316 } 317 318 var caps auth.TaskCapabilities 319 if specCaps != nil { 320 var err error 321 if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil { 322 return nil, err 323 } 324 if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil { 325 return nil, err 326 } 327 if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil { 328 return nil, err 329 } 330 if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil { 331 return nil, err 332 } 333 // TODO(gvisor.dev/issue/3166): Support ambient capabilities. 334 } 335 return &caps, nil 336 } 337 338 // AllCapabilities returns a LinuxCapabilities struct with all capabilities. 339 func AllCapabilities() *specs.LinuxCapabilities { 340 var names []string 341 for n := range capFromName { 342 names = append(names, n) 343 } 344 return &specs.LinuxCapabilities{ 345 Bounding: names, 346 Effective: names, 347 Inheritable: names, 348 Permitted: names, 349 Ambient: names, 350 } 351 } 352 353 // AllCapabilitiesUint64 returns a bitmask containing all capabilities set. 354 func AllCapabilitiesUint64() uint64 { 355 var rv uint64 356 for _, cap := range capFromName { 357 rv |= bits.MaskOf64(int(cap)) 358 } 359 return rv 360 } 361 362 // MergeCapabilities merges the capabilites from first and second. 363 func MergeCapabilities(first, second *specs.LinuxCapabilities) *specs.LinuxCapabilities { 364 return &specs.LinuxCapabilities{ 365 Bounding: mergeUnique(first.Bounding, second.Bounding), 366 Effective: mergeUnique(first.Effective, second.Effective), 367 Inheritable: mergeUnique(first.Inheritable, second.Inheritable), 368 Permitted: mergeUnique(first.Permitted, second.Permitted), 369 Ambient: mergeUnique(first.Ambient, second.Ambient), 370 } 371 } 372 373 // DropCapability removes the specified capability from all capability sets. 374 func DropCapability(caps *specs.LinuxCapabilities, drop string) { 375 caps.Bounding = remove(caps.Bounding, drop) 376 caps.Effective = remove(caps.Effective, drop) 377 caps.Inheritable = remove(caps.Inheritable, drop) 378 caps.Permitted = remove(caps.Permitted, drop) 379 caps.Ambient = remove(caps.Ambient, drop) 380 } 381 382 func mergeUnique(strSlices ...[]string) []string { 383 common := make(map[string]struct{}) 384 for _, strSlice := range strSlices { 385 for _, s := range strSlice { 386 common[s] = struct{}{} 387 } 388 } 389 390 res := make([]string, 0, len(common)) 391 for s := range common { 392 res = append(res, s) 393 } 394 return res 395 } 396 397 func remove(ss []string, rem string) []string { 398 var out []string 399 for _, s := range ss { 400 if s == rem { 401 continue 402 } 403 out = append(out, s) 404 } 405 return out 406 } 407 408 var capFromName = map[string]linux.Capability{ 409 "CAP_CHOWN": linux.CAP_CHOWN, 410 "CAP_DAC_OVERRIDE": linux.CAP_DAC_OVERRIDE, 411 "CAP_DAC_READ_SEARCH": linux.CAP_DAC_READ_SEARCH, 412 "CAP_FOWNER": linux.CAP_FOWNER, 413 "CAP_FSETID": linux.CAP_FSETID, 414 "CAP_KILL": linux.CAP_KILL, 415 "CAP_SETGID": linux.CAP_SETGID, 416 "CAP_SETUID": linux.CAP_SETUID, 417 "CAP_SETPCAP": linux.CAP_SETPCAP, 418 "CAP_LINUX_IMMUTABLE": linux.CAP_LINUX_IMMUTABLE, 419 "CAP_NET_BIND_SERVICE": linux.CAP_NET_BIND_SERVICE, 420 "CAP_NET_BROADCAST": linux.CAP_NET_BROADCAST, 421 "CAP_NET_ADMIN": linux.CAP_NET_ADMIN, 422 "CAP_NET_RAW": linux.CAP_NET_RAW, 423 "CAP_IPC_LOCK": linux.CAP_IPC_LOCK, 424 "CAP_IPC_OWNER": linux.CAP_IPC_OWNER, 425 "CAP_SYS_MODULE": linux.CAP_SYS_MODULE, 426 "CAP_SYS_RAWIO": linux.CAP_SYS_RAWIO, 427 "CAP_SYS_CHROOT": linux.CAP_SYS_CHROOT, 428 "CAP_SYS_PTRACE": linux.CAP_SYS_PTRACE, 429 "CAP_SYS_PACCT": linux.CAP_SYS_PACCT, 430 "CAP_SYS_ADMIN": linux.CAP_SYS_ADMIN, 431 "CAP_SYS_BOOT": linux.CAP_SYS_BOOT, 432 "CAP_SYS_NICE": linux.CAP_SYS_NICE, 433 "CAP_SYS_RESOURCE": linux.CAP_SYS_RESOURCE, 434 "CAP_SYS_TIME": linux.CAP_SYS_TIME, 435 "CAP_SYS_TTY_CONFIG": linux.CAP_SYS_TTY_CONFIG, 436 "CAP_MKNOD": linux.CAP_MKNOD, 437 "CAP_LEASE": linux.CAP_LEASE, 438 "CAP_AUDIT_WRITE": linux.CAP_AUDIT_WRITE, 439 "CAP_AUDIT_CONTROL": linux.CAP_AUDIT_CONTROL, 440 "CAP_SETFCAP": linux.CAP_SETFCAP, 441 "CAP_MAC_OVERRIDE": linux.CAP_MAC_OVERRIDE, 442 "CAP_MAC_ADMIN": linux.CAP_MAC_ADMIN, 443 "CAP_SYSLOG": linux.CAP_SYSLOG, 444 "CAP_WAKE_ALARM": linux.CAP_WAKE_ALARM, 445 "CAP_BLOCK_SUSPEND": linux.CAP_BLOCK_SUSPEND, 446 "CAP_AUDIT_READ": linux.CAP_AUDIT_READ, 447 "CAP_PERFMON": linux.CAP_PERFMON, 448 "CAP_BPF": linux.CAP_BPF, 449 "CAP_CHECKPOINT_RESTORE": linux.CAP_CHECKPOINT_RESTORE, 450 } 451 452 func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) { 453 var caps []linux.Capability 454 for _, n := range names { 455 c, ok := capFromName[n] 456 if !ok { 457 return 0, fmt.Errorf("unknown capability %q", n) 458 } 459 // Should we skip this capabilty? 460 if _, ok := skipSet[c]; ok { 461 continue 462 } 463 caps = append(caps, c) 464 } 465 return auth.CapabilitySetOfMany(caps), nil 466 } 467 468 // IsGoferMount returns true if the given mount can be mounted as an external 469 // gofer. 470 func IsGoferMount(m specs.Mount) bool { 471 MaybeConvertToBindMount(&m) 472 return m.Type == "bind" && m.Source != "" 473 } 474 475 // MaybeConvertToBindMount converts mount type to "bind" in case any of the 476 // mount options are either "bind" or "rbind" as required by the OCI spec. 477 // 478 // "For bind mounts (when options include either bind or rbind), the type is a 479 // dummy, often "none" (not listed in /proc/filesystems)." 480 func MaybeConvertToBindMount(m *specs.Mount) { 481 if m.Type == "bind" { 482 return 483 } 484 for _, opt := range m.Options { 485 if opt == "bind" || opt == "rbind" { 486 m.Type = "bind" 487 return 488 } 489 } 490 } 491 492 // WaitForReady waits for a process to become ready. The process is ready when 493 // the 'ready' function returns true. It continues to wait if 'ready' returns 494 // false. It returns error on timeout, if the process stops or if 'ready' fails. 495 func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error { 496 b := backoff.NewExponentialBackOff() 497 b.InitialInterval = 1 * time.Millisecond 498 b.MaxInterval = 1 * time.Second 499 b.MaxElapsedTime = timeout 500 501 op := func() error { 502 if ok, err := ready(); err != nil { 503 return backoff.Permanent(err) 504 } else if ok { 505 return nil 506 } 507 508 // Check if the process is still running. 509 // If the process is alive, child is 0 because of the NOHANG option. 510 // If the process has terminated, child equals the process id. 511 var ws unix.WaitStatus 512 var ru unix.Rusage 513 child, err := unix.Wait4(pid, &ws, unix.WNOHANG, &ru) 514 if err != nil { 515 return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err)) 516 } else if child == pid { 517 return backoff.Permanent(fmt.Errorf("process %d has terminated", pid)) 518 } 519 return fmt.Errorf("process %d not running yet", pid) 520 } 521 return backoff.Retry(op, b) 522 } 523 524 // DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern' 525 // ends with '/', it's used as a directory with default file name. 526 // 'logPattern' can contain variables that are substituted: 527 // - %TIMESTAMP%: is replaced with a timestamp using the following format: 528 // <yyyymmdd-hhmmss.uuuuuu> 529 // - %COMMAND%: is replaced with 'command' 530 // - %TEST%: is replaced with 'test' (omitted by default) 531 func DebugLogFile(logPattern, command, test string) (*os.File, error) { 532 if strings.HasSuffix(logPattern, "/") { 533 // Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>.txt 534 logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%.txt" 535 } 536 logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1) 537 logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1) 538 logPattern = strings.Replace(logPattern, "%TEST%", test, -1) 539 540 dir := filepath.Dir(logPattern) 541 if err := os.MkdirAll(dir, 0775); err != nil { 542 return nil, fmt.Errorf("error creating dir %q: %v", dir, err) 543 } 544 return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) 545 } 546 547 // IsDebugCommand returns true if the command should be debugged or not, based 548 // on the current configuration. 549 func IsDebugCommand(conf *config.Config, command string) bool { 550 if len(conf.DebugCommand) == 0 { 551 // Debug everything by default. 552 return true 553 } 554 filter := conf.DebugCommand 555 rv := true 556 if filter[0] == '!' { 557 // Negate the match, e.g. !boot should log all, but "boot". 558 filter = filter[1:] 559 rv = false 560 } 561 for _, cmd := range strings.Split(filter, ",") { 562 if cmd == command { 563 return rv 564 } 565 } 566 return !rv 567 } 568 569 // TPUProxyIsEnabled checks if tpuproxy is enabled in the config or annotations. 570 func TPUProxyIsEnabled(spec *specs.Spec, conf *config.Config) bool { 571 if conf.TPUProxy { 572 return true 573 } 574 val, ok := spec.Annotations[AnnotationTPU] 575 if !ok { 576 return false 577 } 578 ret, err := strconv.ParseBool(val) 579 if err != nil { 580 log.Warningf("tpuproxy annotation set to invalid value %q: %w. Skipping.", val, err) 581 } 582 return ret 583 } 584 585 // VFIOFunctionalityRequested returns true if the container should have access 586 // to VFIO functionality. 587 func VFIOFunctionalityRequested(dev *specs.LinuxDevice) bool { 588 return strings.HasPrefix(dev.Path, filepath.Dir(tpuproxy.VFIOPath)) 589 } 590 591 // AcceleratorFunctionalityRequested returns true if the container should have 592 // access to compute accelerators. Compute accelerators are different from GPUs 593 // by using a different major number and different device char files. 594 func AcceleratorFunctionalityRequested(dev *specs.LinuxDevice) bool { 595 return strings.HasPrefix(dev.Path, "/dev/accel") 596 } 597 598 // TPUFunctionalityRequested returns true if the container should have access 599 // to TPU functionality. 600 func TPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool { 601 if !TPUProxyIsEnabled(spec, conf) { 602 return false 603 } 604 if spec.Linux != nil { 605 for _, dev := range spec.Linux.Devices { 606 if AcceleratorFunctionalityRequested(&dev) || VFIOFunctionalityRequested(&dev) { 607 return true 608 } 609 } 610 } 611 return false 612 } 613 614 // SafeSetupAndMount creates the mount point and calls Mount with the given 615 // flags. procPath is the path to procfs. If it is "", procfs is assumed to be 616 // mounted at /proc. 617 func SafeSetupAndMount(src, dst, typ string, flags uint32, procPath string) error { 618 // Create the mount point inside. The type must be the same as the source 619 // (file or directory). 620 var isDir bool 621 if typ == "proc" { 622 // Special case, as there is no source directory for proc mounts. 623 isDir = true 624 } else if fi, err := os.Stat(src); err != nil { 625 return fmt.Errorf("stat(%q) failed: %v", src, err) 626 } else { 627 isDir = fi.IsDir() 628 } 629 630 if isDir { 631 // Create the destination directory. 632 if err := os.MkdirAll(dst, 0777); err != nil { 633 return fmt.Errorf("mkdir(%q) failed: %v", dst, err) 634 } 635 } else { 636 // Create the parent destination directory. 637 parent := path.Dir(dst) 638 if err := os.MkdirAll(parent, 0777); err != nil { 639 return fmt.Errorf("mkdir(%q) failed: %v", parent, err) 640 } 641 // Create the destination file if it does not exist. 642 f, err := os.OpenFile(dst, unix.O_CREAT, 0777) 643 if err != nil { 644 return fmt.Errorf("open(%q) failed: %v", dst, err) 645 } 646 f.Close() 647 } 648 649 // Do the mount. 650 if err := SafeMount(src, dst, typ, uintptr(flags), "", procPath); err != nil { 651 return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err) 652 } 653 return nil 654 } 655 656 // ErrSymlinkMount is returned by SafeMount when the mount destination is found 657 // to be a symlink. 658 type ErrSymlinkMount struct { 659 error 660 } 661 662 // SafeMount is like unix.Mount, but will fail if dst is a symlink. procPath is 663 // the path to procfs. If it is "", procfs is assumed to be mounted at /proc. 664 // 665 // SafeMount can fail when dst contains a symlink. However, it is called in the 666 // normal case with a destination consisting of a known root (/proc/root) and 667 // symlink-free path (from resolveSymlink). 668 func SafeMount(src, dst, fstype string, flags uintptr, data, procPath string) error { 669 // Open the destination. 670 fd, err := unix.Open(dst, unix.O_PATH|unix.O_CLOEXEC, 0) 671 if err != nil { 672 return fmt.Errorf("failed to safely mount: Open(%s, _, _): %w", dst, err) 673 } 674 defer unix.Close(fd) 675 676 // Use /proc/self/fd/ to verify that we opened the intended destination. This 677 // guards against dst being a symlink, in which case we could accidentally 678 // mount over the symlink's target. 679 if procPath == "" { 680 procPath = "/proc" 681 } 682 safePath := filepath.Join(procPath, "self/fd", strconv.Itoa(fd)) 683 target, err := os.Readlink(safePath) 684 if err != nil { 685 return fmt.Errorf("failed to safely mount: Readlink(%s): %w", safePath, err) 686 } 687 if dst != target { 688 return &ErrSymlinkMount{fmt.Errorf("failed to safely mount: expected to open %s, but found %s", dst, target)} 689 } 690 691 return unix.Mount(src, safePath, fstype, flags, data) 692 } 693 694 // RetryEintr retries the function until an error different than EINTR is 695 // returned. 696 func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) { 697 for { 698 r1, r2, err := f() 699 if err != unix.EINTR { 700 return r1, r2, err 701 } 702 } 703 } 704 705 // GetOOMScoreAdj reads the given process' oom_score_adj 706 func GetOOMScoreAdj(pid int) (int, error) { 707 data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid)) 708 if err != nil { 709 return 0, err 710 } 711 return strconv.Atoi(strings.TrimSpace(string(data))) 712 } 713 714 // EnvVar looks for a variable value in the env slice assuming the following 715 // format: "NAME=VALUE". If a variable is defined multiple times, the last 716 // value is used. 717 func EnvVar(env []string, name string) (string, bool) { 718 var err error 719 env, err = ResolveEnvs(env) 720 if err != nil { 721 return "", false 722 } 723 prefix := name + "=" 724 for _, e := range env { 725 if strings.HasPrefix(e, prefix) { 726 return strings.TrimPrefix(e, prefix), true 727 } 728 } 729 return "", false 730 } 731 732 // ResolveEnvs transforms lists of environment variables into a single list of 733 // environment variables. If a variable is defined multiple times, the last 734 // value is used. 735 func ResolveEnvs(envs ...[]string) ([]string, error) { 736 // First create a map of variable names to values. This removes any 737 // duplicates. 738 envMap := make(map[string]string) 739 for _, env := range envs { 740 for _, str := range env { 741 parts := strings.SplitN(str, "=", 2) 742 if len(parts) != 2 { 743 return nil, fmt.Errorf("invalid variable: %s", str) 744 } 745 envMap[parts[0]] = parts[1] 746 } 747 } 748 // Reassemble envMap into a list of environment variables of the form 749 // NAME=VALUE. 750 env := make([]string, 0, len(envMap)) 751 for k, v := range envMap { 752 env = append(env, fmt.Sprintf("%s=%s", k, v)) 753 } 754 return env, nil 755 } 756 757 // FaqErrorMsg returns an error message pointing to the FAQ. 758 func FaqErrorMsg(anchor, msg string) string { 759 return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor) 760 } 761 762 // ContainerName looks for an annotation in the spec with the container name. Returns empty string 763 // if no annotation is found. 764 func ContainerName(spec *specs.Spec) string { 765 return spec.Annotations[annotationContainerName] 766 }