gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/specutils/specutils.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package specutils contains utility functions for working with OCI runtime
    16  // specs.
    17  package specutils
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"io"
    23  	"io/ioutil"
    24  	"os"
    25  	"path"
    26  	"path/filepath"
    27  	"strconv"
    28  	"strings"
    29  	"time"
    30  
    31  	"github.com/cenkalti/backoff"
    32  	"github.com/mohae/deepcopy"
    33  	specs "github.com/opencontainers/runtime-spec/specs-go"
    34  	"golang.org/x/sys/unix"
    35  	"gvisor.dev/gvisor/pkg/abi/linux"
    36  	"gvisor.dev/gvisor/pkg/bits"
    37  	"gvisor.dev/gvisor/pkg/log"
    38  	"gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy"
    39  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    40  	"gvisor.dev/gvisor/runsc/config"
    41  	"gvisor.dev/gvisor/runsc/flag"
    42  )
    43  
    44  const (
    45  	annotationFlagPrefix            = "dev.gvisor.flag."
    46  	annotationSeccomp               = "dev.gvisor.internal.seccomp."
    47  	annotationSeccompRuntimeDefault = "RuntimeDefault"
    48  
    49  	annotationContainerName = "io.kubernetes.cri.container-name"
    50  )
    51  
    52  const (
    53  	// AnnotationTPU is the annotation used to enable TPU proxy on a pod.
    54  	AnnotationTPU = "dev.gvisor.internal.tpuproxy"
    55  )
    56  
    57  // ExePath must point to runsc binary, which is normally the same binary. It's
    58  // changed in tests that aren't linked in the same binary.
    59  var ExePath = "/proc/self/exe"
    60  
    61  // Version is the supported spec version.
    62  var Version = specs.Version
    63  
    64  // LogSpecDebug writes the spec in a human-friendly format to the debug log.
    65  func LogSpecDebug(orig *specs.Spec, logSeccomp bool) {
    66  	if !log.IsLogging(log.Debug) {
    67  		return
    68  	}
    69  
    70  	// Strip down parts of the spec that are not interesting.
    71  	spec := deepcopy.Copy(orig).(*specs.Spec)
    72  	if spec.Process != nil {
    73  		spec.Process.Capabilities = nil
    74  	}
    75  	if spec.Linux != nil {
    76  		if !logSeccomp {
    77  			spec.Linux.Seccomp = nil
    78  		}
    79  		spec.Linux.MaskedPaths = nil
    80  		spec.Linux.ReadonlyPaths = nil
    81  		if spec.Linux.Resources != nil {
    82  			spec.Linux.Resources.Devices = nil
    83  		}
    84  	}
    85  
    86  	out, err := json.MarshalIndent(spec, "", "  ")
    87  	if err != nil {
    88  		log.Debugf("Failed to marshal spec: %v", err)
    89  		return
    90  	}
    91  	log.Debugf("Spec:\n%s", out)
    92  }
    93  
    94  // ValidateSpec validates that the spec is compatible with runsc.
    95  func ValidateSpec(spec *specs.Spec) error {
    96  	// Mandatory fields.
    97  	if spec.Process == nil {
    98  		return fmt.Errorf("Spec.Process must be defined: %+v", spec)
    99  	}
   100  	if len(spec.Process.Args) == 0 {
   101  		return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process)
   102  	}
   103  	if spec.Root == nil {
   104  		return fmt.Errorf("Spec.Root must be defined: %+v", spec)
   105  	}
   106  	if len(spec.Root.Path) == 0 {
   107  		return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root)
   108  	}
   109  
   110  	// Unsupported fields.
   111  	if spec.Solaris != nil {
   112  		return fmt.Errorf("Spec.Solaris is not supported: %+v", spec)
   113  	}
   114  	if spec.Windows != nil {
   115  		return fmt.Errorf("Spec.Windows is not supported: %+v", spec)
   116  	}
   117  	if len(spec.Process.SelinuxLabel) != 0 {
   118  		return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel)
   119  	}
   120  
   121  	// Docker uses AppArmor by default, so just log that it's being ignored.
   122  	if spec.Process.ApparmorProfile != "" {
   123  		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
   124  	}
   125  
   126  	// PR_SET_NO_NEW_PRIVS is assumed to always be set.
   127  	// See kernel.Task.updateCredsForExecLocked.
   128  	if !spec.Process.NoNewPrivileges {
   129  		log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.")
   130  	}
   131  
   132  	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
   133  		if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil {
   134  			return err
   135  		}
   136  	}
   137  	for _, m := range spec.Mounts {
   138  		if err := validateMount(&m); err != nil {
   139  			return err
   140  		}
   141  	}
   142  
   143  	// CRI specifies whether a container should start a new sandbox, or run
   144  	// another container in an existing sandbox.
   145  	switch SpecContainerType(spec) {
   146  	case ContainerTypeContainer:
   147  		// When starting a container in an existing sandbox, the
   148  		// sandbox ID must be set.
   149  		if _, ok := SandboxID(spec); !ok {
   150  			return fmt.Errorf("spec has container-type of container, but no sandbox ID set")
   151  		}
   152  	case ContainerTypeUnknown:
   153  		return fmt.Errorf("unknown container-type")
   154  	default:
   155  	}
   156  
   157  	return nil
   158  }
   159  
   160  // absPath turns the given path into an absolute path (if it is not already
   161  // absolute) by prepending the base path.
   162  func absPath(base, rel string) string {
   163  	if filepath.IsAbs(rel) {
   164  		return rel
   165  	}
   166  	return filepath.Join(base, rel)
   167  }
   168  
   169  // OpenSpec opens an OCI runtime spec from the given bundle directory.
   170  func OpenSpec(bundleDir string) (*os.File, error) {
   171  	// The spec file must be named "config.json" inside the bundle directory.
   172  	return os.Open(filepath.Join(bundleDir, "config.json"))
   173  }
   174  
   175  // ReadSpec reads an OCI runtime spec from the given bundle directory.
   176  // ReadSpec also normalizes all potential relative paths into absolute
   177  // path, e.g. spec.Root.Path, mount.Source.
   178  func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) {
   179  	specFile, err := OpenSpec(bundleDir)
   180  	if err != nil {
   181  		return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err)
   182  	}
   183  	defer specFile.Close()
   184  	return ReadSpecFromFile(bundleDir, specFile, conf)
   185  }
   186  
   187  // ReadSpecFromFile reads an OCI runtime spec from the given file. It also fixes
   188  // up the spec so that the rest of the code doesn't need to worry about it.
   189  //  1. Normalizes all relative paths into absolute by prepending the bundle
   190  //     dir to them.
   191  //  2. Looks for flag overrides and applies them if any.
   192  //  3. Removes seccomp rules if `RuntimeDefault` was used.
   193  func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) {
   194  	if _, err := specFile.Seek(0, io.SeekStart); err != nil {
   195  		return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err)
   196  	}
   197  	specBytes, err := ioutil.ReadAll(specFile)
   198  	if err != nil {
   199  		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err)
   200  	}
   201  	var spec specs.Spec
   202  	if err := json.Unmarshal(specBytes, &spec); err != nil {
   203  		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes))
   204  	}
   205  	if err := ValidateSpec(&spec); err != nil {
   206  		return nil, err
   207  	}
   208  	if err := fixSpec(&spec, bundleDir, conf); err != nil {
   209  		return nil, err
   210  	}
   211  	return &spec, nil
   212  }
   213  
   214  func fixSpec(spec *specs.Spec, bundleDir string, conf *config.Config) error {
   215  	// Turn any relative paths in the spec to absolute by prepending the bundleDir.
   216  	spec.Root.Path = absPath(bundleDir, spec.Root.Path)
   217  	for i := range spec.Mounts {
   218  		m := &spec.Mounts[i]
   219  		if m.Source != "" {
   220  			m.Source = absPath(bundleDir, m.Source)
   221  		}
   222  	}
   223  	// Look for config bundle annotations and verify that they exist.
   224  	const configBundlePrefix = "dev.gvisor.bundle."
   225  	var bundles []config.BundleName
   226  	for annotation, val := range spec.Annotations {
   227  		if !strings.HasPrefix(annotation, configBundlePrefix) {
   228  			continue
   229  		}
   230  		if val != "true" {
   231  			return fmt.Errorf("invalid value %q for annotation %q (must be set to 'true' or removed entirely)", val, annotation)
   232  		}
   233  		bundleName := config.BundleName(annotation[len(configBundlePrefix):])
   234  		if _, exists := config.Bundles[bundleName]; !exists {
   235  			log.Warningf("Bundle name %q (from annotation %q=%q) does not exist; this bundle may have been deprecated. Skipping.", bundleName, annotation, val)
   236  			continue
   237  		}
   238  		bundles = append(bundles, bundleName)
   239  	}
   240  
   241  	// Apply config bundles, if any.
   242  	if len(bundles) > 0 {
   243  		log.Infof("Applying config bundles: %v", bundles)
   244  		if err := conf.ApplyBundles(flag.CommandLine, bundles...); err != nil {
   245  			return err
   246  		}
   247  	}
   248  
   249  	containerName := ContainerName(spec)
   250  	for annotation, val := range spec.Annotations {
   251  		if strings.HasPrefix(annotation, annotationFlagPrefix) {
   252  			// Override flags using annotation to allow customization per sandbox
   253  			// instance.
   254  			name := annotation[len(annotationFlagPrefix):]
   255  			log.Infof("Overriding flag from flag annotation: --%s=%q", name, val)
   256  			if err := conf.Override(flag.CommandLine, name, val /* force= */, false); err != nil {
   257  				return err
   258  			}
   259  		} else if len(containerName) > 0 {
   260  			// If we know the container name, then check to see if seccomp
   261  			// instructions were given to the container.
   262  			if annotation == annotationSeccomp+containerName && val == annotationSeccompRuntimeDefault {
   263  				// Container seccomp rules are redundant when using gVisor, so remove
   264  				// them when seccomp is set to RuntimeDefault.
   265  				if spec.Linux != nil && spec.Linux.Seccomp != nil {
   266  					log.Debugf("Seccomp is being ignored because annotation %q is set to default.", annotationSeccomp)
   267  					spec.Linux.Seccomp = nil
   268  				}
   269  			}
   270  		}
   271  	}
   272  	return nil
   273  }
   274  
   275  // ReadMounts reads mount list from a file.
   276  func ReadMounts(f *os.File) ([]specs.Mount, error) {
   277  	bytes, err := ioutil.ReadAll(f)
   278  	if err != nil {
   279  		return nil, fmt.Errorf("error reading mounts: %v", err)
   280  	}
   281  	var mounts []specs.Mount
   282  	if err := json.Unmarshal(bytes, &mounts); err != nil {
   283  		return nil, fmt.Errorf("error unmarshaling mounts: %v\nJSON bytes:\n%s", err, string(bytes))
   284  	}
   285  	return mounts, nil
   286  }
   287  
   288  // ChangeMountType changes m.Type to the specified type. It may do necessary
   289  // amends to m.Options.
   290  func ChangeMountType(m *specs.Mount, newType string) {
   291  	m.Type = newType
   292  
   293  	// OCI spec allows bind mounts to be specified in options only. So if new type
   294  	// is not bind, remove bind/rbind from options.
   295  	//
   296  	// "For bind mounts (when options include either bind or rbind), the type is
   297  	// a dummy, often "none" (not listed in /proc/filesystems)."
   298  	if newType != "bind" {
   299  		newOpts := make([]string, 0, len(m.Options))
   300  		for _, opt := range m.Options {
   301  			if opt != "rbind" && opt != "bind" {
   302  				newOpts = append(newOpts, opt)
   303  			}
   304  		}
   305  		m.Options = newOpts
   306  	}
   307  }
   308  
   309  // Capabilities takes in spec and returns a TaskCapabilities corresponding to
   310  // the spec.
   311  func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
   312  	// Strip CAP_NET_RAW from all capability sets if necessary.
   313  	skipSet := map[linux.Capability]struct{}{}
   314  	if !enableRaw {
   315  		skipSet[linux.CAP_NET_RAW] = struct{}{}
   316  	}
   317  
   318  	var caps auth.TaskCapabilities
   319  	if specCaps != nil {
   320  		var err error
   321  		if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil {
   322  			return nil, err
   323  		}
   324  		if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil {
   325  			return nil, err
   326  		}
   327  		if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil {
   328  			return nil, err
   329  		}
   330  		if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil {
   331  			return nil, err
   332  		}
   333  		// TODO(gvisor.dev/issue/3166): Support ambient capabilities.
   334  	}
   335  	return &caps, nil
   336  }
   337  
   338  // AllCapabilities returns a LinuxCapabilities struct with all capabilities.
   339  func AllCapabilities() *specs.LinuxCapabilities {
   340  	var names []string
   341  	for n := range capFromName {
   342  		names = append(names, n)
   343  	}
   344  	return &specs.LinuxCapabilities{
   345  		Bounding:    names,
   346  		Effective:   names,
   347  		Inheritable: names,
   348  		Permitted:   names,
   349  		Ambient:     names,
   350  	}
   351  }
   352  
   353  // AllCapabilitiesUint64 returns a bitmask containing all capabilities set.
   354  func AllCapabilitiesUint64() uint64 {
   355  	var rv uint64
   356  	for _, cap := range capFromName {
   357  		rv |= bits.MaskOf64(int(cap))
   358  	}
   359  	return rv
   360  }
   361  
   362  // MergeCapabilities merges the capabilites from first and second.
   363  func MergeCapabilities(first, second *specs.LinuxCapabilities) *specs.LinuxCapabilities {
   364  	return &specs.LinuxCapabilities{
   365  		Bounding:    mergeUnique(first.Bounding, second.Bounding),
   366  		Effective:   mergeUnique(first.Effective, second.Effective),
   367  		Inheritable: mergeUnique(first.Inheritable, second.Inheritable),
   368  		Permitted:   mergeUnique(first.Permitted, second.Permitted),
   369  		Ambient:     mergeUnique(first.Ambient, second.Ambient),
   370  	}
   371  }
   372  
   373  // DropCapability removes the specified capability from all capability sets.
   374  func DropCapability(caps *specs.LinuxCapabilities, drop string) {
   375  	caps.Bounding = remove(caps.Bounding, drop)
   376  	caps.Effective = remove(caps.Effective, drop)
   377  	caps.Inheritable = remove(caps.Inheritable, drop)
   378  	caps.Permitted = remove(caps.Permitted, drop)
   379  	caps.Ambient = remove(caps.Ambient, drop)
   380  }
   381  
   382  func mergeUnique(strSlices ...[]string) []string {
   383  	common := make(map[string]struct{})
   384  	for _, strSlice := range strSlices {
   385  		for _, s := range strSlice {
   386  			common[s] = struct{}{}
   387  		}
   388  	}
   389  
   390  	res := make([]string, 0, len(common))
   391  	for s := range common {
   392  		res = append(res, s)
   393  	}
   394  	return res
   395  }
   396  
   397  func remove(ss []string, rem string) []string {
   398  	var out []string
   399  	for _, s := range ss {
   400  		if s == rem {
   401  			continue
   402  		}
   403  		out = append(out, s)
   404  	}
   405  	return out
   406  }
   407  
   408  var capFromName = map[string]linux.Capability{
   409  	"CAP_CHOWN":              linux.CAP_CHOWN,
   410  	"CAP_DAC_OVERRIDE":       linux.CAP_DAC_OVERRIDE,
   411  	"CAP_DAC_READ_SEARCH":    linux.CAP_DAC_READ_SEARCH,
   412  	"CAP_FOWNER":             linux.CAP_FOWNER,
   413  	"CAP_FSETID":             linux.CAP_FSETID,
   414  	"CAP_KILL":               linux.CAP_KILL,
   415  	"CAP_SETGID":             linux.CAP_SETGID,
   416  	"CAP_SETUID":             linux.CAP_SETUID,
   417  	"CAP_SETPCAP":            linux.CAP_SETPCAP,
   418  	"CAP_LINUX_IMMUTABLE":    linux.CAP_LINUX_IMMUTABLE,
   419  	"CAP_NET_BIND_SERVICE":   linux.CAP_NET_BIND_SERVICE,
   420  	"CAP_NET_BROADCAST":      linux.CAP_NET_BROADCAST,
   421  	"CAP_NET_ADMIN":          linux.CAP_NET_ADMIN,
   422  	"CAP_NET_RAW":            linux.CAP_NET_RAW,
   423  	"CAP_IPC_LOCK":           linux.CAP_IPC_LOCK,
   424  	"CAP_IPC_OWNER":          linux.CAP_IPC_OWNER,
   425  	"CAP_SYS_MODULE":         linux.CAP_SYS_MODULE,
   426  	"CAP_SYS_RAWIO":          linux.CAP_SYS_RAWIO,
   427  	"CAP_SYS_CHROOT":         linux.CAP_SYS_CHROOT,
   428  	"CAP_SYS_PTRACE":         linux.CAP_SYS_PTRACE,
   429  	"CAP_SYS_PACCT":          linux.CAP_SYS_PACCT,
   430  	"CAP_SYS_ADMIN":          linux.CAP_SYS_ADMIN,
   431  	"CAP_SYS_BOOT":           linux.CAP_SYS_BOOT,
   432  	"CAP_SYS_NICE":           linux.CAP_SYS_NICE,
   433  	"CAP_SYS_RESOURCE":       linux.CAP_SYS_RESOURCE,
   434  	"CAP_SYS_TIME":           linux.CAP_SYS_TIME,
   435  	"CAP_SYS_TTY_CONFIG":     linux.CAP_SYS_TTY_CONFIG,
   436  	"CAP_MKNOD":              linux.CAP_MKNOD,
   437  	"CAP_LEASE":              linux.CAP_LEASE,
   438  	"CAP_AUDIT_WRITE":        linux.CAP_AUDIT_WRITE,
   439  	"CAP_AUDIT_CONTROL":      linux.CAP_AUDIT_CONTROL,
   440  	"CAP_SETFCAP":            linux.CAP_SETFCAP,
   441  	"CAP_MAC_OVERRIDE":       linux.CAP_MAC_OVERRIDE,
   442  	"CAP_MAC_ADMIN":          linux.CAP_MAC_ADMIN,
   443  	"CAP_SYSLOG":             linux.CAP_SYSLOG,
   444  	"CAP_WAKE_ALARM":         linux.CAP_WAKE_ALARM,
   445  	"CAP_BLOCK_SUSPEND":      linux.CAP_BLOCK_SUSPEND,
   446  	"CAP_AUDIT_READ":         linux.CAP_AUDIT_READ,
   447  	"CAP_PERFMON":            linux.CAP_PERFMON,
   448  	"CAP_BPF":                linux.CAP_BPF,
   449  	"CAP_CHECKPOINT_RESTORE": linux.CAP_CHECKPOINT_RESTORE,
   450  }
   451  
   452  func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) {
   453  	var caps []linux.Capability
   454  	for _, n := range names {
   455  		c, ok := capFromName[n]
   456  		if !ok {
   457  			return 0, fmt.Errorf("unknown capability %q", n)
   458  		}
   459  		// Should we skip this capabilty?
   460  		if _, ok := skipSet[c]; ok {
   461  			continue
   462  		}
   463  		caps = append(caps, c)
   464  	}
   465  	return auth.CapabilitySetOfMany(caps), nil
   466  }
   467  
   468  // IsGoferMount returns true if the given mount can be mounted as an external
   469  // gofer.
   470  func IsGoferMount(m specs.Mount) bool {
   471  	MaybeConvertToBindMount(&m)
   472  	return m.Type == "bind" && m.Source != ""
   473  }
   474  
   475  // MaybeConvertToBindMount converts mount type to "bind" in case any of the
   476  // mount options are either "bind" or "rbind" as required by the OCI spec.
   477  //
   478  // "For bind mounts (when options include either bind or rbind), the type is a
   479  // dummy, often "none" (not listed in /proc/filesystems)."
   480  func MaybeConvertToBindMount(m *specs.Mount) {
   481  	if m.Type == "bind" {
   482  		return
   483  	}
   484  	for _, opt := range m.Options {
   485  		if opt == "bind" || opt == "rbind" {
   486  			m.Type = "bind"
   487  			return
   488  		}
   489  	}
   490  }
   491  
   492  // WaitForReady waits for a process to become ready. The process is ready when
   493  // the 'ready' function returns true. It continues to wait if 'ready' returns
   494  // false. It returns error on timeout, if the process stops or if 'ready' fails.
   495  func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error {
   496  	b := backoff.NewExponentialBackOff()
   497  	b.InitialInterval = 1 * time.Millisecond
   498  	b.MaxInterval = 1 * time.Second
   499  	b.MaxElapsedTime = timeout
   500  
   501  	op := func() error {
   502  		if ok, err := ready(); err != nil {
   503  			return backoff.Permanent(err)
   504  		} else if ok {
   505  			return nil
   506  		}
   507  
   508  		// Check if the process is still running.
   509  		// If the process is alive, child is 0 because of the NOHANG option.
   510  		// If the process has terminated, child equals the process id.
   511  		var ws unix.WaitStatus
   512  		var ru unix.Rusage
   513  		child, err := unix.Wait4(pid, &ws, unix.WNOHANG, &ru)
   514  		if err != nil {
   515  			return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err))
   516  		} else if child == pid {
   517  			return backoff.Permanent(fmt.Errorf("process %d has terminated", pid))
   518  		}
   519  		return fmt.Errorf("process %d not running yet", pid)
   520  	}
   521  	return backoff.Retry(op, b)
   522  }
   523  
   524  // DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern'
   525  // ends with '/', it's used as a directory with default file name.
   526  // 'logPattern' can contain variables that are substituted:
   527  //   - %TIMESTAMP%: is replaced with a timestamp using the following format:
   528  //     <yyyymmdd-hhmmss.uuuuuu>
   529  //   - %COMMAND%: is replaced with 'command'
   530  //   - %TEST%: is replaced with 'test' (omitted by default)
   531  func DebugLogFile(logPattern, command, test string) (*os.File, error) {
   532  	if strings.HasSuffix(logPattern, "/") {
   533  		// Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>.txt
   534  		logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%.txt"
   535  	}
   536  	logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1)
   537  	logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1)
   538  	logPattern = strings.Replace(logPattern, "%TEST%", test, -1)
   539  
   540  	dir := filepath.Dir(logPattern)
   541  	if err := os.MkdirAll(dir, 0775); err != nil {
   542  		return nil, fmt.Errorf("error creating dir %q: %v", dir, err)
   543  	}
   544  	return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
   545  }
   546  
   547  // IsDebugCommand returns true if the command should be debugged or not, based
   548  // on the current configuration.
   549  func IsDebugCommand(conf *config.Config, command string) bool {
   550  	if len(conf.DebugCommand) == 0 {
   551  		// Debug everything by default.
   552  		return true
   553  	}
   554  	filter := conf.DebugCommand
   555  	rv := true
   556  	if filter[0] == '!' {
   557  		// Negate the match, e.g. !boot should log all, but "boot".
   558  		filter = filter[1:]
   559  		rv = false
   560  	}
   561  	for _, cmd := range strings.Split(filter, ",") {
   562  		if cmd == command {
   563  			return rv
   564  		}
   565  	}
   566  	return !rv
   567  }
   568  
   569  // TPUProxyIsEnabled checks if tpuproxy is enabled in the config or annotations.
   570  func TPUProxyIsEnabled(spec *specs.Spec, conf *config.Config) bool {
   571  	if conf.TPUProxy {
   572  		return true
   573  	}
   574  	val, ok := spec.Annotations[AnnotationTPU]
   575  	if !ok {
   576  		return false
   577  	}
   578  	ret, err := strconv.ParseBool(val)
   579  	if err != nil {
   580  		log.Warningf("tpuproxy annotation set to invalid value %q: %w. Skipping.", val, err)
   581  	}
   582  	return ret
   583  }
   584  
   585  // VFIOFunctionalityRequested returns true if the container should have access
   586  // to VFIO functionality.
   587  func VFIOFunctionalityRequested(dev *specs.LinuxDevice) bool {
   588  	return strings.HasPrefix(dev.Path, filepath.Dir(tpuproxy.VFIOPath))
   589  }
   590  
   591  // AcceleratorFunctionalityRequested returns true if the container should have
   592  // access to compute accelerators. Compute accelerators are different from GPUs
   593  // by using a different major number and different device char files.
   594  func AcceleratorFunctionalityRequested(dev *specs.LinuxDevice) bool {
   595  	return strings.HasPrefix(dev.Path, "/dev/accel")
   596  }
   597  
   598  // TPUFunctionalityRequested returns true if the container should have access
   599  // to TPU functionality.
   600  func TPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool {
   601  	if !TPUProxyIsEnabled(spec, conf) {
   602  		return false
   603  	}
   604  	if spec.Linux != nil {
   605  		for _, dev := range spec.Linux.Devices {
   606  			if AcceleratorFunctionalityRequested(&dev) || VFIOFunctionalityRequested(&dev) {
   607  				return true
   608  			}
   609  		}
   610  	}
   611  	return false
   612  }
   613  
   614  // SafeSetupAndMount creates the mount point and calls Mount with the given
   615  // flags. procPath is the path to procfs. If it is "", procfs is assumed to be
   616  // mounted at /proc.
   617  func SafeSetupAndMount(src, dst, typ string, flags uint32, procPath string) error {
   618  	// Create the mount point inside. The type must be the same as the source
   619  	// (file or directory).
   620  	var isDir bool
   621  	if typ == "proc" {
   622  		// Special case, as there is no source directory for proc mounts.
   623  		isDir = true
   624  	} else if fi, err := os.Stat(src); err != nil {
   625  		return fmt.Errorf("stat(%q) failed: %v", src, err)
   626  	} else {
   627  		isDir = fi.IsDir()
   628  	}
   629  
   630  	if isDir {
   631  		// Create the destination directory.
   632  		if err := os.MkdirAll(dst, 0777); err != nil {
   633  			return fmt.Errorf("mkdir(%q) failed: %v", dst, err)
   634  		}
   635  	} else {
   636  		// Create the parent destination directory.
   637  		parent := path.Dir(dst)
   638  		if err := os.MkdirAll(parent, 0777); err != nil {
   639  			return fmt.Errorf("mkdir(%q) failed: %v", parent, err)
   640  		}
   641  		// Create the destination file if it does not exist.
   642  		f, err := os.OpenFile(dst, unix.O_CREAT, 0777)
   643  		if err != nil {
   644  			return fmt.Errorf("open(%q) failed: %v", dst, err)
   645  		}
   646  		f.Close()
   647  	}
   648  
   649  	// Do the mount.
   650  	if err := SafeMount(src, dst, typ, uintptr(flags), "", procPath); err != nil {
   651  		return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err)
   652  	}
   653  	return nil
   654  }
   655  
   656  // ErrSymlinkMount is returned by SafeMount when the mount destination is found
   657  // to be a symlink.
   658  type ErrSymlinkMount struct {
   659  	error
   660  }
   661  
   662  // SafeMount is like unix.Mount, but will fail if dst is a symlink. procPath is
   663  // the path to procfs. If it is "", procfs is assumed to be mounted at /proc.
   664  //
   665  // SafeMount can fail when dst contains a symlink. However, it is called in the
   666  // normal case with a destination consisting of a known root (/proc/root) and
   667  // symlink-free path (from resolveSymlink).
   668  func SafeMount(src, dst, fstype string, flags uintptr, data, procPath string) error {
   669  	// Open the destination.
   670  	fd, err := unix.Open(dst, unix.O_PATH|unix.O_CLOEXEC, 0)
   671  	if err != nil {
   672  		return fmt.Errorf("failed to safely mount: Open(%s, _, _): %w", dst, err)
   673  	}
   674  	defer unix.Close(fd)
   675  
   676  	// Use /proc/self/fd/ to verify that we opened the intended destination. This
   677  	// guards against dst being a symlink, in which case we could accidentally
   678  	// mount over the symlink's target.
   679  	if procPath == "" {
   680  		procPath = "/proc"
   681  	}
   682  	safePath := filepath.Join(procPath, "self/fd", strconv.Itoa(fd))
   683  	target, err := os.Readlink(safePath)
   684  	if err != nil {
   685  		return fmt.Errorf("failed to safely mount: Readlink(%s): %w", safePath, err)
   686  	}
   687  	if dst != target {
   688  		return &ErrSymlinkMount{fmt.Errorf("failed to safely mount: expected to open %s, but found %s", dst, target)}
   689  	}
   690  
   691  	return unix.Mount(src, safePath, fstype, flags, data)
   692  }
   693  
   694  // RetryEintr retries the function until an error different than EINTR is
   695  // returned.
   696  func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
   697  	for {
   698  		r1, r2, err := f()
   699  		if err != unix.EINTR {
   700  			return r1, r2, err
   701  		}
   702  	}
   703  }
   704  
   705  // GetOOMScoreAdj reads the given process' oom_score_adj
   706  func GetOOMScoreAdj(pid int) (int, error) {
   707  	data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid))
   708  	if err != nil {
   709  		return 0, err
   710  	}
   711  	return strconv.Atoi(strings.TrimSpace(string(data)))
   712  }
   713  
   714  // EnvVar looks for a variable value in the env slice assuming the following
   715  // format: "NAME=VALUE". If a variable is defined multiple times, the last
   716  // value is used.
   717  func EnvVar(env []string, name string) (string, bool) {
   718  	var err error
   719  	env, err = ResolveEnvs(env)
   720  	if err != nil {
   721  		return "", false
   722  	}
   723  	prefix := name + "="
   724  	for _, e := range env {
   725  		if strings.HasPrefix(e, prefix) {
   726  			return strings.TrimPrefix(e, prefix), true
   727  		}
   728  	}
   729  	return "", false
   730  }
   731  
   732  // ResolveEnvs transforms lists of environment variables into a single list of
   733  // environment variables. If a variable is defined multiple times, the last
   734  // value is used.
   735  func ResolveEnvs(envs ...[]string) ([]string, error) {
   736  	// First create a map of variable names to values. This removes any
   737  	// duplicates.
   738  	envMap := make(map[string]string)
   739  	for _, env := range envs {
   740  		for _, str := range env {
   741  			parts := strings.SplitN(str, "=", 2)
   742  			if len(parts) != 2 {
   743  				return nil, fmt.Errorf("invalid variable: %s", str)
   744  			}
   745  			envMap[parts[0]] = parts[1]
   746  		}
   747  	}
   748  	// Reassemble envMap into a list of environment variables of the form
   749  	// NAME=VALUE.
   750  	env := make([]string, 0, len(envMap))
   751  	for k, v := range envMap {
   752  		env = append(env, fmt.Sprintf("%s=%s", k, v))
   753  	}
   754  	return env, nil
   755  }
   756  
   757  // FaqErrorMsg returns an error message pointing to the FAQ.
   758  func FaqErrorMsg(anchor, msg string) string {
   759  	return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor)
   760  }
   761  
   762  // ContainerName looks for an annotation in the spec with the container name. Returns empty string
   763  // if no annotation is found.
   764  func ContainerName(spec *specs.Spec) string {
   765  	return spec.Annotations[annotationContainerName]
   766  }