github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/specutils/specutils.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package specutils contains utility functions for working with OCI runtime
    16  // specs.
    17  package specutils
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"io"
    23  	"io/ioutil"
    24  	"os"
    25  	"path"
    26  	"path/filepath"
    27  	"strconv"
    28  	"strings"
    29  	"time"
    30  
    31  	"github.com/cenkalti/backoff"
    32  	"github.com/mohae/deepcopy"
    33  	specs "github.com/opencontainers/runtime-spec/specs-go"
    34  	"golang.org/x/sys/unix"
    35  	"github.com/metacubex/gvisor/pkg/abi/linux"
    36  	"github.com/metacubex/gvisor/pkg/bits"
    37  	"github.com/metacubex/gvisor/pkg/log"
    38  	"github.com/metacubex/gvisor/pkg/sentry/devices/tpuproxy"
    39  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    40  	"github.com/metacubex/gvisor/runsc/config"
    41  	"github.com/metacubex/gvisor/runsc/flag"
    42  )
    43  
    44  const (
    45  	annotationFlagPrefix            = "dev.gvisor.flag."
    46  	annotationSeccomp               = "dev.gvisor.internal.seccomp."
    47  	annotationTPU                   = "dev.gvisor.internal.tpuproxy"
    48  	annotationSeccompRuntimeDefault = "RuntimeDefault"
    49  
    50  	annotationContainerName = "io.kubernetes.cri.container-name"
    51  )
    52  
    53  // ExePath must point to runsc binary, which is normally the same binary. It's
    54  // changed in tests that aren't linked in the same binary.
    55  var ExePath = "/proc/self/exe"
    56  
    57  // Version is the supported spec version.
    58  var Version = specs.Version
    59  
    60  // LogSpecDebug writes the spec in a human-friendly format to the debug log.
    61  func LogSpecDebug(orig *specs.Spec, logSeccomp bool) {
    62  	if !log.IsLogging(log.Debug) {
    63  		return
    64  	}
    65  
    66  	// Strip down parts of the spec that are not interesting.
    67  	spec := deepcopy.Copy(orig).(*specs.Spec)
    68  	if spec.Process != nil {
    69  		spec.Process.Capabilities = nil
    70  	}
    71  	if spec.Linux != nil {
    72  		if !logSeccomp {
    73  			spec.Linux.Seccomp = nil
    74  		}
    75  		spec.Linux.MaskedPaths = nil
    76  		spec.Linux.ReadonlyPaths = nil
    77  		if spec.Linux.Resources != nil {
    78  			spec.Linux.Resources.Devices = nil
    79  		}
    80  	}
    81  
    82  	out, err := json.MarshalIndent(spec, "", "  ")
    83  	if err != nil {
    84  		log.Debugf("Failed to marshal spec: %v", err)
    85  		return
    86  	}
    87  	log.Debugf("Spec:\n%s", out)
    88  }
    89  
    90  // ValidateSpec validates that the spec is compatible with runsc.
    91  func ValidateSpec(spec *specs.Spec) error {
    92  	// Mandatory fields.
    93  	if spec.Process == nil {
    94  		return fmt.Errorf("Spec.Process must be defined: %+v", spec)
    95  	}
    96  	if len(spec.Process.Args) == 0 {
    97  		return fmt.Errorf("Spec.Process.Arg must be defined: %+v", spec.Process)
    98  	}
    99  	if spec.Root == nil {
   100  		return fmt.Errorf("Spec.Root must be defined: %+v", spec)
   101  	}
   102  	if len(spec.Root.Path) == 0 {
   103  		return fmt.Errorf("Spec.Root.Path must be defined: %+v", spec.Root)
   104  	}
   105  
   106  	// Unsupported fields.
   107  	if spec.Solaris != nil {
   108  		return fmt.Errorf("Spec.Solaris is not supported: %+v", spec)
   109  	}
   110  	if spec.Windows != nil {
   111  		return fmt.Errorf("Spec.Windows is not supported: %+v", spec)
   112  	}
   113  	if len(spec.Process.SelinuxLabel) != 0 {
   114  		return fmt.Errorf("SELinux is not supported: %s", spec.Process.SelinuxLabel)
   115  	}
   116  
   117  	// Docker uses AppArmor by default, so just log that it's being ignored.
   118  	if spec.Process.ApparmorProfile != "" {
   119  		log.Warningf("AppArmor profile %q is being ignored", spec.Process.ApparmorProfile)
   120  	}
   121  
   122  	// PR_SET_NO_NEW_PRIVS is assumed to always be set.
   123  	// See kernel.Task.updateCredsForExecLocked.
   124  	if !spec.Process.NoNewPrivileges {
   125  		log.Warningf("noNewPrivileges ignored. PR_SET_NO_NEW_PRIVS is assumed to always be set.")
   126  	}
   127  
   128  	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
   129  		if err := validateRootfsPropagation(spec.Linux.RootfsPropagation); err != nil {
   130  			return err
   131  		}
   132  	}
   133  	for _, m := range spec.Mounts {
   134  		if err := validateMount(&m); err != nil {
   135  			return err
   136  		}
   137  	}
   138  
   139  	// CRI specifies whether a container should start a new sandbox, or run
   140  	// another container in an existing sandbox.
   141  	switch SpecContainerType(spec) {
   142  	case ContainerTypeContainer:
   143  		// When starting a container in an existing sandbox, the
   144  		// sandbox ID must be set.
   145  		if _, ok := SandboxID(spec); !ok {
   146  			return fmt.Errorf("spec has container-type of container, but no sandbox ID set")
   147  		}
   148  	case ContainerTypeUnknown:
   149  		return fmt.Errorf("unknown container-type")
   150  	default:
   151  	}
   152  
   153  	return nil
   154  }
   155  
   156  // absPath turns the given path into an absolute path (if it is not already
   157  // absolute) by prepending the base path.
   158  func absPath(base, rel string) string {
   159  	if filepath.IsAbs(rel) {
   160  		return rel
   161  	}
   162  	return filepath.Join(base, rel)
   163  }
   164  
   165  // OpenSpec opens an OCI runtime spec from the given bundle directory.
   166  func OpenSpec(bundleDir string) (*os.File, error) {
   167  	// The spec file must be named "config.json" inside the bundle directory.
   168  	return os.Open(filepath.Join(bundleDir, "config.json"))
   169  }
   170  
   171  // ReadSpec reads an OCI runtime spec from the given bundle directory.
   172  // ReadSpec also normalizes all potential relative paths into absolute
   173  // path, e.g. spec.Root.Path, mount.Source.
   174  func ReadSpec(bundleDir string, conf *config.Config) (*specs.Spec, error) {
   175  	specFile, err := OpenSpec(bundleDir)
   176  	if err != nil {
   177  		return nil, fmt.Errorf("error opening spec file %q: %v", filepath.Join(bundleDir, "config.json"), err)
   178  	}
   179  	defer specFile.Close()
   180  	return ReadSpecFromFile(bundleDir, specFile, conf)
   181  }
   182  
   183  // ReadSpecFromFile reads an OCI runtime spec from the given file. It also fixes
   184  // up the spec so that the rest of the code doesn't need to worry about it.
   185  //  1. Normalizes all relative paths into absolute by prepending the bundle
   186  //     dir to them.
   187  //  2. Looks for flag overrides and applies them if any.
   188  //  3. Removes seccomp rules if `RuntimeDefault` was used.
   189  func ReadSpecFromFile(bundleDir string, specFile *os.File, conf *config.Config) (*specs.Spec, error) {
   190  	if _, err := specFile.Seek(0, io.SeekStart); err != nil {
   191  		return nil, fmt.Errorf("error seeking to beginning of file %q: %v", specFile.Name(), err)
   192  	}
   193  	specBytes, err := ioutil.ReadAll(specFile)
   194  	if err != nil {
   195  		return nil, fmt.Errorf("error reading spec from file %q: %v", specFile.Name(), err)
   196  	}
   197  	var spec specs.Spec
   198  	if err := json.Unmarshal(specBytes, &spec); err != nil {
   199  		return nil, fmt.Errorf("error unmarshaling spec from file %q: %v\n %s", specFile.Name(), err, string(specBytes))
   200  	}
   201  	if err := ValidateSpec(&spec); err != nil {
   202  		return nil, err
   203  	}
   204  	if err := fixSpec(&spec, bundleDir, conf); err != nil {
   205  		return nil, err
   206  	}
   207  	return &spec, nil
   208  }
   209  
   210  func fixSpec(spec *specs.Spec, bundleDir string, conf *config.Config) error {
   211  	// Turn any relative paths in the spec to absolute by prepending the bundleDir.
   212  	spec.Root.Path = absPath(bundleDir, spec.Root.Path)
   213  	for i := range spec.Mounts {
   214  		m := &spec.Mounts[i]
   215  		if m.Source != "" {
   216  			m.Source = absPath(bundleDir, m.Source)
   217  		}
   218  	}
   219  	// Look for config bundle annotations and verify that they exist.
   220  	const configBundlePrefix = "dev.gvisor.bundle."
   221  	var bundles []config.BundleName
   222  	for annotation, val := range spec.Annotations {
   223  		if !strings.HasPrefix(annotation, configBundlePrefix) {
   224  			continue
   225  		}
   226  		if val != "true" {
   227  			return fmt.Errorf("invalid value %q for annotation %q (must be set to 'true' or removed entirely)", val, annotation)
   228  		}
   229  		bundleName := config.BundleName(annotation[len(configBundlePrefix):])
   230  		if _, exists := config.Bundles[bundleName]; !exists {
   231  			log.Warningf("Bundle name %q (from annotation %q=%q) does not exist; this bundle may have been deprecated. Skipping.", bundleName, annotation, val)
   232  			continue
   233  		}
   234  		bundles = append(bundles, bundleName)
   235  	}
   236  
   237  	// Apply config bundles, if any.
   238  	if len(bundles) > 0 {
   239  		log.Infof("Applying config bundles: %v", bundles)
   240  		if err := conf.ApplyBundles(flag.CommandLine, bundles...); err != nil {
   241  			return err
   242  		}
   243  	}
   244  
   245  	containerName := ContainerName(spec)
   246  	for annotation, val := range spec.Annotations {
   247  		if strings.HasPrefix(annotation, annotationFlagPrefix) {
   248  			// Override flags using annotation to allow customization per sandbox
   249  			// instance.
   250  			name := annotation[len(annotationFlagPrefix):]
   251  			log.Infof("Overriding flag from flag annotation: --%s=%q", name, val)
   252  			if err := conf.Override(flag.CommandLine, name, val /* force= */, false); err != nil {
   253  				return err
   254  			}
   255  		} else if len(containerName) > 0 {
   256  			// If we know the container name, then check to see if seccomp
   257  			// instructions were given to the the container.
   258  			if annotation == annotationSeccomp+containerName && val == annotationSeccompRuntimeDefault {
   259  				// Container seccomp rules are redundant when using gVisor, so remove
   260  				// them when seccomp is set to RuntimeDefault.
   261  				if spec.Linux != nil && spec.Linux.Seccomp != nil {
   262  					log.Debugf("Seccomp is being ignored because annotation %q is set to default.", annotationSeccomp)
   263  					spec.Linux.Seccomp = nil
   264  				}
   265  			}
   266  		}
   267  	}
   268  	return nil
   269  }
   270  
   271  // ReadMounts reads mount list from a file.
   272  func ReadMounts(f *os.File) ([]specs.Mount, error) {
   273  	bytes, err := ioutil.ReadAll(f)
   274  	if err != nil {
   275  		return nil, fmt.Errorf("error reading mounts: %v", err)
   276  	}
   277  	var mounts []specs.Mount
   278  	if err := json.Unmarshal(bytes, &mounts); err != nil {
   279  		return nil, fmt.Errorf("error unmarshaling mounts: %v\nJSON bytes:\n%s", err, string(bytes))
   280  	}
   281  	return mounts, nil
   282  }
   283  
   284  // ChangeMountType changes m.Type to the specified type. It may do necessary
   285  // amends to m.Options.
   286  func ChangeMountType(m *specs.Mount, newType string) {
   287  	m.Type = newType
   288  
   289  	// OCI spec allows bind mounts to be specified in options only. So if new type
   290  	// is not bind, remove bind/rbind from options.
   291  	//
   292  	// "For bind mounts (when options include either bind or rbind), the type is
   293  	// a dummy, often "none" (not listed in /proc/filesystems)."
   294  	if newType != "bind" {
   295  		newOpts := make([]string, 0, len(m.Options))
   296  		for _, opt := range m.Options {
   297  			if opt != "rbind" && opt != "bind" {
   298  				newOpts = append(newOpts, opt)
   299  			}
   300  		}
   301  		m.Options = newOpts
   302  	}
   303  }
   304  
   305  // Capabilities takes in spec and returns a TaskCapabilities corresponding to
   306  // the spec.
   307  func Capabilities(enableRaw bool, specCaps *specs.LinuxCapabilities) (*auth.TaskCapabilities, error) {
   308  	// Strip CAP_NET_RAW from all capability sets if necessary.
   309  	skipSet := map[linux.Capability]struct{}{}
   310  	if !enableRaw {
   311  		skipSet[linux.CAP_NET_RAW] = struct{}{}
   312  	}
   313  
   314  	var caps auth.TaskCapabilities
   315  	if specCaps != nil {
   316  		var err error
   317  		if caps.BoundingCaps, err = capsFromNames(specCaps.Bounding, skipSet); err != nil {
   318  			return nil, err
   319  		}
   320  		if caps.EffectiveCaps, err = capsFromNames(specCaps.Effective, skipSet); err != nil {
   321  			return nil, err
   322  		}
   323  		if caps.InheritableCaps, err = capsFromNames(specCaps.Inheritable, skipSet); err != nil {
   324  			return nil, err
   325  		}
   326  		if caps.PermittedCaps, err = capsFromNames(specCaps.Permitted, skipSet); err != nil {
   327  			return nil, err
   328  		}
   329  		// TODO(gvisor.dev/issue/3166): Support ambient capabilities.
   330  	}
   331  	return &caps, nil
   332  }
   333  
   334  // AllCapabilities returns a LinuxCapabilities struct with all capabilities.
   335  func AllCapabilities() *specs.LinuxCapabilities {
   336  	var names []string
   337  	for n := range capFromName {
   338  		names = append(names, n)
   339  	}
   340  	return &specs.LinuxCapabilities{
   341  		Bounding:    names,
   342  		Effective:   names,
   343  		Inheritable: names,
   344  		Permitted:   names,
   345  		Ambient:     names,
   346  	}
   347  }
   348  
   349  // AllCapabilitiesUint64 returns a bitmask containing all capabilities set.
   350  func AllCapabilitiesUint64() uint64 {
   351  	var rv uint64
   352  	for _, cap := range capFromName {
   353  		rv |= bits.MaskOf64(int(cap))
   354  	}
   355  	return rv
   356  }
   357  
   358  // MergeCapabilities merges the capabilites from first and second.
   359  func MergeCapabilities(first, second *specs.LinuxCapabilities) *specs.LinuxCapabilities {
   360  	return &specs.LinuxCapabilities{
   361  		Bounding:    mergeUnique(first.Bounding, second.Bounding),
   362  		Effective:   mergeUnique(first.Effective, second.Effective),
   363  		Inheritable: mergeUnique(first.Inheritable, second.Inheritable),
   364  		Permitted:   mergeUnique(first.Permitted, second.Permitted),
   365  		Ambient:     mergeUnique(first.Ambient, second.Ambient),
   366  	}
   367  }
   368  
   369  // DropCapability removes the specified capability from all capability sets.
   370  func DropCapability(caps *specs.LinuxCapabilities, drop string) {
   371  	caps.Bounding = remove(caps.Bounding, drop)
   372  	caps.Effective = remove(caps.Effective, drop)
   373  	caps.Inheritable = remove(caps.Inheritable, drop)
   374  	caps.Permitted = remove(caps.Permitted, drop)
   375  	caps.Ambient = remove(caps.Ambient, drop)
   376  }
   377  
   378  func mergeUnique(strSlices ...[]string) []string {
   379  	common := make(map[string]struct{})
   380  	for _, strSlice := range strSlices {
   381  		for _, s := range strSlice {
   382  			common[s] = struct{}{}
   383  		}
   384  	}
   385  
   386  	res := make([]string, 0, len(common))
   387  	for s := range common {
   388  		res = append(res, s)
   389  	}
   390  	return res
   391  }
   392  
   393  func remove(ss []string, rem string) []string {
   394  	var out []string
   395  	for _, s := range ss {
   396  		if s == rem {
   397  			continue
   398  		}
   399  		out = append(out, s)
   400  	}
   401  	return out
   402  }
   403  
   404  var capFromName = map[string]linux.Capability{
   405  	"CAP_CHOWN":              linux.CAP_CHOWN,
   406  	"CAP_DAC_OVERRIDE":       linux.CAP_DAC_OVERRIDE,
   407  	"CAP_DAC_READ_SEARCH":    linux.CAP_DAC_READ_SEARCH,
   408  	"CAP_FOWNER":             linux.CAP_FOWNER,
   409  	"CAP_FSETID":             linux.CAP_FSETID,
   410  	"CAP_KILL":               linux.CAP_KILL,
   411  	"CAP_SETGID":             linux.CAP_SETGID,
   412  	"CAP_SETUID":             linux.CAP_SETUID,
   413  	"CAP_SETPCAP":            linux.CAP_SETPCAP,
   414  	"CAP_LINUX_IMMUTABLE":    linux.CAP_LINUX_IMMUTABLE,
   415  	"CAP_NET_BIND_SERVICE":   linux.CAP_NET_BIND_SERVICE,
   416  	"CAP_NET_BROADCAST":      linux.CAP_NET_BROADCAST,
   417  	"CAP_NET_ADMIN":          linux.CAP_NET_ADMIN,
   418  	"CAP_NET_RAW":            linux.CAP_NET_RAW,
   419  	"CAP_IPC_LOCK":           linux.CAP_IPC_LOCK,
   420  	"CAP_IPC_OWNER":          linux.CAP_IPC_OWNER,
   421  	"CAP_SYS_MODULE":         linux.CAP_SYS_MODULE,
   422  	"CAP_SYS_RAWIO":          linux.CAP_SYS_RAWIO,
   423  	"CAP_SYS_CHROOT":         linux.CAP_SYS_CHROOT,
   424  	"CAP_SYS_PTRACE":         linux.CAP_SYS_PTRACE,
   425  	"CAP_SYS_PACCT":          linux.CAP_SYS_PACCT,
   426  	"CAP_SYS_ADMIN":          linux.CAP_SYS_ADMIN,
   427  	"CAP_SYS_BOOT":           linux.CAP_SYS_BOOT,
   428  	"CAP_SYS_NICE":           linux.CAP_SYS_NICE,
   429  	"CAP_SYS_RESOURCE":       linux.CAP_SYS_RESOURCE,
   430  	"CAP_SYS_TIME":           linux.CAP_SYS_TIME,
   431  	"CAP_SYS_TTY_CONFIG":     linux.CAP_SYS_TTY_CONFIG,
   432  	"CAP_MKNOD":              linux.CAP_MKNOD,
   433  	"CAP_LEASE":              linux.CAP_LEASE,
   434  	"CAP_AUDIT_WRITE":        linux.CAP_AUDIT_WRITE,
   435  	"CAP_AUDIT_CONTROL":      linux.CAP_AUDIT_CONTROL,
   436  	"CAP_SETFCAP":            linux.CAP_SETFCAP,
   437  	"CAP_MAC_OVERRIDE":       linux.CAP_MAC_OVERRIDE,
   438  	"CAP_MAC_ADMIN":          linux.CAP_MAC_ADMIN,
   439  	"CAP_SYSLOG":             linux.CAP_SYSLOG,
   440  	"CAP_WAKE_ALARM":         linux.CAP_WAKE_ALARM,
   441  	"CAP_BLOCK_SUSPEND":      linux.CAP_BLOCK_SUSPEND,
   442  	"CAP_AUDIT_READ":         linux.CAP_AUDIT_READ,
   443  	"CAP_PERFMON":            linux.CAP_PERFMON,
   444  	"CAP_BPF":                linux.CAP_BPF,
   445  	"CAP_CHECKPOINT_RESTORE": linux.CAP_CHECKPOINT_RESTORE,
   446  }
   447  
   448  func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth.CapabilitySet, error) {
   449  	var caps []linux.Capability
   450  	for _, n := range names {
   451  		c, ok := capFromName[n]
   452  		if !ok {
   453  			return 0, fmt.Errorf("unknown capability %q", n)
   454  		}
   455  		// Should we skip this capabilty?
   456  		if _, ok := skipSet[c]; ok {
   457  			continue
   458  		}
   459  		caps = append(caps, c)
   460  	}
   461  	return auth.CapabilitySetOfMany(caps), nil
   462  }
   463  
   464  // IsGoferMount returns true if the given mount can be mounted as an external
   465  // gofer.
   466  func IsGoferMount(m specs.Mount) bool {
   467  	MaybeConvertToBindMount(&m)
   468  	return m.Type == "bind" && m.Source != ""
   469  }
   470  
   471  // MaybeConvertToBindMount converts mount type to "bind" in case any of the
   472  // mount options are either "bind" or "rbind" as required by the OCI spec.
   473  //
   474  // "For bind mounts (when options include either bind or rbind), the type is a
   475  // dummy, often "none" (not listed in /proc/filesystems)."
   476  func MaybeConvertToBindMount(m *specs.Mount) {
   477  	if m.Type == "bind" {
   478  		return
   479  	}
   480  	for _, opt := range m.Options {
   481  		if opt == "bind" || opt == "rbind" {
   482  			m.Type = "bind"
   483  			return
   484  		}
   485  	}
   486  }
   487  
   488  // WaitForReady waits for a process to become ready. The process is ready when
   489  // the 'ready' function returns true. It continues to wait if 'ready' returns
   490  // false. It returns error on timeout, if the process stops or if 'ready' fails.
   491  func WaitForReady(pid int, timeout time.Duration, ready func() (bool, error)) error {
   492  	b := backoff.NewExponentialBackOff()
   493  	b.InitialInterval = 1 * time.Millisecond
   494  	b.MaxInterval = 1 * time.Second
   495  	b.MaxElapsedTime = timeout
   496  
   497  	op := func() error {
   498  		if ok, err := ready(); err != nil {
   499  			return backoff.Permanent(err)
   500  		} else if ok {
   501  			return nil
   502  		}
   503  
   504  		// Check if the process is still running.
   505  		// If the process is alive, child is 0 because of the NOHANG option.
   506  		// If the process has terminated, child equals the process id.
   507  		var ws unix.WaitStatus
   508  		var ru unix.Rusage
   509  		child, err := unix.Wait4(pid, &ws, unix.WNOHANG, &ru)
   510  		if err != nil {
   511  			return backoff.Permanent(fmt.Errorf("error waiting for process: %v", err))
   512  		} else if child == pid {
   513  			return backoff.Permanent(fmt.Errorf("process %d has terminated", pid))
   514  		}
   515  		return fmt.Errorf("process %d not running yet", pid)
   516  	}
   517  	return backoff.Retry(op, b)
   518  }
   519  
   520  // DebugLogFile opens a log file using 'logPattern' as location. If 'logPattern'
   521  // ends with '/', it's used as a directory with default file name.
   522  // 'logPattern' can contain variables that are substituted:
   523  //   - %TIMESTAMP%: is replaced with a timestamp using the following format:
   524  //     <yyyymmdd-hhmmss.uuuuuu>
   525  //   - %COMMAND%: is replaced with 'command'
   526  //   - %TEST%: is replaced with 'test' (omitted by default)
   527  func DebugLogFile(logPattern, command, test string) (*os.File, error) {
   528  	if strings.HasSuffix(logPattern, "/") {
   529  		// Default format: <debug-log>/runsc.log.<yyyymmdd-hhmmss.uuuuuu>.<command>.txt
   530  		logPattern += "runsc.log.%TIMESTAMP%.%COMMAND%.txt"
   531  	}
   532  	logPattern = strings.Replace(logPattern, "%TIMESTAMP%", time.Now().Format("20060102-150405.000000"), -1)
   533  	logPattern = strings.Replace(logPattern, "%COMMAND%", command, -1)
   534  	logPattern = strings.Replace(logPattern, "%TEST%", test, -1)
   535  
   536  	dir := filepath.Dir(logPattern)
   537  	if err := os.MkdirAll(dir, 0775); err != nil {
   538  		return nil, fmt.Errorf("error creating dir %q: %v", dir, err)
   539  	}
   540  	return os.OpenFile(logPattern, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664)
   541  }
   542  
   543  // IsDebugCommand returns true if the command should be debugged or not, based
   544  // on the current configuration.
   545  func IsDebugCommand(conf *config.Config, command string) bool {
   546  	if len(conf.DebugCommand) == 0 {
   547  		// Debug everything by default.
   548  		return true
   549  	}
   550  	filter := conf.DebugCommand
   551  	rv := true
   552  	if filter[0] == '!' {
   553  		// Negate the match, e.g. !boot should log all, but "boot".
   554  		filter = filter[1:]
   555  		rv = false
   556  	}
   557  	for _, cmd := range strings.Split(filter, ",") {
   558  		if cmd == command {
   559  			return rv
   560  		}
   561  	}
   562  	return !rv
   563  }
   564  
   565  // TPUProxyIsEnabled checks if tpuproxy is enabled in the config or annotations.
   566  func TPUProxyIsEnabled(spec *specs.Spec, conf *config.Config) bool {
   567  	if conf.TPUProxy {
   568  		return true
   569  	}
   570  	val, ok := spec.Annotations[annotationTPU]
   571  	if !ok {
   572  		return false
   573  	}
   574  	ret, err := strconv.ParseBool(val)
   575  	if err != nil {
   576  		log.Warningf("tpuproxy annotation set to invalid value %q: %w. Skipping.", val, err)
   577  	}
   578  	return ret
   579  }
   580  
   581  // VFIOFunctionalityRequested returns true if the container should have access
   582  // to VFIO functionality.
   583  func VFIOFunctionalityRequested(dev *specs.LinuxDevice) bool {
   584  	return strings.HasPrefix(dev.Path, filepath.Dir(tpuproxy.VFIOPath))
   585  }
   586  
   587  // AcceleratorFunctionalityRequested returns true if the container should have
   588  // access to compute accelerators. Compute accelerators are different from GPUs
   589  // by using a different major number and different device char files.
   590  func AcceleratorFunctionalityRequested(dev *specs.LinuxDevice) bool {
   591  	return strings.HasPrefix(dev.Path, "/dev/accel")
   592  }
   593  
   594  // TPUFunctionalityRequested returns true if the container should have access
   595  // to TPU functionality.
   596  func TPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool {
   597  	if !TPUProxyIsEnabled(spec, conf) {
   598  		return false
   599  	}
   600  	if spec.Linux != nil {
   601  		for _, dev := range spec.Linux.Devices {
   602  			if AcceleratorFunctionalityRequested(&dev) || VFIOFunctionalityRequested(&dev) {
   603  				return true
   604  			}
   605  		}
   606  	}
   607  	return false
   608  }
   609  
   610  // SafeSetupAndMount creates the mount point and calls Mount with the given
   611  // flags. procPath is the path to procfs. If it is "", procfs is assumed to be
   612  // mounted at /proc.
   613  func SafeSetupAndMount(src, dst, typ string, flags uint32, procPath string) error {
   614  	// Create the mount point inside. The type must be the same as the source
   615  	// (file or directory).
   616  	var isDir bool
   617  	if typ == "proc" {
   618  		// Special case, as there is no source directory for proc mounts.
   619  		isDir = true
   620  	} else if fi, err := os.Stat(src); err != nil {
   621  		return fmt.Errorf("stat(%q) failed: %v", src, err)
   622  	} else {
   623  		isDir = fi.IsDir()
   624  	}
   625  
   626  	if isDir {
   627  		// Create the destination directory.
   628  		if err := os.MkdirAll(dst, 0777); err != nil {
   629  			return fmt.Errorf("mkdir(%q) failed: %v", dst, err)
   630  		}
   631  	} else {
   632  		// Create the parent destination directory.
   633  		parent := path.Dir(dst)
   634  		if err := os.MkdirAll(parent, 0777); err != nil {
   635  			return fmt.Errorf("mkdir(%q) failed: %v", parent, err)
   636  		}
   637  		// Create the destination file if it does not exist.
   638  		f, err := os.OpenFile(dst, unix.O_CREAT, 0777)
   639  		if err != nil {
   640  			return fmt.Errorf("open(%q) failed: %v", dst, err)
   641  		}
   642  		f.Close()
   643  	}
   644  
   645  	// Do the mount.
   646  	if err := SafeMount(src, dst, typ, uintptr(flags), "", procPath); err != nil {
   647  		return fmt.Errorf("mount(%q, %q, %d) failed: %v", src, dst, flags, err)
   648  	}
   649  	return nil
   650  }
   651  
   652  // ErrSymlinkMount is returned by SafeMount when the mount destination is found
   653  // to be a symlink.
   654  type ErrSymlinkMount struct {
   655  	error
   656  }
   657  
   658  // SafeMount is like unix.Mount, but will fail if dst is a symlink. procPath is
   659  // the path to procfs. If it is "", procfs is assumed to be mounted at /proc.
   660  //
   661  // SafeMount can fail when dst contains a symlink. However, it is called in the
   662  // normal case with a destination consisting of a known root (/proc/root) and
   663  // symlink-free path (from resolveSymlink).
   664  func SafeMount(src, dst, fstype string, flags uintptr, data, procPath string) error {
   665  	// Open the destination.
   666  	fd, err := unix.Open(dst, unix.O_PATH|unix.O_CLOEXEC, 0)
   667  	if err != nil {
   668  		return fmt.Errorf("failed to safely mount: Open(%s, _, _): %w", dst, err)
   669  	}
   670  	defer unix.Close(fd)
   671  
   672  	// Use /proc/self/fd/ to verify that we opened the intended destination. This
   673  	// guards against dst being a symlink, in which case we could accidentally
   674  	// mount over the symlink's target.
   675  	if procPath == "" {
   676  		procPath = "/proc"
   677  	}
   678  	safePath := filepath.Join(procPath, "self/fd", strconv.Itoa(fd))
   679  	target, err := os.Readlink(safePath)
   680  	if err != nil {
   681  		return fmt.Errorf("failed to safely mount: Readlink(%s): %w", safePath, err)
   682  	}
   683  	if dst != target {
   684  		return &ErrSymlinkMount{fmt.Errorf("failed to safely mount: expected to open %s, but found %s", dst, target)}
   685  	}
   686  
   687  	return unix.Mount(src, safePath, fstype, flags, data)
   688  }
   689  
   690  // ContainsStr returns true if 'str' is inside 'strs'.
   691  func ContainsStr(strs []string, str string) bool {
   692  	for _, s := range strs {
   693  		if s == str {
   694  			return true
   695  		}
   696  	}
   697  	return false
   698  }
   699  
   700  // RetryEintr retries the function until an error different than EINTR is
   701  // returned.
   702  func RetryEintr(f func() (uintptr, uintptr, error)) (uintptr, uintptr, error) {
   703  	for {
   704  		r1, r2, err := f()
   705  		if err != unix.EINTR {
   706  			return r1, r2, err
   707  		}
   708  	}
   709  }
   710  
   711  // GetOOMScoreAdj reads the given process' oom_score_adj
   712  func GetOOMScoreAdj(pid int) (int, error) {
   713  	data, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid))
   714  	if err != nil {
   715  		return 0, err
   716  	}
   717  	return strconv.Atoi(strings.TrimSpace(string(data)))
   718  }
   719  
   720  // EnvVar looks for a variable value in the env slice assuming the following
   721  // format: "NAME=VALUE". If a variable is defined multiple times, the last
   722  // value is used.
   723  func EnvVar(env []string, name string) (string, bool) {
   724  	var err error
   725  	env, err = ResolveEnvs(env)
   726  	if err != nil {
   727  		return "", false
   728  	}
   729  	prefix := name + "="
   730  	for _, e := range env {
   731  		if strings.HasPrefix(e, prefix) {
   732  			return strings.TrimPrefix(e, prefix), true
   733  		}
   734  	}
   735  	return "", false
   736  }
   737  
   738  // ResolveEnvs transforms lists of environment variables into a single list of
   739  // environment variables. If a variable is defined multiple times, the last
   740  // value is used.
   741  func ResolveEnvs(envs ...[]string) ([]string, error) {
   742  	// First create a map of variable names to values. This removes any
   743  	// duplicates.
   744  	envMap := make(map[string]string)
   745  	for _, env := range envs {
   746  		for _, str := range env {
   747  			parts := strings.SplitN(str, "=", 2)
   748  			if len(parts) != 2 {
   749  				return nil, fmt.Errorf("invalid variable: %s", str)
   750  			}
   751  			envMap[parts[0]] = parts[1]
   752  		}
   753  	}
   754  	// Reassemble envMap into a list of environment variables of the form
   755  	// NAME=VALUE.
   756  	env := make([]string, 0, len(envMap))
   757  	for k, v := range envMap {
   758  		env = append(env, fmt.Sprintf("%s=%s", k, v))
   759  	}
   760  	return env, nil
   761  }
   762  
   763  // FaqErrorMsg returns an error message pointing to the FAQ.
   764  func FaqErrorMsg(anchor, msg string) string {
   765  	return fmt.Sprintf("%s; see https://gvisor.dev/faq#%s for more details", msg, anchor)
   766  }
   767  
   768  // ContainerName looks for an annotation in the spec with the container name. Returns empty string
   769  // if no annotation is found.
   770  func ContainerName(spec *specs.Spec) string {
   771  	return spec.Annotations[annotationContainerName]
   772  }