github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/configs/config.go (about)

     1  package configs
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/json"
     6  	"fmt"
     7  	"os/exec"
     8  	"time"
     9  
    10  	"github.com/sirupsen/logrus"
    11  	"golang.org/x/sys/unix"
    12  
    13  	"github.com/opencontainers/runc/libcontainer/devices"
    14  	"github.com/opencontainers/runtime-spec/specs-go"
    15  )
    16  
    17  type Rlimit struct {
    18  	Type int    `json:"type"`
    19  	Hard uint64 `json:"hard"`
    20  	Soft uint64 `json:"soft"`
    21  }
    22  
    23  // IDMap represents UID/GID Mappings for User Namespaces.
    24  type IDMap struct {
    25  	ContainerID int64 `json:"container_id"`
    26  	HostID      int64 `json:"host_id"`
    27  	Size        int64 `json:"size"`
    28  }
    29  
    30  // Seccomp represents syscall restrictions
    31  // By default, only the native architecture of the kernel is allowed to be used
    32  // for syscalls. Additional architectures can be added by specifying them in
    33  // Architectures.
    34  type Seccomp struct {
    35  	DefaultAction    Action                   `json:"default_action"`
    36  	Architectures    []string                 `json:"architectures"`
    37  	Flags            []specs.LinuxSeccompFlag `json:"flags"`
    38  	Syscalls         []*Syscall               `json:"syscalls"`
    39  	DefaultErrnoRet  *uint                    `json:"default_errno_ret"`
    40  	ListenerPath     string                   `json:"listener_path,omitempty"`
    41  	ListenerMetadata string                   `json:"listener_metadata,omitempty"`
    42  }
    43  
    44  // Action is taken upon rule match in Seccomp
    45  type Action int
    46  
    47  const (
    48  	Kill Action = iota + 1
    49  	Errno
    50  	Trap
    51  	Allow
    52  	Trace
    53  	Log
    54  	Notify
    55  	KillThread
    56  	KillProcess
    57  )
    58  
    59  // Operator is a comparison operator to be used when matching syscall arguments in Seccomp
    60  type Operator int
    61  
    62  const (
    63  	EqualTo Operator = iota + 1
    64  	NotEqualTo
    65  	GreaterThan
    66  	GreaterThanOrEqualTo
    67  	LessThan
    68  	LessThanOrEqualTo
    69  	MaskEqualTo
    70  )
    71  
    72  // Arg is a rule to match a specific syscall argument in Seccomp
    73  type Arg struct {
    74  	Index    uint     `json:"index"`
    75  	Value    uint64   `json:"value"`
    76  	ValueTwo uint64   `json:"value_two"`
    77  	Op       Operator `json:"op"`
    78  }
    79  
    80  // Syscall is a rule to match a syscall in Seccomp
    81  type Syscall struct {
    82  	Name     string `json:"name"`
    83  	Action   Action `json:"action"`
    84  	ErrnoRet *uint  `json:"errnoRet"`
    85  	Args     []*Arg `json:"args"`
    86  }
    87  
    88  // Config defines configuration options for executing a process inside a contained environment.
    89  type Config struct {
    90  	// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
    91  	// This is a common option when the container is running in ramdisk
    92  	NoPivotRoot bool `json:"no_pivot_root"`
    93  
    94  	// ParentDeathSignal specifies the signal that is sent to the container's process in the case
    95  	// that the parent process dies.
    96  	ParentDeathSignal int `json:"parent_death_signal"`
    97  
    98  	// Path to a directory containing the container's root filesystem.
    99  	Rootfs string `json:"rootfs"`
   100  
   101  	// Umask is the umask to use inside of the container.
   102  	Umask *uint32 `json:"umask"`
   103  
   104  	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
   105  	// bind mounts are writtable.
   106  	Readonlyfs bool `json:"readonlyfs"`
   107  
   108  	// Specifies the mount propagation flags to be applied to /.
   109  	RootPropagation int `json:"rootPropagation"`
   110  
   111  	// Mounts specify additional source and destination paths that will be mounted inside the container's
   112  	// rootfs and mount namespace if specified
   113  	Mounts []*Mount `json:"mounts"`
   114  
   115  	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
   116  	Devices []*devices.Device `json:"devices"`
   117  
   118  	MountLabel string `json:"mount_label"`
   119  
   120  	// Hostname optionally sets the container's hostname if provided
   121  	Hostname string `json:"hostname"`
   122  
   123  	// Domainname optionally sets the container's domainname if provided
   124  	Domainname string `json:"domainname"`
   125  
   126  	// Namespaces specifies the container's namespaces that it should setup when cloning the init process
   127  	// If a namespace is not provided that namespace is shared from the container's parent process
   128  	Namespaces Namespaces `json:"namespaces"`
   129  
   130  	// Capabilities specify the capabilities to keep when executing the process inside the container
   131  	// All capabilities not specified will be dropped from the processes capability mask
   132  	Capabilities *Capabilities `json:"capabilities"`
   133  
   134  	// Networks specifies the container's network setup to be created
   135  	Networks []*Network `json:"networks"`
   136  
   137  	// Routes can be specified to create entries in the route table as the container is started
   138  	Routes []*Route `json:"routes"`
   139  
   140  	// Cgroups specifies specific cgroup settings for the various subsystems that the container is
   141  	// placed into to limit the resources the container has available
   142  	Cgroups *Cgroup `json:"cgroups"`
   143  
   144  	// AppArmorProfile specifies the profile to apply to the process running in the container and is
   145  	// change at the time the process is execed
   146  	AppArmorProfile string `json:"apparmor_profile,omitempty"`
   147  
   148  	// ProcessLabel specifies the label to apply to the process running in the container.  It is
   149  	// commonly used by selinux
   150  	ProcessLabel string `json:"process_label,omitempty"`
   151  
   152  	// Rlimits specifies the resource limits, such as max open files, to set in the container
   153  	// If Rlimits are not set, the container will inherit rlimits from the parent process
   154  	Rlimits []Rlimit `json:"rlimits,omitempty"`
   155  
   156  	// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
   157  	// for a process. Valid values are between the range [-1000, '1000'], where processes with
   158  	// higher scores are preferred for being killed. If it is unset then we don't touch the current
   159  	// value.
   160  	// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
   161  	OomScoreAdj *int `json:"oom_score_adj,omitempty"`
   162  
   163  	// UIDMappings is an array of User ID mappings for User Namespaces
   164  	UIDMappings []IDMap `json:"uid_mappings"`
   165  
   166  	// GIDMappings is an array of Group ID mappings for User Namespaces
   167  	GIDMappings []IDMap `json:"gid_mappings"`
   168  
   169  	// MaskPaths specifies paths within the container's rootfs to mask over with a bind
   170  	// mount pointing to /dev/null as to prevent reads of the file.
   171  	MaskPaths []string `json:"mask_paths"`
   172  
   173  	// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
   174  	// so that these files prevent any writes.
   175  	ReadonlyPaths []string `json:"readonly_paths"`
   176  
   177  	// Sysctl is a map of properties and their values. It is the equivalent of using
   178  	// sysctl -w my.property.name value in Linux.
   179  	Sysctl map[string]string `json:"sysctl"`
   180  
   181  	// Seccomp allows actions to be taken whenever a syscall is made within the container.
   182  	// A number of rules are given, each having an action to be taken if a syscall matches it.
   183  	// A default action to be taken if no rules match is also given.
   184  	Seccomp *Seccomp `json:"seccomp"`
   185  
   186  	// NoNewPrivileges controls whether processes in the container can gain additional privileges.
   187  	NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
   188  
   189  	// Hooks are a collection of actions to perform at various container lifecycle events.
   190  	// CommandHooks are serialized to JSON, but other hooks are not.
   191  	Hooks Hooks
   192  
   193  	// Version is the version of opencontainer specification that is supported.
   194  	Version string `json:"version"`
   195  
   196  	// Labels are user defined metadata that is stored in the config and populated on the state
   197  	Labels []string `json:"labels"`
   198  
   199  	// NoNewKeyring will not allocated a new session keyring for the container.  It will use the
   200  	// callers keyring in this case.
   201  	NoNewKeyring bool `json:"no_new_keyring"`
   202  
   203  	// IntelRdt specifies settings for Intel RDT group that the container is placed into
   204  	// to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
   205  	IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
   206  
   207  	// RootlessEUID is set when the runc was launched with non-zero EUID.
   208  	// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
   209  	// When RootlessEUID is set, runc creates a new userns for the container.
   210  	// (config.json needs to contain userns settings)
   211  	RootlessEUID bool `json:"rootless_euid,omitempty"`
   212  
   213  	// RootlessCgroups is set when unlikely to have the full access to cgroups.
   214  	// When RootlessCgroups is set, cgroups errors are ignored.
   215  	RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
   216  
   217  	// TimeOffsets specifies the offset for supporting time namespaces.
   218  	TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
   219  
   220  	// Scheduler represents the scheduling attributes for a process.
   221  	Scheduler *Scheduler `json:"scheduler,omitempty"`
   222  
   223  	// Personality contains configuration for the Linux personality syscall.
   224  	Personality *LinuxPersonality `json:"personality,omitempty"`
   225  
   226  	// IOPriority is the container's I/O priority.
   227  	IOPriority *IOPriority `json:"io_priority,omitempty"`
   228  }
   229  
   230  // Scheduler is based on the Linux sched_setattr(2) syscall.
   231  type Scheduler = specs.Scheduler
   232  
   233  // ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr.
   234  func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
   235  	var policy uint32
   236  	switch scheduler.Policy {
   237  	case specs.SchedOther:
   238  		policy = 0
   239  	case specs.SchedFIFO:
   240  		policy = 1
   241  	case specs.SchedRR:
   242  		policy = 2
   243  	case specs.SchedBatch:
   244  		policy = 3
   245  	case specs.SchedISO:
   246  		policy = 4
   247  	case specs.SchedIdle:
   248  		policy = 5
   249  	case specs.SchedDeadline:
   250  		policy = 6
   251  	default:
   252  		return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy)
   253  	}
   254  
   255  	var flags uint64
   256  	for _, flag := range scheduler.Flags {
   257  		switch flag {
   258  		case specs.SchedFlagResetOnFork:
   259  			flags |= 0x01
   260  		case specs.SchedFlagReclaim:
   261  			flags |= 0x02
   262  		case specs.SchedFlagDLOverrun:
   263  			flags |= 0x04
   264  		case specs.SchedFlagKeepPolicy:
   265  			flags |= 0x08
   266  		case specs.SchedFlagKeepParams:
   267  			flags |= 0x10
   268  		case specs.SchedFlagUtilClampMin:
   269  			flags |= 0x20
   270  		case specs.SchedFlagUtilClampMax:
   271  			flags |= 0x40
   272  		default:
   273  			return nil, fmt.Errorf("invalid scheduler flag: %s", flag)
   274  		}
   275  	}
   276  
   277  	return &unix.SchedAttr{
   278  		Size:     unix.SizeofSchedAttr,
   279  		Policy:   policy,
   280  		Flags:    flags,
   281  		Nice:     scheduler.Nice,
   282  		Priority: uint32(scheduler.Priority),
   283  		Runtime:  scheduler.Runtime,
   284  		Deadline: scheduler.Deadline,
   285  		Period:   scheduler.Period,
   286  	}, nil
   287  }
   288  
   289  var IOPrioClassMapping = map[specs.IOPriorityClass]int{
   290  	specs.IOPRIO_CLASS_RT:   1,
   291  	specs.IOPRIO_CLASS_BE:   2,
   292  	specs.IOPRIO_CLASS_IDLE: 3,
   293  }
   294  
   295  type IOPriority = specs.LinuxIOPriority
   296  
   297  type (
   298  	HookName string
   299  	HookList []Hook
   300  	Hooks    map[HookName]HookList
   301  )
   302  
   303  const (
   304  	// Prestart commands are executed after the container namespaces are created,
   305  	// but before the user supplied command is executed from init.
   306  	// Note: This hook is now deprecated
   307  	// Prestart commands are called in the Runtime namespace.
   308  	Prestart HookName = "prestart"
   309  
   310  	// CreateRuntime commands MUST be called as part of the create operation after
   311  	// the runtime environment has been created but before the pivot_root has been executed.
   312  	// CreateRuntime is called immediately after the deprecated Prestart hook.
   313  	// CreateRuntime commands are called in the Runtime Namespace.
   314  	CreateRuntime HookName = "createRuntime"
   315  
   316  	// CreateContainer commands MUST be called as part of the create operation after
   317  	// the runtime environment has been created but before the pivot_root has been executed.
   318  	// CreateContainer commands are called in the Container namespace.
   319  	CreateContainer HookName = "createContainer"
   320  
   321  	// StartContainer commands MUST be called as part of the start operation and before
   322  	// the container process is started.
   323  	// StartContainer commands are called in the Container namespace.
   324  	StartContainer HookName = "startContainer"
   325  
   326  	// Poststart commands are executed after the container init process starts.
   327  	// Poststart commands are called in the Runtime Namespace.
   328  	Poststart HookName = "poststart"
   329  
   330  	// Poststop commands are executed after the container init process exits.
   331  	// Poststop commands are called in the Runtime Namespace.
   332  	Poststop HookName = "poststop"
   333  )
   334  
   335  // KnownHookNames returns the known hook names.
   336  // Used by `runc features`.
   337  func KnownHookNames() []string {
   338  	return []string{
   339  		string(Prestart), // deprecated
   340  		string(CreateRuntime),
   341  		string(CreateContainer),
   342  		string(StartContainer),
   343  		string(Poststart),
   344  		string(Poststop),
   345  	}
   346  }
   347  
   348  type Capabilities struct {
   349  	// Bounding is the set of capabilities checked by the kernel.
   350  	Bounding []string
   351  	// Effective is the set of capabilities checked by the kernel.
   352  	Effective []string
   353  	// Inheritable is the capabilities preserved across execve.
   354  	Inheritable []string
   355  	// Permitted is the limiting superset for effective capabilities.
   356  	Permitted []string
   357  	// Ambient is the ambient set of capabilities that are kept.
   358  	Ambient []string
   359  }
   360  
   361  // Deprecated: use (Hooks).Run instead.
   362  func (hooks HookList) RunHooks(state *specs.State) error {
   363  	for i, h := range hooks {
   364  		if err := h.Run(state); err != nil {
   365  			return fmt.Errorf("error running hook #%d: %w", i, err)
   366  		}
   367  	}
   368  
   369  	return nil
   370  }
   371  
   372  func (hooks *Hooks) UnmarshalJSON(b []byte) error {
   373  	var state map[HookName][]CommandHook
   374  
   375  	if err := json.Unmarshal(b, &state); err != nil {
   376  		return err
   377  	}
   378  
   379  	*hooks = Hooks{}
   380  	for n, commandHooks := range state {
   381  		if len(commandHooks) == 0 {
   382  			continue
   383  		}
   384  
   385  		(*hooks)[n] = HookList{}
   386  		for _, h := range commandHooks {
   387  			(*hooks)[n] = append((*hooks)[n], h)
   388  		}
   389  	}
   390  
   391  	return nil
   392  }
   393  
   394  func (hooks *Hooks) MarshalJSON() ([]byte, error) {
   395  	serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
   396  		for _, hook := range hooks {
   397  			switch chook := hook.(type) {
   398  			case CommandHook:
   399  				serializableHooks = append(serializableHooks, chook)
   400  			default:
   401  				logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
   402  			}
   403  		}
   404  
   405  		return serializableHooks
   406  	}
   407  
   408  	return json.Marshal(map[string]interface{}{
   409  		"prestart":        serialize((*hooks)[Prestart]),
   410  		"createRuntime":   serialize((*hooks)[CreateRuntime]),
   411  		"createContainer": serialize((*hooks)[CreateContainer]),
   412  		"startContainer":  serialize((*hooks)[StartContainer]),
   413  		"poststart":       serialize((*hooks)[Poststart]),
   414  		"poststop":        serialize((*hooks)[Poststop]),
   415  	})
   416  }
   417  
   418  // Run executes all hooks for the given hook name.
   419  func (hooks Hooks) Run(name HookName, state *specs.State) error {
   420  	list := hooks[name]
   421  	for i, h := range list {
   422  		if err := h.Run(state); err != nil {
   423  			return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
   424  		}
   425  	}
   426  
   427  	return nil
   428  }
   429  
   430  type Hook interface {
   431  	// Run executes the hook with the provided state.
   432  	Run(*specs.State) error
   433  }
   434  
   435  // NewFunctionHook will call the provided function when the hook is run.
   436  func NewFunctionHook(f func(*specs.State) error) FuncHook {
   437  	return FuncHook{
   438  		run: f,
   439  	}
   440  }
   441  
   442  type FuncHook struct {
   443  	run func(*specs.State) error
   444  }
   445  
   446  func (f FuncHook) Run(s *specs.State) error {
   447  	return f.run(s)
   448  }
   449  
   450  type Command struct {
   451  	Path    string         `json:"path"`
   452  	Args    []string       `json:"args"`
   453  	Env     []string       `json:"env"`
   454  	Dir     string         `json:"dir"`
   455  	Timeout *time.Duration `json:"timeout"`
   456  }
   457  
   458  // NewCommandHook will execute the provided command when the hook is run.
   459  func NewCommandHook(cmd Command) CommandHook {
   460  	return CommandHook{
   461  		Command: cmd,
   462  	}
   463  }
   464  
   465  type CommandHook struct {
   466  	Command
   467  }
   468  
   469  func (c Command) Run(s *specs.State) error {
   470  	b, err := json.Marshal(s)
   471  	if err != nil {
   472  		return err
   473  	}
   474  	var stdout, stderr bytes.Buffer
   475  	cmd := exec.Cmd{
   476  		Path:   c.Path,
   477  		Args:   c.Args,
   478  		Env:    c.Env,
   479  		Stdin:  bytes.NewReader(b),
   480  		Stdout: &stdout,
   481  		Stderr: &stderr,
   482  	}
   483  	if err := cmd.Start(); err != nil {
   484  		return err
   485  	}
   486  	errC := make(chan error, 1)
   487  	go func() {
   488  		err := cmd.Wait()
   489  		if err != nil {
   490  			err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
   491  		}
   492  		errC <- err
   493  	}()
   494  	var timerCh <-chan time.Time
   495  	if c.Timeout != nil {
   496  		timer := time.NewTimer(*c.Timeout)
   497  		defer timer.Stop()
   498  		timerCh = timer.C
   499  	}
   500  	select {
   501  	case err := <-errC:
   502  		return err
   503  	case <-timerCh:
   504  		_ = cmd.Process.Kill()
   505  		<-errC
   506  		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
   507  	}
   508  }