github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/configs/config.go (about) 1 package configs 2 3 import ( 4 "bytes" 5 "encoding/json" 6 "fmt" 7 "os/exec" 8 "time" 9 10 "github.com/sirupsen/logrus" 11 "golang.org/x/sys/unix" 12 13 "github.com/opencontainers/runc/libcontainer/devices" 14 "github.com/opencontainers/runtime-spec/specs-go" 15 ) 16 17 type Rlimit struct { 18 Type int `json:"type"` 19 Hard uint64 `json:"hard"` 20 Soft uint64 `json:"soft"` 21 } 22 23 // IDMap represents UID/GID Mappings for User Namespaces. 24 type IDMap struct { 25 ContainerID int64 `json:"container_id"` 26 HostID int64 `json:"host_id"` 27 Size int64 `json:"size"` 28 } 29 30 // Seccomp represents syscall restrictions 31 // By default, only the native architecture of the kernel is allowed to be used 32 // for syscalls. Additional architectures can be added by specifying them in 33 // Architectures. 34 type Seccomp struct { 35 DefaultAction Action `json:"default_action"` 36 Architectures []string `json:"architectures"` 37 Flags []specs.LinuxSeccompFlag `json:"flags"` 38 Syscalls []*Syscall `json:"syscalls"` 39 DefaultErrnoRet *uint `json:"default_errno_ret"` 40 ListenerPath string `json:"listener_path,omitempty"` 41 ListenerMetadata string `json:"listener_metadata,omitempty"` 42 } 43 44 // Action is taken upon rule match in Seccomp 45 type Action int 46 47 const ( 48 Kill Action = iota + 1 49 Errno 50 Trap 51 Allow 52 Trace 53 Log 54 Notify 55 KillThread 56 KillProcess 57 ) 58 59 // Operator is a comparison operator to be used when matching syscall arguments in Seccomp 60 type Operator int 61 62 const ( 63 EqualTo Operator = iota + 1 64 NotEqualTo 65 GreaterThan 66 GreaterThanOrEqualTo 67 LessThan 68 LessThanOrEqualTo 69 MaskEqualTo 70 ) 71 72 // Arg is a rule to match a specific syscall argument in Seccomp 73 type Arg struct { 74 Index uint `json:"index"` 75 Value uint64 `json:"value"` 76 ValueTwo uint64 `json:"value_two"` 77 Op Operator `json:"op"` 78 } 79 80 // Syscall is a rule to match a syscall in Seccomp 81 type Syscall struct { 82 Name string `json:"name"` 83 Action Action `json:"action"` 84 ErrnoRet *uint `json:"errnoRet"` 85 Args []*Arg `json:"args"` 86 } 87 88 // Config defines configuration options for executing a process inside a contained environment. 89 type Config struct { 90 // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs 91 // This is a common option when the container is running in ramdisk 92 NoPivotRoot bool `json:"no_pivot_root"` 93 94 // ParentDeathSignal specifies the signal that is sent to the container's process in the case 95 // that the parent process dies. 96 ParentDeathSignal int `json:"parent_death_signal"` 97 98 // Path to a directory containing the container's root filesystem. 99 Rootfs string `json:"rootfs"` 100 101 // Umask is the umask to use inside of the container. 102 Umask *uint32 `json:"umask"` 103 104 // Readonlyfs will remount the container's rootfs as readonly where only externally mounted 105 // bind mounts are writtable. 106 Readonlyfs bool `json:"readonlyfs"` 107 108 // Specifies the mount propagation flags to be applied to /. 109 RootPropagation int `json:"rootPropagation"` 110 111 // Mounts specify additional source and destination paths that will be mounted inside the container's 112 // rootfs and mount namespace if specified 113 Mounts []*Mount `json:"mounts"` 114 115 // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! 116 Devices []*devices.Device `json:"devices"` 117 118 MountLabel string `json:"mount_label"` 119 120 // Hostname optionally sets the container's hostname if provided 121 Hostname string `json:"hostname"` 122 123 // Domainname optionally sets the container's domainname if provided 124 Domainname string `json:"domainname"` 125 126 // Namespaces specifies the container's namespaces that it should setup when cloning the init process 127 // If a namespace is not provided that namespace is shared from the container's parent process 128 Namespaces Namespaces `json:"namespaces"` 129 130 // Capabilities specify the capabilities to keep when executing the process inside the container 131 // All capabilities not specified will be dropped from the processes capability mask 132 Capabilities *Capabilities `json:"capabilities"` 133 134 // Networks specifies the container's network setup to be created 135 Networks []*Network `json:"networks"` 136 137 // Routes can be specified to create entries in the route table as the container is started 138 Routes []*Route `json:"routes"` 139 140 // Cgroups specifies specific cgroup settings for the various subsystems that the container is 141 // placed into to limit the resources the container has available 142 Cgroups *Cgroup `json:"cgroups"` 143 144 // AppArmorProfile specifies the profile to apply to the process running in the container and is 145 // change at the time the process is execed 146 AppArmorProfile string `json:"apparmor_profile,omitempty"` 147 148 // ProcessLabel specifies the label to apply to the process running in the container. It is 149 // commonly used by selinux 150 ProcessLabel string `json:"process_label,omitempty"` 151 152 // Rlimits specifies the resource limits, such as max open files, to set in the container 153 // If Rlimits are not set, the container will inherit rlimits from the parent process 154 Rlimits []Rlimit `json:"rlimits,omitempty"` 155 156 // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores 157 // for a process. Valid values are between the range [-1000, '1000'], where processes with 158 // higher scores are preferred for being killed. If it is unset then we don't touch the current 159 // value. 160 // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ 161 OomScoreAdj *int `json:"oom_score_adj,omitempty"` 162 163 // UIDMappings is an array of User ID mappings for User Namespaces 164 UIDMappings []IDMap `json:"uid_mappings"` 165 166 // GIDMappings is an array of Group ID mappings for User Namespaces 167 GIDMappings []IDMap `json:"gid_mappings"` 168 169 // MaskPaths specifies paths within the container's rootfs to mask over with a bind 170 // mount pointing to /dev/null as to prevent reads of the file. 171 MaskPaths []string `json:"mask_paths"` 172 173 // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only 174 // so that these files prevent any writes. 175 ReadonlyPaths []string `json:"readonly_paths"` 176 177 // Sysctl is a map of properties and their values. It is the equivalent of using 178 // sysctl -w my.property.name value in Linux. 179 Sysctl map[string]string `json:"sysctl"` 180 181 // Seccomp allows actions to be taken whenever a syscall is made within the container. 182 // A number of rules are given, each having an action to be taken if a syscall matches it. 183 // A default action to be taken if no rules match is also given. 184 Seccomp *Seccomp `json:"seccomp"` 185 186 // NoNewPrivileges controls whether processes in the container can gain additional privileges. 187 NoNewPrivileges bool `json:"no_new_privileges,omitempty"` 188 189 // Hooks are a collection of actions to perform at various container lifecycle events. 190 // CommandHooks are serialized to JSON, but other hooks are not. 191 Hooks Hooks 192 193 // Version is the version of opencontainer specification that is supported. 194 Version string `json:"version"` 195 196 // Labels are user defined metadata that is stored in the config and populated on the state 197 Labels []string `json:"labels"` 198 199 // NoNewKeyring will not allocated a new session keyring for the container. It will use the 200 // callers keyring in this case. 201 NoNewKeyring bool `json:"no_new_keyring"` 202 203 // IntelRdt specifies settings for Intel RDT group that the container is placed into 204 // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available 205 IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` 206 207 // RootlessEUID is set when the runc was launched with non-zero EUID. 208 // Note that RootlessEUID is set to false when launched with EUID=0 in userns. 209 // When RootlessEUID is set, runc creates a new userns for the container. 210 // (config.json needs to contain userns settings) 211 RootlessEUID bool `json:"rootless_euid,omitempty"` 212 213 // RootlessCgroups is set when unlikely to have the full access to cgroups. 214 // When RootlessCgroups is set, cgroups errors are ignored. 215 RootlessCgroups bool `json:"rootless_cgroups,omitempty"` 216 217 // TimeOffsets specifies the offset for supporting time namespaces. 218 TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"` 219 220 // Scheduler represents the scheduling attributes for a process. 221 Scheduler *Scheduler `json:"scheduler,omitempty"` 222 223 // Personality contains configuration for the Linux personality syscall. 224 Personality *LinuxPersonality `json:"personality,omitempty"` 225 226 // IOPriority is the container's I/O priority. 227 IOPriority *IOPriority `json:"io_priority,omitempty"` 228 } 229 230 // Scheduler is based on the Linux sched_setattr(2) syscall. 231 type Scheduler = specs.Scheduler 232 233 // ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr. 234 func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) { 235 var policy uint32 236 switch scheduler.Policy { 237 case specs.SchedOther: 238 policy = 0 239 case specs.SchedFIFO: 240 policy = 1 241 case specs.SchedRR: 242 policy = 2 243 case specs.SchedBatch: 244 policy = 3 245 case specs.SchedISO: 246 policy = 4 247 case specs.SchedIdle: 248 policy = 5 249 case specs.SchedDeadline: 250 policy = 6 251 default: 252 return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy) 253 } 254 255 var flags uint64 256 for _, flag := range scheduler.Flags { 257 switch flag { 258 case specs.SchedFlagResetOnFork: 259 flags |= 0x01 260 case specs.SchedFlagReclaim: 261 flags |= 0x02 262 case specs.SchedFlagDLOverrun: 263 flags |= 0x04 264 case specs.SchedFlagKeepPolicy: 265 flags |= 0x08 266 case specs.SchedFlagKeepParams: 267 flags |= 0x10 268 case specs.SchedFlagUtilClampMin: 269 flags |= 0x20 270 case specs.SchedFlagUtilClampMax: 271 flags |= 0x40 272 default: 273 return nil, fmt.Errorf("invalid scheduler flag: %s", flag) 274 } 275 } 276 277 return &unix.SchedAttr{ 278 Size: unix.SizeofSchedAttr, 279 Policy: policy, 280 Flags: flags, 281 Nice: scheduler.Nice, 282 Priority: uint32(scheduler.Priority), 283 Runtime: scheduler.Runtime, 284 Deadline: scheduler.Deadline, 285 Period: scheduler.Period, 286 }, nil 287 } 288 289 var IOPrioClassMapping = map[specs.IOPriorityClass]int{ 290 specs.IOPRIO_CLASS_RT: 1, 291 specs.IOPRIO_CLASS_BE: 2, 292 specs.IOPRIO_CLASS_IDLE: 3, 293 } 294 295 type IOPriority = specs.LinuxIOPriority 296 297 type ( 298 HookName string 299 HookList []Hook 300 Hooks map[HookName]HookList 301 ) 302 303 const ( 304 // Prestart commands are executed after the container namespaces are created, 305 // but before the user supplied command is executed from init. 306 // Note: This hook is now deprecated 307 // Prestart commands are called in the Runtime namespace. 308 Prestart HookName = "prestart" 309 310 // CreateRuntime commands MUST be called as part of the create operation after 311 // the runtime environment has been created but before the pivot_root has been executed. 312 // CreateRuntime is called immediately after the deprecated Prestart hook. 313 // CreateRuntime commands are called in the Runtime Namespace. 314 CreateRuntime HookName = "createRuntime" 315 316 // CreateContainer commands MUST be called as part of the create operation after 317 // the runtime environment has been created but before the pivot_root has been executed. 318 // CreateContainer commands are called in the Container namespace. 319 CreateContainer HookName = "createContainer" 320 321 // StartContainer commands MUST be called as part of the start operation and before 322 // the container process is started. 323 // StartContainer commands are called in the Container namespace. 324 StartContainer HookName = "startContainer" 325 326 // Poststart commands are executed after the container init process starts. 327 // Poststart commands are called in the Runtime Namespace. 328 Poststart HookName = "poststart" 329 330 // Poststop commands are executed after the container init process exits. 331 // Poststop commands are called in the Runtime Namespace. 332 Poststop HookName = "poststop" 333 ) 334 335 // KnownHookNames returns the known hook names. 336 // Used by `runc features`. 337 func KnownHookNames() []string { 338 return []string{ 339 string(Prestart), // deprecated 340 string(CreateRuntime), 341 string(CreateContainer), 342 string(StartContainer), 343 string(Poststart), 344 string(Poststop), 345 } 346 } 347 348 type Capabilities struct { 349 // Bounding is the set of capabilities checked by the kernel. 350 Bounding []string 351 // Effective is the set of capabilities checked by the kernel. 352 Effective []string 353 // Inheritable is the capabilities preserved across execve. 354 Inheritable []string 355 // Permitted is the limiting superset for effective capabilities. 356 Permitted []string 357 // Ambient is the ambient set of capabilities that are kept. 358 Ambient []string 359 } 360 361 // Deprecated: use (Hooks).Run instead. 362 func (hooks HookList) RunHooks(state *specs.State) error { 363 for i, h := range hooks { 364 if err := h.Run(state); err != nil { 365 return fmt.Errorf("error running hook #%d: %w", i, err) 366 } 367 } 368 369 return nil 370 } 371 372 func (hooks *Hooks) UnmarshalJSON(b []byte) error { 373 var state map[HookName][]CommandHook 374 375 if err := json.Unmarshal(b, &state); err != nil { 376 return err 377 } 378 379 *hooks = Hooks{} 380 for n, commandHooks := range state { 381 if len(commandHooks) == 0 { 382 continue 383 } 384 385 (*hooks)[n] = HookList{} 386 for _, h := range commandHooks { 387 (*hooks)[n] = append((*hooks)[n], h) 388 } 389 } 390 391 return nil 392 } 393 394 func (hooks *Hooks) MarshalJSON() ([]byte, error) { 395 serialize := func(hooks []Hook) (serializableHooks []CommandHook) { 396 for _, hook := range hooks { 397 switch chook := hook.(type) { 398 case CommandHook: 399 serializableHooks = append(serializableHooks, chook) 400 default: 401 logrus.Warnf("cannot serialize hook of type %T, skipping", hook) 402 } 403 } 404 405 return serializableHooks 406 } 407 408 return json.Marshal(map[string]interface{}{ 409 "prestart": serialize((*hooks)[Prestart]), 410 "createRuntime": serialize((*hooks)[CreateRuntime]), 411 "createContainer": serialize((*hooks)[CreateContainer]), 412 "startContainer": serialize((*hooks)[StartContainer]), 413 "poststart": serialize((*hooks)[Poststart]), 414 "poststop": serialize((*hooks)[Poststop]), 415 }) 416 } 417 418 // Run executes all hooks for the given hook name. 419 func (hooks Hooks) Run(name HookName, state *specs.State) error { 420 list := hooks[name] 421 for i, h := range list { 422 if err := h.Run(state); err != nil { 423 return fmt.Errorf("error running %s hook #%d: %w", name, i, err) 424 } 425 } 426 427 return nil 428 } 429 430 type Hook interface { 431 // Run executes the hook with the provided state. 432 Run(*specs.State) error 433 } 434 435 // NewFunctionHook will call the provided function when the hook is run. 436 func NewFunctionHook(f func(*specs.State) error) FuncHook { 437 return FuncHook{ 438 run: f, 439 } 440 } 441 442 type FuncHook struct { 443 run func(*specs.State) error 444 } 445 446 func (f FuncHook) Run(s *specs.State) error { 447 return f.run(s) 448 } 449 450 type Command struct { 451 Path string `json:"path"` 452 Args []string `json:"args"` 453 Env []string `json:"env"` 454 Dir string `json:"dir"` 455 Timeout *time.Duration `json:"timeout"` 456 } 457 458 // NewCommandHook will execute the provided command when the hook is run. 459 func NewCommandHook(cmd Command) CommandHook { 460 return CommandHook{ 461 Command: cmd, 462 } 463 } 464 465 type CommandHook struct { 466 Command 467 } 468 469 func (c Command) Run(s *specs.State) error { 470 b, err := json.Marshal(s) 471 if err != nil { 472 return err 473 } 474 var stdout, stderr bytes.Buffer 475 cmd := exec.Cmd{ 476 Path: c.Path, 477 Args: c.Args, 478 Env: c.Env, 479 Stdin: bytes.NewReader(b), 480 Stdout: &stdout, 481 Stderr: &stderr, 482 } 483 if err := cmd.Start(); err != nil { 484 return err 485 } 486 errC := make(chan error, 1) 487 go func() { 488 err := cmd.Wait() 489 if err != nil { 490 err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) 491 } 492 errC <- err 493 }() 494 var timerCh <-chan time.Time 495 if c.Timeout != nil { 496 timer := time.NewTimer(*c.Timeout) 497 defer timer.Stop() 498 timerCh = timer.C 499 } 500 select { 501 case err := <-errC: 502 return err 503 case <-timerCh: 504 _ = cmd.Process.Kill() 505 <-errC 506 return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) 507 } 508 }