github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/specconv/spec_linux.go (about) 1 // Package specconv implements conversion of specifications to libcontainer 2 // configurations 3 package specconv 4 5 import ( 6 "errors" 7 "fmt" 8 "os" 9 "path/filepath" 10 "sort" 11 "strings" 12 "sync" 13 "time" 14 15 systemdDbus "github.com/coreos/go-systemd/v22/dbus" 16 dbus "github.com/godbus/dbus/v5" 17 "github.com/opencontainers/runc/libcontainer/cgroups" 18 "github.com/opencontainers/runc/libcontainer/configs" 19 "github.com/opencontainers/runc/libcontainer/devices" 20 "github.com/opencontainers/runc/libcontainer/seccomp" 21 "github.com/opencontainers/runc/libcontainer/userns" 22 libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" 23 "github.com/opencontainers/runtime-spec/specs-go" 24 "github.com/sirupsen/logrus" 25 26 "golang.org/x/sys/unix" 27 ) 28 29 var ( 30 initMapsOnce sync.Once 31 namespaceMapping map[specs.LinuxNamespaceType]configs.NamespaceType 32 mountPropagationMapping map[string]int 33 recAttrFlags map[string]struct { 34 clear bool 35 flag uint64 36 } 37 mountFlags, extensionFlags map[string]struct { 38 clear bool 39 flag int 40 } 41 complexFlags map[string]func(*configs.Mount) 42 ) 43 44 func initMaps() { 45 initMapsOnce.Do(func() { 46 namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ 47 specs.PIDNamespace: configs.NEWPID, 48 specs.NetworkNamespace: configs.NEWNET, 49 specs.MountNamespace: configs.NEWNS, 50 specs.UserNamespace: configs.NEWUSER, 51 specs.IPCNamespace: configs.NEWIPC, 52 specs.UTSNamespace: configs.NEWUTS, 53 specs.CgroupNamespace: configs.NEWCGROUP, 54 specs.TimeNamespace: configs.NEWTIME, 55 } 56 57 mountPropagationMapping = map[string]int{ 58 "rprivate": unix.MS_PRIVATE | unix.MS_REC, 59 "private": unix.MS_PRIVATE, 60 "rslave": unix.MS_SLAVE | unix.MS_REC, 61 "slave": unix.MS_SLAVE, 62 "rshared": unix.MS_SHARED | unix.MS_REC, 63 "shared": unix.MS_SHARED, 64 "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, 65 "unbindable": unix.MS_UNBINDABLE, 66 } 67 68 mountFlags = map[string]struct { 69 clear bool 70 flag int 71 }{ 72 // "acl" cannot be mapped to MS_POSIXACL: https://github.com/opencontainers/runc/issues/3738 73 "async": {true, unix.MS_SYNCHRONOUS}, 74 "atime": {true, unix.MS_NOATIME}, 75 "bind": {false, unix.MS_BIND}, 76 "defaults": {false, 0}, 77 "dev": {true, unix.MS_NODEV}, 78 "diratime": {true, unix.MS_NODIRATIME}, 79 "dirsync": {false, unix.MS_DIRSYNC}, 80 "exec": {true, unix.MS_NOEXEC}, 81 "iversion": {false, unix.MS_I_VERSION}, 82 "lazytime": {false, unix.MS_LAZYTIME}, 83 "loud": {true, unix.MS_SILENT}, 84 "mand": {false, unix.MS_MANDLOCK}, 85 "noatime": {false, unix.MS_NOATIME}, 86 "nodev": {false, unix.MS_NODEV}, 87 "nodiratime": {false, unix.MS_NODIRATIME}, 88 "noexec": {false, unix.MS_NOEXEC}, 89 "noiversion": {true, unix.MS_I_VERSION}, 90 "nolazytime": {true, unix.MS_LAZYTIME}, 91 "nomand": {true, unix.MS_MANDLOCK}, 92 "norelatime": {true, unix.MS_RELATIME}, 93 "nostrictatime": {true, unix.MS_STRICTATIME}, 94 "nosuid": {false, unix.MS_NOSUID}, 95 "nosymfollow": {false, unix.MS_NOSYMFOLLOW}, // since kernel 5.10 96 "rbind": {false, unix.MS_BIND | unix.MS_REC}, 97 "relatime": {false, unix.MS_RELATIME}, 98 "remount": {false, unix.MS_REMOUNT}, 99 "ro": {false, unix.MS_RDONLY}, 100 "rw": {true, unix.MS_RDONLY}, 101 "silent": {false, unix.MS_SILENT}, 102 "strictatime": {false, unix.MS_STRICTATIME}, 103 "suid": {true, unix.MS_NOSUID}, 104 "sync": {false, unix.MS_SYNCHRONOUS}, 105 "symfollow": {true, unix.MS_NOSYMFOLLOW}, // since kernel 5.10 106 } 107 108 recAttrFlags = map[string]struct { 109 clear bool 110 flag uint64 111 }{ 112 "rro": {false, unix.MOUNT_ATTR_RDONLY}, 113 "rrw": {true, unix.MOUNT_ATTR_RDONLY}, 114 "rnosuid": {false, unix.MOUNT_ATTR_NOSUID}, 115 "rsuid": {true, unix.MOUNT_ATTR_NOSUID}, 116 "rnodev": {false, unix.MOUNT_ATTR_NODEV}, 117 "rdev": {true, unix.MOUNT_ATTR_NODEV}, 118 "rnoexec": {false, unix.MOUNT_ATTR_NOEXEC}, 119 "rexec": {true, unix.MOUNT_ATTR_NOEXEC}, 120 "rnodiratime": {false, unix.MOUNT_ATTR_NODIRATIME}, 121 "rdiratime": {true, unix.MOUNT_ATTR_NODIRATIME}, 122 "rrelatime": {false, unix.MOUNT_ATTR_RELATIME}, 123 "rnorelatime": {true, unix.MOUNT_ATTR_RELATIME}, 124 "rnoatime": {false, unix.MOUNT_ATTR_NOATIME}, 125 "ratime": {true, unix.MOUNT_ATTR_NOATIME}, 126 "rstrictatime": {false, unix.MOUNT_ATTR_STRICTATIME}, 127 "rnostrictatime": {true, unix.MOUNT_ATTR_STRICTATIME}, 128 "rnosymfollow": {false, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14 129 "rsymfollow": {true, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14 130 } 131 132 extensionFlags = map[string]struct { 133 clear bool 134 flag int 135 }{ 136 "tmpcopyup": {false, configs.EXT_COPYUP}, 137 } 138 139 complexFlags = map[string]func(*configs.Mount){ 140 "idmap": func(m *configs.Mount) { 141 m.IDMapping = new(configs.MountIDMapping) 142 m.IDMapping.Recursive = false // noop 143 }, 144 "ridmap": func(m *configs.Mount) { 145 m.IDMapping = new(configs.MountIDMapping) 146 m.IDMapping.Recursive = true 147 }, 148 } 149 }) 150 } 151 152 // KnownNamespaces returns the list of the known namespaces. 153 // Used by `runc features`. 154 func KnownNamespaces() []string { 155 initMaps() 156 var res []string 157 for k := range namespaceMapping { 158 res = append(res, string(k)) 159 } 160 sort.Strings(res) 161 return res 162 } 163 164 // KnownMountOptions returns the list of the known mount options. 165 // Used by `runc features`. 166 func KnownMountOptions() []string { 167 initMaps() 168 var res []string 169 for k := range mountFlags { 170 res = append(res, k) 171 } 172 for k := range mountPropagationMapping { 173 res = append(res, k) 174 } 175 for k := range recAttrFlags { 176 res = append(res, k) 177 } 178 for k := range extensionFlags { 179 res = append(res, k) 180 } 181 sort.Strings(res) 182 return res 183 } 184 185 // AllowedDevices is the set of devices which are automatically included for 186 // all containers. 187 // 188 // # XXX (cyphar) 189 // 190 // This behaviour is at the very least "questionable" (if not outright 191 // wrong) according to the runtime-spec. 192 // 193 // Yes, we have to include certain devices other than the ones the user 194 // specifies, but several devices listed here are not part of the spec 195 // (including "mknod for any device"?!). In addition, these rules are 196 // appended to the user-provided set which means that users *cannot disable 197 // this behaviour*. 198 // 199 // ... unfortunately I'm too scared to change this now because who knows how 200 // many people depend on this (incorrect and arguably insecure) behaviour. 201 var AllowedDevices = []*devices.Device{ 202 // allow mknod for any device 203 { 204 Rule: devices.Rule{ 205 Type: devices.CharDevice, 206 Major: devices.Wildcard, 207 Minor: devices.Wildcard, 208 Permissions: "m", 209 Allow: true, 210 }, 211 }, 212 { 213 Rule: devices.Rule{ 214 Type: devices.BlockDevice, 215 Major: devices.Wildcard, 216 Minor: devices.Wildcard, 217 Permissions: "m", 218 Allow: true, 219 }, 220 }, 221 { 222 Path: "/dev/null", 223 FileMode: 0o666, 224 Uid: 0, 225 Gid: 0, 226 Rule: devices.Rule{ 227 Type: devices.CharDevice, 228 Major: 1, 229 Minor: 3, 230 Permissions: "rwm", 231 Allow: true, 232 }, 233 }, 234 { 235 Path: "/dev/random", 236 FileMode: 0o666, 237 Uid: 0, 238 Gid: 0, 239 Rule: devices.Rule{ 240 Type: devices.CharDevice, 241 Major: 1, 242 Minor: 8, 243 Permissions: "rwm", 244 Allow: true, 245 }, 246 }, 247 { 248 Path: "/dev/full", 249 FileMode: 0o666, 250 Uid: 0, 251 Gid: 0, 252 Rule: devices.Rule{ 253 Type: devices.CharDevice, 254 Major: 1, 255 Minor: 7, 256 Permissions: "rwm", 257 Allow: true, 258 }, 259 }, 260 { 261 Path: "/dev/tty", 262 FileMode: 0o666, 263 Uid: 0, 264 Gid: 0, 265 Rule: devices.Rule{ 266 Type: devices.CharDevice, 267 Major: 5, 268 Minor: 0, 269 Permissions: "rwm", 270 Allow: true, 271 }, 272 }, 273 { 274 Path: "/dev/zero", 275 FileMode: 0o666, 276 Uid: 0, 277 Gid: 0, 278 Rule: devices.Rule{ 279 Type: devices.CharDevice, 280 Major: 1, 281 Minor: 5, 282 Permissions: "rwm", 283 Allow: true, 284 }, 285 }, 286 { 287 Path: "/dev/urandom", 288 FileMode: 0o666, 289 Uid: 0, 290 Gid: 0, 291 Rule: devices.Rule{ 292 Type: devices.CharDevice, 293 Major: 1, 294 Minor: 9, 295 Permissions: "rwm", 296 Allow: true, 297 }, 298 }, 299 // /dev/pts/ - pts namespaces are "coming soon" 300 { 301 Rule: devices.Rule{ 302 Type: devices.CharDevice, 303 Major: 136, 304 Minor: devices.Wildcard, 305 Permissions: "rwm", 306 Allow: true, 307 }, 308 }, 309 { 310 Rule: devices.Rule{ 311 Type: devices.CharDevice, 312 Major: 5, 313 Minor: 2, 314 Permissions: "rwm", 315 Allow: true, 316 }, 317 }, 318 } 319 320 type CreateOpts struct { 321 CgroupName string 322 UseSystemdCgroup bool 323 NoPivotRoot bool 324 NoNewKeyring bool 325 Spec *specs.Spec 326 RootlessEUID bool 327 RootlessCgroups bool 328 } 329 330 // getwd is a wrapper similar to os.Getwd, except it always gets 331 // the value from the kernel, which guarantees the returned value 332 // to be absolute and clean. 333 func getwd() (wd string, err error) { 334 for { 335 wd, err = unix.Getwd() 336 if err != unix.EINTR { 337 break 338 } 339 } 340 return wd, os.NewSyscallError("getwd", err) 341 } 342 343 // CreateLibcontainerConfig creates a new libcontainer configuration from a 344 // given specification and a cgroup name 345 func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { 346 // runc's cwd will always be the bundle path 347 cwd, err := getwd() 348 if err != nil { 349 return nil, err 350 } 351 spec := opts.Spec 352 if spec.Root == nil { 353 return nil, errors.New("root must be specified") 354 } 355 rootfsPath := spec.Root.Path 356 if !filepath.IsAbs(rootfsPath) { 357 rootfsPath = filepath.Join(cwd, rootfsPath) 358 } 359 labels := []string{} 360 for k, v := range spec.Annotations { 361 labels = append(labels, k+"="+v) 362 } 363 config := &configs.Config{ 364 Rootfs: rootfsPath, 365 NoPivotRoot: opts.NoPivotRoot, 366 Readonlyfs: spec.Root.Readonly, 367 Hostname: spec.Hostname, 368 Domainname: spec.Domainname, 369 Labels: append(labels, "bundle="+cwd), 370 NoNewKeyring: opts.NoNewKeyring, 371 RootlessEUID: opts.RootlessEUID, 372 RootlessCgroups: opts.RootlessCgroups, 373 } 374 375 for _, m := range spec.Mounts { 376 cm, err := createLibcontainerMount(cwd, m) 377 if err != nil { 378 return nil, fmt.Errorf("invalid mount %+v: %w", m, err) 379 } 380 config.Mounts = append(config.Mounts, cm) 381 } 382 383 defaultDevs, err := createDevices(spec, config) 384 if err != nil { 385 return nil, err 386 } 387 388 c, err := CreateCgroupConfig(opts, defaultDevs) 389 if err != nil { 390 return nil, err 391 } 392 393 config.Cgroups = c 394 // set linux-specific config 395 if spec.Linux != nil { 396 initMaps() 397 398 if spec.Linux.RootfsPropagation != "" { 399 var exists bool 400 if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists { 401 return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation) 402 } 403 if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) { 404 return nil, errors.New("rootfsPropagation of [r]private is not safe without pivot_root") 405 } 406 } 407 408 for _, ns := range spec.Linux.Namespaces { 409 t, exists := namespaceMapping[ns.Type] 410 if !exists { 411 return nil, fmt.Errorf("namespace %q does not exist", ns) 412 } 413 if config.Namespaces.Contains(t) { 414 return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns) 415 } 416 config.Namespaces.Add(t, ns.Path) 417 } 418 if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" { 419 config.Networks = []*configs.Network{ 420 { 421 Type: "loopback", 422 }, 423 } 424 } 425 if config.Namespaces.Contains(configs.NEWUSER) { 426 if err := setupUserNamespace(spec, config); err != nil { 427 return nil, err 428 } 429 // For idmap and ridmap mounts without explicit mappings, use the 430 // ones from the container's userns. If we are joining another 431 // userns, stash the path. 432 for _, m := range config.Mounts { 433 if m.IDMapping != nil && m.IDMapping.UIDMappings == nil && m.IDMapping.GIDMappings == nil { 434 if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { 435 m.IDMapping.UserNSPath = path 436 } else { 437 m.IDMapping.UIDMappings = config.UIDMappings 438 m.IDMapping.GIDMappings = config.GIDMappings 439 } 440 } 441 } 442 } 443 config.MaskPaths = spec.Linux.MaskedPaths 444 config.ReadonlyPaths = spec.Linux.ReadonlyPaths 445 config.MountLabel = spec.Linux.MountLabel 446 config.Sysctl = spec.Linux.Sysctl 447 config.TimeOffsets = spec.Linux.TimeOffsets 448 if spec.Linux.Seccomp != nil { 449 seccomp, err := SetupSeccomp(spec.Linux.Seccomp) 450 if err != nil { 451 return nil, err 452 } 453 config.Seccomp = seccomp 454 } 455 if spec.Linux.IntelRdt != nil { 456 config.IntelRdt = &configs.IntelRdt{ 457 ClosID: spec.Linux.IntelRdt.ClosID, 458 L3CacheSchema: spec.Linux.IntelRdt.L3CacheSchema, 459 MemBwSchema: spec.Linux.IntelRdt.MemBwSchema, 460 } 461 } 462 if spec.Linux.Personality != nil { 463 if len(spec.Linux.Personality.Flags) > 0 { 464 logrus.Warnf("ignoring unsupported personality flags: %+v because personality flag has not supported at this time", spec.Linux.Personality.Flags) 465 } 466 domain, err := getLinuxPersonalityFromStr(string(spec.Linux.Personality.Domain)) 467 if err != nil { 468 return nil, err 469 } 470 config.Personality = &configs.LinuxPersonality{ 471 Domain: domain, 472 } 473 } 474 475 } 476 477 // Set the host UID that should own the container's cgroup. 478 // This must be performed after setupUserNamespace, so that 479 // config.HostRootUID() returns the correct result. 480 // 481 // Only set it if the container will have its own cgroup 482 // namespace and the cgroupfs will be mounted read/write. 483 // 484 hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == "" 485 hasRwCgroupfs := false 486 if hasCgroupNS { 487 for _, m := range config.Mounts { 488 if m.Source == "cgroup" && filepath.Clean(m.Destination) == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 { 489 hasRwCgroupfs = true 490 break 491 } 492 } 493 } 494 processUid := 0 495 if spec.Process != nil { 496 // Chown the cgroup to the UID running the process, 497 // which is not necessarily UID 0 in the container 498 // namespace (e.g., an unprivileged UID in the host 499 // user namespace). 500 processUid = int(spec.Process.User.UID) 501 } 502 if hasCgroupNS && hasRwCgroupfs { 503 ownerUid, err := config.HostUID(processUid) 504 // There are two error cases; we can ignore both. 505 // 506 // 1. uidMappings is unset. Either there is no user 507 // namespace (fine), or it is an error (which is 508 // checked elsewhere). 509 // 510 // 2. The user is unmapped in the user namespace. This is an 511 // unusual configuration and might be an error. But it too 512 // will be checked elsewhere, so we can ignore it here. 513 // 514 if err == nil { 515 config.Cgroups.OwnerUID = &ownerUid 516 } 517 } 518 519 if spec.Process != nil { 520 config.OomScoreAdj = spec.Process.OOMScoreAdj 521 config.NoNewPrivileges = spec.Process.NoNewPrivileges 522 config.Umask = spec.Process.User.Umask 523 config.ProcessLabel = spec.Process.SelinuxLabel 524 if spec.Process.Capabilities != nil { 525 config.Capabilities = &configs.Capabilities{ 526 Bounding: spec.Process.Capabilities.Bounding, 527 Effective: spec.Process.Capabilities.Effective, 528 Permitted: spec.Process.Capabilities.Permitted, 529 Inheritable: spec.Process.Capabilities.Inheritable, 530 Ambient: spec.Process.Capabilities.Ambient, 531 } 532 } 533 if spec.Process.Scheduler != nil { 534 s := *spec.Process.Scheduler 535 config.Scheduler = &s 536 } 537 538 if spec.Process.IOPriority != nil { 539 ioPriority := *spec.Process.IOPriority 540 config.IOPriority = &ioPriority 541 } 542 } 543 createHooks(spec, config) 544 config.Version = specs.Version 545 return config, nil 546 } 547 548 func toConfigIDMap(specMaps []specs.LinuxIDMapping) []configs.IDMap { 549 if specMaps == nil { 550 return nil 551 } 552 idmaps := make([]configs.IDMap, len(specMaps)) 553 for i, id := range specMaps { 554 idmaps[i] = configs.IDMap{ 555 ContainerID: int64(id.ContainerID), 556 HostID: int64(id.HostID), 557 Size: int64(id.Size), 558 } 559 } 560 return idmaps 561 } 562 563 func createLibcontainerMount(cwd string, m specs.Mount) (*configs.Mount, error) { 564 if !filepath.IsAbs(m.Destination) { 565 // Relax validation for backward compatibility 566 // TODO (runc v1.x.x): change warning to an error 567 // return nil, fmt.Errorf("mount destination %s is not absolute", m.Destination) 568 logrus.Warnf("mount destination %s is not absolute. Support for non-absolute mount destinations will be removed in a future release.", m.Destination) 569 } 570 mnt := parseMountOptions(m.Options) 571 572 mnt.Destination = m.Destination 573 mnt.Source = m.Source 574 mnt.Device = m.Type 575 if mnt.Flags&unix.MS_BIND != 0 { 576 // Any "type" the user specified is meaningless (and ignored) for 577 // bind-mounts -- so we set it to "bind" because rootfs_linux.go 578 // (incorrectly) relies on this for some checks. 579 mnt.Device = "bind" 580 if !filepath.IsAbs(mnt.Source) { 581 mnt.Source = filepath.Join(cwd, m.Source) 582 } 583 } 584 585 if m.UIDMappings != nil || m.GIDMappings != nil { 586 if mnt.IDMapping == nil { 587 // Neither "idmap" nor "ridmap" were specified. 588 mnt.IDMapping = new(configs.MountIDMapping) 589 } 590 mnt.IDMapping.UIDMappings = toConfigIDMap(m.UIDMappings) 591 mnt.IDMapping.GIDMappings = toConfigIDMap(m.GIDMappings) 592 } 593 594 // None of the mount arguments can contain a null byte. Normally such 595 // strings would either cause some other failure or would just be truncated 596 // when we hit the null byte, but because we serialise these strings as 597 // netlink messages (which don't have special null-byte handling) we need 598 // to block this as early as possible. 599 if strings.IndexByte(mnt.Source, 0) >= 0 || 600 strings.IndexByte(mnt.Destination, 0) >= 0 || 601 strings.IndexByte(mnt.Device, 0) >= 0 { 602 return nil, errors.New("mount field contains null byte") 603 } 604 605 return mnt, nil 606 } 607 608 // checkPropertyName checks if systemd property name is valid. A valid name 609 // should consist of latin letters only, and have least 3 of them. 610 func checkPropertyName(s string) error { 611 if len(s) < 3 { 612 return errors.New("too short") 613 } 614 // Check ASCII characters rather than Unicode runes, 615 // so we have to use indexes rather than range. 616 for i := 0; i < len(s); i++ { 617 ch := s[i] 618 if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') { 619 continue 620 } 621 return errors.New("contains non-alphabetic character") 622 } 623 return nil 624 } 625 626 // getLinuxPersonalityFromStr converts the string domain received from spec to equivalent integer. 627 func getLinuxPersonalityFromStr(domain string) (int, error) { 628 if domain == string(specs.PerLinux32) { 629 return configs.PerLinux32, nil 630 } else if domain == string(specs.PerLinux) { 631 return configs.PerLinux, nil 632 } 633 return -1, fmt.Errorf("invalid personality domain %s", domain) 634 } 635 636 // Some systemd properties are documented as having "Sec" suffix 637 // (e.g. TimeoutStopSec) but are expected to have "USec" suffix 638 // here, so let's provide conversion to improve compatibility. 639 func convertSecToUSec(value dbus.Variant) (dbus.Variant, error) { 640 var sec uint64 641 const M = 1000000 642 vi := value.Value() 643 switch value.Signature().String() { 644 case "y": 645 sec = uint64(vi.(byte)) * M 646 case "n": 647 sec = uint64(vi.(int16)) * M 648 case "q": 649 sec = uint64(vi.(uint16)) * M 650 case "i": 651 sec = uint64(vi.(int32)) * M 652 case "u": 653 sec = uint64(vi.(uint32)) * M 654 case "x": 655 sec = uint64(vi.(int64)) * M 656 case "t": 657 sec = vi.(uint64) * M 658 case "d": 659 sec = uint64(vi.(float64) * M) 660 default: 661 return value, errors.New("not a number") 662 } 663 return dbus.MakeVariant(sec), nil 664 } 665 666 func initSystemdProps(spec *specs.Spec) ([]systemdDbus.Property, error) { 667 const keyPrefix = "org.systemd.property." 668 var sp []systemdDbus.Property 669 670 for k, v := range spec.Annotations { 671 name := strings.TrimPrefix(k, keyPrefix) 672 if len(name) == len(k) { // prefix not there 673 continue 674 } 675 if err := checkPropertyName(name); err != nil { 676 return nil, fmt.Errorf("annotation %s name incorrect: %w", k, err) 677 } 678 value, err := dbus.ParseVariant(v, dbus.Signature{}) 679 if err != nil { 680 return nil, fmt.Errorf("annotation %s=%s value parse error: %w", k, v, err) 681 } 682 // Check for Sec suffix. 683 if trimName := strings.TrimSuffix(name, "Sec"); len(trimName) < len(name) { 684 // Check for a lowercase ascii a-z just before Sec. 685 if ch := trimName[len(trimName)-1]; ch >= 'a' && ch <= 'z' { 686 // Convert from Sec to USec. 687 name = trimName + "USec" 688 value, err = convertSecToUSec(value) 689 if err != nil { 690 return nil, fmt.Errorf("annotation %s=%s value parse error: %w", k, v, err) 691 } 692 } 693 } 694 sp = append(sp, systemdDbus.Property{Name: name, Value: value}) 695 } 696 697 return sp, nil 698 } 699 700 func CreateCgroupConfig(opts *CreateOpts, defaultDevs []*devices.Device) (*configs.Cgroup, error) { 701 var ( 702 myCgroupPath string 703 704 spec = opts.Spec 705 useSystemdCgroup = opts.UseSystemdCgroup 706 name = opts.CgroupName 707 ) 708 709 c := &configs.Cgroup{ 710 Systemd: useSystemdCgroup, 711 Rootless: opts.RootlessCgroups, 712 Resources: &configs.Resources{}, 713 } 714 715 if useSystemdCgroup { 716 sp, err := initSystemdProps(spec) 717 if err != nil { 718 return nil, err 719 } 720 c.SystemdProps = sp 721 } 722 723 if spec.Linux != nil && spec.Linux.CgroupsPath != "" { 724 if useSystemdCgroup { 725 myCgroupPath = spec.Linux.CgroupsPath 726 } else { 727 myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath) 728 } 729 } 730 731 if useSystemdCgroup { 732 if myCgroupPath == "" { 733 // Default for c.Parent is set by systemd cgroup drivers. 734 c.ScopePrefix = "runc" 735 c.Name = name 736 } else { 737 // Parse the path from expected "slice:prefix:name" 738 // for e.g. "system.slice:docker:1234" 739 parts := strings.Split(myCgroupPath, ":") 740 if len(parts) != 3 { 741 return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath) 742 } 743 c.Parent = parts[0] 744 c.ScopePrefix = parts[1] 745 c.Name = parts[2] 746 } 747 } else { 748 if myCgroupPath == "" { 749 c.Name = name 750 } 751 c.Path = myCgroupPath 752 } 753 754 // In rootless containers, any attempt to make cgroup changes is likely to fail. 755 // libcontainer will validate this but ignores the error. 756 if spec.Linux != nil { 757 r := spec.Linux.Resources 758 if r != nil { 759 for i, d := range r.Devices { 760 var ( 761 t = "a" 762 major = int64(-1) 763 minor = int64(-1) 764 ) 765 if d.Type != "" { 766 t = d.Type 767 } 768 if d.Major != nil { 769 major = *d.Major 770 } 771 if d.Minor != nil { 772 minor = *d.Minor 773 } 774 if d.Access == "" { 775 return nil, fmt.Errorf("device access at %d field cannot be empty", i) 776 } 777 dt, err := stringToCgroupDeviceRune(t) 778 if err != nil { 779 return nil, err 780 } 781 c.Resources.Devices = append(c.Resources.Devices, &devices.Rule{ 782 Type: dt, 783 Major: major, 784 Minor: minor, 785 Permissions: devices.Permissions(d.Access), 786 Allow: d.Allow, 787 }) 788 } 789 if r.Memory != nil { 790 if r.Memory.Limit != nil { 791 c.Resources.Memory = *r.Memory.Limit 792 } 793 if r.Memory.Reservation != nil { 794 c.Resources.MemoryReservation = *r.Memory.Reservation 795 } 796 if r.Memory.Swap != nil { 797 c.Resources.MemorySwap = *r.Memory.Swap 798 } 799 if r.Memory.Kernel != nil || r.Memory.KernelTCP != nil { //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility. 800 logrus.Warn("Kernel memory settings are ignored and will be removed") 801 } 802 if r.Memory.Swappiness != nil { 803 c.Resources.MemorySwappiness = r.Memory.Swappiness 804 } 805 if r.Memory.DisableOOMKiller != nil { 806 c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller 807 } 808 if r.Memory.CheckBeforeUpdate != nil { 809 c.Resources.MemoryCheckBeforeUpdate = *r.Memory.CheckBeforeUpdate 810 } 811 } 812 if r.CPU != nil { 813 if r.CPU.Shares != nil { 814 c.Resources.CpuShares = *r.CPU.Shares 815 816 // CpuWeight is used for cgroupv2 and should be converted 817 c.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(c.Resources.CpuShares) 818 } 819 if r.CPU.Quota != nil { 820 c.Resources.CpuQuota = *r.CPU.Quota 821 } 822 if r.CPU.Burst != nil { 823 c.Resources.CpuBurst = r.CPU.Burst 824 } 825 if r.CPU.Period != nil { 826 c.Resources.CpuPeriod = *r.CPU.Period 827 } 828 if r.CPU.RealtimeRuntime != nil { 829 c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime 830 } 831 if r.CPU.RealtimePeriod != nil { 832 c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod 833 } 834 c.Resources.CpusetCpus = r.CPU.Cpus 835 c.Resources.CpusetMems = r.CPU.Mems 836 c.Resources.CPUIdle = r.CPU.Idle 837 } 838 if r.Pids != nil { 839 c.Resources.PidsLimit = r.Pids.Limit 840 } 841 if r.BlockIO != nil { 842 if r.BlockIO.Weight != nil { 843 c.Resources.BlkioWeight = *r.BlockIO.Weight 844 } 845 if r.BlockIO.LeafWeight != nil { 846 c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight 847 } 848 for _, wd := range r.BlockIO.WeightDevice { 849 var weight, leafWeight uint16 850 if wd.Weight != nil { 851 weight = *wd.Weight 852 } 853 if wd.LeafWeight != nil { 854 leafWeight = *wd.LeafWeight 855 } 856 weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) 857 c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) 858 } 859 for _, td := range r.BlockIO.ThrottleReadBpsDevice { 860 rate := td.Rate 861 throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) 862 c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) 863 } 864 for _, td := range r.BlockIO.ThrottleWriteBpsDevice { 865 rate := td.Rate 866 throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) 867 c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) 868 } 869 for _, td := range r.BlockIO.ThrottleReadIOPSDevice { 870 rate := td.Rate 871 throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) 872 c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) 873 } 874 for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { 875 rate := td.Rate 876 throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) 877 c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) 878 } 879 } 880 for _, l := range r.HugepageLimits { 881 c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ 882 Pagesize: l.Pagesize, 883 Limit: l.Limit, 884 }) 885 } 886 if len(r.Rdma) > 0 { 887 c.Resources.Rdma = make(map[string]configs.LinuxRdma, len(r.Rdma)) 888 for k, v := range r.Rdma { 889 c.Resources.Rdma[k] = configs.LinuxRdma{ 890 HcaHandles: v.HcaHandles, 891 HcaObjects: v.HcaObjects, 892 } 893 } 894 } 895 if r.Network != nil { 896 if r.Network.ClassID != nil { 897 c.Resources.NetClsClassid = *r.Network.ClassID 898 } 899 for _, m := range r.Network.Priorities { 900 c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ 901 Interface: m.Name, 902 Priority: int64(m.Priority), 903 }) 904 } 905 } 906 if len(r.Unified) > 0 { 907 // copy the map 908 c.Resources.Unified = make(map[string]string, len(r.Unified)) 909 for k, v := range r.Unified { 910 c.Resources.Unified[k] = v 911 } 912 } 913 } 914 } 915 916 // Append the default allowed devices to the end of the list. 917 for _, device := range defaultDevs { 918 c.Resources.Devices = append(c.Resources.Devices, &device.Rule) 919 } 920 return c, nil 921 } 922 923 func stringToCgroupDeviceRune(s string) (devices.Type, error) { 924 switch s { 925 case "a": 926 return devices.WildcardDevice, nil 927 case "b": 928 return devices.BlockDevice, nil 929 case "c": 930 return devices.CharDevice, nil 931 default: 932 return 0, fmt.Errorf("invalid cgroup device type %q", s) 933 } 934 } 935 936 func stringToDeviceRune(s string) (devices.Type, error) { 937 switch s { 938 case "p": 939 return devices.FifoDevice, nil 940 case "u", "c": 941 return devices.CharDevice, nil 942 case "b": 943 return devices.BlockDevice, nil 944 default: 945 return 0, fmt.Errorf("invalid device type %q", s) 946 } 947 } 948 949 func createDevices(spec *specs.Spec, config *configs.Config) ([]*devices.Device, error) { 950 // If a spec device is redundant with a default device, remove that default 951 // device (the spec one takes priority). 952 dedupedAllowDevs := []*devices.Device{} 953 954 next: 955 for _, ad := range AllowedDevices { 956 if ad.Path != "" && spec.Linux != nil { 957 for _, sd := range spec.Linux.Devices { 958 if sd.Path == ad.Path { 959 continue next 960 } 961 } 962 } 963 dedupedAllowDevs = append(dedupedAllowDevs, ad) 964 if ad.Path != "" { 965 config.Devices = append(config.Devices, ad) 966 } 967 } 968 969 // Merge in additional devices from the spec. 970 if spec.Linux != nil { 971 for _, d := range spec.Linux.Devices { 972 var uid, gid uint32 973 var filemode os.FileMode = 0o666 974 975 if d.UID != nil { 976 uid = *d.UID 977 } 978 if d.GID != nil { 979 gid = *d.GID 980 } 981 dt, err := stringToDeviceRune(d.Type) 982 if err != nil { 983 return nil, err 984 } 985 if d.FileMode != nil { 986 filemode = *d.FileMode &^ unix.S_IFMT 987 } 988 device := &devices.Device{ 989 Rule: devices.Rule{ 990 Type: dt, 991 Major: d.Major, 992 Minor: d.Minor, 993 }, 994 Path: d.Path, 995 FileMode: filemode, 996 Uid: uid, 997 Gid: gid, 998 } 999 config.Devices = append(config.Devices, device) 1000 } 1001 } 1002 1003 return dedupedAllowDevs, nil 1004 } 1005 1006 func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { 1007 if spec.Linux != nil { 1008 config.UIDMappings = toConfigIDMap(spec.Linux.UIDMappings) 1009 config.GIDMappings = toConfigIDMap(spec.Linux.GIDMappings) 1010 } 1011 if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { 1012 // Cache the current userns mappings in our configuration, so that we 1013 // can calculate uid and gid mappings within runc. These mappings are 1014 // never used for configuring the container if the path is set. 1015 uidMap, gidMap, err := userns.GetUserNamespaceMappings(path) 1016 if err != nil { 1017 return fmt.Errorf("failed to cache mappings for userns: %w", err) 1018 } 1019 // We cannot allow uid or gid mappings to be set if we are also asked 1020 // to join a userns. 1021 if config.UIDMappings != nil || config.GIDMappings != nil { 1022 // FIXME: It turns out that containerd and CRIO pass both a userns 1023 // path and the mappings of the namespace in the same config.json. 1024 // Such a configuration is technically not valid, but we used to 1025 // require mappings be specified, and thus users worked around our 1026 // bug -- so we can't regress it at the moment. But we also don't 1027 // want to produce broken behaviour if the mapping doesn't match 1028 // the userns. So (for now) we output a warning if the actual 1029 // userns mappings match the configuration, otherwise we return an 1030 // error. 1031 if !userns.IsSameMapping(uidMap, config.UIDMappings) || 1032 !userns.IsSameMapping(gidMap, config.GIDMappings) { 1033 return errors.New("user namespaces enabled, but both namespace path and non-matching mapping specified -- you may only provide one") 1034 } 1035 logrus.Warnf("config.json has both a userns path to join and a matching userns mapping specified -- you may only provide one. Future versions of runc may return an error with this configuration, please report a bug on <https://github.com/opencontainers/runc> if you see this warning and cannot update your configuration.") 1036 } 1037 1038 config.UIDMappings = uidMap 1039 config.GIDMappings = gidMap 1040 logrus.WithFields(logrus.Fields{ 1041 "uid_map": uidMap, 1042 "gid_map": gidMap, 1043 }).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached") 1044 } 1045 rootUID, err := config.HostRootUID() 1046 if err != nil { 1047 return err 1048 } 1049 rootGID, err := config.HostRootGID() 1050 if err != nil { 1051 return err 1052 } 1053 for _, node := range config.Devices { 1054 node.Uid = uint32(rootUID) 1055 node.Gid = uint32(rootGID) 1056 } 1057 return nil 1058 } 1059 1060 // parseMountOptions parses options and returns a configs.Mount 1061 // structure with fields that depends on options set accordingly. 1062 func parseMountOptions(options []string) *configs.Mount { 1063 var ( 1064 data []string 1065 m configs.Mount 1066 recAttrSet, recAttrClr uint64 1067 ) 1068 initMaps() 1069 for _, o := range options { 1070 // If the option does not exist in the mountFlags table, 1071 // or the flag is not supported on the platform, 1072 // then it is a data value for a specific fs type. 1073 if f, exists := mountFlags[o]; exists && f.flag != 0 { 1074 // FIXME: The *atime flags are special (they are more of an enum 1075 // with quite hairy semantics) and thus arguably setting some of 1076 // them should clear unrelated flags. 1077 if f.clear { 1078 m.Flags &= ^f.flag 1079 m.ClearedFlags |= f.flag 1080 } else { 1081 m.Flags |= f.flag 1082 m.ClearedFlags &= ^f.flag 1083 } 1084 } else if f, exists := mountPropagationMapping[o]; exists && f != 0 { 1085 m.PropagationFlags = append(m.PropagationFlags, f) 1086 } else if f, exists := recAttrFlags[o]; exists { 1087 if f.clear { 1088 recAttrClr |= f.flag 1089 recAttrSet &= ^f.flag 1090 } else { 1091 recAttrSet |= f.flag 1092 recAttrClr &= ^f.flag 1093 if f.flag&unix.MOUNT_ATTR__ATIME == f.flag { 1094 // https://man7.org/linux/man-pages/man2/mount_setattr.2.html 1095 // "cannot simply specify the access-time setting in attr_set, but must also include MOUNT_ATTR__ATIME in the attr_clr field." 1096 recAttrClr |= unix.MOUNT_ATTR__ATIME 1097 } 1098 } 1099 } else if f, exists := extensionFlags[o]; exists { 1100 if f.clear { 1101 m.Extensions &= ^f.flag 1102 } else { 1103 m.Extensions |= f.flag 1104 } 1105 } else if fn, exists := complexFlags[o]; exists { 1106 fn(&m) 1107 } else { 1108 data = append(data, o) 1109 } 1110 } 1111 m.Data = strings.Join(data, ",") 1112 if recAttrSet != 0 || recAttrClr != 0 { 1113 m.RecAttr = &unix.MountAttr{ 1114 Attr_set: recAttrSet, 1115 Attr_clr: recAttrClr, 1116 } 1117 } 1118 return &m 1119 } 1120 1121 func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { 1122 if config == nil { 1123 return nil, nil 1124 } 1125 1126 // No default action specified, no syscalls listed, assume seccomp disabled 1127 if config.DefaultAction == "" && len(config.Syscalls) == 0 { 1128 return nil, nil 1129 } 1130 1131 newConfig := new(configs.Seccomp) 1132 newConfig.Syscalls = []*configs.Syscall{} 1133 1134 // The list of flags defined in runtime-spec is a subset of the flags 1135 // in the seccomp() syscall. 1136 if config.Flags == nil { 1137 // No flags are set explicitly (not even the empty set); 1138 // set the default of specs.LinuxSeccompFlagSpecAllow, 1139 // if it is supported by the libseccomp and the kernel. 1140 if err := seccomp.FlagSupported(specs.LinuxSeccompFlagSpecAllow); err == nil { 1141 newConfig.Flags = []specs.LinuxSeccompFlag{specs.LinuxSeccompFlagSpecAllow} 1142 } 1143 } else { 1144 // Fail early if some flags are unknown or unsupported. 1145 for _, flag := range config.Flags { 1146 if err := seccomp.FlagSupported(flag); err != nil { 1147 return nil, err 1148 } 1149 newConfig.Flags = append(newConfig.Flags, flag) 1150 } 1151 } 1152 1153 if len(config.Architectures) > 0 { 1154 newConfig.Architectures = []string{} 1155 for _, arch := range config.Architectures { 1156 newArch, err := seccomp.ConvertStringToArch(string(arch)) 1157 if err != nil { 1158 return nil, err 1159 } 1160 newConfig.Architectures = append(newConfig.Architectures, newArch) 1161 } 1162 } 1163 1164 // Convert default action from string representation 1165 newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction)) 1166 if err != nil { 1167 return nil, err 1168 } 1169 newConfig.DefaultAction = newDefaultAction 1170 newConfig.DefaultErrnoRet = config.DefaultErrnoRet 1171 1172 newConfig.ListenerPath = config.ListenerPath 1173 newConfig.ListenerMetadata = config.ListenerMetadata 1174 1175 // Loop through all syscall blocks and convert them to libcontainer format 1176 for _, call := range config.Syscalls { 1177 newAction, err := seccomp.ConvertStringToAction(string(call.Action)) 1178 if err != nil { 1179 return nil, err 1180 } 1181 1182 for _, name := range call.Names { 1183 newCall := configs.Syscall{ 1184 Name: name, 1185 Action: newAction, 1186 ErrnoRet: call.ErrnoRet, 1187 Args: []*configs.Arg{}, 1188 } 1189 // Loop through all the arguments of the syscall and convert them 1190 for _, arg := range call.Args { 1191 newOp, err := seccomp.ConvertStringToOperator(string(arg.Op)) 1192 if err != nil { 1193 return nil, err 1194 } 1195 1196 newArg := configs.Arg{ 1197 Index: arg.Index, 1198 Value: arg.Value, 1199 ValueTwo: arg.ValueTwo, 1200 Op: newOp, 1201 } 1202 1203 newCall.Args = append(newCall.Args, &newArg) 1204 } 1205 newConfig.Syscalls = append(newConfig.Syscalls, &newCall) 1206 } 1207 } 1208 1209 return newConfig, nil 1210 } 1211 1212 func createHooks(rspec *specs.Spec, config *configs.Config) { 1213 config.Hooks = configs.Hooks{} 1214 if rspec.Hooks != nil { 1215 for _, h := range rspec.Hooks.Prestart { //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility. 1216 cmd := createCommandHook(h) 1217 config.Hooks[configs.Prestart] = append(config.Hooks[configs.Prestart], configs.NewCommandHook(cmd)) 1218 } 1219 for _, h := range rspec.Hooks.CreateRuntime { 1220 cmd := createCommandHook(h) 1221 config.Hooks[configs.CreateRuntime] = append(config.Hooks[configs.CreateRuntime], configs.NewCommandHook(cmd)) 1222 } 1223 for _, h := range rspec.Hooks.CreateContainer { 1224 cmd := createCommandHook(h) 1225 config.Hooks[configs.CreateContainer] = append(config.Hooks[configs.CreateContainer], configs.NewCommandHook(cmd)) 1226 } 1227 for _, h := range rspec.Hooks.StartContainer { 1228 cmd := createCommandHook(h) 1229 config.Hooks[configs.StartContainer] = append(config.Hooks[configs.StartContainer], configs.NewCommandHook(cmd)) 1230 } 1231 for _, h := range rspec.Hooks.Poststart { 1232 cmd := createCommandHook(h) 1233 config.Hooks[configs.Poststart] = append(config.Hooks[configs.Poststart], configs.NewCommandHook(cmd)) 1234 } 1235 for _, h := range rspec.Hooks.Poststop { 1236 cmd := createCommandHook(h) 1237 config.Hooks[configs.Poststop] = append(config.Hooks[configs.Poststop], configs.NewCommandHook(cmd)) 1238 } 1239 } 1240 } 1241 1242 func createCommandHook(h specs.Hook) configs.Command { 1243 cmd := configs.Command{ 1244 Path: h.Path, 1245 Args: h.Args, 1246 Env: h.Env, 1247 } 1248 if h.Timeout != nil { 1249 d := time.Duration(*h.Timeout) * time.Second 1250 cmd.Timeout = &d 1251 } 1252 return cmd 1253 }