github.com/containerd/nerdctl/v2@v2.0.0-beta.5.0.20240520001846-b5758f54fa28/pkg/cmd/container/run_cgroup_linux.go (about) 1 /* 2 Copyright The containerd Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package container 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "path/filepath" 24 "strings" 25 26 "github.com/containerd/containerd/containers" 27 "github.com/containerd/containerd/oci" 28 "github.com/containerd/log" 29 "github.com/containerd/nerdctl/v2/pkg/api/types" 30 "github.com/containerd/nerdctl/v2/pkg/infoutil" 31 "github.com/containerd/nerdctl/v2/pkg/rootlessutil" 32 "github.com/docker/go-units" 33 "github.com/opencontainers/runtime-spec/specs-go" 34 ) 35 36 type customMemoryOptions struct { 37 MemoryReservation *int64 38 MemorySwappiness *uint64 39 disableOOMKiller *bool 40 } 41 42 func generateCgroupOpts(id string, options types.ContainerCreateOptions) ([]oci.SpecOpts, error) { 43 if options.KernelMemory != "" { 44 log.L.Warnf("The --kernel-memory flag is no longer supported. This flag is a noop.") 45 } 46 47 if options.Memory == "" && options.OomKillDisable { 48 log.L.Warn("Disabling the OOM killer on containers without setting a '-m/--memory' limit may be dangerous.") 49 } 50 51 if options.GOptions.CgroupManager == "none" { 52 if !rootlessutil.IsRootless() { 53 return nil, errors.New(`cgroup-manager "none" is only supported for rootless`) 54 } 55 56 if options.CPUs > 0.0 || options.Memory != "" || options.MemorySwap != "" || options.PidsLimit > 0 { 57 log.L.Warn(`cgroup manager is set to "none", discarding resource limit requests. ` + 58 "(Hint: enable cgroup v2 with systemd: https://rootlesscontaine.rs/getting-started/common/cgroup2/)") 59 } 60 if options.CgroupParent != "" { 61 log.L.Warnf(`cgroup manager is set to "none", ignoring cgroup parent %q`+ 62 "(Hint: enable cgroup v2 with systemd: https://rootlesscontaine.rs/getting-started/common/cgroup2/)", options.CgroupParent) 63 } 64 return []oci.SpecOpts{oci.WithCgroup("")}, nil 65 } 66 67 var opts []oci.SpecOpts // nolint: prealloc 68 path, err := generateCgroupPath(id, options.GOptions.CgroupManager, options.CgroupParent) 69 if err != nil { 70 return nil, err 71 } 72 if path != "" { 73 opts = append(opts, oci.WithCgroup(path)) 74 } 75 76 // cpus: from https://github.com/containerd/containerd/blob/v1.4.3/cmd/ctr/commands/run/run_unix.go#L187-L193 77 if options.CPUs > 0.0 { 78 var ( 79 period = uint64(100000) 80 quota = int64(options.CPUs * 100000.0) 81 ) 82 opts = append(opts, oci.WithCPUCFS(quota, period)) 83 } 84 85 if options.CPUShares != 0 { 86 opts = append(opts, oci.WithCPUShares(options.CPUShares)) 87 } 88 89 if options.CPUSetCPUs != "" { 90 opts = append(opts, oci.WithCPUs(options.CPUSetCPUs)) 91 } 92 if options.CPUQuota != -1 || options.CPUPeriod != 0 { 93 if options.CPUs > 0.0 { 94 return nil, errors.New("cpus and quota/period should be used separately") 95 } 96 opts = append(opts, oci.WithCPUCFS(options.CPUQuota, options.CPUPeriod)) 97 } 98 if options.CPUSetMems != "" { 99 opts = append(opts, oci.WithCPUsMems(options.CPUSetMems)) 100 } 101 102 var mem64 int64 103 if options.Memory != "" { 104 mem64, err = units.RAMInBytes(options.Memory) 105 if err != nil { 106 return nil, fmt.Errorf("failed to parse memory bytes %q: %w", options.Memory, err) 107 } 108 opts = append(opts, oci.WithMemoryLimit(uint64(mem64))) 109 } 110 111 var memReserve64 int64 112 if options.MemoryReservation != "" { 113 memReserve64, err = units.RAMInBytes(options.MemoryReservation) 114 if err != nil { 115 return nil, fmt.Errorf("failed to parse memory bytes %q: %w", options.MemoryReservation, err) 116 } 117 } 118 var memSwap64 int64 119 if options.MemorySwap != "" { 120 if options.MemorySwap == "-1" { 121 memSwap64 = -1 122 } else { 123 memSwap64, err = units.RAMInBytes(options.MemorySwap) 124 if err != nil { 125 return nil, fmt.Errorf("failed to parse memory-swap bytes %q: %w", options.MemorySwap, err) 126 } 127 if mem64 > 0 && memSwap64 > 0 && memSwap64 < mem64 { 128 return nil, fmt.Errorf("minimum memoryswap limit should be larger than memory limit, see usage") 129 } 130 } 131 } else { 132 // if `--memory-swap` is unset, the container can use as much swap as the `--memory` setting. 133 memSwap64 = mem64 * 2 134 } 135 if memSwap64 == 0 { 136 // if --memory-swap is set to 0, the setting is ignored, and the value is treated as unset. 137 memSwap64 = mem64 * 2 138 } 139 if memSwap64 != 0 { 140 opts = append(opts, oci.WithMemorySwap(memSwap64)) 141 } 142 if mem64 > 0 && memReserve64 > 0 && mem64 < memReserve64 { 143 return nil, fmt.Errorf("minimum memory limit can not be less than memory reservation limit, see usage") 144 } 145 if options.MemorySwappiness64 > 100 || options.MemorySwappiness64 < -1 { 146 return nil, fmt.Errorf("invalid value: %v, valid memory swappiness range is 0-100", options.MemorySwappiness64) 147 } 148 149 var customMemRes customMemoryOptions 150 if memReserve64 >= 0 && options.MemoryReservationChanged { 151 customMemRes.MemoryReservation = &memReserve64 152 } 153 if options.MemorySwappiness64 >= 0 && options.MemorySwappiness64Changed { 154 memSwapinessUint64 := uint64(options.MemorySwappiness64) 155 customMemRes.MemorySwappiness = &memSwapinessUint64 156 } 157 if options.OomKillDisable { 158 customMemRes.disableOOMKiller = &options.OomKillDisable 159 } 160 opts = append(opts, withCustomMemoryResources(customMemRes)) 161 162 if options.PidsLimit > 0 { 163 opts = append(opts, oci.WithPidsLimit(options.PidsLimit)) 164 } 165 166 if len(options.CgroupConf) > 0 && infoutil.CgroupsVersion() == "1" { 167 return nil, errors.New("cannot use --cgroup-conf without cgroup v2") 168 } 169 170 unifieds := make(map[string]string) 171 for _, unified := range options.CgroupConf { 172 splitUnified := strings.SplitN(unified, "=", 2) 173 if len(splitUnified) < 2 { 174 return nil, errors.New("--cgroup-conf must be formatted KEY=VALUE") 175 } 176 unifieds[splitUnified[0]] = splitUnified[1] 177 } 178 opts = append(opts, withUnified(unifieds)) 179 180 if options.BlkioWeight != 0 && !infoutil.BlockIOWeight(options.GOptions.CgroupManager) { 181 log.L.Warn("kernel support for cgroup blkio weight missing, weight discarded") 182 options.BlkioWeight = 0 183 } 184 if options.BlkioWeight > 0 && options.BlkioWeight < 10 || options.BlkioWeight > 1000 { 185 return nil, errors.New("range of blkio weight is from 10 to 1000") 186 } 187 opts = append(opts, withBlkioWeight(options.BlkioWeight)) 188 189 switch options.Cgroupns { 190 case "private": 191 ns := specs.LinuxNamespace{ 192 Type: specs.CgroupNamespace, 193 } 194 opts = append(opts, oci.WithLinuxNamespace(ns)) 195 case "host": 196 opts = append(opts, oci.WithHostNamespace(specs.CgroupNamespace)) 197 default: 198 return nil, fmt.Errorf("unknown cgroupns mode %q", options.Cgroupns) 199 } 200 201 for _, f := range options.Device { 202 devPath, conPath, mode, err := ParseDevice(f) 203 if err != nil { 204 return nil, fmt.Errorf("failed to parse device %q: %w", f, err) 205 } 206 opts = append(opts, oci.WithDevices(devPath, conPath, mode)) 207 } 208 209 return opts, nil 210 } 211 212 func generateCgroupPath(id, cgroupManager, cgroupParent string) (string, error) { 213 var ( 214 path string 215 usingSystemd = cgroupManager == "systemd" 216 slice = "system.slice" 217 scopePrefix = ":nerdctl:" 218 ) 219 if rootlessutil.IsRootlessChild() { 220 slice = "user.slice" 221 } 222 223 if cgroupParent == "" { 224 if usingSystemd { 225 // "slice:prefix:name" 226 path = slice + scopePrefix + id 227 } 228 // Nothing to do for the non-systemd case if a parent wasn't supplied, 229 // containerd already sets a default cgroup path as /<namespace>/<containerID> 230 return path, nil 231 } 232 233 // If the user asked for a cgroup parent, we will use systemd, 234 // Docker uses the following: 235 // parent + prefix (in our case, nerdctl) + containerID. 236 // 237 // In the non systemd case, it's just /parent/containerID 238 if usingSystemd { 239 if len(cgroupParent) <= 6 || !strings.HasSuffix(cgroupParent, ".slice") { 240 return "", errors.New(`cgroup-parent for systemd cgroup should be a valid slice named as "xxx.slice"`) 241 } 242 path = cgroupParent + scopePrefix + id 243 } else { 244 path = filepath.Join(cgroupParent, id) 245 } 246 247 return path, nil 248 } 249 250 // ParseDevice parses the give device string into hostDevPath, containerPath and mode(defaults: "rwm"). 251 func ParseDevice(s string) (hostDevPath string, containerPath string, mode string, err error) { 252 mode = "rwm" 253 split := strings.Split(s, ":") 254 var containerDevPath string 255 switch len(split) { 256 case 1: // e.g. "/dev/sda1" 257 hostDevPath = split[0] 258 containerDevPath = hostDevPath 259 case 2: // e.g., "/dev/sda1:rwm", or "/dev/sda1:/dev/sda1 260 hostDevPath = split[0] 261 if !strings.Contains(split[1], "/") { 262 containerDevPath = hostDevPath 263 mode = split[1] 264 } else { 265 containerDevPath = split[1] 266 } 267 case 3: // e.g., "/dev/sda1:/dev/sda1:rwm" 268 hostDevPath = split[0] 269 containerDevPath = split[1] 270 mode = split[2] 271 default: 272 return "", "", "", errors.New("too many `:` symbols") 273 } 274 275 if !filepath.IsAbs(hostDevPath) { 276 return "", "", "", fmt.Errorf("%q is not an absolute path", hostDevPath) 277 } 278 279 if err := validateDeviceMode(mode); err != nil { 280 return "", "", "", err 281 } 282 return hostDevPath, containerDevPath, mode, nil 283 } 284 285 func validateDeviceMode(mode string) error { 286 for _, r := range mode { 287 switch r { 288 case 'r', 'w', 'm': 289 default: 290 return fmt.Errorf("invalid mode %q: unexpected rune %v", mode, r) 291 } 292 } 293 return nil 294 } 295 296 func withUnified(unified map[string]string) oci.SpecOpts { 297 return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) (err error) { 298 if unified == nil { 299 return nil 300 } 301 s.Linux.Resources.Unified = make(map[string]string) 302 for k, v := range unified { 303 s.Linux.Resources.Unified[k] = v 304 } 305 return nil 306 } 307 } 308 309 func withBlkioWeight(blkioWeight uint16) oci.SpecOpts { 310 return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error { 311 if blkioWeight == 0 { 312 return nil 313 } 314 s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{Weight: &blkioWeight} 315 return nil 316 } 317 } 318 319 func withCustomMemoryResources(memoryOptions customMemoryOptions) oci.SpecOpts { 320 return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error { 321 if s.Linux != nil { 322 if s.Linux.Resources == nil { 323 s.Linux.Resources = &specs.LinuxResources{} 324 } 325 if s.Linux.Resources.Memory == nil { 326 s.Linux.Resources.Memory = &specs.LinuxMemory{} 327 } 328 if memoryOptions.disableOOMKiller != nil { 329 s.Linux.Resources.Memory.DisableOOMKiller = memoryOptions.disableOOMKiller 330 } 331 if memoryOptions.MemorySwappiness != nil { 332 s.Linux.Resources.Memory.Swappiness = memoryOptions.MemorySwappiness 333 } 334 if memoryOptions.MemoryReservation != nil { 335 s.Linux.Resources.Memory.Reservation = memoryOptions.MemoryReservation 336 } 337 } 338 return nil 339 } 340 }