github.com/containerd/nerdctl/v2@v2.0.0-beta.5.0.20240520001846-b5758f54fa28/pkg/cmd/container/run_cgroup_linux.go (about)

     1  /*
     2     Copyright The containerd Authors.
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package container
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"path/filepath"
    24  	"strings"
    25  
    26  	"github.com/containerd/containerd/containers"
    27  	"github.com/containerd/containerd/oci"
    28  	"github.com/containerd/log"
    29  	"github.com/containerd/nerdctl/v2/pkg/api/types"
    30  	"github.com/containerd/nerdctl/v2/pkg/infoutil"
    31  	"github.com/containerd/nerdctl/v2/pkg/rootlessutil"
    32  	"github.com/docker/go-units"
    33  	"github.com/opencontainers/runtime-spec/specs-go"
    34  )
    35  
    36  type customMemoryOptions struct {
    37  	MemoryReservation *int64
    38  	MemorySwappiness  *uint64
    39  	disableOOMKiller  *bool
    40  }
    41  
    42  func generateCgroupOpts(id string, options types.ContainerCreateOptions) ([]oci.SpecOpts, error) {
    43  	if options.KernelMemory != "" {
    44  		log.L.Warnf("The --kernel-memory flag is no longer supported. This flag is a noop.")
    45  	}
    46  
    47  	if options.Memory == "" && options.OomKillDisable {
    48  		log.L.Warn("Disabling the OOM killer on containers without setting a '-m/--memory' limit may be dangerous.")
    49  	}
    50  
    51  	if options.GOptions.CgroupManager == "none" {
    52  		if !rootlessutil.IsRootless() {
    53  			return nil, errors.New(`cgroup-manager "none" is only supported for rootless`)
    54  		}
    55  
    56  		if options.CPUs > 0.0 || options.Memory != "" || options.MemorySwap != "" || options.PidsLimit > 0 {
    57  			log.L.Warn(`cgroup manager is set to "none", discarding resource limit requests. ` +
    58  				"(Hint: enable cgroup v2 with systemd: https://rootlesscontaine.rs/getting-started/common/cgroup2/)")
    59  		}
    60  		if options.CgroupParent != "" {
    61  			log.L.Warnf(`cgroup manager is set to "none", ignoring cgroup parent %q`+
    62  				"(Hint: enable cgroup v2 with systemd: https://rootlesscontaine.rs/getting-started/common/cgroup2/)", options.CgroupParent)
    63  		}
    64  		return []oci.SpecOpts{oci.WithCgroup("")}, nil
    65  	}
    66  
    67  	var opts []oci.SpecOpts // nolint: prealloc
    68  	path, err := generateCgroupPath(id, options.GOptions.CgroupManager, options.CgroupParent)
    69  	if err != nil {
    70  		return nil, err
    71  	}
    72  	if path != "" {
    73  		opts = append(opts, oci.WithCgroup(path))
    74  	}
    75  
    76  	// cpus: from https://github.com/containerd/containerd/blob/v1.4.3/cmd/ctr/commands/run/run_unix.go#L187-L193
    77  	if options.CPUs > 0.0 {
    78  		var (
    79  			period = uint64(100000)
    80  			quota  = int64(options.CPUs * 100000.0)
    81  		)
    82  		opts = append(opts, oci.WithCPUCFS(quota, period))
    83  	}
    84  
    85  	if options.CPUShares != 0 {
    86  		opts = append(opts, oci.WithCPUShares(options.CPUShares))
    87  	}
    88  
    89  	if options.CPUSetCPUs != "" {
    90  		opts = append(opts, oci.WithCPUs(options.CPUSetCPUs))
    91  	}
    92  	if options.CPUQuota != -1 || options.CPUPeriod != 0 {
    93  		if options.CPUs > 0.0 {
    94  			return nil, errors.New("cpus and quota/period should be used separately")
    95  		}
    96  		opts = append(opts, oci.WithCPUCFS(options.CPUQuota, options.CPUPeriod))
    97  	}
    98  	if options.CPUSetMems != "" {
    99  		opts = append(opts, oci.WithCPUsMems(options.CPUSetMems))
   100  	}
   101  
   102  	var mem64 int64
   103  	if options.Memory != "" {
   104  		mem64, err = units.RAMInBytes(options.Memory)
   105  		if err != nil {
   106  			return nil, fmt.Errorf("failed to parse memory bytes %q: %w", options.Memory, err)
   107  		}
   108  		opts = append(opts, oci.WithMemoryLimit(uint64(mem64)))
   109  	}
   110  
   111  	var memReserve64 int64
   112  	if options.MemoryReservation != "" {
   113  		memReserve64, err = units.RAMInBytes(options.MemoryReservation)
   114  		if err != nil {
   115  			return nil, fmt.Errorf("failed to parse memory bytes %q: %w", options.MemoryReservation, err)
   116  		}
   117  	}
   118  	var memSwap64 int64
   119  	if options.MemorySwap != "" {
   120  		if options.MemorySwap == "-1" {
   121  			memSwap64 = -1
   122  		} else {
   123  			memSwap64, err = units.RAMInBytes(options.MemorySwap)
   124  			if err != nil {
   125  				return nil, fmt.Errorf("failed to parse memory-swap bytes %q: %w", options.MemorySwap, err)
   126  			}
   127  			if mem64 > 0 && memSwap64 > 0 && memSwap64 < mem64 {
   128  				return nil, fmt.Errorf("minimum memoryswap limit should be larger than memory limit, see usage")
   129  			}
   130  		}
   131  	} else {
   132  		// if `--memory-swap` is unset, the container can use as much swap as the `--memory` setting.
   133  		memSwap64 = mem64 * 2
   134  	}
   135  	if memSwap64 == 0 {
   136  		// if --memory-swap is set to 0, the setting is ignored, and the value is treated as unset.
   137  		memSwap64 = mem64 * 2
   138  	}
   139  	if memSwap64 != 0 {
   140  		opts = append(opts, oci.WithMemorySwap(memSwap64))
   141  	}
   142  	if mem64 > 0 && memReserve64 > 0 && mem64 < memReserve64 {
   143  		return nil, fmt.Errorf("minimum memory limit can not be less than memory reservation limit, see usage")
   144  	}
   145  	if options.MemorySwappiness64 > 100 || options.MemorySwappiness64 < -1 {
   146  		return nil, fmt.Errorf("invalid value: %v, valid memory swappiness range is 0-100", options.MemorySwappiness64)
   147  	}
   148  
   149  	var customMemRes customMemoryOptions
   150  	if memReserve64 >= 0 && options.MemoryReservationChanged {
   151  		customMemRes.MemoryReservation = &memReserve64
   152  	}
   153  	if options.MemorySwappiness64 >= 0 && options.MemorySwappiness64Changed {
   154  		memSwapinessUint64 := uint64(options.MemorySwappiness64)
   155  		customMemRes.MemorySwappiness = &memSwapinessUint64
   156  	}
   157  	if options.OomKillDisable {
   158  		customMemRes.disableOOMKiller = &options.OomKillDisable
   159  	}
   160  	opts = append(opts, withCustomMemoryResources(customMemRes))
   161  
   162  	if options.PidsLimit > 0 {
   163  		opts = append(opts, oci.WithPidsLimit(options.PidsLimit))
   164  	}
   165  
   166  	if len(options.CgroupConf) > 0 && infoutil.CgroupsVersion() == "1" {
   167  		return nil, errors.New("cannot use --cgroup-conf without cgroup v2")
   168  	}
   169  
   170  	unifieds := make(map[string]string)
   171  	for _, unified := range options.CgroupConf {
   172  		splitUnified := strings.SplitN(unified, "=", 2)
   173  		if len(splitUnified) < 2 {
   174  			return nil, errors.New("--cgroup-conf must be formatted KEY=VALUE")
   175  		}
   176  		unifieds[splitUnified[0]] = splitUnified[1]
   177  	}
   178  	opts = append(opts, withUnified(unifieds))
   179  
   180  	if options.BlkioWeight != 0 && !infoutil.BlockIOWeight(options.GOptions.CgroupManager) {
   181  		log.L.Warn("kernel support for cgroup blkio weight missing, weight discarded")
   182  		options.BlkioWeight = 0
   183  	}
   184  	if options.BlkioWeight > 0 && options.BlkioWeight < 10 || options.BlkioWeight > 1000 {
   185  		return nil, errors.New("range of blkio weight is from 10 to 1000")
   186  	}
   187  	opts = append(opts, withBlkioWeight(options.BlkioWeight))
   188  
   189  	switch options.Cgroupns {
   190  	case "private":
   191  		ns := specs.LinuxNamespace{
   192  			Type: specs.CgroupNamespace,
   193  		}
   194  		opts = append(opts, oci.WithLinuxNamespace(ns))
   195  	case "host":
   196  		opts = append(opts, oci.WithHostNamespace(specs.CgroupNamespace))
   197  	default:
   198  		return nil, fmt.Errorf("unknown cgroupns mode %q", options.Cgroupns)
   199  	}
   200  
   201  	for _, f := range options.Device {
   202  		devPath, conPath, mode, err := ParseDevice(f)
   203  		if err != nil {
   204  			return nil, fmt.Errorf("failed to parse device %q: %w", f, err)
   205  		}
   206  		opts = append(opts, oci.WithDevices(devPath, conPath, mode))
   207  	}
   208  
   209  	return opts, nil
   210  }
   211  
   212  func generateCgroupPath(id, cgroupManager, cgroupParent string) (string, error) {
   213  	var (
   214  		path         string
   215  		usingSystemd = cgroupManager == "systemd"
   216  		slice        = "system.slice"
   217  		scopePrefix  = ":nerdctl:"
   218  	)
   219  	if rootlessutil.IsRootlessChild() {
   220  		slice = "user.slice"
   221  	}
   222  
   223  	if cgroupParent == "" {
   224  		if usingSystemd {
   225  			// "slice:prefix:name"
   226  			path = slice + scopePrefix + id
   227  		}
   228  		// Nothing to do for the non-systemd case if a parent wasn't supplied,
   229  		// containerd already sets a default cgroup path as /<namespace>/<containerID>
   230  		return path, nil
   231  	}
   232  
   233  	// If the user asked for a cgroup parent, we will use systemd,
   234  	// Docker uses the following:
   235  	// parent + prefix (in our case, nerdctl) + containerID.
   236  	//
   237  	// In the non systemd case, it's just /parent/containerID
   238  	if usingSystemd {
   239  		if len(cgroupParent) <= 6 || !strings.HasSuffix(cgroupParent, ".slice") {
   240  			return "", errors.New(`cgroup-parent for systemd cgroup should be a valid slice named as "xxx.slice"`)
   241  		}
   242  		path = cgroupParent + scopePrefix + id
   243  	} else {
   244  		path = filepath.Join(cgroupParent, id)
   245  	}
   246  
   247  	return path, nil
   248  }
   249  
   250  // ParseDevice parses the give device string into hostDevPath, containerPath and mode(defaults: "rwm").
   251  func ParseDevice(s string) (hostDevPath string, containerPath string, mode string, err error) {
   252  	mode = "rwm"
   253  	split := strings.Split(s, ":")
   254  	var containerDevPath string
   255  	switch len(split) {
   256  	case 1: // e.g. "/dev/sda1"
   257  		hostDevPath = split[0]
   258  		containerDevPath = hostDevPath
   259  	case 2: // e.g., "/dev/sda1:rwm", or "/dev/sda1:/dev/sda1
   260  		hostDevPath = split[0]
   261  		if !strings.Contains(split[1], "/") {
   262  			containerDevPath = hostDevPath
   263  			mode = split[1]
   264  		} else {
   265  			containerDevPath = split[1]
   266  		}
   267  	case 3: // e.g., "/dev/sda1:/dev/sda1:rwm"
   268  		hostDevPath = split[0]
   269  		containerDevPath = split[1]
   270  		mode = split[2]
   271  	default:
   272  		return "", "", "", errors.New("too many `:` symbols")
   273  	}
   274  
   275  	if !filepath.IsAbs(hostDevPath) {
   276  		return "", "", "", fmt.Errorf("%q is not an absolute path", hostDevPath)
   277  	}
   278  
   279  	if err := validateDeviceMode(mode); err != nil {
   280  		return "", "", "", err
   281  	}
   282  	return hostDevPath, containerDevPath, mode, nil
   283  }
   284  
   285  func validateDeviceMode(mode string) error {
   286  	for _, r := range mode {
   287  		switch r {
   288  		case 'r', 'w', 'm':
   289  		default:
   290  			return fmt.Errorf("invalid mode %q: unexpected rune %v", mode, r)
   291  		}
   292  	}
   293  	return nil
   294  }
   295  
   296  func withUnified(unified map[string]string) oci.SpecOpts {
   297  	return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) (err error) {
   298  		if unified == nil {
   299  			return nil
   300  		}
   301  		s.Linux.Resources.Unified = make(map[string]string)
   302  		for k, v := range unified {
   303  			s.Linux.Resources.Unified[k] = v
   304  		}
   305  		return nil
   306  	}
   307  }
   308  
   309  func withBlkioWeight(blkioWeight uint16) oci.SpecOpts {
   310  	return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error {
   311  		if blkioWeight == 0 {
   312  			return nil
   313  		}
   314  		s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{Weight: &blkioWeight}
   315  		return nil
   316  	}
   317  }
   318  
   319  func withCustomMemoryResources(memoryOptions customMemoryOptions) oci.SpecOpts {
   320  	return func(_ context.Context, _ oci.Client, _ *containers.Container, s *oci.Spec) error {
   321  		if s.Linux != nil {
   322  			if s.Linux.Resources == nil {
   323  				s.Linux.Resources = &specs.LinuxResources{}
   324  			}
   325  			if s.Linux.Resources.Memory == nil {
   326  				s.Linux.Resources.Memory = &specs.LinuxMemory{}
   327  			}
   328  			if memoryOptions.disableOOMKiller != nil {
   329  				s.Linux.Resources.Memory.DisableOOMKiller = memoryOptions.disableOOMKiller
   330  			}
   331  			if memoryOptions.MemorySwappiness != nil {
   332  				s.Linux.Resources.Memory.Swappiness = memoryOptions.MemorySwappiness
   333  			}
   334  			if memoryOptions.MemoryReservation != nil {
   335  				s.Linux.Resources.Memory.Reservation = memoryOptions.MemoryReservation
   336  			}
   337  		}
   338  		return nil
   339  	}
   340  }