k8s.io/kubernetes@v1.29.3/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go (about)

     1  //go:build linux
     2  // +build linux
     3  
     4  /*
     5  Copyright 2018 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11      http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package kuberuntime
    21  
    22  import (
    23  	"errors"
    24  	"fmt"
    25  	"math"
    26  	"os"
    27  	"path/filepath"
    28  	"strconv"
    29  	"sync"
    30  	"time"
    31  
    32  	"github.com/containerd/cgroups"
    33  	cadvisorv1 "github.com/google/cadvisor/info/v1"
    34  	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
    35  
    36  	v1 "k8s.io/api/core/v1"
    37  	"k8s.io/apimachinery/pkg/api/resource"
    38  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    39  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    40  	"k8s.io/klog/v2"
    41  	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
    42  	kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    43  	kubefeatures "k8s.io/kubernetes/pkg/features"
    44  	"k8s.io/kubernetes/pkg/kubelet/cm"
    45  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    46  	"k8s.io/kubernetes/pkg/kubelet/qos"
    47  	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
    48  )
    49  
    50  var defaultPageSize = int64(os.Getpagesize())
    51  
    52  // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig.
    53  func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error {
    54  	enforceMemoryQoS := false
    55  	// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
    56  	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
    57  		isCgroup2UnifiedMode() {
    58  		enforceMemoryQoS = true
    59  	}
    60  	cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
    61  	if err != nil {
    62  		return err
    63  	}
    64  	config.Linux = cl
    65  
    66  	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.UserNamespacesSupport) {
    67  		if cl.SecurityContext.NamespaceOptions.UsernsOptions != nil {
    68  			for _, mount := range config.Mounts {
    69  				mount.UidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Uids
    70  				mount.GidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Gids
    71  			}
    72  		}
    73  	}
    74  	return nil
    75  }
    76  
    77  // generateLinuxContainerConfig generates linux container config for kubelet runtime v1.
    78  func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) (*runtimeapi.LinuxContainerConfig, error) {
    79  	sc, err := m.determineEffectiveSecurityContext(pod, container, uid, username)
    80  	if err != nil {
    81  		return nil, err
    82  	}
    83  	lc := &runtimeapi.LinuxContainerConfig{
    84  		Resources:       m.generateLinuxContainerResources(pod, container, enforceMemoryQoS),
    85  		SecurityContext: sc,
    86  	}
    87  
    88  	if nsTarget != nil && lc.SecurityContext.NamespaceOptions.Pid == runtimeapi.NamespaceMode_CONTAINER {
    89  		lc.SecurityContext.NamespaceOptions.Pid = runtimeapi.NamespaceMode_TARGET
    90  		lc.SecurityContext.NamespaceOptions.TargetId = nsTarget.ID
    91  	}
    92  
    93  	return lc, nil
    94  }
    95  
    96  // generateLinuxContainerResources generates linux container resources config for runtime
    97  func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, container *v1.Container, enforceMemoryQoS bool) *runtimeapi.LinuxContainerResources {
    98  	// set linux container resources
    99  	var cpuRequest *resource.Quantity
   100  	if _, cpuRequestExists := container.Resources.Requests[v1.ResourceCPU]; cpuRequestExists {
   101  		cpuRequest = container.Resources.Requests.Cpu()
   102  	}
   103  	lcr := m.calculateLinuxResources(cpuRequest, container.Resources.Limits.Cpu(), container.Resources.Limits.Memory())
   104  
   105  	lcr.OomScoreAdj = int64(qos.GetContainerOOMScoreAdjust(pod, container,
   106  		int64(m.machineInfo.MemoryCapacity)))
   107  
   108  	lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
   109  
   110  	// Configure swap for the container
   111  	m.configureContainerSwapResources(lcr, pod, container)
   112  
   113  	// Set memory.min and memory.high to enforce MemoryQoS
   114  	if enforceMemoryQoS {
   115  		unified := map[string]string{}
   116  		memoryRequest := container.Resources.Requests.Memory().Value()
   117  		memoryLimit := container.Resources.Limits.Memory().Value()
   118  		if memoryRequest != 0 {
   119  			unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
   120  		}
   121  
   122  		// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
   123  		// Here, we only check from memory perspective. Hence MemoryQoS feature is disabled on those QoS pods by not setting memory.high.
   124  		if memoryRequest != memoryLimit {
   125  			// The formula for memory.high for container cgroup is modified in Alpha stage of the feature in K8s v1.27.
   126  			// It will be set based on formula:
   127  			// `memory.high=floor[(requests.memory + memory throttling factor * (limits.memory or node allocatable memory - requests.memory))/pageSize] * pageSize`
   128  			// where default value of memory throttling factor is set to 0.9
   129  			// More info: https://git.k8s.io/enhancements/keps/sig-node/2570-memory-qos
   130  			memoryHigh := int64(0)
   131  			if memoryLimit != 0 {
   132  				memoryHigh = int64(math.Floor(
   133  					float64(memoryRequest)+
   134  						(float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
   135  			} else {
   136  				allocatable := m.getNodeAllocatable()
   137  				allocatableMemory, ok := allocatable[v1.ResourceMemory]
   138  				if ok && allocatableMemory.Value() > 0 {
   139  					memoryHigh = int64(math.Floor(
   140  						float64(memoryRequest)+
   141  							(float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
   142  				}
   143  			}
   144  			if memoryHigh != 0 && memoryHigh > memoryRequest {
   145  				unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
   146  			}
   147  		}
   148  		if len(unified) > 0 {
   149  			if lcr.Unified == nil {
   150  				lcr.Unified = unified
   151  			} else {
   152  				for k, v := range unified {
   153  					lcr.Unified[k] = v
   154  				}
   155  			}
   156  			klog.V(4).InfoS("MemoryQoS config for container", "pod", klog.KObj(pod), "containerName", container.Name, "unified", unified)
   157  		}
   158  	}
   159  
   160  	return lcr
   161  }
   162  
   163  // configureContainerSwapResources configures the swap resources for a specified (linux) container.
   164  // Swap is only configured if a swap cgroup controller is available and the NodeSwap feature gate is enabled.
   165  func (m *kubeGenericRuntimeManager) configureContainerSwapResources(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
   166  	if !swapControllerAvailable() {
   167  		klog.InfoS("No swap cgroup controller present", "swapBehavior", m.memorySwapBehavior, "pod", klog.KObj(pod), "containerName", container.Name)
   168  		return
   169  	}
   170  	swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo)
   171  
   172  	if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
   173  		swapConfigurationHelper.ConfigureNoSwap(lcr)
   174  		return
   175  	}
   176  
   177  	// NOTE(ehashman): Behavior is defined in the opencontainers runtime spec:
   178  	// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
   179  	switch m.memorySwapBehavior {
   180  	case kubelettypes.LimitedSwap:
   181  		swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
   182  	default:
   183  		swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
   184  	}
   185  }
   186  
   187  // generateContainerResources generates platform specific (linux) container resources config for runtime
   188  func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources {
   189  	enforceMemoryQoS := false
   190  	// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
   191  	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
   192  		isCgroup2UnifiedMode() {
   193  		enforceMemoryQoS = true
   194  	}
   195  	return &runtimeapi.ContainerResources{
   196  		Linux: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS),
   197  	}
   198  }
   199  
   200  // calculateLinuxResources will create the linuxContainerResources type based on the provided CPU and memory resource requests, limits
   201  func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit, memoryLimit *resource.Quantity) *runtimeapi.LinuxContainerResources {
   202  	resources := runtimeapi.LinuxContainerResources{}
   203  	var cpuShares int64
   204  
   205  	memLimit := memoryLimit.Value()
   206  
   207  	// If request is not specified, but limit is, we want request to default to limit.
   208  	// API server does this for new containers, but we repeat this logic in Kubelet
   209  	// for containers running on existing Kubernetes clusters.
   210  	if cpuRequest == nil && cpuLimit != nil {
   211  		cpuShares = int64(cm.MilliCPUToShares(cpuLimit.MilliValue()))
   212  	} else {
   213  		// if cpuRequest.Amount is nil, then MilliCPUToShares will return the minimal number
   214  		// of CPU shares.
   215  		cpuShares = int64(cm.MilliCPUToShares(cpuRequest.MilliValue()))
   216  	}
   217  	resources.CpuShares = cpuShares
   218  	if memLimit != 0 {
   219  		resources.MemoryLimitInBytes = memLimit
   220  	}
   221  
   222  	if m.cpuCFSQuota {
   223  		// if cpuLimit.Amount is nil, then the appropriate default value is returned
   224  		// to allow full usage of cpu resource.
   225  		cpuPeriod := int64(quotaPeriod)
   226  		if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
   227  			// kubeGenericRuntimeManager.cpuCFSQuotaPeriod is provided in time.Duration,
   228  			// but we need to convert it to number of microseconds which is used by kernel.
   229  			cpuPeriod = int64(m.cpuCFSQuotaPeriod.Duration / time.Microsecond)
   230  		}
   231  		cpuQuota := milliCPUToQuota(cpuLimit.MilliValue(), cpuPeriod)
   232  		resources.CpuQuota = cpuQuota
   233  		resources.CpuPeriod = cpuPeriod
   234  	}
   235  
   236  	// runc requires cgroupv2 for unified mode
   237  	if isCgroup2UnifiedMode() {
   238  		resources.Unified = map[string]string{
   239  			// Ask the kernel to kill all processes in the container cgroup in case of OOM.
   240  			// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
   241  			// more info.
   242  			"memory.oom.group": "1",
   243  		}
   244  	}
   245  	return &resources
   246  }
   247  
   248  // GetHugepageLimitsFromResources returns limits of each hugepages from resources.
   249  func GetHugepageLimitsFromResources(resources v1.ResourceRequirements) []*runtimeapi.HugepageLimit {
   250  	var hugepageLimits []*runtimeapi.HugepageLimit
   251  
   252  	// For each page size, limit to 0.
   253  	for _, pageSize := range libcontainercgroups.HugePageSizes() {
   254  		hugepageLimits = append(hugepageLimits, &runtimeapi.HugepageLimit{
   255  			PageSize: pageSize,
   256  			Limit:    uint64(0),
   257  		})
   258  	}
   259  
   260  	requiredHugepageLimits := map[string]uint64{}
   261  	for resourceObj, amountObj := range resources.Limits {
   262  		if !v1helper.IsHugePageResourceName(resourceObj) {
   263  			continue
   264  		}
   265  
   266  		pageSize, err := v1helper.HugePageSizeFromResourceName(resourceObj)
   267  		if err != nil {
   268  			klog.InfoS("Failed to get hugepage size from resource", "object", resourceObj, "err", err)
   269  			continue
   270  		}
   271  
   272  		sizeString, err := v1helper.HugePageUnitSizeFromByteSize(pageSize.Value())
   273  		if err != nil {
   274  			klog.InfoS("Size is invalid", "object", resourceObj, "err", err)
   275  			continue
   276  		}
   277  		requiredHugepageLimits[sizeString] = uint64(amountObj.Value())
   278  	}
   279  
   280  	for _, hugepageLimit := range hugepageLimits {
   281  		if limit, exists := requiredHugepageLimits[hugepageLimit.PageSize]; exists {
   282  			hugepageLimit.Limit = limit
   283  		}
   284  	}
   285  
   286  	return hugepageLimits
   287  }
   288  
   289  func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *kubecontainer.ContainerResources {
   290  	var cStatusResources *kubecontainer.ContainerResources
   291  	runtimeStatusResources := statusResources.GetLinux()
   292  	if runtimeStatusResources != nil {
   293  		var cpuLimit, memLimit, cpuRequest *resource.Quantity
   294  		if runtimeStatusResources.CpuPeriod > 0 {
   295  			milliCPU := quotaToMilliCPU(runtimeStatusResources.CpuQuota, runtimeStatusResources.CpuPeriod)
   296  			if milliCPU > 0 {
   297  				cpuLimit = resource.NewMilliQuantity(milliCPU, resource.DecimalSI)
   298  			}
   299  		}
   300  		if runtimeStatusResources.CpuShares > 0 {
   301  			milliCPU := sharesToMilliCPU(runtimeStatusResources.CpuShares)
   302  			if milliCPU > 0 {
   303  				cpuRequest = resource.NewMilliQuantity(milliCPU, resource.DecimalSI)
   304  			}
   305  		}
   306  		if runtimeStatusResources.MemoryLimitInBytes > 0 {
   307  			memLimit = resource.NewQuantity(runtimeStatusResources.MemoryLimitInBytes, resource.BinarySI)
   308  		}
   309  		if cpuLimit != nil || memLimit != nil || cpuRequest != nil {
   310  			cStatusResources = &kubecontainer.ContainerResources{
   311  				CPULimit:    cpuLimit,
   312  				CPURequest:  cpuRequest,
   313  				MemoryLimit: memLimit,
   314  			}
   315  		}
   316  	}
   317  	return cStatusResources
   318  }
   319  
   320  // Note: this function variable is being added here so it would be possible to mock
   321  // the cgroup version for unit tests by assigning a new mocked function into it. Without it,
   322  // the cgroup version would solely depend on the environment running the test.
   323  var isCgroup2UnifiedMode = func() bool {
   324  	return libcontainercgroups.IsCgroup2UnifiedMode()
   325  }
   326  
   327  var (
   328  	swapControllerAvailability     bool
   329  	swapControllerAvailabilityOnce sync.Once
   330  )
   331  
   332  // Note: this function variable is being added here so it would be possible to mock
   333  // the swap controller availability for unit tests by assigning a new function to it. Without it,
   334  // the swap controller availability would solely depend on the environment running the test.
   335  var swapControllerAvailable = func() bool {
   336  	// See https://github.com/containerd/containerd/pull/7838/
   337  	swapControllerAvailabilityOnce.Do(func() {
   338  		const warn = "Failed to detect the availability of the swap controller, assuming not available"
   339  		p := "/sys/fs/cgroup/memory/memory.memsw.limit_in_bytes"
   340  		if isCgroup2UnifiedMode() {
   341  			// memory.swap.max does not exist in the cgroup root, so we check /sys/fs/cgroup/<SELF>/memory.swap.max
   342  			_, unified, err := cgroups.ParseCgroupFileUnified("/proc/self/cgroup")
   343  			if err != nil {
   344  				klog.V(5).ErrorS(fmt.Errorf("failed to parse /proc/self/cgroup: %w", err), warn)
   345  				return
   346  			}
   347  			p = filepath.Join("/sys/fs/cgroup", unified, "memory.swap.max")
   348  		}
   349  		if _, err := os.Stat(p); err != nil {
   350  			if !errors.Is(err, os.ErrNotExist) {
   351  				klog.V(5).ErrorS(err, warn)
   352  			}
   353  			return
   354  		}
   355  		swapControllerAvailability = true
   356  	})
   357  	return swapControllerAvailability
   358  }
   359  
   360  type swapConfigurationHelper struct {
   361  	machineInfo cadvisorv1.MachineInfo
   362  }
   363  
   364  func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
   365  	return &swapConfigurationHelper{machineInfo: machineInfo}
   366  }
   367  
   368  func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
   369  	podQos := kubeapiqos.GetPodQOS(pod)
   370  	containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
   371  	memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0
   372  
   373  	if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
   374  		m.ConfigureNoSwap(lcr)
   375  		return
   376  	}
   377  
   378  	containerMemoryRequest := container.Resources.Requests.Memory()
   379  	swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
   380  
   381  	if err != nil {
   382  		klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
   383  		m.ConfigureNoSwap(lcr)
   384  		return
   385  	}
   386  
   387  	m.configureSwap(lcr, swapLimit)
   388  }
   389  
   390  func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
   391  	if !isCgroup2UnifiedMode() {
   392  		if swapControllerAvailable() {
   393  			// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
   394  			// Some swapping is still possible.
   395  			// Note that if memory limit is 0, memory swap limit is ignored.
   396  			lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
   397  		}
   398  		return
   399  	}
   400  
   401  	m.configureSwap(lcr, 0)
   402  }
   403  
   404  func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) {
   405  	if !isCgroup2UnifiedMode() {
   406  		m.ConfigureNoSwap(lcr)
   407  		return
   408  	}
   409  
   410  	if lcr.Unified == nil {
   411  		lcr.Unified = map[string]string{}
   412  	}
   413  
   414  	lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max"
   415  }
   416  
   417  func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
   418  	if !isCgroup2UnifiedMode() {
   419  		klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
   420  		return
   421  	}
   422  
   423  	if lcr.Unified == nil {
   424  		lcr.Unified = map[string]string{}
   425  	}
   426  
   427  	lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
   428  }
   429  
   430  // The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
   431  // For more info, please look at the following KEP: https://kep.k8s.io/2400
   432  func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
   433  	if nodeTotalMemory <= 0 {
   434  		return 0, fmt.Errorf("total node memory is 0")
   435  	}
   436  	if containerMemoryRequest > nodeTotalMemory {
   437  		return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
   438  	}
   439  
   440  	containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
   441  	swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)
   442  
   443  	return int64(swapAllocation), nil
   444  }