k8s.io/kubernetes@v1.29.3/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go (about) 1 //go:build linux 2 // +build linux 3 4 /* 5 Copyright 2018 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package kuberuntime 21 22 import ( 23 "errors" 24 "fmt" 25 "math" 26 "os" 27 "path/filepath" 28 "strconv" 29 "sync" 30 "time" 31 32 "github.com/containerd/cgroups" 33 cadvisorv1 "github.com/google/cadvisor/info/v1" 34 libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" 35 36 v1 "k8s.io/api/core/v1" 37 "k8s.io/apimachinery/pkg/api/resource" 38 utilfeature "k8s.io/apiserver/pkg/util/feature" 39 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 40 "k8s.io/klog/v2" 41 v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" 42 kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 43 kubefeatures "k8s.io/kubernetes/pkg/features" 44 "k8s.io/kubernetes/pkg/kubelet/cm" 45 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 46 "k8s.io/kubernetes/pkg/kubelet/qos" 47 kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" 48 ) 49 50 var defaultPageSize = int64(os.Getpagesize()) 51 52 // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig. 53 func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error { 54 enforceMemoryQoS := false 55 // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2 56 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && 57 isCgroup2UnifiedMode() { 58 enforceMemoryQoS = true 59 } 60 cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS) 61 if err != nil { 62 return err 63 } 64 config.Linux = cl 65 66 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.UserNamespacesSupport) { 67 if cl.SecurityContext.NamespaceOptions.UsernsOptions != nil { 68 for _, mount := range config.Mounts { 69 mount.UidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Uids 70 mount.GidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Gids 71 } 72 } 73 } 74 return nil 75 } 76 77 // generateLinuxContainerConfig generates linux container config for kubelet runtime v1. 78 func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) (*runtimeapi.LinuxContainerConfig, error) { 79 sc, err := m.determineEffectiveSecurityContext(pod, container, uid, username) 80 if err != nil { 81 return nil, err 82 } 83 lc := &runtimeapi.LinuxContainerConfig{ 84 Resources: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS), 85 SecurityContext: sc, 86 } 87 88 if nsTarget != nil && lc.SecurityContext.NamespaceOptions.Pid == runtimeapi.NamespaceMode_CONTAINER { 89 lc.SecurityContext.NamespaceOptions.Pid = runtimeapi.NamespaceMode_TARGET 90 lc.SecurityContext.NamespaceOptions.TargetId = nsTarget.ID 91 } 92 93 return lc, nil 94 } 95 96 // generateLinuxContainerResources generates linux container resources config for runtime 97 func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, container *v1.Container, enforceMemoryQoS bool) *runtimeapi.LinuxContainerResources { 98 // set linux container resources 99 var cpuRequest *resource.Quantity 100 if _, cpuRequestExists := container.Resources.Requests[v1.ResourceCPU]; cpuRequestExists { 101 cpuRequest = container.Resources.Requests.Cpu() 102 } 103 lcr := m.calculateLinuxResources(cpuRequest, container.Resources.Limits.Cpu(), container.Resources.Limits.Memory()) 104 105 lcr.OomScoreAdj = int64(qos.GetContainerOOMScoreAdjust(pod, container, 106 int64(m.machineInfo.MemoryCapacity))) 107 108 lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources) 109 110 // Configure swap for the container 111 m.configureContainerSwapResources(lcr, pod, container) 112 113 // Set memory.min and memory.high to enforce MemoryQoS 114 if enforceMemoryQoS { 115 unified := map[string]string{} 116 memoryRequest := container.Resources.Requests.Memory().Value() 117 memoryLimit := container.Resources.Limits.Memory().Value() 118 if memoryRequest != 0 { 119 unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10) 120 } 121 122 // Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit. 123 // Here, we only check from memory perspective. Hence MemoryQoS feature is disabled on those QoS pods by not setting memory.high. 124 if memoryRequest != memoryLimit { 125 // The formula for memory.high for container cgroup is modified in Alpha stage of the feature in K8s v1.27. 126 // It will be set based on formula: 127 // `memory.high=floor[(requests.memory + memory throttling factor * (limits.memory or node allocatable memory - requests.memory))/pageSize] * pageSize` 128 // where default value of memory throttling factor is set to 0.9 129 // More info: https://git.k8s.io/enhancements/keps/sig-node/2570-memory-qos 130 memoryHigh := int64(0) 131 if memoryLimit != 0 { 132 memoryHigh = int64(math.Floor( 133 float64(memoryRequest)+ 134 (float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize 135 } else { 136 allocatable := m.getNodeAllocatable() 137 allocatableMemory, ok := allocatable[v1.ResourceMemory] 138 if ok && allocatableMemory.Value() > 0 { 139 memoryHigh = int64(math.Floor( 140 float64(memoryRequest)+ 141 (float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize 142 } 143 } 144 if memoryHigh != 0 && memoryHigh > memoryRequest { 145 unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10) 146 } 147 } 148 if len(unified) > 0 { 149 if lcr.Unified == nil { 150 lcr.Unified = unified 151 } else { 152 for k, v := range unified { 153 lcr.Unified[k] = v 154 } 155 } 156 klog.V(4).InfoS("MemoryQoS config for container", "pod", klog.KObj(pod), "containerName", container.Name, "unified", unified) 157 } 158 } 159 160 return lcr 161 } 162 163 // configureContainerSwapResources configures the swap resources for a specified (linux) container. 164 // Swap is only configured if a swap cgroup controller is available and the NodeSwap feature gate is enabled. 165 func (m *kubeGenericRuntimeManager) configureContainerSwapResources(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) { 166 if !swapControllerAvailable() { 167 klog.InfoS("No swap cgroup controller present", "swapBehavior", m.memorySwapBehavior, "pod", klog.KObj(pod), "containerName", container.Name) 168 return 169 } 170 swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo) 171 172 if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) { 173 swapConfigurationHelper.ConfigureNoSwap(lcr) 174 return 175 } 176 177 // NOTE(ehashman): Behavior is defined in the opencontainers runtime spec: 178 // https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory 179 switch m.memorySwapBehavior { 180 case kubelettypes.LimitedSwap: 181 swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container) 182 default: 183 swapConfigurationHelper.ConfigureUnlimitedSwap(lcr) 184 } 185 } 186 187 // generateContainerResources generates platform specific (linux) container resources config for runtime 188 func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources { 189 enforceMemoryQoS := false 190 // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2 191 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && 192 isCgroup2UnifiedMode() { 193 enforceMemoryQoS = true 194 } 195 return &runtimeapi.ContainerResources{ 196 Linux: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS), 197 } 198 } 199 200 // calculateLinuxResources will create the linuxContainerResources type based on the provided CPU and memory resource requests, limits 201 func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit, memoryLimit *resource.Quantity) *runtimeapi.LinuxContainerResources { 202 resources := runtimeapi.LinuxContainerResources{} 203 var cpuShares int64 204 205 memLimit := memoryLimit.Value() 206 207 // If request is not specified, but limit is, we want request to default to limit. 208 // API server does this for new containers, but we repeat this logic in Kubelet 209 // for containers running on existing Kubernetes clusters. 210 if cpuRequest == nil && cpuLimit != nil { 211 cpuShares = int64(cm.MilliCPUToShares(cpuLimit.MilliValue())) 212 } else { 213 // if cpuRequest.Amount is nil, then MilliCPUToShares will return the minimal number 214 // of CPU shares. 215 cpuShares = int64(cm.MilliCPUToShares(cpuRequest.MilliValue())) 216 } 217 resources.CpuShares = cpuShares 218 if memLimit != 0 { 219 resources.MemoryLimitInBytes = memLimit 220 } 221 222 if m.cpuCFSQuota { 223 // if cpuLimit.Amount is nil, then the appropriate default value is returned 224 // to allow full usage of cpu resource. 225 cpuPeriod := int64(quotaPeriod) 226 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) { 227 // kubeGenericRuntimeManager.cpuCFSQuotaPeriod is provided in time.Duration, 228 // but we need to convert it to number of microseconds which is used by kernel. 229 cpuPeriod = int64(m.cpuCFSQuotaPeriod.Duration / time.Microsecond) 230 } 231 cpuQuota := milliCPUToQuota(cpuLimit.MilliValue(), cpuPeriod) 232 resources.CpuQuota = cpuQuota 233 resources.CpuPeriod = cpuPeriod 234 } 235 236 // runc requires cgroupv2 for unified mode 237 if isCgroup2UnifiedMode() { 238 resources.Unified = map[string]string{ 239 // Ask the kernel to kill all processes in the container cgroup in case of OOM. 240 // See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for 241 // more info. 242 "memory.oom.group": "1", 243 } 244 } 245 return &resources 246 } 247 248 // GetHugepageLimitsFromResources returns limits of each hugepages from resources. 249 func GetHugepageLimitsFromResources(resources v1.ResourceRequirements) []*runtimeapi.HugepageLimit { 250 var hugepageLimits []*runtimeapi.HugepageLimit 251 252 // For each page size, limit to 0. 253 for _, pageSize := range libcontainercgroups.HugePageSizes() { 254 hugepageLimits = append(hugepageLimits, &runtimeapi.HugepageLimit{ 255 PageSize: pageSize, 256 Limit: uint64(0), 257 }) 258 } 259 260 requiredHugepageLimits := map[string]uint64{} 261 for resourceObj, amountObj := range resources.Limits { 262 if !v1helper.IsHugePageResourceName(resourceObj) { 263 continue 264 } 265 266 pageSize, err := v1helper.HugePageSizeFromResourceName(resourceObj) 267 if err != nil { 268 klog.InfoS("Failed to get hugepage size from resource", "object", resourceObj, "err", err) 269 continue 270 } 271 272 sizeString, err := v1helper.HugePageUnitSizeFromByteSize(pageSize.Value()) 273 if err != nil { 274 klog.InfoS("Size is invalid", "object", resourceObj, "err", err) 275 continue 276 } 277 requiredHugepageLimits[sizeString] = uint64(amountObj.Value()) 278 } 279 280 for _, hugepageLimit := range hugepageLimits { 281 if limit, exists := requiredHugepageLimits[hugepageLimit.PageSize]; exists { 282 hugepageLimit.Limit = limit 283 } 284 } 285 286 return hugepageLimits 287 } 288 289 func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *kubecontainer.ContainerResources { 290 var cStatusResources *kubecontainer.ContainerResources 291 runtimeStatusResources := statusResources.GetLinux() 292 if runtimeStatusResources != nil { 293 var cpuLimit, memLimit, cpuRequest *resource.Quantity 294 if runtimeStatusResources.CpuPeriod > 0 { 295 milliCPU := quotaToMilliCPU(runtimeStatusResources.CpuQuota, runtimeStatusResources.CpuPeriod) 296 if milliCPU > 0 { 297 cpuLimit = resource.NewMilliQuantity(milliCPU, resource.DecimalSI) 298 } 299 } 300 if runtimeStatusResources.CpuShares > 0 { 301 milliCPU := sharesToMilliCPU(runtimeStatusResources.CpuShares) 302 if milliCPU > 0 { 303 cpuRequest = resource.NewMilliQuantity(milliCPU, resource.DecimalSI) 304 } 305 } 306 if runtimeStatusResources.MemoryLimitInBytes > 0 { 307 memLimit = resource.NewQuantity(runtimeStatusResources.MemoryLimitInBytes, resource.BinarySI) 308 } 309 if cpuLimit != nil || memLimit != nil || cpuRequest != nil { 310 cStatusResources = &kubecontainer.ContainerResources{ 311 CPULimit: cpuLimit, 312 CPURequest: cpuRequest, 313 MemoryLimit: memLimit, 314 } 315 } 316 } 317 return cStatusResources 318 } 319 320 // Note: this function variable is being added here so it would be possible to mock 321 // the cgroup version for unit tests by assigning a new mocked function into it. Without it, 322 // the cgroup version would solely depend on the environment running the test. 323 var isCgroup2UnifiedMode = func() bool { 324 return libcontainercgroups.IsCgroup2UnifiedMode() 325 } 326 327 var ( 328 swapControllerAvailability bool 329 swapControllerAvailabilityOnce sync.Once 330 ) 331 332 // Note: this function variable is being added here so it would be possible to mock 333 // the swap controller availability for unit tests by assigning a new function to it. Without it, 334 // the swap controller availability would solely depend on the environment running the test. 335 var swapControllerAvailable = func() bool { 336 // See https://github.com/containerd/containerd/pull/7838/ 337 swapControllerAvailabilityOnce.Do(func() { 338 const warn = "Failed to detect the availability of the swap controller, assuming not available" 339 p := "/sys/fs/cgroup/memory/memory.memsw.limit_in_bytes" 340 if isCgroup2UnifiedMode() { 341 // memory.swap.max does not exist in the cgroup root, so we check /sys/fs/cgroup/<SELF>/memory.swap.max 342 _, unified, err := cgroups.ParseCgroupFileUnified("/proc/self/cgroup") 343 if err != nil { 344 klog.V(5).ErrorS(fmt.Errorf("failed to parse /proc/self/cgroup: %w", err), warn) 345 return 346 } 347 p = filepath.Join("/sys/fs/cgroup", unified, "memory.swap.max") 348 } 349 if _, err := os.Stat(p); err != nil { 350 if !errors.Is(err, os.ErrNotExist) { 351 klog.V(5).ErrorS(err, warn) 352 } 353 return 354 } 355 swapControllerAvailability = true 356 }) 357 return swapControllerAvailability 358 } 359 360 type swapConfigurationHelper struct { 361 machineInfo cadvisorv1.MachineInfo 362 } 363 364 func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper { 365 return &swapConfigurationHelper{machineInfo: machineInfo} 366 } 367 368 func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) { 369 podQos := kubeapiqos.GetPodQOS(pod) 370 containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero() 371 memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0 372 373 if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit { 374 m.ConfigureNoSwap(lcr) 375 return 376 } 377 378 containerMemoryRequest := container.Resources.Requests.Memory() 379 swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity)) 380 381 if err != nil { 382 klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap") 383 m.ConfigureNoSwap(lcr) 384 return 385 } 386 387 m.configureSwap(lcr, swapLimit) 388 } 389 390 func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) { 391 if !isCgroup2UnifiedMode() { 392 if swapControllerAvailable() { 393 // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit 394 // Some swapping is still possible. 395 // Note that if memory limit is 0, memory swap limit is ignored. 396 lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes 397 } 398 return 399 } 400 401 m.configureSwap(lcr, 0) 402 } 403 404 func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) { 405 if !isCgroup2UnifiedMode() { 406 m.ConfigureNoSwap(lcr) 407 return 408 } 409 410 if lcr.Unified == nil { 411 lcr.Unified = map[string]string{} 412 } 413 414 lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max" 415 } 416 417 func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) { 418 if !isCgroup2UnifiedMode() { 419 klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected") 420 return 421 } 422 423 if lcr.Unified == nil { 424 lcr.Unified = map[string]string{} 425 } 426 427 lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory) 428 } 429 430 // The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>. 431 // For more info, please look at the following KEP: https://kep.k8s.io/2400 432 func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) { 433 if nodeTotalMemory <= 0 { 434 return 0, fmt.Errorf("total node memory is 0") 435 } 436 if containerMemoryRequest > nodeTotalMemory { 437 return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory) 438 } 439 440 containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory) 441 swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable) 442 443 return int64(swapAllocation), nil 444 }