k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go (about) 1 //go:build linux 2 // +build linux 3 4 /* 5 Copyright 2018 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package kuberuntime 21 22 import ( 23 "errors" 24 "fmt" 25 "math" 26 "os" 27 "path/filepath" 28 "strconv" 29 "sync" 30 "time" 31 32 "github.com/containerd/cgroups" 33 cadvisorv1 "github.com/google/cadvisor/info/v1" 34 libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" 35 36 v1 "k8s.io/api/core/v1" 37 "k8s.io/apimachinery/pkg/api/resource" 38 utilfeature "k8s.io/apiserver/pkg/util/feature" 39 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 40 "k8s.io/klog/v2" 41 v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" 42 kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 43 kubefeatures "k8s.io/kubernetes/pkg/features" 44 "k8s.io/kubernetes/pkg/kubelet/cm" 45 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 46 "k8s.io/kubernetes/pkg/kubelet/qos" 47 kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" 48 ) 49 50 var defaultPageSize = int64(os.Getpagesize()) 51 52 // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig. 53 func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error { 54 enforceMemoryQoS := false 55 // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2 56 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && 57 isCgroup2UnifiedMode() { 58 enforceMemoryQoS = true 59 } 60 cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS) 61 if err != nil { 62 return err 63 } 64 config.Linux = cl 65 66 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.UserNamespacesSupport) { 67 if cl.SecurityContext.NamespaceOptions.UsernsOptions != nil { 68 for _, mount := range config.Mounts { 69 mount.UidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Uids 70 mount.GidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Gids 71 } 72 } 73 } 74 return nil 75 } 76 77 // generateLinuxContainerConfig generates linux container config for kubelet runtime v1. 78 func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) (*runtimeapi.LinuxContainerConfig, error) { 79 sc, err := m.determineEffectiveSecurityContext(pod, container, uid, username) 80 if err != nil { 81 return nil, err 82 } 83 lc := &runtimeapi.LinuxContainerConfig{ 84 Resources: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS), 85 SecurityContext: sc, 86 } 87 88 if nsTarget != nil && lc.SecurityContext.NamespaceOptions.Pid == runtimeapi.NamespaceMode_CONTAINER { 89 lc.SecurityContext.NamespaceOptions.Pid = runtimeapi.NamespaceMode_TARGET 90 lc.SecurityContext.NamespaceOptions.TargetId = nsTarget.ID 91 } 92 93 return lc, nil 94 } 95 96 // generateLinuxContainerResources generates linux container resources config for runtime 97 func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, container *v1.Container, enforceMemoryQoS bool) *runtimeapi.LinuxContainerResources { 98 // set linux container resources 99 var cpuRequest *resource.Quantity 100 if _, cpuRequestExists := container.Resources.Requests[v1.ResourceCPU]; cpuRequestExists { 101 cpuRequest = container.Resources.Requests.Cpu() 102 } 103 lcr := m.calculateLinuxResources(cpuRequest, container.Resources.Limits.Cpu(), container.Resources.Limits.Memory()) 104 105 lcr.OomScoreAdj = int64(qos.GetContainerOOMScoreAdjust(pod, container, 106 int64(m.machineInfo.MemoryCapacity))) 107 108 lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources) 109 110 // Configure swap for the container 111 m.configureContainerSwapResources(lcr, pod, container) 112 113 // Set memory.min and memory.high to enforce MemoryQoS 114 if enforceMemoryQoS { 115 unified := map[string]string{} 116 memoryRequest := container.Resources.Requests.Memory().Value() 117 memoryLimit := container.Resources.Limits.Memory().Value() 118 if memoryRequest != 0 { 119 unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10) 120 } 121 122 // Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit. 123 // Here, we only check from memory perspective. Hence MemoryQoS feature is disabled on those QoS pods by not setting memory.high. 124 if memoryRequest != memoryLimit { 125 // The formula for memory.high for container cgroup is modified in Alpha stage of the feature in K8s v1.27. 126 // It will be set based on formula: 127 // `memory.high=floor[(requests.memory + memory throttling factor * (limits.memory or node allocatable memory - requests.memory))/pageSize] * pageSize` 128 // where default value of memory throttling factor is set to 0.9 129 // More info: https://git.k8s.io/enhancements/keps/sig-node/2570-memory-qos 130 memoryHigh := int64(0) 131 if memoryLimit != 0 { 132 memoryHigh = int64(math.Floor( 133 float64(memoryRequest)+ 134 (float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize 135 } else { 136 allocatable := m.getNodeAllocatable() 137 allocatableMemory, ok := allocatable[v1.ResourceMemory] 138 if ok && allocatableMemory.Value() > 0 { 139 memoryHigh = int64(math.Floor( 140 float64(memoryRequest)+ 141 (float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize 142 } 143 } 144 if memoryHigh != 0 && memoryHigh > memoryRequest { 145 unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10) 146 } 147 } 148 if len(unified) > 0 { 149 if lcr.Unified == nil { 150 lcr.Unified = unified 151 } else { 152 for k, v := range unified { 153 lcr.Unified[k] = v 154 } 155 } 156 klog.V(4).InfoS("MemoryQoS config for container", "pod", klog.KObj(pod), "containerName", container.Name, "unified", unified) 157 } 158 } 159 160 return lcr 161 } 162 163 // configureContainerSwapResources configures the swap resources for a specified (linux) container. 164 // Swap is only configured if a swap cgroup controller is available and the NodeSwap feature gate is enabled. 165 func (m *kubeGenericRuntimeManager) configureContainerSwapResources(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) { 166 if !swapControllerAvailable() { 167 return 168 } 169 170 swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo) 171 if m.memorySwapBehavior == kubelettypes.LimitedSwap { 172 if !isCgroup2UnifiedMode() { 173 swapConfigurationHelper.ConfigureNoSwap(lcr) 174 return 175 } 176 } 177 178 if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) { 179 swapConfigurationHelper.ConfigureNoSwap(lcr) 180 return 181 } 182 183 // NOTE(ehashman): Behavior is defined in the opencontainers runtime spec: 184 // https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory 185 switch m.memorySwapBehavior { 186 case kubelettypes.NoSwap: 187 swapConfigurationHelper.ConfigureNoSwap(lcr) 188 case kubelettypes.LimitedSwap: 189 swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container) 190 default: 191 swapConfigurationHelper.ConfigureNoSwap(lcr) 192 } 193 } 194 195 // generateContainerResources generates platform specific (linux) container resources config for runtime 196 func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources { 197 enforceMemoryQoS := false 198 // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2 199 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && 200 isCgroup2UnifiedMode() { 201 enforceMemoryQoS = true 202 } 203 return &runtimeapi.ContainerResources{ 204 Linux: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS), 205 } 206 } 207 208 // calculateLinuxResources will create the linuxContainerResources type based on the provided CPU and memory resource requests, limits 209 func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit, memoryLimit *resource.Quantity) *runtimeapi.LinuxContainerResources { 210 resources := runtimeapi.LinuxContainerResources{} 211 var cpuShares int64 212 213 memLimit := memoryLimit.Value() 214 215 // If request is not specified, but limit is, we want request to default to limit. 216 // API server does this for new containers, but we repeat this logic in Kubelet 217 // for containers running on existing Kubernetes clusters. 218 if cpuRequest == nil && cpuLimit != nil { 219 cpuShares = int64(cm.MilliCPUToShares(cpuLimit.MilliValue())) 220 } else { 221 // if cpuRequest.Amount is nil, then MilliCPUToShares will return the minimal number 222 // of CPU shares. 223 cpuShares = int64(cm.MilliCPUToShares(cpuRequest.MilliValue())) 224 } 225 resources.CpuShares = cpuShares 226 if memLimit != 0 { 227 resources.MemoryLimitInBytes = memLimit 228 } 229 230 if m.cpuCFSQuota { 231 // if cpuLimit.Amount is nil, then the appropriate default value is returned 232 // to allow full usage of cpu resource. 233 cpuPeriod := int64(quotaPeriod) 234 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) { 235 // kubeGenericRuntimeManager.cpuCFSQuotaPeriod is provided in time.Duration, 236 // but we need to convert it to number of microseconds which is used by kernel. 237 cpuPeriod = int64(m.cpuCFSQuotaPeriod.Duration / time.Microsecond) 238 } 239 cpuQuota := milliCPUToQuota(cpuLimit.MilliValue(), cpuPeriod) 240 resources.CpuQuota = cpuQuota 241 resources.CpuPeriod = cpuPeriod 242 } 243 244 // runc requires cgroupv2 for unified mode 245 if isCgroup2UnifiedMode() { 246 resources.Unified = map[string]string{ 247 // Ask the kernel to kill all processes in the container cgroup in case of OOM. 248 // See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for 249 // more info. 250 "memory.oom.group": "1", 251 } 252 } 253 return &resources 254 } 255 256 // GetHugepageLimitsFromResources returns limits of each hugepages from resources. 257 func GetHugepageLimitsFromResources(resources v1.ResourceRequirements) []*runtimeapi.HugepageLimit { 258 var hugepageLimits []*runtimeapi.HugepageLimit 259 260 // For each page size, limit to 0. 261 for _, pageSize := range libcontainercgroups.HugePageSizes() { 262 hugepageLimits = append(hugepageLimits, &runtimeapi.HugepageLimit{ 263 PageSize: pageSize, 264 Limit: uint64(0), 265 }) 266 } 267 268 requiredHugepageLimits := map[string]uint64{} 269 for resourceObj, amountObj := range resources.Limits { 270 if !v1helper.IsHugePageResourceName(resourceObj) { 271 continue 272 } 273 274 pageSize, err := v1helper.HugePageSizeFromResourceName(resourceObj) 275 if err != nil { 276 klog.InfoS("Failed to get hugepage size from resource", "object", resourceObj, "err", err) 277 continue 278 } 279 280 sizeString, err := v1helper.HugePageUnitSizeFromByteSize(pageSize.Value()) 281 if err != nil { 282 klog.InfoS("Size is invalid", "object", resourceObj, "err", err) 283 continue 284 } 285 requiredHugepageLimits[sizeString] = uint64(amountObj.Value()) 286 } 287 288 for _, hugepageLimit := range hugepageLimits { 289 if limit, exists := requiredHugepageLimits[hugepageLimit.PageSize]; exists { 290 hugepageLimit.Limit = limit 291 } 292 } 293 294 return hugepageLimits 295 } 296 297 func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *kubecontainer.ContainerResources { 298 var cStatusResources *kubecontainer.ContainerResources 299 runtimeStatusResources := statusResources.GetLinux() 300 if runtimeStatusResources != nil { 301 var cpuLimit, memLimit, cpuRequest *resource.Quantity 302 if runtimeStatusResources.CpuPeriod > 0 { 303 milliCPU := quotaToMilliCPU(runtimeStatusResources.CpuQuota, runtimeStatusResources.CpuPeriod) 304 if milliCPU > 0 { 305 cpuLimit = resource.NewMilliQuantity(milliCPU, resource.DecimalSI) 306 } 307 } 308 if runtimeStatusResources.CpuShares > 0 { 309 milliCPU := sharesToMilliCPU(runtimeStatusResources.CpuShares) 310 if milliCPU > 0 { 311 cpuRequest = resource.NewMilliQuantity(milliCPU, resource.DecimalSI) 312 } 313 } 314 if runtimeStatusResources.MemoryLimitInBytes > 0 { 315 memLimit = resource.NewQuantity(runtimeStatusResources.MemoryLimitInBytes, resource.BinarySI) 316 } 317 if cpuLimit != nil || memLimit != nil || cpuRequest != nil { 318 cStatusResources = &kubecontainer.ContainerResources{ 319 CPULimit: cpuLimit, 320 CPURequest: cpuRequest, 321 MemoryLimit: memLimit, 322 } 323 } 324 } 325 return cStatusResources 326 } 327 328 // Note: this function variable is being added here so it would be possible to mock 329 // the cgroup version for unit tests by assigning a new mocked function into it. Without it, 330 // the cgroup version would solely depend on the environment running the test. 331 var isCgroup2UnifiedMode = func() bool { 332 return libcontainercgroups.IsCgroup2UnifiedMode() 333 } 334 335 var ( 336 swapControllerAvailability bool 337 swapControllerAvailabilityOnce sync.Once 338 ) 339 340 // Note: this function variable is being added here so it would be possible to mock 341 // the swap controller availability for unit tests by assigning a new function to it. Without it, 342 // the swap controller availability would solely depend on the environment running the test. 343 var swapControllerAvailable = func() bool { 344 // See https://github.com/containerd/containerd/pull/7838/ 345 swapControllerAvailabilityOnce.Do(func() { 346 const warn = "Failed to detect the availability of the swap controller, assuming not available" 347 p := "/sys/fs/cgroup/memory/memory.memsw.limit_in_bytes" 348 if isCgroup2UnifiedMode() { 349 // memory.swap.max does not exist in the cgroup root, so we check /sys/fs/cgroup/<SELF>/memory.swap.max 350 _, unified, err := cgroups.ParseCgroupFileUnified("/proc/self/cgroup") 351 if err != nil { 352 klog.V(5).ErrorS(fmt.Errorf("failed to parse /proc/self/cgroup: %w", err), warn) 353 return 354 } 355 p = filepath.Join("/sys/fs/cgroup", unified, "memory.swap.max") 356 } 357 if _, err := os.Stat(p); err != nil { 358 if !errors.Is(err, os.ErrNotExist) { 359 klog.V(5).ErrorS(err, warn) 360 } 361 return 362 } 363 swapControllerAvailability = true 364 }) 365 return swapControllerAvailability 366 } 367 368 type swapConfigurationHelper struct { 369 machineInfo cadvisorv1.MachineInfo 370 } 371 372 func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper { 373 return &swapConfigurationHelper{machineInfo: machineInfo} 374 } 375 376 func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) { 377 podQos := kubeapiqos.GetPodQOS(pod) 378 containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero() 379 memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0 380 381 if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit { 382 m.ConfigureNoSwap(lcr) 383 return 384 } 385 386 containerMemoryRequest := container.Resources.Requests.Memory() 387 swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity)) 388 389 if err != nil { 390 klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap") 391 m.ConfigureNoSwap(lcr) 392 return 393 } 394 395 m.configureSwap(lcr, swapLimit) 396 } 397 398 func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) { 399 if !isCgroup2UnifiedMode() { 400 if swapControllerAvailable() { 401 // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit 402 // Some swapping is still possible. 403 // Note that if memory limit is 0, memory swap limit is ignored. 404 lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes 405 } 406 return 407 } 408 409 m.configureSwap(lcr, 0) 410 } 411 412 func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) { 413 if !isCgroup2UnifiedMode() { 414 klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected") 415 return 416 } 417 418 if lcr.Unified == nil { 419 lcr.Unified = map[string]string{} 420 } 421 422 lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory) 423 } 424 425 // The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>. 426 // For more info, please look at the following KEP: https://kep.k8s.io/2400 427 func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) { 428 if nodeTotalMemory <= 0 { 429 return 0, fmt.Errorf("total node memory is 0") 430 } 431 if containerMemoryRequest > nodeTotalMemory { 432 return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory) 433 } 434 435 containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory) 436 swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable) 437 438 return int64(swapAllocation), nil 439 }