k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/helpers_linux.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cm 18 19 import ( 20 "bufio" 21 "fmt" 22 "os" 23 "path/filepath" 24 "strconv" 25 26 libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" 27 v1 "k8s.io/api/core/v1" 28 "k8s.io/apimachinery/pkg/types" 29 utilfeature "k8s.io/apiserver/pkg/util/feature" 30 31 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 32 "k8s.io/kubernetes/pkg/api/v1/resource" 33 v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" 34 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 35 kubefeatures "k8s.io/kubernetes/pkg/features" 36 "k8s.io/kubernetes/pkg/kubelet/cm/util" 37 ) 38 39 const ( 40 // These limits are defined in the kernel: 41 // https://github.com/torvalds/linux/blob/0bddd227f3dc55975e2b8dfa7fc6f959b062a2c7/kernel/sched/sched.h#L427-L428 42 MinShares = 2 43 MaxShares = 262144 44 45 SharesPerCPU = 1024 46 MilliCPUToCPU = 1000 47 48 // 100000 microseconds is equivalent to 100ms 49 QuotaPeriod = 100000 50 // 1000 microseconds is equivalent to 1ms 51 // defined here: 52 // https://github.com/torvalds/linux/blob/cac03ac368fabff0122853de2422d4e17a32de08/kernel/sched/core.c#L10546 53 MinQuotaPeriod = 1000 54 ) 55 56 // MilliCPUToQuota converts milliCPU to CFS quota and period values. 57 // Input parameters and resulting value is number of microseconds. 58 func MilliCPUToQuota(milliCPU int64, period int64) (quota int64) { 59 // CFS quota is measured in two values: 60 // - cfs_period_us=100ms (the amount of time to measure usage across given by period) 61 // - cfs_quota=20ms (the amount of cpu time allowed to be used across a period) 62 // so in the above example, you are limited to 20% of a single CPU 63 // for multi-cpu environments, you just scale equivalent amounts 64 // see https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt for details 65 66 if milliCPU == 0 { 67 return 68 } 69 70 if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) { 71 period = QuotaPeriod 72 } 73 74 // we then convert your milliCPU to a value normalized over a period 75 quota = (milliCPU * period) / MilliCPUToCPU 76 77 // quota needs to be a minimum of 1ms. 78 if quota < MinQuotaPeriod { 79 quota = MinQuotaPeriod 80 } 81 return 82 } 83 84 // MilliCPUToShares converts the milliCPU to CFS shares. 85 func MilliCPUToShares(milliCPU int64) uint64 { 86 if milliCPU == 0 { 87 // Docker converts zero milliCPU to unset, which maps to kernel default 88 // for unset: 1024. Return 2 here to really match kernel default for 89 // zero milliCPU. 90 return MinShares 91 } 92 // Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding. 93 shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU 94 if shares < MinShares { 95 return MinShares 96 } 97 if shares > MaxShares { 98 return MaxShares 99 } 100 return uint64(shares) 101 } 102 103 // HugePageLimits converts the API representation to a map 104 // from huge page size (in bytes) to huge page limit (in bytes). 105 func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 { 106 hugePageLimits := map[int64]int64{} 107 for k, v := range resourceList { 108 if v1helper.IsHugePageResourceName(k) { 109 pageSize, _ := v1helper.HugePageSizeFromResourceName(k) 110 if value, exists := hugePageLimits[pageSize.Value()]; exists { 111 hugePageLimits[pageSize.Value()] = value + v.Value() 112 } else { 113 hugePageLimits[pageSize.Value()] = v.Value() 114 } 115 } 116 } 117 return hugePageLimits 118 } 119 120 // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. 121 func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig { 122 inPlacePodVerticalScalingEnabled := utilfeature.DefaultFeatureGate.Enabled(kubefeatures.InPlacePodVerticalScaling) 123 // sum requests and limits. 124 reqs := resource.PodRequests(pod, resource.PodResourcesOptions{ 125 InPlacePodVerticalScalingEnabled: inPlacePodVerticalScalingEnabled, 126 }) 127 // track if limits were applied for each resource. 128 memoryLimitsDeclared := true 129 cpuLimitsDeclared := true 130 131 limits := resource.PodLimits(pod, resource.PodResourcesOptions{ 132 InPlacePodVerticalScalingEnabled: inPlacePodVerticalScalingEnabled, 133 ContainerFn: func(res v1.ResourceList, containerType podutil.ContainerType) { 134 if res.Cpu().IsZero() { 135 cpuLimitsDeclared = false 136 } 137 if res.Memory().IsZero() { 138 memoryLimitsDeclared = false 139 } 140 }, 141 }) 142 // map hugepage pagesize (bytes) to limits (bytes) 143 hugePageLimits := HugePageLimits(reqs) 144 145 cpuRequests := int64(0) 146 cpuLimits := int64(0) 147 memoryLimits := int64(0) 148 if request, found := reqs[v1.ResourceCPU]; found { 149 cpuRequests = request.MilliValue() 150 } 151 if limit, found := limits[v1.ResourceCPU]; found { 152 cpuLimits = limit.MilliValue() 153 } 154 if limit, found := limits[v1.ResourceMemory]; found { 155 memoryLimits = limit.Value() 156 } 157 158 // convert to CFS values 159 cpuShares := MilliCPUToShares(cpuRequests) 160 cpuQuota := MilliCPUToQuota(cpuLimits, int64(cpuPeriod)) 161 162 // quota is not capped when cfs quota is disabled 163 if !enforceCPULimits { 164 cpuQuota = int64(-1) 165 } 166 167 // determine the qos class 168 qosClass := v1qos.GetPodQOS(pod) 169 170 // build the result 171 result := &ResourceConfig{} 172 if qosClass == v1.PodQOSGuaranteed { 173 result.CPUShares = &cpuShares 174 result.CPUQuota = &cpuQuota 175 result.CPUPeriod = &cpuPeriod 176 result.Memory = &memoryLimits 177 } else if qosClass == v1.PodQOSBurstable { 178 result.CPUShares = &cpuShares 179 if cpuLimitsDeclared { 180 result.CPUQuota = &cpuQuota 181 result.CPUPeriod = &cpuPeriod 182 } 183 if memoryLimitsDeclared { 184 result.Memory = &memoryLimits 185 } 186 } else { 187 shares := uint64(MinShares) 188 result.CPUShares = &shares 189 } 190 result.HugePageLimit = hugePageLimits 191 192 if enforceMemoryQoS { 193 memoryMin := int64(0) 194 if request, found := reqs[v1.ResourceMemory]; found { 195 memoryMin = request.Value() 196 } 197 if memoryMin > 0 { 198 result.Unified = map[string]string{ 199 Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10), 200 } 201 } 202 } 203 204 return result 205 } 206 207 // getCgroupSubsystemsV1 returns information about the mounted cgroup v1 subsystems 208 func getCgroupSubsystemsV1() (*CgroupSubsystems, error) { 209 // get all cgroup mounts. 210 allCgroups, err := libcontainercgroups.GetCgroupMounts(true) 211 if err != nil { 212 return &CgroupSubsystems{}, err 213 } 214 if len(allCgroups) == 0 { 215 return &CgroupSubsystems{}, fmt.Errorf("failed to find cgroup mounts") 216 } 217 mountPoints := make(map[string]string, len(allCgroups)) 218 for _, mount := range allCgroups { 219 // BEFORE kubelet used a random mount point per cgroups subsystem; 220 // NOW more deterministic: kubelet use mount point with shortest path; 221 // FUTURE is bright with clear expectation determined in doc. 222 // ref. issue: https://github.com/kubernetes/kubernetes/issues/95488 223 224 for _, subsystem := range mount.Subsystems { 225 previous := mountPoints[subsystem] 226 if previous == "" || len(mount.Mountpoint) < len(previous) { 227 mountPoints[subsystem] = mount.Mountpoint 228 } 229 } 230 } 231 return &CgroupSubsystems{ 232 Mounts: allCgroups, 233 MountPoints: mountPoints, 234 }, nil 235 } 236 237 // getCgroupSubsystemsV2 returns information about the enabled cgroup v2 subsystems 238 func getCgroupSubsystemsV2() (*CgroupSubsystems, error) { 239 controllers, err := libcontainercgroups.GetAllSubsystems() 240 if err != nil { 241 return nil, err 242 } 243 244 mounts := []libcontainercgroups.Mount{} 245 mountPoints := make(map[string]string, len(controllers)) 246 for _, controller := range controllers { 247 mountPoints[controller] = util.CgroupRoot 248 m := libcontainercgroups.Mount{ 249 Mountpoint: util.CgroupRoot, 250 Root: util.CgroupRoot, 251 Subsystems: []string{controller}, 252 } 253 mounts = append(mounts, m) 254 } 255 256 return &CgroupSubsystems{ 257 Mounts: mounts, 258 MountPoints: mountPoints, 259 }, nil 260 } 261 262 // GetCgroupSubsystems returns information about the mounted cgroup subsystems 263 func GetCgroupSubsystems() (*CgroupSubsystems, error) { 264 if libcontainercgroups.IsCgroup2UnifiedMode() { 265 return getCgroupSubsystemsV2() 266 } 267 268 return getCgroupSubsystemsV1() 269 } 270 271 // getCgroupProcs takes a cgroup directory name as an argument 272 // reads through the cgroup's procs file and returns a list of tgid's. 273 // It returns an empty list if a procs file doesn't exists 274 func getCgroupProcs(dir string) ([]int, error) { 275 procsFile := filepath.Join(dir, "cgroup.procs") 276 f, err := os.Open(procsFile) 277 if err != nil { 278 if os.IsNotExist(err) { 279 // The procsFile does not exist, So no pids attached to this directory 280 return []int{}, nil 281 } 282 return nil, err 283 } 284 defer f.Close() 285 286 s := bufio.NewScanner(f) 287 out := []int{} 288 for s.Scan() { 289 if t := s.Text(); t != "" { 290 pid, err := strconv.Atoi(t) 291 if err != nil { 292 return nil, fmt.Errorf("unexpected line in %v; could not convert to pid: %v", procsFile, err) 293 } 294 out = append(out, pid) 295 } 296 } 297 return out, nil 298 } 299 300 // GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier 301 func GetPodCgroupNameSuffix(podUID types.UID) string { 302 return podCgroupNamePrefix + string(podUID) 303 } 304 305 // NodeAllocatableRoot returns the literal cgroup path for the node allocatable cgroup 306 func NodeAllocatableRoot(cgroupRoot string, cgroupsPerQOS bool, cgroupDriver string) string { 307 nodeAllocatableRoot := ParseCgroupfsToCgroupName(cgroupRoot) 308 if cgroupsPerQOS { 309 nodeAllocatableRoot = NewCgroupName(nodeAllocatableRoot, defaultNodeAllocatableCgroupName) 310 } 311 if cgroupDriver == "systemd" { 312 return nodeAllocatableRoot.ToSystemd() 313 } 314 return nodeAllocatableRoot.ToCgroupfs() 315 } 316 317 // GetKubeletContainer returns the cgroup the kubelet will use 318 func GetKubeletContainer(kubeletCgroups string) (string, error) { 319 if kubeletCgroups == "" { 320 cont, err := getContainer(os.Getpid()) 321 if err != nil { 322 return "", err 323 } 324 return cont, nil 325 } 326 return kubeletCgroups, nil 327 }