k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/qos_container_manager_linux.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cm 18 19 import ( 20 "fmt" 21 "strconv" 22 "strings" 23 "sync" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 "k8s.io/klog/v2" 28 29 "k8s.io/apimachinery/pkg/util/wait" 30 31 units "github.com/docker/go-units" 32 libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" 33 utilfeature "k8s.io/apiserver/pkg/util/feature" 34 35 "k8s.io/kubernetes/pkg/api/v1/resource" 36 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 37 kubefeatures "k8s.io/kubernetes/pkg/features" 38 ) 39 40 const ( 41 // how often the qos cgroup manager will perform periodic update 42 // of the qos level cgroup resource constraints 43 periodicQOSCgroupUpdateInterval = 1 * time.Minute 44 ) 45 46 type QOSContainerManager interface { 47 Start(func() v1.ResourceList, ActivePodsFunc) error 48 GetQOSContainersInfo() QOSContainersInfo 49 UpdateCgroups() error 50 } 51 52 type qosContainerManagerImpl struct { 53 sync.Mutex 54 qosContainersInfo QOSContainersInfo 55 subsystems *CgroupSubsystems 56 cgroupManager CgroupManager 57 activePods ActivePodsFunc 58 getNodeAllocatable func() v1.ResourceList 59 cgroupRoot CgroupName 60 qosReserved map[v1.ResourceName]int64 61 } 62 63 func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) { 64 if !nodeConfig.CgroupsPerQOS { 65 return &qosContainerManagerNoop{ 66 cgroupRoot: cgroupRoot, 67 }, nil 68 } 69 70 return &qosContainerManagerImpl{ 71 subsystems: subsystems, 72 cgroupManager: cgroupManager, 73 cgroupRoot: cgroupRoot, 74 qosReserved: nodeConfig.QOSReserved, 75 }, nil 76 } 77 78 func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo { 79 return m.qosContainersInfo 80 } 81 82 func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error { 83 cm := m.cgroupManager 84 rootContainer := m.cgroupRoot 85 if !cm.Exists(rootContainer) { 86 return fmt.Errorf("root container %v doesn't exist", rootContainer) 87 } 88 89 // Top level for Qos containers are created only for Burstable 90 // and Best Effort classes 91 qosClasses := map[v1.PodQOSClass]CgroupName{ 92 v1.PodQOSBurstable: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))), 93 v1.PodQOSBestEffort: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))), 94 } 95 96 // Create containers for both qos classes 97 for qosClass, containerName := range qosClasses { 98 resourceParameters := &ResourceConfig{} 99 // the BestEffort QoS class has a statically configured minShares value 100 if qosClass == v1.PodQOSBestEffort { 101 minShares := uint64(MinShares) 102 resourceParameters.CPUShares = &minShares 103 } 104 105 // containerConfig object stores the cgroup specifications 106 containerConfig := &CgroupConfig{ 107 Name: containerName, 108 ResourceParameters: resourceParameters, 109 } 110 111 // for each enumerated huge page size, the qos tiers are unbounded 112 m.setHugePagesUnbounded(containerConfig) 113 114 // check if it exists 115 if !cm.Exists(containerName) { 116 if err := cm.Create(containerConfig); err != nil { 117 return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err) 118 } 119 } else { 120 // to ensure we actually have the right state, we update the config on startup 121 if err := cm.Update(containerConfig); err != nil { 122 return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err) 123 } 124 } 125 } 126 // Store the top level qos container names 127 m.qosContainersInfo = QOSContainersInfo{ 128 Guaranteed: rootContainer, 129 Burstable: qosClasses[v1.PodQOSBurstable], 130 BestEffort: qosClasses[v1.PodQOSBestEffort], 131 } 132 m.getNodeAllocatable = getNodeAllocatable 133 m.activePods = activePods 134 135 // update qos cgroup tiers on startup and in periodic intervals 136 // to ensure desired state is in sync with actual state. 137 go wait.Until(func() { 138 err := m.UpdateCgroups() 139 if err != nil { 140 klog.InfoS("Failed to reserve QoS requests", "err", err) 141 } 142 }, periodicQOSCgroupUpdateInterval, wait.NeverStop) 143 144 return nil 145 } 146 147 // setHugePagesUnbounded ensures hugetlb is effectively unbounded 148 func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error { 149 hugePageLimit := map[int64]int64{} 150 for _, pageSize := range libcontainercgroups.HugePageSizes() { 151 pageSizeBytes, err := units.RAMInBytes(pageSize) 152 if err != nil { 153 return err 154 } 155 hugePageLimit[pageSizeBytes] = int64(1 << 62) 156 } 157 cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit 158 return nil 159 } 160 161 func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error { 162 for _, v := range configs { 163 if err := m.setHugePagesUnbounded(v); err != nil { 164 return err 165 } 166 } 167 return nil 168 } 169 170 func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error { 171 pods := m.activePods() 172 burstablePodCPURequest := int64(0) 173 reuseReqs := make(v1.ResourceList, 4) 174 for i := range pods { 175 pod := pods[i] 176 qosClass := v1qos.GetPodQOS(pod) 177 if qosClass != v1.PodQOSBurstable { 178 // we only care about the burstable qos tier 179 continue 180 } 181 req := resource.PodRequests(pod, resource.PodResourcesOptions{Reuse: reuseReqs}) 182 if request, found := req[v1.ResourceCPU]; found { 183 burstablePodCPURequest += request.MilliValue() 184 } 185 } 186 187 // make sure best effort is always 2 shares 188 bestEffortCPUShares := uint64(MinShares) 189 configs[v1.PodQOSBestEffort].ResourceParameters.CPUShares = &bestEffortCPUShares 190 191 // set burstable shares based on current observe state 192 burstableCPUShares := MilliCPUToShares(burstablePodCPURequest) 193 configs[v1.PodQOSBurstable].ResourceParameters.CPUShares = &burstableCPUShares 194 return nil 195 } 196 197 // getQoSMemoryRequests sums and returns the memory request of all pods for 198 // guaranteed and burstable qos classes. 199 func (m *qosContainerManagerImpl) getQoSMemoryRequests() map[v1.PodQOSClass]int64 { 200 qosMemoryRequests := map[v1.PodQOSClass]int64{ 201 v1.PodQOSGuaranteed: 0, 202 v1.PodQOSBurstable: 0, 203 } 204 205 // Sum the pod limits for pods in each QOS class 206 pods := m.activePods() 207 reuseReqs := make(v1.ResourceList, 4) 208 for _, pod := range pods { 209 podMemoryRequest := int64(0) 210 qosClass := v1qos.GetPodQOS(pod) 211 if qosClass == v1.PodQOSBestEffort { 212 // limits are not set for Best Effort pods 213 continue 214 } 215 req := resource.PodRequests(pod, resource.PodResourcesOptions{Reuse: reuseReqs}) 216 if request, found := req[v1.ResourceMemory]; found { 217 podMemoryRequest += request.Value() 218 } 219 qosMemoryRequests[qosClass] += podMemoryRequest 220 } 221 222 return qosMemoryRequests 223 } 224 225 // setMemoryReserve sums the memory limits of all pods in a QOS class, 226 // calculates QOS class memory limits, and set those limits in the 227 // CgroupConfig for each QOS class. 228 func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) { 229 qosMemoryRequests := m.getQoSMemoryRequests() 230 231 resources := m.getNodeAllocatable() 232 allocatableResource, ok := resources[v1.ResourceMemory] 233 if !ok { 234 klog.V(2).InfoS("Allocatable memory value could not be determined, not setting QoS memory limits") 235 return 236 } 237 allocatable := allocatableResource.Value() 238 if allocatable == 0 { 239 klog.V(2).InfoS("Allocatable memory reported as 0, might be in standalone mode, not setting QoS memory limits") 240 return 241 } 242 243 for qos, limits := range qosMemoryRequests { 244 klog.V(2).InfoS("QoS pod memory limit", "qos", qos, "limits", limits, "percentReserve", percentReserve) 245 } 246 247 // Calculate QOS memory limits 248 burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100) 249 bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100) 250 configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit 251 configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit 252 } 253 254 // retrySetMemoryReserve checks for any QoS cgroups over the limit 255 // that was attempted to be set in the first Update() and adjusts 256 // their memory limit to the usage to prevent further growth. 257 func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) { 258 // Unreclaimable memory usage may already exceeded the desired limit 259 // Attempt to set the limit near the current usage to put pressure 260 // on the cgroup and prevent further growth. 261 for qos, config := range configs { 262 usage, err := m.cgroupManager.MemoryUsage(config.Name) 263 if err != nil { 264 klog.V(2).InfoS("Failed to get resource stats", "err", err) 265 return 266 } 267 268 // Because there is no good way to determine of the original Update() 269 // on the memory resource was successful, we determine failure of the 270 // first attempt by checking if the usage is above the limit we attempt 271 // to set. If it is, we assume the first attempt to set the limit failed 272 // and try again setting the limit to the usage. Otherwise we leave 273 // the CgroupConfig as is. 274 if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory { 275 configs[qos].ResourceParameters.Memory = &usage 276 } 277 } 278 } 279 280 // setMemoryQoS sums the memory requests of all pods in the Burstable class, 281 // and set the sum memory as the memory.min in the Unified field of CgroupConfig. 282 func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*CgroupConfig) { 283 qosMemoryRequests := m.getQoSMemoryRequests() 284 285 // Calculate the memory.min: 286 // for burstable(/kubepods/burstable): sum of all burstable pods 287 // for guaranteed(/kubepods): sum of all guaranteed and burstable pods 288 burstableMin := qosMemoryRequests[v1.PodQOSBurstable] 289 guaranteedMin := qosMemoryRequests[v1.PodQOSGuaranteed] + burstableMin 290 291 if burstableMin > 0 { 292 if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil { 293 configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string) 294 } 295 configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10) 296 klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin) 297 } 298 299 if guaranteedMin > 0 { 300 if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil { 301 configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string) 302 } 303 configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10) 304 klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin) 305 } 306 } 307 308 func (m *qosContainerManagerImpl) UpdateCgroups() error { 309 m.Lock() 310 defer m.Unlock() 311 312 qosConfigs := map[v1.PodQOSClass]*CgroupConfig{ 313 v1.PodQOSGuaranteed: { 314 Name: m.qosContainersInfo.Guaranteed, 315 ResourceParameters: &ResourceConfig{}, 316 }, 317 v1.PodQOSBurstable: { 318 Name: m.qosContainersInfo.Burstable, 319 ResourceParameters: &ResourceConfig{}, 320 }, 321 v1.PodQOSBestEffort: { 322 Name: m.qosContainersInfo.BestEffort, 323 ResourceParameters: &ResourceConfig{}, 324 }, 325 } 326 327 // update the qos level cgroup settings for cpu shares 328 if err := m.setCPUCgroupConfig(qosConfigs); err != nil { 329 return err 330 } 331 332 // update the qos level cgroup settings for huge pages (ensure they remain unbounded) 333 if err := m.setHugePagesConfig(qosConfigs); err != nil { 334 return err 335 } 336 337 // update the qos level cgrougs v2 settings of memory qos if feature enabled 338 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && 339 libcontainercgroups.IsCgroup2UnifiedMode() { 340 m.setMemoryQoS(qosConfigs) 341 } 342 343 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) { 344 for resource, percentReserve := range m.qosReserved { 345 switch resource { 346 case v1.ResourceMemory: 347 m.setMemoryReserve(qosConfigs, percentReserve) 348 } 349 } 350 351 updateSuccess := true 352 for _, config := range qosConfigs { 353 err := m.cgroupManager.Update(config) 354 if err != nil { 355 updateSuccess = false 356 } 357 } 358 if updateSuccess { 359 klog.V(4).InfoS("Updated QoS cgroup configuration") 360 return nil 361 } 362 363 // If the resource can adjust the ResourceConfig to increase likelihood of 364 // success, call the adjustment function here. Otherwise, the Update() will 365 // be called again with the same values. 366 for resource, percentReserve := range m.qosReserved { 367 switch resource { 368 case v1.ResourceMemory: 369 m.retrySetMemoryReserve(qosConfigs, percentReserve) 370 } 371 } 372 } 373 374 for _, config := range qosConfigs { 375 err := m.cgroupManager.Update(config) 376 if err != nil { 377 klog.ErrorS(err, "Failed to update QoS cgroup configuration") 378 return err 379 } 380 } 381 382 klog.V(4).InfoS("Updated QoS cgroup configuration") 383 return nil 384 } 385 386 type qosContainerManagerNoop struct { 387 cgroupRoot CgroupName 388 } 389 390 var _ QOSContainerManager = &qosContainerManagerNoop{} 391 392 func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo { 393 return QOSContainersInfo{} 394 } 395 396 func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error { 397 return nil 398 } 399 400 func (m *qosContainerManagerNoop) UpdateCgroups() error { 401 return nil 402 }