k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/cpumanager/policy_static.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cpumanager 18 19 import ( 20 "fmt" 21 22 v1 "k8s.io/api/core/v1" 23 utilfeature "k8s.io/apiserver/pkg/util/feature" 24 "k8s.io/klog/v2" 25 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 26 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 27 "k8s.io/kubernetes/pkg/features" 28 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state" 29 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" 30 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" 31 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" 32 "k8s.io/kubernetes/pkg/kubelet/metrics" 33 "k8s.io/kubernetes/pkg/kubelet/types" 34 "k8s.io/utils/cpuset" 35 ) 36 37 const ( 38 39 // PolicyStatic is the name of the static policy. 40 // Should options be given, these will be ignored and backward (up to 1.21 included) 41 // compatible behaviour will be enforced 42 PolicyStatic policyName = "static" 43 // ErrorSMTAlignment represents the type of an SMTAlignmentError 44 ErrorSMTAlignment = "SMTAlignmentError" 45 ) 46 47 // SMTAlignmentError represents an error due to SMT alignment 48 type SMTAlignmentError struct { 49 RequestedCPUs int 50 CpusPerCore int 51 AvailablePhysicalCPUs int 52 } 53 54 func (e SMTAlignmentError) Error() string { 55 if e.AvailablePhysicalCPUs > 0 { 56 return fmt.Sprintf("SMT Alignment Error: not enough free physical CPUs: available physical CPUs = %d, requested CPUs = %d, CPUs per core = %d", e.AvailablePhysicalCPUs, e.RequestedCPUs, e.CpusPerCore) 57 } 58 return fmt.Sprintf("SMT Alignment Error: requested %d cpus not multiple cpus per core = %d", e.RequestedCPUs, e.CpusPerCore) 59 } 60 61 // Type returns human-readable type of this error. Used in the admission control to populate Admission Failure reason. 62 func (e SMTAlignmentError) Type() string { 63 return ErrorSMTAlignment 64 } 65 66 // staticPolicy is a CPU manager policy that does not change CPU 67 // assignments for exclusively pinned guaranteed containers after the main 68 // container process starts. 69 // 70 // This policy allocates CPUs exclusively for a container if all the following 71 // conditions are met: 72 // 73 // - The pod QoS class is Guaranteed. 74 // - The CPU request is a positive integer. 75 // 76 // The static policy maintains the following sets of logical CPUs: 77 // 78 // - SHARED: Burstable, BestEffort, and non-integral Guaranteed containers 79 // run here. Initially this contains all CPU IDs on the system. As 80 // exclusive allocations are created and destroyed, this CPU set shrinks 81 // and grows, accordingly. This is stored in the state as the default 82 // CPU set. 83 // 84 // - RESERVED: A subset of the shared pool which is not exclusively 85 // allocatable. The membership of this pool is static for the lifetime of 86 // the Kubelet. The size of the reserved pool is 87 // ceil(systemreserved.cpu + kubereserved.cpu). 88 // Reserved CPUs are taken topologically starting with lowest-indexed 89 // physical core, as reported by cAdvisor. 90 // 91 // - ASSIGNABLE: Equal to SHARED - RESERVED. Exclusive CPUs are allocated 92 // from this pool. 93 // 94 // - EXCLUSIVE ALLOCATIONS: CPU sets assigned exclusively to one container. 95 // These are stored as explicit assignments in the state. 96 // 97 // When an exclusive allocation is made, the static policy also updates the 98 // default cpuset in the state abstraction. The CPU manager's periodic 99 // reconcile loop takes care of rewriting the cpuset in cgroupfs for any 100 // containers that may be running in the shared pool. For this reason, 101 // applications running within exclusively-allocated containers must tolerate 102 // potentially sharing their allocated CPUs for up to the CPU manager 103 // reconcile period. 104 type staticPolicy struct { 105 // cpu socket topology 106 topology *topology.CPUTopology 107 // set of CPUs that is not available for exclusive assignment 108 reservedCPUs cpuset.CPUSet 109 // Superset of reservedCPUs. It includes not just the reservedCPUs themselves, 110 // but also any siblings of those reservedCPUs on the same physical die. 111 // NOTE: If the reserved set includes full physical CPUs from the beginning 112 // (e.g. only reserved pairs of core siblings) this set is expected to be 113 // identical to the reserved set. 114 reservedPhysicalCPUs cpuset.CPUSet 115 // topology manager reference to get container Topology affinity 116 affinity topologymanager.Store 117 // set of CPUs to reuse across allocations in a pod 118 cpusToReuse map[string]cpuset.CPUSet 119 // options allow to fine-tune the behaviour of the policy 120 options StaticPolicyOptions 121 } 122 123 // Ensure staticPolicy implements Policy interface 124 var _ Policy = &staticPolicy{} 125 126 // NewStaticPolicy returns a CPU manager policy that does not change CPU 127 // assignments for exclusively pinned guaranteed containers after the main 128 // container process starts. 129 func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string) (Policy, error) { 130 opts, err := NewStaticPolicyOptions(cpuPolicyOptions) 131 if err != nil { 132 return nil, err 133 } 134 err = ValidateStaticPolicyOptions(opts, topology, affinity) 135 if err != nil { 136 return nil, err 137 } 138 139 klog.InfoS("Static policy created with configuration", "options", opts) 140 141 policy := &staticPolicy{ 142 topology: topology, 143 affinity: affinity, 144 cpusToReuse: make(map[string]cpuset.CPUSet), 145 options: opts, 146 } 147 148 allCPUs := topology.CPUDetails.CPUs() 149 var reserved cpuset.CPUSet 150 if reservedCPUs.Size() > 0 { 151 reserved = reservedCPUs 152 } else { 153 // takeByTopology allocates CPUs associated with low-numbered cores from 154 // allCPUs. 155 // 156 // For example: Given a system with 8 CPUs available and HT enabled, 157 // if numReservedCPUs=2, then reserved={0,4} 158 reserved, _ = policy.takeByTopology(allCPUs, numReservedCPUs) 159 } 160 161 if reserved.Size() != numReservedCPUs { 162 err := fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of %s did not equal %d)", reserved, numReservedCPUs) 163 return nil, err 164 } 165 166 var reservedPhysicalCPUs cpuset.CPUSet 167 for _, cpu := range reserved.UnsortedList() { 168 core, err := topology.CPUCoreID(cpu) 169 if err != nil { 170 return nil, fmt.Errorf("[cpumanager] unable to build the reserved physical CPUs from the reserved set: %w", err) 171 } 172 reservedPhysicalCPUs = reservedPhysicalCPUs.Union(topology.CPUDetails.CPUsInCores(core)) 173 } 174 175 klog.InfoS("Reserved CPUs not available for exclusive assignment", "reservedSize", reserved.Size(), "reserved", reserved, "reservedPhysicalCPUs", reservedPhysicalCPUs) 176 policy.reservedCPUs = reserved 177 policy.reservedPhysicalCPUs = reservedPhysicalCPUs 178 179 return policy, nil 180 } 181 182 func (p *staticPolicy) Name() string { 183 return string(PolicyStatic) 184 } 185 186 func (p *staticPolicy) Start(s state.State) error { 187 if err := p.validateState(s); err != nil { 188 klog.ErrorS(err, "Static policy invalid state, please drain node and remove policy state file") 189 return err 190 } 191 return nil 192 } 193 194 func (p *staticPolicy) validateState(s state.State) error { 195 tmpAssignments := s.GetCPUAssignments() 196 tmpDefaultCPUset := s.GetDefaultCPUSet() 197 198 // Default cpuset cannot be empty when assignments exist 199 if tmpDefaultCPUset.IsEmpty() { 200 if len(tmpAssignments) != 0 { 201 return fmt.Errorf("default cpuset cannot be empty") 202 } 203 // state is empty initialize 204 allCPUs := p.topology.CPUDetails.CPUs() 205 s.SetDefaultCPUSet(allCPUs) 206 return nil 207 } 208 209 // State has already been initialized from file (is not empty) 210 // 1. Check if the reserved cpuset is not part of default cpuset because: 211 // - kube/system reserved have changed (increased) - may lead to some containers not being able to start 212 // - user tampered with file 213 if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) { 214 return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", 215 p.reservedCPUs.String(), tmpDefaultCPUset.String()) 216 } 217 218 // 2. Check if state for static policy is consistent 219 for pod := range tmpAssignments { 220 for container, cset := range tmpAssignments[pod] { 221 // None of the cpu in DEFAULT cset should be in s.assignments 222 if !tmpDefaultCPUset.Intersection(cset).IsEmpty() { 223 return fmt.Errorf("pod: %s, container: %s cpuset: \"%s\" overlaps with default cpuset \"%s\"", 224 pod, container, cset.String(), tmpDefaultCPUset.String()) 225 } 226 } 227 } 228 229 // 3. It's possible that the set of available CPUs has changed since 230 // the state was written. This can be due to for example 231 // offlining a CPU when kubelet is not running. If this happens, 232 // CPU manager will run into trouble when later it tries to 233 // assign non-existent CPUs to containers. Validate that the 234 // topology that was received during CPU manager startup matches with 235 // the set of CPUs stored in the state. 236 totalKnownCPUs := tmpDefaultCPUset.Clone() 237 tmpCPUSets := []cpuset.CPUSet{} 238 for pod := range tmpAssignments { 239 for _, cset := range tmpAssignments[pod] { 240 tmpCPUSets = append(tmpCPUSets, cset) 241 } 242 } 243 totalKnownCPUs = totalKnownCPUs.Union(tmpCPUSets...) 244 if !totalKnownCPUs.Equals(p.topology.CPUDetails.CPUs()) { 245 return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"", 246 p.topology.CPUDetails.CPUs().String(), totalKnownCPUs.String()) 247 } 248 249 return nil 250 } 251 252 // GetAllocatableCPUs returns the total set of CPUs available for allocation. 253 func (p *staticPolicy) GetAllocatableCPUs(s state.State) cpuset.CPUSet { 254 return p.topology.CPUDetails.CPUs().Difference(p.reservedCPUs) 255 } 256 257 // GetAvailableCPUs returns the set of unassigned CPUs minus the reserved set. 258 func (p *staticPolicy) GetAvailableCPUs(s state.State) cpuset.CPUSet { 259 return s.GetDefaultCPUSet().Difference(p.reservedCPUs) 260 } 261 262 func (p *staticPolicy) GetAvailablePhysicalCPUs(s state.State) cpuset.CPUSet { 263 return s.GetDefaultCPUSet().Difference(p.reservedPhysicalCPUs) 264 } 265 266 func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, cset cpuset.CPUSet) { 267 // If pod entries to m.cpusToReuse other than the current pod exist, delete them. 268 for podUID := range p.cpusToReuse { 269 if podUID != string(pod.UID) { 270 delete(p.cpusToReuse, podUID) 271 } 272 } 273 // If no cpuset exists for cpusToReuse by this pod yet, create one. 274 if _, ok := p.cpusToReuse[string(pod.UID)]; !ok { 275 p.cpusToReuse[string(pod.UID)] = cpuset.New() 276 } 277 // Check if the container is an init container. 278 // If so, add its cpuset to the cpuset of reusable CPUs for any new allocations. 279 for _, initContainer := range pod.Spec.InitContainers { 280 if container.Name == initContainer.Name { 281 if types.IsRestartableInitContainer(&initContainer) { 282 // If the container is a restartable init container, we should not 283 // reuse its cpuset, as a restartable init container can run with 284 // regular containers. 285 break 286 } 287 p.cpusToReuse[string(pod.UID)] = p.cpusToReuse[string(pod.UID)].Union(cset) 288 return 289 } 290 } 291 // Otherwise it is an app container. 292 // Remove its cpuset from the cpuset of reusable CPUs for any new allocations. 293 p.cpusToReuse[string(pod.UID)] = p.cpusToReuse[string(pod.UID)].Difference(cset) 294 } 295 296 func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) { 297 numCPUs := p.guaranteedCPUs(pod, container) 298 if numCPUs == 0 { 299 // container belongs in the shared pool (nothing to do; use default cpuset) 300 return nil 301 } 302 303 klog.InfoS("Static policy: Allocate", "pod", klog.KObj(pod), "containerName", container.Name) 304 // container belongs in an exclusively allocated pool 305 metrics.CPUManagerPinningRequestsTotal.Inc() 306 defer func() { 307 if rerr != nil { 308 metrics.CPUManagerPinningErrorsTotal.Inc() 309 } 310 }() 311 312 if p.options.FullPhysicalCPUsOnly { 313 CPUsPerCore := p.topology.CPUsPerCore() 314 if (numCPUs % CPUsPerCore) != 0 { 315 // Since CPU Manager has been enabled requesting strict SMT alignment, it means a guaranteed pod can only be admitted 316 // if the CPU requested is a multiple of the number of virtual cpus per physical cores. 317 // In case CPU request is not a multiple of the number of virtual cpus per physical cores the Pod will be put 318 // in Failed state, with SMTAlignmentError as reason. Since the allocation happens in terms of physical cores 319 // and the scheduler is responsible for ensuring that the workload goes to a node that has enough CPUs, 320 // the pod would be placed on a node where there are enough physical cores available to be allocated. 321 // Just like the behaviour in case of static policy, takeByTopology will try to first allocate CPUs from the same socket 322 // and only in case the request cannot be sattisfied on a single socket, CPU allocation is done for a workload to occupy all 323 // CPUs on a physical core. Allocation of individual threads would never have to occur. 324 return SMTAlignmentError{ 325 RequestedCPUs: numCPUs, 326 CpusPerCore: CPUsPerCore, 327 } 328 } 329 330 availablePhysicalCPUs := p.GetAvailablePhysicalCPUs(s).Size() 331 332 // It's legal to reserve CPUs which are not core siblings. In this case the CPU allocator can descend to single cores 333 // when picking CPUs. This will void the guarantee of FullPhysicalCPUsOnly. To prevent this, we need to additionally consider 334 // all the core siblings of the reserved CPUs as unavailable when computing the free CPUs, before to start the actual allocation. 335 // This way, by construction all possible CPUs allocation whose number is multiple of the SMT level are now correct again. 336 if numCPUs > availablePhysicalCPUs { 337 return SMTAlignmentError{ 338 RequestedCPUs: numCPUs, 339 CpusPerCore: CPUsPerCore, 340 AvailablePhysicalCPUs: availablePhysicalCPUs, 341 } 342 } 343 } 344 if cpuset, ok := s.GetCPUSet(string(pod.UID), container.Name); ok { 345 p.updateCPUsToReuse(pod, container, cpuset) 346 klog.InfoS("Static policy: container already present in state, skipping", "pod", klog.KObj(pod), "containerName", container.Name) 347 return nil 348 } 349 350 // Call Topology Manager to get the aligned socket affinity across all hint providers. 351 hint := p.affinity.GetAffinity(string(pod.UID), container.Name) 352 klog.InfoS("Topology Affinity", "pod", klog.KObj(pod), "containerName", container.Name, "affinity", hint) 353 354 // Allocate CPUs according to the NUMA affinity contained in the hint. 355 cpuset, err := p.allocateCPUs(s, numCPUs, hint.NUMANodeAffinity, p.cpusToReuse[string(pod.UID)]) 356 if err != nil { 357 klog.ErrorS(err, "Unable to allocate CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "numCPUs", numCPUs) 358 return err 359 } 360 s.SetCPUSet(string(pod.UID), container.Name, cpuset) 361 p.updateCPUsToReuse(pod, container, cpuset) 362 363 return nil 364 } 365 366 // getAssignedCPUsOfSiblings returns assigned cpus of given container's siblings(all containers other than the given container) in the given pod `podUID`. 367 func getAssignedCPUsOfSiblings(s state.State, podUID string, containerName string) cpuset.CPUSet { 368 assignments := s.GetCPUAssignments() 369 cset := cpuset.New() 370 for name, cpus := range assignments[podUID] { 371 if containerName == name { 372 continue 373 } 374 cset = cset.Union(cpus) 375 } 376 return cset 377 } 378 379 func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerName string) error { 380 klog.InfoS("Static policy: RemoveContainer", "podUID", podUID, "containerName", containerName) 381 cpusInUse := getAssignedCPUsOfSiblings(s, podUID, containerName) 382 if toRelease, ok := s.GetCPUSet(podUID, containerName); ok { 383 s.Delete(podUID, containerName) 384 // Mutate the shared pool, adding released cpus. 385 toRelease = toRelease.Difference(cpusInUse) 386 s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease)) 387 } 388 return nil 389 } 390 391 func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bitmask.BitMask, reusableCPUs cpuset.CPUSet) (cpuset.CPUSet, error) { 392 klog.InfoS("AllocateCPUs", "numCPUs", numCPUs, "socket", numaAffinity) 393 394 allocatableCPUs := p.GetAvailableCPUs(s).Union(reusableCPUs) 395 396 // If there are aligned CPUs in numaAffinity, attempt to take those first. 397 result := cpuset.New() 398 if numaAffinity != nil { 399 alignedCPUs := p.getAlignedCPUs(numaAffinity, allocatableCPUs) 400 401 numAlignedToAlloc := alignedCPUs.Size() 402 if numCPUs < numAlignedToAlloc { 403 numAlignedToAlloc = numCPUs 404 } 405 406 alignedCPUs, err := p.takeByTopology(alignedCPUs, numAlignedToAlloc) 407 if err != nil { 408 return cpuset.New(), err 409 } 410 411 result = result.Union(alignedCPUs) 412 } 413 414 // Get any remaining CPUs from what's leftover after attempting to grab aligned ones. 415 remainingCPUs, err := p.takeByTopology(allocatableCPUs.Difference(result), numCPUs-result.Size()) 416 if err != nil { 417 return cpuset.New(), err 418 } 419 result = result.Union(remainingCPUs) 420 421 // Remove allocated CPUs from the shared CPUSet. 422 s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result)) 423 424 klog.InfoS("AllocateCPUs", "result", result) 425 return result, nil 426 } 427 428 func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int { 429 if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed { 430 return 0 431 } 432 cpuQuantity := container.Resources.Requests[v1.ResourceCPU] 433 // In-place pod resize feature makes Container.Resources field mutable for CPU & memory. 434 // AllocatedResources holds the value of Container.Resources.Requests when the pod was admitted. 435 // We should return this value because this is what kubelet agreed to allocate for the container 436 // and the value configured with runtime. 437 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 438 if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok { 439 cpuQuantity = cs.AllocatedResources[v1.ResourceCPU] 440 } 441 } 442 if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() { 443 return 0 444 } 445 // Safe downcast to do for all systems with < 2.1 billion CPUs. 446 // Per the language spec, `int` is guaranteed to be at least 32 bits wide. 447 // https://golang.org/ref/spec#Numeric_types 448 return int(cpuQuantity.Value()) 449 } 450 451 func (p *staticPolicy) podGuaranteedCPUs(pod *v1.Pod) int { 452 // The maximum of requested CPUs by init containers. 453 requestedByInitContainers := 0 454 requestedByRestartableInitContainers := 0 455 for _, container := range pod.Spec.InitContainers { 456 if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok { 457 continue 458 } 459 requestedCPU := p.guaranteedCPUs(pod, &container) 460 // See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/753-sidecar-containers#resources-calculation-for-scheduling-and-pod-admission 461 // for the detail. 462 if types.IsRestartableInitContainer(&container) { 463 requestedByRestartableInitContainers += requestedCPU 464 } else if requestedByRestartableInitContainers+requestedCPU > requestedByInitContainers { 465 requestedByInitContainers = requestedByRestartableInitContainers + requestedCPU 466 } 467 } 468 469 // The sum of requested CPUs by app containers. 470 requestedByAppContainers := 0 471 for _, container := range pod.Spec.Containers { 472 if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok { 473 continue 474 } 475 requestedByAppContainers += p.guaranteedCPUs(pod, &container) 476 } 477 478 requestedByLongRunningContainers := requestedByAppContainers + requestedByRestartableInitContainers 479 if requestedByInitContainers > requestedByLongRunningContainers { 480 return requestedByInitContainers 481 } 482 return requestedByLongRunningContainers 483 } 484 485 func (p *staticPolicy) takeByTopology(availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) { 486 if p.options.DistributeCPUsAcrossNUMA { 487 cpuGroupSize := 1 488 if p.options.FullPhysicalCPUsOnly { 489 cpuGroupSize = p.topology.CPUsPerCore() 490 } 491 return takeByTopologyNUMADistributed(p.topology, availableCPUs, numCPUs, cpuGroupSize) 492 } 493 return takeByTopologyNUMAPacked(p.topology, availableCPUs, numCPUs) 494 } 495 496 func (p *staticPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint { 497 // Get a count of how many guaranteed CPUs have been requested. 498 requested := p.guaranteedCPUs(pod, container) 499 500 // Number of required CPUs is not an integer or a container is not part of the Guaranteed QoS class. 501 // It will be treated by the TopologyManager as having no preference and cause it to ignore this 502 // resource when considering pod alignment. 503 // In terms of hints, this is equal to: TopologyHints[NUMANodeAffinity: nil, Preferred: true]. 504 if requested == 0 { 505 return nil 506 } 507 508 // Short circuit to regenerate the same hints if there are already 509 // guaranteed CPUs allocated to the Container. This might happen after a 510 // kubelet restart, for example. 511 if allocated, exists := s.GetCPUSet(string(pod.UID), container.Name); exists { 512 if allocated.Size() != requested { 513 klog.ErrorS(nil, "CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "requestedSize", requested, "allocatedSize", allocated.Size()) 514 // An empty list of hints will be treated as a preference that cannot be satisfied. 515 // In definition of hints this is equal to: TopologyHint[NUMANodeAffinity: nil, Preferred: false]. 516 // For all but the best-effort policy, the Topology Manager will throw a pod-admission error. 517 return map[string][]topologymanager.TopologyHint{ 518 string(v1.ResourceCPU): {}, 519 } 520 } 521 klog.InfoS("Regenerating TopologyHints for CPUs already allocated", "pod", klog.KObj(pod), "containerName", container.Name) 522 return map[string][]topologymanager.TopologyHint{ 523 string(v1.ResourceCPU): p.generateCPUTopologyHints(allocated, cpuset.CPUSet{}, requested), 524 } 525 } 526 527 // Get a list of available CPUs. 528 available := p.GetAvailableCPUs(s) 529 530 // Get a list of reusable CPUs (e.g. CPUs reused from initContainers). 531 // It should be an empty CPUSet for a newly created pod. 532 reusable := p.cpusToReuse[string(pod.UID)] 533 534 // Generate hints. 535 cpuHints := p.generateCPUTopologyHints(available, reusable, requested) 536 klog.InfoS("TopologyHints generated", "pod", klog.KObj(pod), "containerName", container.Name, "cpuHints", cpuHints) 537 538 return map[string][]topologymanager.TopologyHint{ 539 string(v1.ResourceCPU): cpuHints, 540 } 541 } 542 543 func (p *staticPolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint { 544 // Get a count of how many guaranteed CPUs have been requested by Pod. 545 requested := p.podGuaranteedCPUs(pod) 546 547 // Number of required CPUs is not an integer or a pod is not part of the Guaranteed QoS class. 548 // It will be treated by the TopologyManager as having no preference and cause it to ignore this 549 // resource when considering pod alignment. 550 // In terms of hints, this is equal to: TopologyHints[NUMANodeAffinity: nil, Preferred: true]. 551 if requested == 0 { 552 return nil 553 } 554 555 assignedCPUs := cpuset.New() 556 for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) { 557 requestedByContainer := p.guaranteedCPUs(pod, &container) 558 // Short circuit to regenerate the same hints if there are already 559 // guaranteed CPUs allocated to the Container. This might happen after a 560 // kubelet restart, for example. 561 if allocated, exists := s.GetCPUSet(string(pod.UID), container.Name); exists { 562 if allocated.Size() != requestedByContainer { 563 klog.ErrorS(nil, "CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "allocatedSize", requested, "requestedByContainer", requestedByContainer, "allocatedSize", allocated.Size()) 564 // An empty list of hints will be treated as a preference that cannot be satisfied. 565 // In definition of hints this is equal to: TopologyHint[NUMANodeAffinity: nil, Preferred: false]. 566 // For all but the best-effort policy, the Topology Manager will throw a pod-admission error. 567 return map[string][]topologymanager.TopologyHint{ 568 string(v1.ResourceCPU): {}, 569 } 570 } 571 // A set of CPUs already assigned to containers in this pod 572 assignedCPUs = assignedCPUs.Union(allocated) 573 } 574 } 575 if assignedCPUs.Size() == requested { 576 klog.InfoS("Regenerating TopologyHints for CPUs already allocated", "pod", klog.KObj(pod)) 577 return map[string][]topologymanager.TopologyHint{ 578 string(v1.ResourceCPU): p.generateCPUTopologyHints(assignedCPUs, cpuset.CPUSet{}, requested), 579 } 580 } 581 582 // Get a list of available CPUs. 583 available := p.GetAvailableCPUs(s) 584 585 // Get a list of reusable CPUs (e.g. CPUs reused from initContainers). 586 // It should be an empty CPUSet for a newly created pod. 587 reusable := p.cpusToReuse[string(pod.UID)] 588 589 // Ensure any CPUs already assigned to containers in this pod are included as part of the hint generation. 590 reusable = reusable.Union(assignedCPUs) 591 592 // Generate hints. 593 cpuHints := p.generateCPUTopologyHints(available, reusable, requested) 594 klog.InfoS("TopologyHints generated", "pod", klog.KObj(pod), "cpuHints", cpuHints) 595 596 return map[string][]topologymanager.TopologyHint{ 597 string(v1.ResourceCPU): cpuHints, 598 } 599 } 600 601 // generateCPUTopologyHints generates a set of TopologyHints given the set of 602 // available CPUs and the number of CPUs being requested. 603 // 604 // It follows the convention of marking all hints that have the same number of 605 // bits set as the narrowest matching NUMANodeAffinity with 'Preferred: true', and 606 // marking all others with 'Preferred: false'. 607 func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, reusableCPUs cpuset.CPUSet, request int) []topologymanager.TopologyHint { 608 // Initialize minAffinitySize to include all NUMA Nodes. 609 minAffinitySize := p.topology.CPUDetails.NUMANodes().Size() 610 611 // Iterate through all combinations of numa nodes bitmask and build hints from them. 612 hints := []topologymanager.TopologyHint{} 613 bitmask.IterateBitMasks(p.topology.CPUDetails.NUMANodes().List(), func(mask bitmask.BitMask) { 614 // First, update minAffinitySize for the current request size. 615 cpusInMask := p.topology.CPUDetails.CPUsInNUMANodes(mask.GetBits()...).Size() 616 if cpusInMask >= request && mask.Count() < minAffinitySize { 617 minAffinitySize = mask.Count() 618 } 619 620 // Then check to see if we have enough CPUs available on the current 621 // numa node bitmask to satisfy the CPU request. 622 numMatching := 0 623 for _, c := range reusableCPUs.List() { 624 // Disregard this mask if its NUMANode isn't part of it. 625 if !mask.IsSet(p.topology.CPUDetails[c].NUMANodeID) { 626 return 627 } 628 numMatching++ 629 } 630 631 // Finally, check to see if enough available CPUs remain on the current 632 // NUMA node combination to satisfy the CPU request. 633 for _, c := range availableCPUs.List() { 634 if mask.IsSet(p.topology.CPUDetails[c].NUMANodeID) { 635 numMatching++ 636 } 637 } 638 639 // If they don't, then move onto the next combination. 640 if numMatching < request { 641 return 642 } 643 644 // Otherwise, create a new hint from the numa node bitmask and add it to the 645 // list of hints. We set all hint preferences to 'false' on the first 646 // pass through. 647 hints = append(hints, topologymanager.TopologyHint{ 648 NUMANodeAffinity: mask, 649 Preferred: false, 650 }) 651 }) 652 653 // Loop back through all hints and update the 'Preferred' field based on 654 // counting the number of bits sets in the affinity mask and comparing it 655 // to the minAffinitySize. Only those with an equal number of bits set (and 656 // with a minimal set of numa nodes) will be considered preferred. 657 for i := range hints { 658 if p.options.AlignBySocket && p.isHintSocketAligned(hints[i], minAffinitySize) { 659 hints[i].Preferred = true 660 continue 661 } 662 if hints[i].NUMANodeAffinity.Count() == minAffinitySize { 663 hints[i].Preferred = true 664 } 665 } 666 667 return hints 668 } 669 670 // isHintSocketAligned function return true if numa nodes in hint are socket aligned. 671 func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool { 672 numaNodesBitMask := hint.NUMANodeAffinity.GetBits() 673 numaNodesPerSocket := p.topology.NumNUMANodes / p.topology.NumSockets 674 if numaNodesPerSocket == 0 { 675 return false 676 } 677 // minSockets refers to minimum number of socket required to satify allocation. 678 // A hint is considered socket aligned if sockets across which numa nodes span is equal to minSockets 679 minSockets := (minAffinitySize + numaNodesPerSocket - 1) / numaNodesPerSocket 680 return p.topology.CPUDetails.SocketsInNUMANodes(numaNodesBitMask...).Size() == minSockets 681 } 682 683 // getAlignedCPUs return set of aligned CPUs based on numa affinity mask and configured policy options. 684 func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableCPUs cpuset.CPUSet) cpuset.CPUSet { 685 alignedCPUs := cpuset.New() 686 numaBits := numaAffinity.GetBits() 687 688 // If align-by-socket policy option is enabled, NUMA based hint is expanded to 689 // socket aligned hint. It will ensure that first socket aligned available CPUs are 690 // allocated before we try to find CPUs across socket to satisfy allocation request. 691 if p.options.AlignBySocket { 692 socketBits := p.topology.CPUDetails.SocketsInNUMANodes(numaBits...).UnsortedList() 693 for _, socketID := range socketBits { 694 alignedCPUs = alignedCPUs.Union(allocatableCPUs.Intersection(p.topology.CPUDetails.CPUsInSockets(socketID))) 695 } 696 return alignedCPUs 697 } 698 699 for _, numaNodeID := range numaBits { 700 alignedCPUs = alignedCPUs.Union(allocatableCPUs.Intersection(p.topology.CPUDetails.CPUsInNUMANodes(numaNodeID))) 701 } 702 703 return alignedCPUs 704 }