k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/memorymanager/policy_static.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package memorymanager 18 19 import ( 20 "fmt" 21 "reflect" 22 "sort" 23 24 cadvisorapi "github.com/google/cadvisor/info/v1" 25 26 v1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/api/resource" 28 utilfeature "k8s.io/apiserver/pkg/util/feature" 29 "k8s.io/klog/v2" 30 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 31 corehelper "k8s.io/kubernetes/pkg/apis/core/v1/helper" 32 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 33 "k8s.io/kubernetes/pkg/features" 34 "k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state" 35 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" 36 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" 37 "k8s.io/kubernetes/pkg/kubelet/types" 38 ) 39 40 const policyTypeStatic policyType = "Static" 41 42 type systemReservedMemory map[int]map[v1.ResourceName]uint64 43 type reusableMemory map[string]map[string]map[v1.ResourceName]uint64 44 45 // staticPolicy is implementation of the policy interface for the static policy 46 type staticPolicy struct { 47 // machineInfo contains machine memory related information 48 machineInfo *cadvisorapi.MachineInfo 49 // reserved contains memory that reserved for kube 50 systemReserved systemReservedMemory 51 // topology manager reference to get container Topology affinity 52 affinity topologymanager.Store 53 // initContainersReusableMemory contains the memory allocated for init 54 // containers that can be reused. 55 // Note that the restartable init container memory is not included here, 56 // because it is not reusable. 57 initContainersReusableMemory reusableMemory 58 } 59 60 var _ Policy = &staticPolicy{} 61 62 // NewPolicyStatic returns new static policy instance 63 func NewPolicyStatic(machineInfo *cadvisorapi.MachineInfo, reserved systemReservedMemory, affinity topologymanager.Store) (Policy, error) { 64 var totalSystemReserved uint64 65 for _, node := range reserved { 66 if _, ok := node[v1.ResourceMemory]; !ok { 67 continue 68 } 69 totalSystemReserved += node[v1.ResourceMemory] 70 } 71 72 // check if we have some reserved memory for the system 73 if totalSystemReserved <= 0 { 74 return nil, fmt.Errorf("[memorymanager] you should specify the system reserved memory") 75 } 76 77 return &staticPolicy{ 78 machineInfo: machineInfo, 79 systemReserved: reserved, 80 affinity: affinity, 81 initContainersReusableMemory: reusableMemory{}, 82 }, nil 83 } 84 85 func (p *staticPolicy) Name() string { 86 return string(policyTypeStatic) 87 } 88 89 func (p *staticPolicy) Start(s state.State) error { 90 if err := p.validateState(s); err != nil { 91 klog.ErrorS(err, "Invalid state, please drain node and remove policy state file") 92 return err 93 } 94 return nil 95 } 96 97 // Allocate call is idempotent 98 func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error { 99 // allocate the memory only for guaranteed pods 100 if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed { 101 return nil 102 } 103 104 podUID := string(pod.UID) 105 klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name) 106 if blocks := s.GetMemoryBlocks(podUID, container.Name); blocks != nil { 107 p.updatePodReusableMemory(pod, container, blocks) 108 109 klog.InfoS("Container already present in state, skipping", "pod", klog.KObj(pod), "containerName", container.Name) 110 return nil 111 } 112 113 // Call Topology Manager to get the aligned affinity across all hint providers. 114 hint := p.affinity.GetAffinity(podUID, container.Name) 115 klog.InfoS("Got topology affinity", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name, "hint", hint) 116 117 requestedResources, err := getRequestedResources(pod, container) 118 if err != nil { 119 return err 120 } 121 122 machineState := s.GetMachineState() 123 bestHint := &hint 124 // topology manager returned the hint with NUMA affinity nil 125 // we should use the default NUMA affinity calculated the same way as for the topology manager 126 if hint.NUMANodeAffinity == nil { 127 defaultHint, err := p.getDefaultHint(machineState, pod, requestedResources) 128 if err != nil { 129 return err 130 } 131 132 if !defaultHint.Preferred && bestHint.Preferred { 133 return fmt.Errorf("[memorymanager] failed to find the default preferred hint") 134 } 135 bestHint = defaultHint 136 } 137 138 // topology manager returns the hint that does not satisfy completely the container request 139 // we should extend this hint to the one who will satisfy the request and include the current hint 140 if !isAffinitySatisfyRequest(machineState, bestHint.NUMANodeAffinity, requestedResources) { 141 extendedHint, err := p.extendTopologyManagerHint(machineState, pod, requestedResources, bestHint.NUMANodeAffinity) 142 if err != nil { 143 return err 144 } 145 146 if !extendedHint.Preferred && bestHint.Preferred { 147 return fmt.Errorf("[memorymanager] failed to find the extended preferred hint") 148 } 149 bestHint = extendedHint 150 } 151 152 var containerBlocks []state.Block 153 maskBits := bestHint.NUMANodeAffinity.GetBits() 154 for resourceName, requestedSize := range requestedResources { 155 // update memory blocks 156 containerBlocks = append(containerBlocks, state.Block{ 157 NUMAAffinity: maskBits, 158 Size: requestedSize, 159 Type: resourceName, 160 }) 161 162 podReusableMemory := p.getPodReusableMemory(pod, bestHint.NUMANodeAffinity, resourceName) 163 if podReusableMemory >= requestedSize { 164 requestedSize = 0 165 } else { 166 requestedSize -= podReusableMemory 167 } 168 169 // Update nodes memory state 170 p.updateMachineState(machineState, maskBits, resourceName, requestedSize) 171 } 172 173 p.updatePodReusableMemory(pod, container, containerBlocks) 174 175 s.SetMachineState(machineState) 176 s.SetMemoryBlocks(podUID, container.Name, containerBlocks) 177 178 // update init containers memory blocks to reflect the fact that we re-used init containers memory 179 // it is possible that the size of the init container memory block will have 0 value, when all memory 180 // allocated for it was re-used 181 // we only do this so that the sum(memory_for_all_containers) == total amount of allocated memory to the pod, even 182 // though the final state here doesn't accurately reflect what was (in reality) allocated to each container 183 // TODO: we should refactor our state structs to reflect the amount of the re-used memory 184 p.updateInitContainersMemoryBlocks(s, pod, container, containerBlocks) 185 186 return nil 187 } 188 189 func (p *staticPolicy) updateMachineState(machineState state.NUMANodeMap, numaAffinity []int, resourceName v1.ResourceName, requestedSize uint64) { 190 for _, nodeID := range numaAffinity { 191 machineState[nodeID].NumberOfAssignments++ 192 machineState[nodeID].Cells = numaAffinity 193 194 // we need to continue to update all affinity mask nodes 195 if requestedSize == 0 { 196 continue 197 } 198 199 // update the node memory state 200 nodeResourceMemoryState := machineState[nodeID].MemoryMap[resourceName] 201 if nodeResourceMemoryState.Free <= 0 { 202 continue 203 } 204 205 // the node has enough memory to satisfy the request 206 if nodeResourceMemoryState.Free >= requestedSize { 207 nodeResourceMemoryState.Reserved += requestedSize 208 nodeResourceMemoryState.Free -= requestedSize 209 requestedSize = 0 210 continue 211 } 212 213 // the node does not have enough memory, use the node remaining memory and move to the next node 214 requestedSize -= nodeResourceMemoryState.Free 215 nodeResourceMemoryState.Reserved += nodeResourceMemoryState.Free 216 nodeResourceMemoryState.Free = 0 217 } 218 } 219 220 func (p *staticPolicy) getPodReusableMemory(pod *v1.Pod, numaAffinity bitmask.BitMask, resourceName v1.ResourceName) uint64 { 221 podReusableMemory, ok := p.initContainersReusableMemory[string(pod.UID)] 222 if !ok { 223 return 0 224 } 225 226 numaReusableMemory, ok := podReusableMemory[numaAffinity.String()] 227 if !ok { 228 return 0 229 } 230 231 return numaReusableMemory[resourceName] 232 } 233 234 // RemoveContainer call is idempotent 235 func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerName string) { 236 blocks := s.GetMemoryBlocks(podUID, containerName) 237 if blocks == nil { 238 return 239 } 240 241 klog.InfoS("RemoveContainer", "podUID", podUID, "containerName", containerName) 242 s.Delete(podUID, containerName) 243 244 // Mutate machine memory state to update free and reserved memory 245 machineState := s.GetMachineState() 246 for _, b := range blocks { 247 releasedSize := b.Size 248 for _, nodeID := range b.NUMAAffinity { 249 machineState[nodeID].NumberOfAssignments-- 250 251 // once we do not have any memory allocations on this node, clear node groups 252 if machineState[nodeID].NumberOfAssignments == 0 { 253 machineState[nodeID].Cells = []int{nodeID} 254 } 255 256 // we still need to pass over all NUMA node under the affinity mask to update them 257 if releasedSize == 0 { 258 continue 259 } 260 261 nodeResourceMemoryState := machineState[nodeID].MemoryMap[b.Type] 262 263 // if the node does not have reserved memory to free, continue to the next node 264 if nodeResourceMemoryState.Reserved == 0 { 265 continue 266 } 267 268 // the reserved memory smaller than the amount of the memory that should be released 269 // release as much as possible and move to the next node 270 if nodeResourceMemoryState.Reserved < releasedSize { 271 releasedSize -= nodeResourceMemoryState.Reserved 272 nodeResourceMemoryState.Free += nodeResourceMemoryState.Reserved 273 nodeResourceMemoryState.Reserved = 0 274 continue 275 } 276 277 // the reserved memory big enough to satisfy the released memory 278 nodeResourceMemoryState.Free += releasedSize 279 nodeResourceMemoryState.Reserved -= releasedSize 280 releasedSize = 0 281 } 282 } 283 284 s.SetMachineState(machineState) 285 } 286 287 func regenerateHints(pod *v1.Pod, ctn *v1.Container, ctnBlocks []state.Block, reqRsrc map[v1.ResourceName]uint64) map[string][]topologymanager.TopologyHint { 288 hints := map[string][]topologymanager.TopologyHint{} 289 for resourceName := range reqRsrc { 290 hints[string(resourceName)] = []topologymanager.TopologyHint{} 291 } 292 293 if len(ctnBlocks) != len(reqRsrc) { 294 klog.ErrorS(nil, "The number of requested resources by the container differs from the number of memory blocks", "containerName", ctn.Name) 295 return nil 296 } 297 298 for _, b := range ctnBlocks { 299 if _, ok := reqRsrc[b.Type]; !ok { 300 klog.ErrorS(nil, "Container requested resources do not have resource of this type", "containerName", ctn.Name, "type", b.Type) 301 return nil 302 } 303 304 if b.Size != reqRsrc[b.Type] { 305 klog.ErrorS(nil, "Memory already allocated with different numbers than requested", "podUID", pod.UID, "type", b.Type, "containerName", ctn.Name, "requestedResource", reqRsrc[b.Type], "allocatedSize", b.Size) 306 return nil 307 } 308 309 containerNUMAAffinity, err := bitmask.NewBitMask(b.NUMAAffinity...) 310 if err != nil { 311 klog.ErrorS(err, "Failed to generate NUMA bitmask") 312 return nil 313 } 314 315 klog.InfoS("Regenerating TopologyHints, resource was already allocated to pod", "resourceName", b.Type, "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", ctn.Name) 316 hints[string(b.Type)] = append(hints[string(b.Type)], topologymanager.TopologyHint{ 317 NUMANodeAffinity: containerNUMAAffinity, 318 Preferred: true, 319 }) 320 } 321 return hints 322 } 323 324 func getPodRequestedResources(pod *v1.Pod) (map[v1.ResourceName]uint64, error) { 325 // Maximun resources requested by init containers at any given time. 326 reqRsrcsByInitCtrs := make(map[v1.ResourceName]uint64) 327 // Total resources requested by restartable init containers. 328 reqRsrcsByRestartableInitCtrs := make(map[v1.ResourceName]uint64) 329 for _, ctr := range pod.Spec.InitContainers { 330 reqRsrcs, err := getRequestedResources(pod, &ctr) 331 332 if err != nil { 333 return nil, err 334 } 335 for rsrcName, qty := range reqRsrcs { 336 if _, ok := reqRsrcsByInitCtrs[rsrcName]; !ok { 337 reqRsrcsByInitCtrs[rsrcName] = uint64(0) 338 } 339 340 // See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/753-sidecar-containers#resources-calculation-for-scheduling-and-pod-admission 341 // for the detail. 342 if types.IsRestartableInitContainer(&ctr) { 343 reqRsrcsByRestartableInitCtrs[rsrcName] += qty 344 } else if reqRsrcsByRestartableInitCtrs[rsrcName]+qty > reqRsrcsByInitCtrs[rsrcName] { 345 reqRsrcsByInitCtrs[rsrcName] = reqRsrcsByRestartableInitCtrs[rsrcName] + qty 346 } 347 } 348 } 349 350 reqRsrcsByAppCtrs := make(map[v1.ResourceName]uint64) 351 for _, ctr := range pod.Spec.Containers { 352 reqRsrcs, err := getRequestedResources(pod, &ctr) 353 354 if err != nil { 355 return nil, err 356 } 357 for rsrcName, qty := range reqRsrcs { 358 if _, ok := reqRsrcsByAppCtrs[rsrcName]; !ok { 359 reqRsrcsByAppCtrs[rsrcName] = uint64(0) 360 } 361 362 reqRsrcsByAppCtrs[rsrcName] += qty 363 } 364 } 365 366 reqRsrcs := make(map[v1.ResourceName]uint64) 367 for rsrcName := range reqRsrcsByAppCtrs { 368 // Total resources requested by long-running containers. 369 reqRsrcsByLongRunningCtrs := reqRsrcsByAppCtrs[rsrcName] + reqRsrcsByRestartableInitCtrs[rsrcName] 370 reqRsrcs[rsrcName] = reqRsrcsByLongRunningCtrs 371 372 if reqRsrcs[rsrcName] < reqRsrcsByInitCtrs[rsrcName] { 373 reqRsrcs[rsrcName] = reqRsrcsByInitCtrs[rsrcName] 374 } 375 } 376 return reqRsrcs, nil 377 } 378 379 func (p *staticPolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint { 380 if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed { 381 return nil 382 } 383 384 reqRsrcs, err := getPodRequestedResources(pod) 385 if err != nil { 386 klog.ErrorS(err, "Failed to get pod requested resources", "pod", klog.KObj(pod), "podUID", pod.UID) 387 return nil 388 } 389 390 for _, ctn := range append(pod.Spec.InitContainers, pod.Spec.Containers...) { 391 containerBlocks := s.GetMemoryBlocks(string(pod.UID), ctn.Name) 392 // Short circuit to regenerate the same hints if there are already 393 // memory allocated for the container. This might happen after a 394 // kubelet restart, for example. 395 if containerBlocks != nil { 396 return regenerateHints(pod, &ctn, containerBlocks, reqRsrcs) 397 } 398 } 399 400 // the pod topology hints calculated only once for all containers, so no need to pass re-usable state 401 return p.calculateHints(s.GetMachineState(), pod, reqRsrcs) 402 } 403 404 // GetTopologyHints implements the topologymanager.HintProvider Interface 405 // and is consulted to achieve NUMA aware resource alignment among this 406 // and other resource controllers. 407 func (p *staticPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint { 408 if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed { 409 return nil 410 } 411 412 requestedResources, err := getRequestedResources(pod, container) 413 if err != nil { 414 klog.ErrorS(err, "Failed to get container requested resources", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name) 415 return nil 416 } 417 418 containerBlocks := s.GetMemoryBlocks(string(pod.UID), container.Name) 419 // Short circuit to regenerate the same hints if there are already 420 // memory allocated for the container. This might happen after a 421 // kubelet restart, for example. 422 if containerBlocks != nil { 423 return regenerateHints(pod, container, containerBlocks, requestedResources) 424 } 425 426 return p.calculateHints(s.GetMachineState(), pod, requestedResources) 427 } 428 429 func getRequestedResources(pod *v1.Pod, container *v1.Container) (map[v1.ResourceName]uint64, error) { 430 requestedResources := map[v1.ResourceName]uint64{} 431 resources := container.Resources.Requests 432 // In-place pod resize feature makes Container.Resources field mutable for CPU & memory. 433 // AllocatedResources holds the value of Container.Resources.Requests when the pod was admitted. 434 // We should return this value because this is what kubelet agreed to allocate for the container 435 // and the value configured with runtime. 436 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 437 if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok { 438 resources = cs.AllocatedResources 439 } 440 } 441 for resourceName, quantity := range resources { 442 if resourceName != v1.ResourceMemory && !corehelper.IsHugePageResourceName(resourceName) { 443 continue 444 } 445 requestedSize, succeed := quantity.AsInt64() 446 if !succeed { 447 return nil, fmt.Errorf("[memorymanager] failed to represent quantity as int64") 448 } 449 requestedResources[resourceName] = uint64(requestedSize) 450 } 451 return requestedResources, nil 452 } 453 454 func (p *staticPolicy) calculateHints(machineState state.NUMANodeMap, pod *v1.Pod, requestedResources map[v1.ResourceName]uint64) map[string][]topologymanager.TopologyHint { 455 var numaNodes []int 456 for n := range machineState { 457 numaNodes = append(numaNodes, n) 458 } 459 sort.Ints(numaNodes) 460 461 // Initialize minAffinitySize to include all NUMA Cells. 462 minAffinitySize := len(numaNodes) 463 464 hints := map[string][]topologymanager.TopologyHint{} 465 bitmask.IterateBitMasks(numaNodes, func(mask bitmask.BitMask) { 466 maskBits := mask.GetBits() 467 singleNUMAHint := len(maskBits) == 1 468 469 totalFreeSize := map[v1.ResourceName]uint64{} 470 totalAllocatableSize := map[v1.ResourceName]uint64{} 471 // calculate total free and allocatable memory for the node mask 472 for _, nodeID := range maskBits { 473 for resourceName := range requestedResources { 474 if _, ok := totalFreeSize[resourceName]; !ok { 475 totalFreeSize[resourceName] = 0 476 } 477 totalFreeSize[resourceName] += machineState[nodeID].MemoryMap[resourceName].Free 478 479 if _, ok := totalAllocatableSize[resourceName]; !ok { 480 totalAllocatableSize[resourceName] = 0 481 } 482 totalAllocatableSize[resourceName] += machineState[nodeID].MemoryMap[resourceName].Allocatable 483 } 484 } 485 486 // verify that for all memory types the node mask has enough allocatable resources 487 for resourceName, requestedSize := range requestedResources { 488 if totalAllocatableSize[resourceName] < requestedSize { 489 return 490 } 491 } 492 493 // set the minimum amount of NUMA nodes that can satisfy the container resources requests 494 if mask.Count() < minAffinitySize { 495 minAffinitySize = mask.Count() 496 } 497 498 // the node already in group with another node, it can not be used for the single NUMA node allocation 499 if singleNUMAHint && len(machineState[maskBits[0]].Cells) > 1 { 500 return 501 } 502 503 for _, nodeID := range maskBits { 504 // the node already used for the memory allocation 505 if !singleNUMAHint && machineState[nodeID].NumberOfAssignments > 0 { 506 // the node used for the single NUMA memory allocation, it can not be used for the multi NUMA node allocation 507 if len(machineState[nodeID].Cells) == 1 { 508 return 509 } 510 511 // the node already used with different group of nodes, it can not be use with in the current hint 512 if !areGroupsEqual(machineState[nodeID].Cells, maskBits) { 513 return 514 } 515 } 516 } 517 518 // verify that for all memory types the node mask has enough free resources 519 for resourceName, requestedSize := range requestedResources { 520 podReusableMemory := p.getPodReusableMemory(pod, mask, resourceName) 521 if totalFreeSize[resourceName]+podReusableMemory < requestedSize { 522 return 523 } 524 } 525 526 // add the node mask as topology hint for all memory types 527 for resourceName := range requestedResources { 528 if _, ok := hints[string(resourceName)]; !ok { 529 hints[string(resourceName)] = []topologymanager.TopologyHint{} 530 } 531 hints[string(resourceName)] = append(hints[string(resourceName)], topologymanager.TopologyHint{ 532 NUMANodeAffinity: mask, 533 Preferred: false, 534 }) 535 } 536 }) 537 538 // update hints preferred according to multiNUMAGroups, in case when it wasn't provided, the default 539 // behaviour to prefer the minimal amount of NUMA nodes will be used 540 for resourceName := range requestedResources { 541 for i, hint := range hints[string(resourceName)] { 542 hints[string(resourceName)][i].Preferred = p.isHintPreferred(hint.NUMANodeAffinity.GetBits(), minAffinitySize) 543 } 544 } 545 546 return hints 547 } 548 549 func (p *staticPolicy) isHintPreferred(maskBits []int, minAffinitySize int) bool { 550 return len(maskBits) == minAffinitySize 551 } 552 553 func areGroupsEqual(group1, group2 []int) bool { 554 sort.Ints(group1) 555 sort.Ints(group2) 556 557 if len(group1) != len(group2) { 558 return false 559 } 560 561 for i, elm := range group1 { 562 if group2[i] != elm { 563 return false 564 } 565 } 566 return true 567 } 568 569 func (p *staticPolicy) validateState(s state.State) error { 570 machineState := s.GetMachineState() 571 memoryAssignments := s.GetMemoryAssignments() 572 573 if len(machineState) == 0 { 574 // Machine state cannot be empty when assignments exist 575 if len(memoryAssignments) != 0 { 576 return fmt.Errorf("[memorymanager] machine state can not be empty when it has memory assignments") 577 } 578 579 defaultMachineState := p.getDefaultMachineState() 580 s.SetMachineState(defaultMachineState) 581 582 return nil 583 } 584 585 // calculate all memory assigned to containers 586 expectedMachineState := p.getDefaultMachineState() 587 for pod, container := range memoryAssignments { 588 for containerName, blocks := range container { 589 for _, b := range blocks { 590 requestedSize := b.Size 591 for _, nodeID := range b.NUMAAffinity { 592 nodeState, ok := expectedMachineState[nodeID] 593 if !ok { 594 return fmt.Errorf("[memorymanager] (pod: %s, container: %s) the memory assignment uses the NUMA that does not exist", pod, containerName) 595 } 596 597 nodeState.NumberOfAssignments++ 598 nodeState.Cells = b.NUMAAffinity 599 600 memoryState, ok := nodeState.MemoryMap[b.Type] 601 if !ok { 602 return fmt.Errorf("[memorymanager] (pod: %s, container: %s) the memory assignment uses memory resource that does not exist", pod, containerName) 603 } 604 605 if requestedSize == 0 { 606 continue 607 } 608 609 // this node does not have enough memory continue to the next one 610 if memoryState.Free <= 0 { 611 continue 612 } 613 614 // the node has enough memory to satisfy the request 615 if memoryState.Free >= requestedSize { 616 memoryState.Reserved += requestedSize 617 memoryState.Free -= requestedSize 618 requestedSize = 0 619 continue 620 } 621 622 // the node does not have enough memory, use the node remaining memory and move to the next node 623 requestedSize -= memoryState.Free 624 memoryState.Reserved += memoryState.Free 625 memoryState.Free = 0 626 } 627 } 628 } 629 } 630 631 // State has already been initialized from file (is not empty) 632 // Validate that total size, system reserved and reserved memory not changed, it can happen, when: 633 // - adding or removing physical memory bank from the node 634 // - change of kubelet system-reserved, kube-reserved or pre-reserved-memory-zone parameters 635 if !areMachineStatesEqual(machineState, expectedMachineState) { 636 return fmt.Errorf("[memorymanager] the expected machine state is different from the real one") 637 } 638 639 return nil 640 } 641 642 func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool { 643 if len(ms1) != len(ms2) { 644 klog.ErrorS(nil, "Node states are different", "lengthNode1", len(ms1), "lengthNode2", len(ms2)) 645 return false 646 } 647 648 for nodeID, nodeState1 := range ms1 { 649 nodeState2, ok := ms2[nodeID] 650 if !ok { 651 klog.ErrorS(nil, "Node state does not have node ID", "nodeID", nodeID) 652 return false 653 } 654 655 if nodeState1.NumberOfAssignments != nodeState2.NumberOfAssignments { 656 klog.ErrorS(nil, "Node states number of assignments are different", "assignment1", nodeState1.NumberOfAssignments, "assignment2", nodeState2.NumberOfAssignments) 657 return false 658 } 659 660 if !areGroupsEqual(nodeState1.Cells, nodeState2.Cells) { 661 klog.ErrorS(nil, "Node states groups are different", "stateNode1", nodeState1.Cells, "stateNode2", nodeState2.Cells) 662 return false 663 } 664 665 if len(nodeState1.MemoryMap) != len(nodeState2.MemoryMap) { 666 klog.ErrorS(nil, "Node states memory map have different lengths", "lengthNode1", len(nodeState1.MemoryMap), "lengthNode2", len(nodeState2.MemoryMap)) 667 return false 668 } 669 670 for resourceName, memoryState1 := range nodeState1.MemoryMap { 671 memoryState2, ok := nodeState2.MemoryMap[resourceName] 672 if !ok { 673 klog.ErrorS(nil, "Memory state does not have resource", "resource", resourceName) 674 return false 675 } 676 677 if !reflect.DeepEqual(*memoryState1, *memoryState2) { 678 klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2) 679 return false 680 } 681 } 682 } 683 return true 684 } 685 686 func (p *staticPolicy) getDefaultMachineState() state.NUMANodeMap { 687 defaultMachineState := state.NUMANodeMap{} 688 nodeHugepages := map[int]uint64{} 689 for _, node := range p.machineInfo.Topology { 690 defaultMachineState[node.Id] = &state.NUMANodeState{ 691 NumberOfAssignments: 0, 692 MemoryMap: map[v1.ResourceName]*state.MemoryTable{}, 693 Cells: []int{node.Id}, 694 } 695 696 // fill memory table with huge pages values 697 for _, hugepage := range node.HugePages { 698 hugepageQuantity := resource.NewQuantity(int64(hugepage.PageSize)*1024, resource.BinarySI) 699 resourceName := corehelper.HugePageResourceName(*hugepageQuantity) 700 systemReserved := p.getResourceSystemReserved(node.Id, resourceName) 701 totalHugepagesSize := hugepage.NumPages * hugepage.PageSize * 1024 702 allocatable := totalHugepagesSize - systemReserved 703 defaultMachineState[node.Id].MemoryMap[resourceName] = &state.MemoryTable{ 704 Allocatable: allocatable, 705 Free: allocatable, 706 Reserved: 0, 707 SystemReserved: systemReserved, 708 TotalMemSize: totalHugepagesSize, 709 } 710 if _, ok := nodeHugepages[node.Id]; !ok { 711 nodeHugepages[node.Id] = 0 712 } 713 nodeHugepages[node.Id] += totalHugepagesSize 714 } 715 716 // fill memory table with regular memory values 717 systemReserved := p.getResourceSystemReserved(node.Id, v1.ResourceMemory) 718 719 allocatable := node.Memory - systemReserved 720 // remove memory allocated by hugepages 721 if allocatedByHugepages, ok := nodeHugepages[node.Id]; ok { 722 allocatable -= allocatedByHugepages 723 } 724 defaultMachineState[node.Id].MemoryMap[v1.ResourceMemory] = &state.MemoryTable{ 725 Allocatable: allocatable, 726 Free: allocatable, 727 Reserved: 0, 728 SystemReserved: systemReserved, 729 TotalMemSize: node.Memory, 730 } 731 } 732 return defaultMachineState 733 } 734 735 func (p *staticPolicy) getResourceSystemReserved(nodeID int, resourceName v1.ResourceName) uint64 { 736 var systemReserved uint64 737 if nodeSystemReserved, ok := p.systemReserved[nodeID]; ok { 738 if nodeMemorySystemReserved, ok := nodeSystemReserved[resourceName]; ok { 739 systemReserved = nodeMemorySystemReserved 740 } 741 } 742 return systemReserved 743 } 744 745 func (p *staticPolicy) getDefaultHint(machineState state.NUMANodeMap, pod *v1.Pod, requestedResources map[v1.ResourceName]uint64) (*topologymanager.TopologyHint, error) { 746 hints := p.calculateHints(machineState, pod, requestedResources) 747 if len(hints) < 1 { 748 return nil, fmt.Errorf("[memorymanager] failed to get the default NUMA affinity, no NUMA nodes with enough memory is available") 749 } 750 751 // hints for all memory types should be the same, so we will check hints only for regular memory type 752 return findBestHint(hints[string(v1.ResourceMemory)]), nil 753 } 754 755 func isAffinitySatisfyRequest(machineState state.NUMANodeMap, mask bitmask.BitMask, requestedResources map[v1.ResourceName]uint64) bool { 756 totalFreeSize := map[v1.ResourceName]uint64{} 757 for _, nodeID := range mask.GetBits() { 758 for resourceName := range requestedResources { 759 if _, ok := totalFreeSize[resourceName]; !ok { 760 totalFreeSize[resourceName] = 0 761 } 762 totalFreeSize[resourceName] += machineState[nodeID].MemoryMap[resourceName].Free 763 } 764 } 765 766 // verify that for all memory types the node mask has enough resources 767 for resourceName, requestedSize := range requestedResources { 768 if totalFreeSize[resourceName] < requestedSize { 769 return false 770 } 771 } 772 773 return true 774 } 775 776 // extendTopologyManagerHint extends the topology manager hint, in case when it does not satisfy to the container request 777 // the topology manager uses bitwise AND to merge all topology hints into the best one, so in case of the restricted policy, 778 // it possible that we will get the subset of hint that we provided to the topology manager, in this case we want to extend 779 // it to the original one 780 func (p *staticPolicy) extendTopologyManagerHint(machineState state.NUMANodeMap, pod *v1.Pod, requestedResources map[v1.ResourceName]uint64, mask bitmask.BitMask) (*topologymanager.TopologyHint, error) { 781 hints := p.calculateHints(machineState, pod, requestedResources) 782 783 var filteredHints []topologymanager.TopologyHint 784 // hints for all memory types should be the same, so we will check hints only for regular memory type 785 for _, hint := range hints[string(v1.ResourceMemory)] { 786 affinityBits := hint.NUMANodeAffinity.GetBits() 787 // filter all hints that does not include currentHint 788 if isHintInGroup(mask.GetBits(), affinityBits) { 789 filteredHints = append(filteredHints, hint) 790 } 791 } 792 793 if len(filteredHints) < 1 { 794 return nil, fmt.Errorf("[memorymanager] failed to find NUMA nodes to extend the current topology hint") 795 } 796 797 // try to find the preferred hint with the minimal number of NUMA nodes, relevant for the restricted policy 798 return findBestHint(filteredHints), nil 799 } 800 801 func isHintInGroup(hint []int, group []int) bool { 802 sort.Ints(hint) 803 sort.Ints(group) 804 805 hintIndex := 0 806 for i := range group { 807 if hintIndex == len(hint) { 808 return true 809 } 810 811 if group[i] != hint[hintIndex] { 812 continue 813 } 814 hintIndex++ 815 } 816 817 return hintIndex == len(hint) 818 } 819 820 func findBestHint(hints []topologymanager.TopologyHint) *topologymanager.TopologyHint { 821 // try to find the preferred hint with the minimal number of NUMA nodes, relevant for the restricted policy 822 bestHint := topologymanager.TopologyHint{} 823 for _, hint := range hints { 824 if bestHint.NUMANodeAffinity == nil { 825 bestHint = hint 826 continue 827 } 828 829 // preferred of the current hint is true, when the extendedHint preferred is false 830 if hint.Preferred && !bestHint.Preferred { 831 bestHint = hint 832 continue 833 } 834 835 // both hints has the same preferred value, but the current hint has less NUMA nodes than the extended one 836 if hint.Preferred == bestHint.Preferred && hint.NUMANodeAffinity.IsNarrowerThan(bestHint.NUMANodeAffinity) { 837 bestHint = hint 838 } 839 } 840 return &bestHint 841 } 842 843 // GetAllocatableMemory returns the amount of allocatable memory for each NUMA node 844 func (p *staticPolicy) GetAllocatableMemory(s state.State) []state.Block { 845 var allocatableMemory []state.Block 846 machineState := s.GetMachineState() 847 for numaNodeID, numaNodeState := range machineState { 848 for resourceName, memoryTable := range numaNodeState.MemoryMap { 849 if memoryTable.Allocatable == 0 { 850 continue 851 } 852 853 block := state.Block{ 854 NUMAAffinity: []int{numaNodeID}, 855 Type: resourceName, 856 Size: memoryTable.Allocatable, 857 } 858 allocatableMemory = append(allocatableMemory, block) 859 } 860 } 861 return allocatableMemory 862 } 863 864 func (p *staticPolicy) updatePodReusableMemory(pod *v1.Pod, container *v1.Container, memoryBlocks []state.Block) { 865 podUID := string(pod.UID) 866 867 // If pod entries to m.initContainersReusableMemory other than the current pod exist, delete them. 868 for uid := range p.initContainersReusableMemory { 869 if podUID != uid { 870 delete(p.initContainersReusableMemory, uid) 871 } 872 } 873 874 if isRegularInitContainer(pod, container) { 875 if _, ok := p.initContainersReusableMemory[podUID]; !ok { 876 p.initContainersReusableMemory[podUID] = map[string]map[v1.ResourceName]uint64{} 877 } 878 879 for _, block := range memoryBlocks { 880 blockBitMask, _ := bitmask.NewBitMask(block.NUMAAffinity...) 881 blockBitMaskString := blockBitMask.String() 882 883 if _, ok := p.initContainersReusableMemory[podUID][blockBitMaskString]; !ok { 884 p.initContainersReusableMemory[podUID][blockBitMaskString] = map[v1.ResourceName]uint64{} 885 } 886 887 if blockReusableMemory := p.initContainersReusableMemory[podUID][blockBitMaskString][block.Type]; block.Size > blockReusableMemory { 888 p.initContainersReusableMemory[podUID][blockBitMaskString][block.Type] = block.Size 889 } 890 } 891 892 return 893 } 894 895 // update re-usable memory once it used by the app container 896 for _, block := range memoryBlocks { 897 blockBitMask, _ := bitmask.NewBitMask(block.NUMAAffinity...) 898 if podReusableMemory := p.getPodReusableMemory(pod, blockBitMask, block.Type); podReusableMemory != 0 { 899 if block.Size >= podReusableMemory { 900 p.initContainersReusableMemory[podUID][blockBitMask.String()][block.Type] = 0 901 } else { 902 p.initContainersReusableMemory[podUID][blockBitMask.String()][block.Type] -= block.Size 903 } 904 } 905 } 906 } 907 908 func (p *staticPolicy) updateInitContainersMemoryBlocks(s state.State, pod *v1.Pod, container *v1.Container, containerMemoryBlocks []state.Block) { 909 podUID := string(pod.UID) 910 911 for _, containerBlock := range containerMemoryBlocks { 912 blockSize := containerBlock.Size 913 for _, initContainer := range pod.Spec.InitContainers { 914 // we do not want to continue updates once we reach the current container 915 if initContainer.Name == container.Name { 916 break 917 } 918 919 if blockSize == 0 { 920 break 921 } 922 923 if types.IsRestartableInitContainer(&initContainer) { 924 // we should not reuse the resource from any restartable init 925 // container 926 continue 927 } 928 929 initContainerBlocks := s.GetMemoryBlocks(podUID, initContainer.Name) 930 if len(initContainerBlocks) == 0 { 931 continue 932 } 933 934 for i := range initContainerBlocks { 935 initContainerBlock := &initContainerBlocks[i] 936 if initContainerBlock.Size == 0 { 937 continue 938 } 939 940 if initContainerBlock.Type != containerBlock.Type { 941 continue 942 } 943 944 if !isNUMAAffinitiesEqual(initContainerBlock.NUMAAffinity, containerBlock.NUMAAffinity) { 945 continue 946 } 947 948 if initContainerBlock.Size > blockSize { 949 initContainerBlock.Size -= blockSize 950 blockSize = 0 951 } else { 952 blockSize -= initContainerBlock.Size 953 initContainerBlock.Size = 0 954 } 955 } 956 957 s.SetMemoryBlocks(podUID, initContainer.Name, initContainerBlocks) 958 } 959 } 960 } 961 962 func isRegularInitContainer(pod *v1.Pod, container *v1.Container) bool { 963 for _, initContainer := range pod.Spec.InitContainers { 964 if initContainer.Name == container.Name { 965 return !types.IsRestartableInitContainer(&initContainer) 966 } 967 } 968 969 return false 970 } 971 972 func isNUMAAffinitiesEqual(numaAffinity1, numaAffinity2 []int) bool { 973 bitMask1, err := bitmask.NewBitMask(numaAffinity1...) 974 if err != nil { 975 klog.ErrorS(err, "failed to create bit mask", "numaAffinity1", numaAffinity1) 976 return false 977 } 978 979 bitMask2, err := bitmask.NewBitMask(numaAffinity2...) 980 if err != nil { 981 klog.ErrorS(err, "failed to create bit mask", "numaAffinity2", numaAffinity2) 982 return false 983 } 984 985 return bitMask1.IsEqual(bitMask2) 986 }