k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/memorymanager/policy_static.go

k8s.io/kubernetes@v1.29.3/pkg/kubelet/cm/memorymanager/policy_static.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package memorymanager
    18  
    19  import (
    20  	"fmt"
    21  	"reflect"
    22  	"sort"
    23  
    24  	cadvisorapi "github.com/google/cadvisor/info/v1"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	"k8s.io/apimachinery/pkg/api/resource"
    28  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    29  	"k8s.io/klog/v2"
    30  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    31  	corehelper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
    32  	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    33  	"k8s.io/kubernetes/pkg/features"
    34  	"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
    35  	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
    36  	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
    37  	"k8s.io/kubernetes/pkg/kubelet/types"
    38  )
    39  
    40  const policyTypeStatic policyType = "Static"
    41  
    42  type systemReservedMemory map[int]map[v1.ResourceName]uint64
    43  type reusableMemory map[string]map[string]map[v1.ResourceName]uint64
    44  
    45  // staticPolicy is implementation of the policy interface for the static policy
    46  type staticPolicy struct {
    47  	// machineInfo contains machine memory related information
    48  	machineInfo *cadvisorapi.MachineInfo
    49  	// reserved contains memory that reserved for kube
    50  	systemReserved systemReservedMemory
    51  	// topology manager reference to get container Topology affinity
    52  	affinity topologymanager.Store
    53  	// initContainersReusableMemory contains the memory allocated for init
    54  	// containers that can be reused.
    55  	// Note that the restartable init container memory is not included here,
    56  	// because it is not reusable.
    57  	initContainersReusableMemory reusableMemory
    58  }
    59  
    60  var _ Policy = &staticPolicy{}
    61  
    62  // NewPolicyStatic returns new static policy instance
    63  func NewPolicyStatic(machineInfo *cadvisorapi.MachineInfo, reserved systemReservedMemory, affinity topologymanager.Store) (Policy, error) {
    64  	var totalSystemReserved uint64
    65  	for _, node := range reserved {
    66  		if _, ok := node[v1.ResourceMemory]; !ok {
    67  			continue
    68  		}
    69  		totalSystemReserved += node[v1.ResourceMemory]
    70  	}
    71  
    72  	// check if we have some reserved memory for the system
    73  	if totalSystemReserved <= 0 {
    74  		return nil, fmt.Errorf("[memorymanager] you should specify the system reserved memory")
    75  	}
    76  
    77  	return &staticPolicy{
    78  		machineInfo:                  machineInfo,
    79  		systemReserved:               reserved,
    80  		affinity:                     affinity,
    81  		initContainersReusableMemory: reusableMemory{},
    82  	}, nil
    83  }
    84  
    85  func (p *staticPolicy) Name() string {
    86  	return string(policyTypeStatic)
    87  }
    88  
    89  func (p *staticPolicy) Start(s state.State) error {
    90  	if err := p.validateState(s); err != nil {
    91  		klog.ErrorS(err, "Invalid state, please drain node and remove policy state file")
    92  		return err
    93  	}
    94  	return nil
    95  }
    96  
    97  // Allocate call is idempotent
    98  func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
    99  	// allocate the memory only for guaranteed pods
   100  	if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
   101  		return nil
   102  	}
   103  
   104  	podUID := string(pod.UID)
   105  	klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
   106  	if blocks := s.GetMemoryBlocks(podUID, container.Name); blocks != nil {
   107  		p.updatePodReusableMemory(pod, container, blocks)
   108  
   109  		klog.InfoS("Container already present in state, skipping", "pod", klog.KObj(pod), "containerName", container.Name)
   110  		return nil
   111  	}
   112  
   113  	// Call Topology Manager to get the aligned affinity across all hint providers.
   114  	hint := p.affinity.GetAffinity(podUID, container.Name)
   115  	klog.InfoS("Got topology affinity", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name, "hint", hint)
   116  
   117  	requestedResources, err := getRequestedResources(pod, container)
   118  	if err != nil {
   119  		return err
   120  	}
   121  
   122  	machineState := s.GetMachineState()
   123  	bestHint := &hint
   124  	// topology manager returned the hint with NUMA affinity nil
   125  	// we should use the default NUMA affinity calculated the same way as for the topology manager
   126  	if hint.NUMANodeAffinity == nil {
   127  		defaultHint, err := p.getDefaultHint(machineState, pod, requestedResources)
   128  		if err != nil {
   129  			return err
   130  		}
   131  
   132  		if !defaultHint.Preferred && bestHint.Preferred {
   133  			return fmt.Errorf("[memorymanager] failed to find the default preferred hint")
   134  		}
   135  		bestHint = defaultHint
   136  	}
   137  
   138  	// topology manager returns the hint that does not satisfy completely the container request
   139  	// we should extend this hint to the one who will satisfy the request and include the current hint
   140  	if !isAffinitySatisfyRequest(machineState, bestHint.NUMANodeAffinity, requestedResources) {
   141  		extendedHint, err := p.extendTopologyManagerHint(machineState, pod, requestedResources, bestHint.NUMANodeAffinity)
   142  		if err != nil {
   143  			return err
   144  		}
   145  
   146  		if !extendedHint.Preferred && bestHint.Preferred {
   147  			return fmt.Errorf("[memorymanager] failed to find the extended preferred hint")
   148  		}
   149  		bestHint = extendedHint
   150  	}
   151  
   152  	var containerBlocks []state.Block
   153  	maskBits := bestHint.NUMANodeAffinity.GetBits()
   154  	for resourceName, requestedSize := range requestedResources {
   155  		// update memory blocks
   156  		containerBlocks = append(containerBlocks, state.Block{
   157  			NUMAAffinity: maskBits,
   158  			Size:         requestedSize,
   159  			Type:         resourceName,
   160  		})
   161  
   162  		podReusableMemory := p.getPodReusableMemory(pod, bestHint.NUMANodeAffinity, resourceName)
   163  		if podReusableMemory >= requestedSize {
   164  			requestedSize = 0
   165  		} else {
   166  			requestedSize -= podReusableMemory
   167  		}
   168  
   169  		// Update nodes memory state
   170  		p.updateMachineState(machineState, maskBits, resourceName, requestedSize)
   171  	}
   172  
   173  	p.updatePodReusableMemory(pod, container, containerBlocks)
   174  
   175  	s.SetMachineState(machineState)
   176  	s.SetMemoryBlocks(podUID, container.Name, containerBlocks)
   177  
   178  	// update init containers memory blocks to reflect the fact that we re-used init containers memory
   179  	// it is possible that the size of the init container memory block will have 0 value, when all memory
   180  	// allocated for it was re-used
   181  	// we only do this so that the sum(memory_for_all_containers) == total amount of allocated memory to the pod, even
   182  	// though the final state here doesn't accurately reflect what was (in reality) allocated to each container
   183  	// TODO: we should refactor our state structs to reflect the amount of the re-used memory
   184  	p.updateInitContainersMemoryBlocks(s, pod, container, containerBlocks)
   185  
   186  	return nil
   187  }
   188  
   189  func (p *staticPolicy) updateMachineState(machineState state.NUMANodeMap, numaAffinity []int, resourceName v1.ResourceName, requestedSize uint64) {
   190  	for _, nodeID := range numaAffinity {
   191  		machineState[nodeID].NumberOfAssignments++
   192  		machineState[nodeID].Cells = numaAffinity
   193  
   194  		// we need to continue to update all affinity mask nodes
   195  		if requestedSize == 0 {
   196  			continue
   197  		}
   198  
   199  		// update the node memory state
   200  		nodeResourceMemoryState := machineState[nodeID].MemoryMap[resourceName]
   201  		if nodeResourceMemoryState.Free <= 0 {
   202  			continue
   203  		}
   204  
   205  		// the node has enough memory to satisfy the request
   206  		if nodeResourceMemoryState.Free >= requestedSize {
   207  			nodeResourceMemoryState.Reserved += requestedSize
   208  			nodeResourceMemoryState.Free -= requestedSize
   209  			requestedSize = 0
   210  			continue
   211  		}
   212  
   213  		// the node does not have enough memory, use the node remaining memory and move to the next node
   214  		requestedSize -= nodeResourceMemoryState.Free
   215  		nodeResourceMemoryState.Reserved += nodeResourceMemoryState.Free
   216  		nodeResourceMemoryState.Free = 0
   217  	}
   218  }
   219  
   220  func (p *staticPolicy) getPodReusableMemory(pod *v1.Pod, numaAffinity bitmask.BitMask, resourceName v1.ResourceName) uint64 {
   221  	podReusableMemory, ok := p.initContainersReusableMemory[string(pod.UID)]
   222  	if !ok {
   223  		return 0
   224  	}
   225  
   226  	numaReusableMemory, ok := podReusableMemory[numaAffinity.String()]
   227  	if !ok {
   228  		return 0
   229  	}
   230  
   231  	return numaReusableMemory[resourceName]
   232  }
   233  
   234  // RemoveContainer call is idempotent
   235  func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerName string) {
   236  	blocks := s.GetMemoryBlocks(podUID, containerName)
   237  	if blocks == nil {
   238  		return
   239  	}
   240  
   241  	klog.InfoS("RemoveContainer", "podUID", podUID, "containerName", containerName)
   242  	s.Delete(podUID, containerName)
   243  
   244  	// Mutate machine memory state to update free and reserved memory
   245  	machineState := s.GetMachineState()
   246  	for _, b := range blocks {
   247  		releasedSize := b.Size
   248  		for _, nodeID := range b.NUMAAffinity {
   249  			machineState[nodeID].NumberOfAssignments--
   250  
   251  			// once we do not have any memory allocations on this node, clear node groups
   252  			if machineState[nodeID].NumberOfAssignments == 0 {
   253  				machineState[nodeID].Cells = []int{nodeID}
   254  			}
   255  
   256  			// we still need to pass over all NUMA node under the affinity mask to update them
   257  			if releasedSize == 0 {
   258  				continue
   259  			}
   260  
   261  			nodeResourceMemoryState := machineState[nodeID].MemoryMap[b.Type]
   262  
   263  			// if the node does not have reserved memory to free, continue to the next node
   264  			if nodeResourceMemoryState.Reserved == 0 {
   265  				continue
   266  			}
   267  
   268  			// the reserved memory smaller than the amount of the memory that should be released
   269  			// release as much as possible and move to the next node
   270  			if nodeResourceMemoryState.Reserved < releasedSize {
   271  				releasedSize -= nodeResourceMemoryState.Reserved
   272  				nodeResourceMemoryState.Free += nodeResourceMemoryState.Reserved
   273  				nodeResourceMemoryState.Reserved = 0
   274  				continue
   275  			}
   276  
   277  			// the reserved memory big enough to satisfy the released memory
   278  			nodeResourceMemoryState.Free += releasedSize
   279  			nodeResourceMemoryState.Reserved -= releasedSize
   280  			releasedSize = 0
   281  		}
   282  	}
   283  
   284  	s.SetMachineState(machineState)
   285  }
   286  
   287  func regenerateHints(pod *v1.Pod, ctn *v1.Container, ctnBlocks []state.Block, reqRsrc map[v1.ResourceName]uint64) map[string][]topologymanager.TopologyHint {
   288  	hints := map[string][]topologymanager.TopologyHint{}
   289  	for resourceName := range reqRsrc {
   290  		hints[string(resourceName)] = []topologymanager.TopologyHint{}
   291  	}
   292  
   293  	if len(ctnBlocks) != len(reqRsrc) {
   294  		klog.ErrorS(nil, "The number of requested resources by the container differs from the number of memory blocks", "containerName", ctn.Name)
   295  		return nil
   296  	}
   297  
   298  	for _, b := range ctnBlocks {
   299  		if _, ok := reqRsrc[b.Type]; !ok {
   300  			klog.ErrorS(nil, "Container requested resources do not have resource of this type", "containerName", ctn.Name, "type", b.Type)
   301  			return nil
   302  		}
   303  
   304  		if b.Size != reqRsrc[b.Type] {
   305  			klog.ErrorS(nil, "Memory already allocated with different numbers than requested", "podUID", pod.UID, "type", b.Type, "containerName", ctn.Name, "requestedResource", reqRsrc[b.Type], "allocatedSize", b.Size)
   306  			return nil
   307  		}
   308  
   309  		containerNUMAAffinity, err := bitmask.NewBitMask(b.NUMAAffinity...)
   310  		if err != nil {
   311  			klog.ErrorS(err, "Failed to generate NUMA bitmask")
   312  			return nil
   313  		}
   314  
   315  		klog.InfoS("Regenerating TopologyHints, resource was already allocated to pod", "resourceName", b.Type, "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", ctn.Name)
   316  		hints[string(b.Type)] = append(hints[string(b.Type)], topologymanager.TopologyHint{
   317  			NUMANodeAffinity: containerNUMAAffinity,
   318  			Preferred:        true,
   319  		})
   320  	}
   321  	return hints
   322  }
   323  
   324  func getPodRequestedResources(pod *v1.Pod) (map[v1.ResourceName]uint64, error) {
   325  	// Maximun resources requested by init containers at any given time.
   326  	reqRsrcsByInitCtrs := make(map[v1.ResourceName]uint64)
   327  	// Total resources requested by restartable init containers.
   328  	reqRsrcsByRestartableInitCtrs := make(map[v1.ResourceName]uint64)
   329  	for _, ctr := range pod.Spec.InitContainers {
   330  		reqRsrcs, err := getRequestedResources(pod, &ctr)
   331  
   332  		if err != nil {
   333  			return nil, err
   334  		}
   335  		for rsrcName, qty := range reqRsrcs {
   336  			if _, ok := reqRsrcsByInitCtrs[rsrcName]; !ok {
   337  				reqRsrcsByInitCtrs[rsrcName] = uint64(0)
   338  			}
   339  
   340  			// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/753-sidecar-containers#resources-calculation-for-scheduling-and-pod-admission
   341  			// for the detail.
   342  			if types.IsRestartableInitContainer(&ctr) {
   343  				reqRsrcsByRestartableInitCtrs[rsrcName] += qty
   344  			} else if reqRsrcsByRestartableInitCtrs[rsrcName]+qty > reqRsrcsByInitCtrs[rsrcName] {
   345  				reqRsrcsByInitCtrs[rsrcName] = reqRsrcsByRestartableInitCtrs[rsrcName] + qty
   346  			}
   347  		}
   348  	}
   349  
   350  	reqRsrcsByAppCtrs := make(map[v1.ResourceName]uint64)
   351  	for _, ctr := range pod.Spec.Containers {
   352  		reqRsrcs, err := getRequestedResources(pod, &ctr)
   353  
   354  		if err != nil {
   355  			return nil, err
   356  		}
   357  		for rsrcName, qty := range reqRsrcs {
   358  			if _, ok := reqRsrcsByAppCtrs[rsrcName]; !ok {
   359  				reqRsrcsByAppCtrs[rsrcName] = uint64(0)
   360  			}
   361  
   362  			reqRsrcsByAppCtrs[rsrcName] += qty
   363  		}
   364  	}
   365  
   366  	reqRsrcs := make(map[v1.ResourceName]uint64)
   367  	for rsrcName := range reqRsrcsByAppCtrs {
   368  		// Total resources requested by long-running containers.
   369  		reqRsrcsByLongRunningCtrs := reqRsrcsByAppCtrs[rsrcName] + reqRsrcsByRestartableInitCtrs[rsrcName]
   370  		reqRsrcs[rsrcName] = reqRsrcsByLongRunningCtrs
   371  
   372  		if reqRsrcs[rsrcName] < reqRsrcsByInitCtrs[rsrcName] {
   373  			reqRsrcs[rsrcName] = reqRsrcsByInitCtrs[rsrcName]
   374  		}
   375  	}
   376  	return reqRsrcs, nil
   377  }
   378  
   379  func (p *staticPolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
   380  	if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
   381  		return nil
   382  	}
   383  
   384  	reqRsrcs, err := getPodRequestedResources(pod)
   385  	if err != nil {
   386  		klog.ErrorS(err, "Failed to get pod requested resources", "pod", klog.KObj(pod), "podUID", pod.UID)
   387  		return nil
   388  	}
   389  
   390  	for _, ctn := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
   391  		containerBlocks := s.GetMemoryBlocks(string(pod.UID), ctn.Name)
   392  		// Short circuit to regenerate the same hints if there are already
   393  		// memory allocated for the container. This might happen after a
   394  		// kubelet restart, for example.
   395  		if containerBlocks != nil {
   396  			return regenerateHints(pod, &ctn, containerBlocks, reqRsrcs)
   397  		}
   398  	}
   399  
   400  	// the pod topology hints calculated only once for all containers, so no need to pass re-usable state
   401  	return p.calculateHints(s.GetMachineState(), pod, reqRsrcs)
   402  }
   403  
   404  // GetTopologyHints implements the topologymanager.HintProvider Interface
   405  // and is consulted to achieve NUMA aware resource alignment among this
   406  // and other resource controllers.
   407  func (p *staticPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
   408  	if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
   409  		return nil
   410  	}
   411  
   412  	requestedResources, err := getRequestedResources(pod, container)
   413  	if err != nil {
   414  		klog.ErrorS(err, "Failed to get container requested resources", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name)
   415  		return nil
   416  	}
   417  
   418  	containerBlocks := s.GetMemoryBlocks(string(pod.UID), container.Name)
   419  	// Short circuit to regenerate the same hints if there are already
   420  	// memory allocated for the container. This might happen after a
   421  	// kubelet restart, for example.
   422  	if containerBlocks != nil {
   423  		return regenerateHints(pod, container, containerBlocks, requestedResources)
   424  	}
   425  
   426  	return p.calculateHints(s.GetMachineState(), pod, requestedResources)
   427  }
   428  
   429  func getRequestedResources(pod *v1.Pod, container *v1.Container) (map[v1.ResourceName]uint64, error) {
   430  	requestedResources := map[v1.ResourceName]uint64{}
   431  	resources := container.Resources.Requests
   432  	// In-place pod resize feature makes Container.Resources field mutable for CPU & memory.
   433  	// AllocatedResources holds the value of Container.Resources.Requests when the pod was admitted.
   434  	// We should return this value because this is what kubelet agreed to allocate for the container
   435  	// and the value configured with runtime.
   436  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
   437  		if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok {
   438  			resources = cs.AllocatedResources
   439  		}
   440  	}
   441  	for resourceName, quantity := range resources {
   442  		if resourceName != v1.ResourceMemory && !corehelper.IsHugePageResourceName(resourceName) {
   443  			continue
   444  		}
   445  		requestedSize, succeed := quantity.AsInt64()
   446  		if !succeed {
   447  			return nil, fmt.Errorf("[memorymanager] failed to represent quantity as int64")
   448  		}
   449  		requestedResources[resourceName] = uint64(requestedSize)
   450  	}
   451  	return requestedResources, nil
   452  }
   453  
   454  func (p *staticPolicy) calculateHints(machineState state.NUMANodeMap, pod *v1.Pod, requestedResources map[v1.ResourceName]uint64) map[string][]topologymanager.TopologyHint {
   455  	var numaNodes []int
   456  	for n := range machineState {
   457  		numaNodes = append(numaNodes, n)
   458  	}
   459  	sort.Ints(numaNodes)
   460  
   461  	// Initialize minAffinitySize to include all NUMA Cells.
   462  	minAffinitySize := len(numaNodes)
   463  
   464  	hints := map[string][]topologymanager.TopologyHint{}
   465  	bitmask.IterateBitMasks(numaNodes, func(mask bitmask.BitMask) {
   466  		maskBits := mask.GetBits()
   467  		singleNUMAHint := len(maskBits) == 1
   468  
   469  		totalFreeSize := map[v1.ResourceName]uint64{}
   470  		totalAllocatableSize := map[v1.ResourceName]uint64{}
   471  		// calculate total free and allocatable memory for the node mask
   472  		for _, nodeID := range maskBits {
   473  			for resourceName := range requestedResources {
   474  				if _, ok := totalFreeSize[resourceName]; !ok {
   475  					totalFreeSize[resourceName] = 0
   476  				}
   477  				totalFreeSize[resourceName] += machineState[nodeID].MemoryMap[resourceName].Free
   478  
   479  				if _, ok := totalAllocatableSize[resourceName]; !ok {
   480  					totalAllocatableSize[resourceName] = 0
   481  				}
   482  				totalAllocatableSize[resourceName] += machineState[nodeID].MemoryMap[resourceName].Allocatable
   483  			}
   484  		}
   485  
   486  		// verify that for all memory types the node mask has enough allocatable resources
   487  		for resourceName, requestedSize := range requestedResources {
   488  			if totalAllocatableSize[resourceName] < requestedSize {
   489  				return
   490  			}
   491  		}
   492  
   493  		// set the minimum amount of NUMA nodes that can satisfy the container resources requests
   494  		if mask.Count() < minAffinitySize {
   495  			minAffinitySize = mask.Count()
   496  		}
   497  
   498  		// the node already in group with another node, it can not be used for the single NUMA node allocation
   499  		if singleNUMAHint && len(machineState[maskBits[0]].Cells) > 1 {
   500  			return
   501  		}
   502  
   503  		for _, nodeID := range maskBits {
   504  			// the node already used for the memory allocation
   505  			if !singleNUMAHint && machineState[nodeID].NumberOfAssignments > 0 {
   506  				// the node used for the single NUMA memory allocation, it can not be used for the multi NUMA node allocation
   507  				if len(machineState[nodeID].Cells) == 1 {
   508  					return
   509  				}
   510  
   511  				// the node already used with different group of nodes, it can not be use with in the current hint
   512  				if !areGroupsEqual(machineState[nodeID].Cells, maskBits) {
   513  					return
   514  				}
   515  			}
   516  		}
   517  
   518  		// verify that for all memory types the node mask has enough free resources
   519  		for resourceName, requestedSize := range requestedResources {
   520  			podReusableMemory := p.getPodReusableMemory(pod, mask, resourceName)
   521  			if totalFreeSize[resourceName]+podReusableMemory < requestedSize {
   522  				return
   523  			}
   524  		}
   525  
   526  		// add the node mask as topology hint for all memory types
   527  		for resourceName := range requestedResources {
   528  			if _, ok := hints[string(resourceName)]; !ok {
   529  				hints[string(resourceName)] = []topologymanager.TopologyHint{}
   530  			}
   531  			hints[string(resourceName)] = append(hints[string(resourceName)], topologymanager.TopologyHint{
   532  				NUMANodeAffinity: mask,
   533  				Preferred:        false,
   534  			})
   535  		}
   536  	})
   537  
   538  	// update hints preferred according to multiNUMAGroups, in case when it wasn't provided, the default
   539  	// behaviour to prefer the minimal amount of NUMA nodes will be used
   540  	for resourceName := range requestedResources {
   541  		for i, hint := range hints[string(resourceName)] {
   542  			hints[string(resourceName)][i].Preferred = p.isHintPreferred(hint.NUMANodeAffinity.GetBits(), minAffinitySize)
   543  		}
   544  	}
   545  
   546  	return hints
   547  }
   548  
   549  func (p *staticPolicy) isHintPreferred(maskBits []int, minAffinitySize int) bool {
   550  	return len(maskBits) == minAffinitySize
   551  }
   552  
   553  func areGroupsEqual(group1, group2 []int) bool {
   554  	sort.Ints(group1)
   555  	sort.Ints(group2)
   556  
   557  	if len(group1) != len(group2) {
   558  		return false
   559  	}
   560  
   561  	for i, elm := range group1 {
   562  		if group2[i] != elm {
   563  			return false
   564  		}
   565  	}
   566  	return true
   567  }
   568  
   569  func (p *staticPolicy) validateState(s state.State) error {
   570  	machineState := s.GetMachineState()
   571  	memoryAssignments := s.GetMemoryAssignments()
   572  
   573  	if len(machineState) == 0 {
   574  		// Machine state cannot be empty when assignments exist
   575  		if len(memoryAssignments) != 0 {
   576  			return fmt.Errorf("[memorymanager] machine state can not be empty when it has memory assignments")
   577  		}
   578  
   579  		defaultMachineState := p.getDefaultMachineState()
   580  		s.SetMachineState(defaultMachineState)
   581  
   582  		return nil
   583  	}
   584  
   585  	// calculate all memory assigned to containers
   586  	expectedMachineState := p.getDefaultMachineState()
   587  	for pod, container := range memoryAssignments {
   588  		for containerName, blocks := range container {
   589  			for _, b := range blocks {
   590  				requestedSize := b.Size
   591  				for _, nodeID := range b.NUMAAffinity {
   592  					nodeState, ok := expectedMachineState[nodeID]
   593  					if !ok {
   594  						return fmt.Errorf("[memorymanager] (pod: %s, container: %s) the memory assignment uses the NUMA that does not exist", pod, containerName)
   595  					}
   596  
   597  					nodeState.NumberOfAssignments++
   598  					nodeState.Cells = b.NUMAAffinity
   599  
   600  					memoryState, ok := nodeState.MemoryMap[b.Type]
   601  					if !ok {
   602  						return fmt.Errorf("[memorymanager] (pod: %s, container: %s) the memory assignment uses memory resource that does not exist", pod, containerName)
   603  					}
   604  
   605  					if requestedSize == 0 {
   606  						continue
   607  					}
   608  
   609  					// this node does not have enough memory continue to the next one
   610  					if memoryState.Free <= 0 {
   611  						continue
   612  					}
   613  
   614  					// the node has enough memory to satisfy the request
   615  					if memoryState.Free >= requestedSize {
   616  						memoryState.Reserved += requestedSize
   617  						memoryState.Free -= requestedSize
   618  						requestedSize = 0
   619  						continue
   620  					}
   621  
   622  					// the node does not have enough memory, use the node remaining memory and move to the next node
   623  					requestedSize -= memoryState.Free
   624  					memoryState.Reserved += memoryState.Free
   625  					memoryState.Free = 0
   626  				}
   627  			}
   628  		}
   629  	}
   630  
   631  	// State has already been initialized from file (is not empty)
   632  	// Validate that total size, system reserved and reserved memory not changed, it can happen, when:
   633  	// - adding or removing physical memory bank from the node
   634  	// - change of kubelet system-reserved, kube-reserved or pre-reserved-memory-zone parameters
   635  	if !areMachineStatesEqual(machineState, expectedMachineState) {
   636  		return fmt.Errorf("[memorymanager] the expected machine state is different from the real one")
   637  	}
   638  
   639  	return nil
   640  }
   641  
   642  func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
   643  	if len(ms1) != len(ms2) {
   644  		klog.ErrorS(nil, "Node states are different", "lengthNode1", len(ms1), "lengthNode2", len(ms2))
   645  		return false
   646  	}
   647  
   648  	for nodeID, nodeState1 := range ms1 {
   649  		nodeState2, ok := ms2[nodeID]
   650  		if !ok {
   651  			klog.ErrorS(nil, "Node state does not have node ID", "nodeID", nodeID)
   652  			return false
   653  		}
   654  
   655  		if nodeState1.NumberOfAssignments != nodeState2.NumberOfAssignments {
   656  			klog.ErrorS(nil, "Node states number of assignments are different", "assignment1", nodeState1.NumberOfAssignments, "assignment2", nodeState2.NumberOfAssignments)
   657  			return false
   658  		}
   659  
   660  		if !areGroupsEqual(nodeState1.Cells, nodeState2.Cells) {
   661  			klog.ErrorS(nil, "Node states groups are different", "stateNode1", nodeState1.Cells, "stateNode2", nodeState2.Cells)
   662  			return false
   663  		}
   664  
   665  		if len(nodeState1.MemoryMap) != len(nodeState2.MemoryMap) {
   666  			klog.ErrorS(nil, "Node states memory map have different lengths", "lengthNode1", len(nodeState1.MemoryMap), "lengthNode2", len(nodeState2.MemoryMap))
   667  			return false
   668  		}
   669  
   670  		for resourceName, memoryState1 := range nodeState1.MemoryMap {
   671  			memoryState2, ok := nodeState2.MemoryMap[resourceName]
   672  			if !ok {
   673  				klog.ErrorS(nil, "Memory state does not have resource", "resource", resourceName)
   674  				return false
   675  			}
   676  
   677  			if !reflect.DeepEqual(*memoryState1, *memoryState2) {
   678  				klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
   679  				return false
   680  			}
   681  		}
   682  	}
   683  	return true
   684  }
   685  
   686  func (p *staticPolicy) getDefaultMachineState() state.NUMANodeMap {
   687  	defaultMachineState := state.NUMANodeMap{}
   688  	nodeHugepages := map[int]uint64{}
   689  	for _, node := range p.machineInfo.Topology {
   690  		defaultMachineState[node.Id] = &state.NUMANodeState{
   691  			NumberOfAssignments: 0,
   692  			MemoryMap:           map[v1.ResourceName]*state.MemoryTable{},
   693  			Cells:               []int{node.Id},
   694  		}
   695  
   696  		// fill memory table with huge pages values
   697  		for _, hugepage := range node.HugePages {
   698  			hugepageQuantity := resource.NewQuantity(int64(hugepage.PageSize)*1024, resource.BinarySI)
   699  			resourceName := corehelper.HugePageResourceName(*hugepageQuantity)
   700  			systemReserved := p.getResourceSystemReserved(node.Id, resourceName)
   701  			totalHugepagesSize := hugepage.NumPages * hugepage.PageSize * 1024
   702  			allocatable := totalHugepagesSize - systemReserved
   703  			defaultMachineState[node.Id].MemoryMap[resourceName] = &state.MemoryTable{
   704  				Allocatable:    allocatable,
   705  				Free:           allocatable,
   706  				Reserved:       0,
   707  				SystemReserved: systemReserved,
   708  				TotalMemSize:   totalHugepagesSize,
   709  			}
   710  			if _, ok := nodeHugepages[node.Id]; !ok {
   711  				nodeHugepages[node.Id] = 0
   712  			}
   713  			nodeHugepages[node.Id] += totalHugepagesSize
   714  		}
   715  
   716  		// fill memory table with regular memory values
   717  		systemReserved := p.getResourceSystemReserved(node.Id, v1.ResourceMemory)
   718  
   719  		allocatable := node.Memory - systemReserved
   720  		// remove memory allocated by hugepages
   721  		if allocatedByHugepages, ok := nodeHugepages[node.Id]; ok {
   722  			allocatable -= allocatedByHugepages
   723  		}
   724  		defaultMachineState[node.Id].MemoryMap[v1.ResourceMemory] = &state.MemoryTable{
   725  			Allocatable:    allocatable,
   726  			Free:           allocatable,
   727  			Reserved:       0,
   728  			SystemReserved: systemReserved,
   729  			TotalMemSize:   node.Memory,
   730  		}
   731  	}
   732  	return defaultMachineState
   733  }
   734  
   735  func (p *staticPolicy) getResourceSystemReserved(nodeID int, resourceName v1.ResourceName) uint64 {
   736  	var systemReserved uint64
   737  	if nodeSystemReserved, ok := p.systemReserved[nodeID]; ok {
   738  		if nodeMemorySystemReserved, ok := nodeSystemReserved[resourceName]; ok {
   739  			systemReserved = nodeMemorySystemReserved
   740  		}
   741  	}
   742  	return systemReserved
   743  }
   744  
   745  func (p *staticPolicy) getDefaultHint(machineState state.NUMANodeMap, pod *v1.Pod, requestedResources map[v1.ResourceName]uint64) (*topologymanager.TopologyHint, error) {
   746  	hints := p.calculateHints(machineState, pod, requestedResources)
   747  	if len(hints) < 1 {
   748  		return nil, fmt.Errorf("[memorymanager] failed to get the default NUMA affinity, no NUMA nodes with enough memory is available")
   749  	}
   750  
   751  	// hints for all memory types should be the same, so we will check hints only for regular memory type
   752  	return findBestHint(hints[string(v1.ResourceMemory)]), nil
   753  }
   754  
   755  func isAffinitySatisfyRequest(machineState state.NUMANodeMap, mask bitmask.BitMask, requestedResources map[v1.ResourceName]uint64) bool {
   756  	totalFreeSize := map[v1.ResourceName]uint64{}
   757  	for _, nodeID := range mask.GetBits() {
   758  		for resourceName := range requestedResources {
   759  			if _, ok := totalFreeSize[resourceName]; !ok {
   760  				totalFreeSize[resourceName] = 0
   761  			}
   762  			totalFreeSize[resourceName] += machineState[nodeID].MemoryMap[resourceName].Free
   763  		}
   764  	}
   765  
   766  	// verify that for all memory types the node mask has enough resources
   767  	for resourceName, requestedSize := range requestedResources {
   768  		if totalFreeSize[resourceName] < requestedSize {
   769  			return false
   770  		}
   771  	}
   772  
   773  	return true
   774  }
   775  
   776  // extendTopologyManagerHint extends the topology manager hint, in case when it does not satisfy to the container request
   777  // the topology manager uses bitwise AND to merge all topology hints into the best one, so in case of the restricted policy,
   778  // it possible that we will get the subset of hint that we provided to the topology manager, in this case we want to extend
   779  // it to the original one
   780  func (p *staticPolicy) extendTopologyManagerHint(machineState state.NUMANodeMap, pod *v1.Pod, requestedResources map[v1.ResourceName]uint64, mask bitmask.BitMask) (*topologymanager.TopologyHint, error) {
   781  	hints := p.calculateHints(machineState, pod, requestedResources)
   782  
   783  	var filteredHints []topologymanager.TopologyHint
   784  	// hints for all memory types should be the same, so we will check hints only for regular memory type
   785  	for _, hint := range hints[string(v1.ResourceMemory)] {
   786  		affinityBits := hint.NUMANodeAffinity.GetBits()
   787  		// filter all hints that does not include currentHint
   788  		if isHintInGroup(mask.GetBits(), affinityBits) {
   789  			filteredHints = append(filteredHints, hint)
   790  		}
   791  	}
   792  
   793  	if len(filteredHints) < 1 {
   794  		return nil, fmt.Errorf("[memorymanager] failed to find NUMA nodes to extend the current topology hint")
   795  	}
   796  
   797  	// try to find the preferred hint with the minimal number of NUMA nodes, relevant for the restricted policy
   798  	return findBestHint(filteredHints), nil
   799  }
   800  
   801  func isHintInGroup(hint []int, group []int) bool {
   802  	sort.Ints(hint)
   803  	sort.Ints(group)
   804  
   805  	hintIndex := 0
   806  	for i := range group {
   807  		if hintIndex == len(hint) {
   808  			return true
   809  		}
   810  
   811  		if group[i] != hint[hintIndex] {
   812  			continue
   813  		}
   814  		hintIndex++
   815  	}
   816  
   817  	return hintIndex == len(hint)
   818  }
   819  
   820  func findBestHint(hints []topologymanager.TopologyHint) *topologymanager.TopologyHint {
   821  	// try to find the preferred hint with the minimal number of NUMA nodes, relevant for the restricted policy
   822  	bestHint := topologymanager.TopologyHint{}
   823  	for _, hint := range hints {
   824  		if bestHint.NUMANodeAffinity == nil {
   825  			bestHint = hint
   826  			continue
   827  		}
   828  
   829  		// preferred of the current hint is true, when the extendedHint preferred is false
   830  		if hint.Preferred && !bestHint.Preferred {
   831  			bestHint = hint
   832  			continue
   833  		}
   834  
   835  		// both hints has the same preferred value, but the current hint has less NUMA nodes than the extended one
   836  		if hint.Preferred == bestHint.Preferred && hint.NUMANodeAffinity.IsNarrowerThan(bestHint.NUMANodeAffinity) {
   837  			bestHint = hint
   838  		}
   839  	}
   840  	return &bestHint
   841  }
   842  
   843  // GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
   844  func (p *staticPolicy) GetAllocatableMemory(s state.State) []state.Block {
   845  	var allocatableMemory []state.Block
   846  	machineState := s.GetMachineState()
   847  	for numaNodeID, numaNodeState := range machineState {
   848  		for resourceName, memoryTable := range numaNodeState.MemoryMap {
   849  			if memoryTable.Allocatable == 0 {
   850  				continue
   851  			}
   852  
   853  			block := state.Block{
   854  				NUMAAffinity: []int{numaNodeID},
   855  				Type:         resourceName,
   856  				Size:         memoryTable.Allocatable,
   857  			}
   858  			allocatableMemory = append(allocatableMemory, block)
   859  		}
   860  	}
   861  	return allocatableMemory
   862  }
   863  
   864  func (p *staticPolicy) updatePodReusableMemory(pod *v1.Pod, container *v1.Container, memoryBlocks []state.Block) {
   865  	podUID := string(pod.UID)
   866  
   867  	// If pod entries to m.initContainersReusableMemory other than the current pod exist, delete them.
   868  	for uid := range p.initContainersReusableMemory {
   869  		if podUID != uid {
   870  			delete(p.initContainersReusableMemory, uid)
   871  		}
   872  	}
   873  
   874  	if isRegularInitContainer(pod, container) {
   875  		if _, ok := p.initContainersReusableMemory[podUID]; !ok {
   876  			p.initContainersReusableMemory[podUID] = map[string]map[v1.ResourceName]uint64{}
   877  		}
   878  
   879  		for _, block := range memoryBlocks {
   880  			blockBitMask, _ := bitmask.NewBitMask(block.NUMAAffinity...)
   881  			blockBitMaskString := blockBitMask.String()
   882  
   883  			if _, ok := p.initContainersReusableMemory[podUID][blockBitMaskString]; !ok {
   884  				p.initContainersReusableMemory[podUID][blockBitMaskString] = map[v1.ResourceName]uint64{}
   885  			}
   886  
   887  			if blockReusableMemory := p.initContainersReusableMemory[podUID][blockBitMaskString][block.Type]; block.Size > blockReusableMemory {
   888  				p.initContainersReusableMemory[podUID][blockBitMaskString][block.Type] = block.Size
   889  			}
   890  		}
   891  
   892  		return
   893  	}
   894  
   895  	// update re-usable memory once it used by the app container
   896  	for _, block := range memoryBlocks {
   897  		blockBitMask, _ := bitmask.NewBitMask(block.NUMAAffinity...)
   898  		if podReusableMemory := p.getPodReusableMemory(pod, blockBitMask, block.Type); podReusableMemory != 0 {
   899  			if block.Size >= podReusableMemory {
   900  				p.initContainersReusableMemory[podUID][blockBitMask.String()][block.Type] = 0
   901  			} else {
   902  				p.initContainersReusableMemory[podUID][blockBitMask.String()][block.Type] -= block.Size
   903  			}
   904  		}
   905  	}
   906  }
   907  
   908  func (p *staticPolicy) updateInitContainersMemoryBlocks(s state.State, pod *v1.Pod, container *v1.Container, containerMemoryBlocks []state.Block) {
   909  	podUID := string(pod.UID)
   910  
   911  	for _, containerBlock := range containerMemoryBlocks {
   912  		blockSize := containerBlock.Size
   913  		for _, initContainer := range pod.Spec.InitContainers {
   914  			// we do not want to continue updates once we reach the current container
   915  			if initContainer.Name == container.Name {
   916  				break
   917  			}
   918  
   919  			if blockSize == 0 {
   920  				break
   921  			}
   922  
   923  			if types.IsRestartableInitContainer(&initContainer) {
   924  				// we should not reuse the resource from any restartable init
   925  				// container
   926  				continue
   927  			}
   928  
   929  			initContainerBlocks := s.GetMemoryBlocks(podUID, initContainer.Name)
   930  			if len(initContainerBlocks) == 0 {
   931  				continue
   932  			}
   933  
   934  			for i := range initContainerBlocks {
   935  				initContainerBlock := &initContainerBlocks[i]
   936  				if initContainerBlock.Size == 0 {
   937  					continue
   938  				}
   939  
   940  				if initContainerBlock.Type != containerBlock.Type {
   941  					continue
   942  				}
   943  
   944  				if !isNUMAAffinitiesEqual(initContainerBlock.NUMAAffinity, containerBlock.NUMAAffinity) {
   945  					continue
   946  				}
   947  
   948  				if initContainerBlock.Size > blockSize {
   949  					initContainerBlock.Size -= blockSize
   950  					blockSize = 0
   951  				} else {
   952  					blockSize -= initContainerBlock.Size
   953  					initContainerBlock.Size = 0
   954  				}
   955  			}
   956  
   957  			s.SetMemoryBlocks(podUID, initContainer.Name, initContainerBlocks)
   958  		}
   959  	}
   960  }
   961  
   962  func isRegularInitContainer(pod *v1.Pod, container *v1.Container) bool {
   963  	for _, initContainer := range pod.Spec.InitContainers {
   964  		if initContainer.Name == container.Name {
   965  			return !types.IsRestartableInitContainer(&initContainer)
   966  		}
   967  	}
   968  
   969  	return false
   970  }
   971  
   972  func isNUMAAffinitiesEqual(numaAffinity1, numaAffinity2 []int) bool {
   973  	bitMask1, err := bitmask.NewBitMask(numaAffinity1...)
   974  	if err != nil {
   975  		klog.ErrorS(err, "failed to create bit mask", "numaAffinity1", numaAffinity1)
   976  		return false
   977  	}
   978  
   979  	bitMask2, err := bitmask.NewBitMask(numaAffinity2...)
   980  	if err != nil {
   981  		klog.ErrorS(err, "failed to create bit mask", "numaAffinity2", numaAffinity2)
   982  		return false
   983  	}
   984  
   985  	return bitMask1.IsEqual(bitMask2)
   986  }