github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package dynamicpolicy
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"math"
    23  	"sort"
    24  	"time"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
    28  
    29  	apiconsts "github.com/kubewharf/katalyst-api/pkg/consts"
    30  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/calculator"
    31  	advisorapi "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor"
    32  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/state"
    33  	cpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/util"
    34  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util"
    35  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    36  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    37  	"github.com/kubewharf/katalyst-core/pkg/util/machine"
    38  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    39  	qosutil "github.com/kubewharf/katalyst-core/pkg/util/qos"
    40  )
    41  
    42  func (p *DynamicPolicy) sharedCoresAllocationHandler(_ context.Context,
    43  	req *pluginapi.ResourceRequest,
    44  ) (*pluginapi.ResourceAllocationResponse, error) {
    45  	if req == nil {
    46  		return nil, fmt.Errorf("sharedCoresAllocationHandler got nil request")
    47  	}
    48  
    49  	_, reqFloat64, err := util.GetQuantityFromResourceReq(req)
    50  	if err != nil {
    51  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
    52  	}
    53  
    54  	machineState := p.state.GetMachineState()
    55  	pooledCPUs := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs,
    56  		state.CheckDedicated, state.CheckDedicatedNUMABinding)
    57  	if pooledCPUs.IsEmpty() {
    58  		general.Errorf("pod: %s/%s, container: %s get empty pooledCPUs", req.PodNamespace, req.PodName, req.ContainerName)
    59  		return nil, fmt.Errorf("get empty pooledCPUs")
    60  	}
    61  
    62  	pooledCPUsTopologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, pooledCPUs)
    63  	if err != nil {
    64  		general.Errorf("pod: %s/%s, container: %s GetTopologyAwareAssignmentsByCPUSet failed with error: %v",
    65  			req.PodNamespace, req.PodName, req.ContainerName, err)
    66  		return nil, fmt.Errorf("GetTopologyAwareAssignmentsByCPUSet failed with error: %v", err)
    67  	}
    68  
    69  	needSet := true
    70  	allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName)
    71  	err = updateAllocationInfoByReq(req, allocationInfo)
    72  	if err != nil {
    73  		general.Errorf("pod: %s/%s, container: %s updateAllocationInfoByReq failed with error: %v",
    74  			req.PodNamespace, req.PodName, req.ContainerName, err)
    75  		return nil, fmt.Errorf("updateAllocationInfoByReq failed with error: %v", err)
    76  	}
    77  
    78  	if allocationInfo == nil {
    79  		general.Infof("pod: %s/%s, container: %s is met firstly, do ramp up with pooled cpus: %s",
    80  			req.PodNamespace, req.PodName, req.ContainerName, pooledCPUs.String())
    81  
    82  		shouldRampUp := p.shouldSharedCoresRampUp(req.PodUid)
    83  
    84  		allocationInfo = &state.AllocationInfo{
    85  			PodUid:                           req.PodUid,
    86  			PodNamespace:                     req.PodNamespace,
    87  			PodName:                          req.PodName,
    88  			ContainerName:                    req.ContainerName,
    89  			ContainerType:                    req.ContainerType.String(),
    90  			ContainerIndex:                   req.ContainerIndex,
    91  			RampUp:                           shouldRampUp,
    92  			OwnerPoolName:                    advisorapi.EmptyOwnerPoolName,
    93  			PodRole:                          req.PodRole,
    94  			PodType:                          req.PodType,
    95  			AllocationResult:                 pooledCPUs,
    96  			OriginalAllocationResult:         pooledCPUs.Clone(),
    97  			TopologyAwareAssignments:         pooledCPUsTopologyAwareAssignments,
    98  			OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(pooledCPUsTopologyAwareAssignments),
    99  			InitTimestamp:                    time.Now().Format(util.QRMTimeFormat),
   100  			Labels:                           general.DeepCopyMap(req.Labels),
   101  			Annotations:                      general.DeepCopyMap(req.Annotations),
   102  			QoSLevel:                         apiconsts.PodAnnotationQoSLevelSharedCores,
   103  			RequestQuantity:                  reqFloat64,
   104  		}
   105  
   106  		if !shouldRampUp {
   107  			targetPoolName := allocationInfo.GetSpecifiedPoolName()
   108  			poolAllocationInfo := p.state.GetAllocationInfo(targetPoolName, advisorapi.FakedContainerName)
   109  
   110  			if poolAllocationInfo == nil {
   111  				general.Infof("pod: %s/%s, container: %s is active, but its specified pool entry doesn't exist, try to ramp up it",
   112  					req.PodNamespace, req.PodName, req.ContainerName)
   113  				allocationInfo.RampUp = true
   114  			} else {
   115  				p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo)
   116  				err := p.doAndCheckPutAllocationInfo(allocationInfo, false)
   117  				if err != nil {
   118  					return nil, err
   119  				}
   120  
   121  				needSet = false
   122  			}
   123  		}
   124  	} else if allocationInfo.RampUp {
   125  		general.Infof("pod: %s/%s, container: %s is still in ramp up, allocate pooled cpus: %s",
   126  			req.PodNamespace, req.PodName, req.ContainerName, pooledCPUs.String())
   127  
   128  		allocationInfo.AllocationResult = pooledCPUs
   129  		allocationInfo.OriginalAllocationResult = pooledCPUs.Clone()
   130  		allocationInfo.TopologyAwareAssignments = pooledCPUsTopologyAwareAssignments
   131  		allocationInfo.OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(pooledCPUsTopologyAwareAssignments)
   132  	} else {
   133  		err := p.doAndCheckPutAllocationInfo(allocationInfo, true)
   134  		if err != nil {
   135  			return nil, err
   136  		}
   137  
   138  		needSet = false
   139  	}
   140  
   141  	if needSet {
   142  		// update pod entries directly.
   143  		// if one of subsequent steps is failed,
   144  		// we will delete current allocationInfo from podEntries in defer function of allocation function.
   145  		p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo)
   146  		podEntries := p.state.GetPodEntries()
   147  
   148  		updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   149  		if err != nil {
   150  			general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v",
   151  				req.PodNamespace, req.PodName, req.ContainerName, err)
   152  			return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   153  		}
   154  		p.state.SetMachineState(updatedMachineState)
   155  	}
   156  
   157  	resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req)
   158  	if err != nil {
   159  		general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v",
   160  			req.PodNamespace, req.PodName, req.ContainerName, err)
   161  		return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err)
   162  	}
   163  	return resp, nil
   164  }
   165  
   166  func (p *DynamicPolicy) reclaimedCoresAllocationHandler(_ context.Context,
   167  	req *pluginapi.ResourceRequest,
   168  ) (*pluginapi.ResourceAllocationResponse, error) {
   169  	if req == nil {
   170  		return nil, fmt.Errorf("reclaimedCoresAllocationHandler got nil request")
   171  	}
   172  
   173  	_, reqFloat64, err := util.GetQuantityFromResourceReq(req)
   174  	if err != nil {
   175  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
   176  	}
   177  
   178  	allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName)
   179  	err = updateAllocationInfoByReq(req, allocationInfo)
   180  	if err != nil {
   181  		general.Errorf("pod: %s/%s, container: %s updateAllocationInfoByReq failed with error: %v",
   182  			req.PodNamespace, req.PodName, req.ContainerName, err)
   183  		return nil, fmt.Errorf("updateAllocationInfoByReq failed with error: %v", err)
   184  	}
   185  
   186  	reclaimedAllocationInfo := p.state.GetAllocationInfo(state.PoolNameReclaim, advisorapi.FakedContainerName)
   187  	if reclaimedAllocationInfo == nil {
   188  		general.Errorf("allocation for pod: %s/%s, container: %s is failed, because pool: %s is not ready",
   189  			req.PodNamespace, req.PodName, req.ContainerName, state.PoolNameReclaim)
   190  
   191  		return nil, fmt.Errorf("pool: %s is not ready", state.PoolNameReclaim)
   192  	} else if reclaimedAllocationInfo.AllocationResult.Size() == 0 {
   193  		general.Errorf("allocation for pod: %s/%s, container: %s is failed, because pool: %s is empty",
   194  			req.PodNamespace, req.PodName, req.ContainerName, state.PoolNameReclaim)
   195  
   196  		return nil, fmt.Errorf("pool: %s is not empty", state.PoolNameReclaim)
   197  	}
   198  
   199  	if allocationInfo != nil {
   200  		general.Infof("pod: %s/%s, container: %s with old allocation result: %s, allocate by reclaimedCPUSet: %s",
   201  			req.PodNamespace, req.PodName, req.ContainerName, allocationInfo.AllocationResult.String(), reclaimedAllocationInfo.AllocationResult.String())
   202  	} else {
   203  		general.Infof("pod: %s/%s, container: %s is firstly met, allocate by reclaimedCPUSet: %s",
   204  			req.PodNamespace, req.PodName, req.ContainerName, reclaimedAllocationInfo.AllocationResult.String())
   205  
   206  		allocationInfo = &state.AllocationInfo{
   207  			PodUid:          req.PodUid,
   208  			PodNamespace:    req.PodNamespace,
   209  			PodName:         req.PodName,
   210  			ContainerName:   req.ContainerName,
   211  			ContainerType:   req.ContainerType.String(),
   212  			ContainerIndex:  req.ContainerIndex,
   213  			OwnerPoolName:   state.PoolNameReclaim,
   214  			PodRole:         req.PodRole,
   215  			PodType:         req.PodType,
   216  			InitTimestamp:   time.Now().Format(util.QRMTimeFormat),
   217  			Labels:          general.DeepCopyMap(req.Labels),
   218  			Annotations:     general.DeepCopyMap(req.Annotations),
   219  			QoSLevel:        apiconsts.PodAnnotationQoSLevelReclaimedCores,
   220  			RequestQuantity: reqFloat64,
   221  		}
   222  	}
   223  
   224  	allocationInfo.OwnerPoolName = state.PoolNameReclaim
   225  	allocationInfo.AllocationResult = reclaimedAllocationInfo.AllocationResult.Clone()
   226  	allocationInfo.OriginalAllocationResult = reclaimedAllocationInfo.OriginalAllocationResult.Clone()
   227  	allocationInfo.TopologyAwareAssignments = machine.DeepcopyCPUAssignment(reclaimedAllocationInfo.TopologyAwareAssignments)
   228  	allocationInfo.OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(reclaimedAllocationInfo.OriginalTopologyAwareAssignments)
   229  
   230  	// update pod entries directly.
   231  	// if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function.
   232  	p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo)
   233  	podEntries := p.state.GetPodEntries()
   234  
   235  	updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   236  	if err != nil {
   237  		general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v",
   238  			req.PodNamespace, req.PodName, req.ContainerName, err)
   239  		return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   240  	}
   241  
   242  	resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req)
   243  	if err != nil {
   244  		general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v",
   245  			req.PodNamespace, req.PodName, req.ContainerName, err)
   246  		return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err)
   247  	}
   248  	p.state.SetMachineState(updatedMachineState)
   249  
   250  	return resp, nil
   251  }
   252  
   253  func (p *DynamicPolicy) dedicatedCoresAllocationHandler(ctx context.Context,
   254  	req *pluginapi.ResourceRequest,
   255  ) (*pluginapi.ResourceAllocationResponse, error) {
   256  	if req == nil {
   257  		return nil, fmt.Errorf("dedicatedCoresAllocationHandler got nil req")
   258  	}
   259  
   260  	switch req.Annotations[apiconsts.PodAnnotationMemoryEnhancementNumaBinding] {
   261  	case apiconsts.PodAnnotationMemoryEnhancementNumaBindingEnable:
   262  		return p.dedicatedCoresWithNUMABindingAllocationHandler(ctx, req)
   263  	default:
   264  		return p.dedicatedCoresWithoutNUMABindingAllocationHandler(ctx, req)
   265  	}
   266  }
   267  
   268  func (p *DynamicPolicy) dedicatedCoresWithoutNUMABindingAllocationHandler(_ context.Context,
   269  	_ *pluginapi.ResourceRequest,
   270  ) (*pluginapi.ResourceAllocationResponse, error) {
   271  	// todo: support dedicated_cores without NUMA binding
   272  	return nil, fmt.Errorf("not support dedicated_cores without NUMA binding")
   273  }
   274  
   275  func (p *DynamicPolicy) dedicatedCoresWithNUMABindingAllocationHandler(ctx context.Context,
   276  	req *pluginapi.ResourceRequest,
   277  ) (*pluginapi.ResourceAllocationResponse, error) {
   278  	if req.ContainerType == pluginapi.ContainerType_SIDECAR {
   279  		return p.dedicatedCoresWithNUMABindingAllocationSidecarHandler(ctx, req)
   280  	}
   281  
   282  	var machineState state.NUMANodeMap
   283  	oldAllocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName)
   284  	if oldAllocationInfo == nil {
   285  		machineState = p.state.GetMachineState()
   286  	} else {
   287  		p.state.Delete(req.PodUid, req.ContainerName)
   288  		podEntries := p.state.GetPodEntries()
   289  
   290  		var err error
   291  		machineState, err = generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   292  		if err != nil {
   293  			general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v",
   294  				req.PodNamespace, req.PodName, req.ContainerName, err)
   295  			return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   296  		}
   297  	}
   298  
   299  	reqInt, reqFloat64, err := util.GetQuantityFromResourceReq(req)
   300  	if err != nil {
   301  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
   302  	}
   303  
   304  	result, err := p.allocateNumaBindingCPUs(reqInt, req.Hint, machineState, req.Annotations)
   305  	if err != nil {
   306  		general.ErrorS(err, "unable to allocate CPUs",
   307  			"podNamespace", req.PodNamespace,
   308  			"podName", req.PodName,
   309  			"containerName", req.ContainerName,
   310  			"numCPUsInt", reqInt,
   311  			"numCPUsFloat64", reqFloat64)
   312  		return nil, err
   313  	}
   314  
   315  	general.InfoS("allocate CPUs successfully",
   316  		"podNamespace", req.PodNamespace,
   317  		"podName", req.PodName,
   318  		"containerName", req.ContainerName,
   319  		"numCPUsInt", reqInt,
   320  		"numCPUsFloat64", reqFloat64,
   321  		"result", result.String())
   322  
   323  	topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, result)
   324  	if err != nil {
   325  		general.ErrorS(err, "unable to calculate topologyAwareAssignments",
   326  			"podNamespace", req.PodNamespace,
   327  			"podName", req.PodName,
   328  			"containerName", req.ContainerName,
   329  			"numCPUsInt", reqInt,
   330  			"numCPUsFloat64", reqFloat64,
   331  			"result cpuset", result.String())
   332  		return nil, err
   333  	}
   334  
   335  	allocationInfo := &state.AllocationInfo{
   336  		PodUid:                           req.PodUid,
   337  		PodNamespace:                     req.PodNamespace,
   338  		PodName:                          req.PodName,
   339  		ContainerName:                    req.ContainerName,
   340  		ContainerType:                    req.ContainerType.String(),
   341  		ContainerIndex:                   req.ContainerIndex,
   342  		RampUp:                           true,
   343  		PodRole:                          req.PodRole,
   344  		PodType:                          req.PodType,
   345  		OwnerPoolName:                    state.PoolNameDedicated,
   346  		AllocationResult:                 result.Clone(),
   347  		OriginalAllocationResult:         result.Clone(),
   348  		TopologyAwareAssignments:         topologyAwareAssignments,
   349  		OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments),
   350  		InitTimestamp:                    time.Now().Format(util.QRMTimeFormat),
   351  		QoSLevel:                         apiconsts.PodAnnotationQoSLevelDedicatedCores,
   352  		Labels:                           general.DeepCopyMap(req.Labels),
   353  		Annotations:                      general.DeepCopyMap(req.Annotations),
   354  		RequestQuantity:                  reqFloat64,
   355  	}
   356  
   357  	// update pod entries directly.
   358  	// if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function.
   359  	p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo)
   360  	podEntries := p.state.GetPodEntries()
   361  
   362  	updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   363  	if err != nil {
   364  		general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v",
   365  			req.PodNamespace, req.PodName, req.ContainerName, err)
   366  		return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   367  	}
   368  	p.state.SetMachineState(updatedMachineState)
   369  
   370  	err = p.adjustAllocationEntries()
   371  	if err != nil {
   372  		general.Errorf("pod: %s/%s, container: %s putContainersAndAdjustAllocationEntriesWithoutAllocation failed with error: %v",
   373  			req.PodNamespace, req.PodName, req.ContainerName, err)
   374  		return nil, fmt.Errorf("adjustAllocationEntries failed with error: %v", err)
   375  	}
   376  
   377  	resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req)
   378  	if err != nil {
   379  		general.Errorf("pod: %s/%s, container: %s PackResourceAllocationResponseByAllocationInfo failed with error: %v",
   380  			req.PodNamespace, req.PodName, req.ContainerName, err)
   381  		return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err)
   382  	}
   383  	return resp, nil
   384  }
   385  
   386  // dedicatedCoresWithNUMABindingAllocationSidecarHandler currently we set cpuset of sidecar to the cpuset of its main container
   387  func (p *DynamicPolicy) dedicatedCoresWithNUMABindingAllocationSidecarHandler(_ context.Context,
   388  	req *pluginapi.ResourceRequest,
   389  ) (*pluginapi.ResourceAllocationResponse, error) {
   390  	_, reqFloat64, err := util.GetQuantityFromResourceReq(req)
   391  	if err != nil {
   392  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
   393  	}
   394  
   395  	podEntries := p.state.GetPodEntries()
   396  	if podEntries[req.PodUid] == nil {
   397  		general.Infof("there is no pod entry, pod: %s/%s, sidecar: %s, waiting next reconcile",
   398  			req.PodNamespace, req.PodName, req.ContainerName)
   399  		return &pluginapi.ResourceAllocationResponse{}, nil
   400  	}
   401  
   402  	mainContainerAllocationInfo := podEntries[req.PodUid].GetMainContainerEntry()
   403  
   404  	// todo: consider sidecar without reconcile in vpa
   405  	if mainContainerAllocationInfo == nil {
   406  		general.Infof("main container is not found for pod: %s/%s, sidecar: %s, waiting next reconcile",
   407  			req.PodNamespace, req.PodName, req.ContainerName)
   408  		return &pluginapi.ResourceAllocationResponse{}, nil
   409  	}
   410  
   411  	allocationInfo := &state.AllocationInfo{
   412  		PodUid:                           req.PodUid,
   413  		PodNamespace:                     req.PodNamespace,
   414  		PodName:                          req.PodName,
   415  		ContainerName:                    req.ContainerName,
   416  		ContainerType:                    req.ContainerType.String(),
   417  		ContainerIndex:                   req.ContainerIndex,
   418  		PodRole:                          req.PodRole,
   419  		PodType:                          req.PodType,
   420  		AllocationResult:                 mainContainerAllocationInfo.AllocationResult.Clone(),
   421  		OriginalAllocationResult:         mainContainerAllocationInfo.OriginalAllocationResult.Clone(),
   422  		TopologyAwareAssignments:         machine.DeepcopyCPUAssignment(mainContainerAllocationInfo.TopologyAwareAssignments),
   423  		OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(mainContainerAllocationInfo.OriginalTopologyAwareAssignments),
   424  		InitTimestamp:                    time.Now().Format(util.QRMTimeFormat),
   425  		QoSLevel:                         apiconsts.PodAnnotationQoSLevelDedicatedCores,
   426  		Labels:                           general.DeepCopyMap(req.Labels),
   427  		Annotations:                      general.DeepCopyMap(req.Annotations),
   428  		RequestQuantity:                  reqFloat64,
   429  	}
   430  
   431  	// update pod entries directly.
   432  	// if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function.
   433  	p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo)
   434  	podEntries = p.state.GetPodEntries()
   435  
   436  	updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   437  	if err != nil {
   438  		general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v",
   439  			req.PodNamespace, req.PodName, req.ContainerName, err)
   440  		return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   441  	}
   442  	p.state.SetMachineState(updatedMachineState)
   443  
   444  	resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req)
   445  	if err != nil {
   446  		general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v",
   447  			req.PodNamespace, req.PodName, req.ContainerName, err)
   448  		return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err)
   449  	}
   450  	return resp, nil
   451  }
   452  
   453  func (p *DynamicPolicy) allocateNumaBindingCPUs(numCPUs int, hint *pluginapi.TopologyHint,
   454  	machineState state.NUMANodeMap, reqAnnotations map[string]string,
   455  ) (machine.CPUSet, error) {
   456  	if hint == nil {
   457  		return machine.NewCPUSet(), fmt.Errorf("hint is nil")
   458  	} else if len(hint.Nodes) == 0 {
   459  		return machine.NewCPUSet(), fmt.Errorf("hint is empty")
   460  	} else if qosutil.AnnotationsIndicateNUMABinding(reqAnnotations) &&
   461  		!qosutil.AnnotationsIndicateNUMAExclusive(reqAnnotations) &&
   462  		len(hint.Nodes) > 1 {
   463  		return machine.NewCPUSet(), fmt.Errorf("NUMA not exclusive binding container has request larger than 1 NUMA")
   464  	}
   465  
   466  	result := machine.NewCPUSet()
   467  	alignedAvailableCPUs := machine.CPUSet{}
   468  	for _, numaNode := range hint.Nodes {
   469  		alignedAvailableCPUs = alignedAvailableCPUs.Union(machineState[int(numaNode)].GetAvailableCPUSet(p.reservedCPUs))
   470  	}
   471  
   472  	var alignedCPUs machine.CPUSet
   473  
   474  	if qosutil.AnnotationsIndicateNUMAExclusive(reqAnnotations) {
   475  		// todo: currently we hack dedicated_cores with NUMA binding take up whole NUMA,
   476  		//  and we will modify strategy here if assumption above breaks.
   477  		alignedCPUs = alignedAvailableCPUs.Clone()
   478  	} else {
   479  		var err error
   480  		alignedCPUs, err = calculator.TakeByTopology(p.machineInfo, alignedAvailableCPUs, numCPUs)
   481  		if err != nil {
   482  			general.ErrorS(err, "take cpu for NUMA not exclusive binding container failed",
   483  				"hints", hint.Nodes,
   484  				"alignedAvailableCPUs", alignedAvailableCPUs.String())
   485  
   486  			return machine.NewCPUSet(),
   487  				fmt.Errorf("take cpu for NUMA not exclusive binding container failed with err: %v", err)
   488  		}
   489  	}
   490  
   491  	general.InfoS("allocate by hints",
   492  		"hints", hint.Nodes,
   493  		"alignedAvailableCPUs", alignedAvailableCPUs.String(),
   494  		"alignedAllocatedCPUs", alignedCPUs)
   495  
   496  	// currently, result equals to alignedCPUs,
   497  	// maybe extend cpus not aligned to meet requirement later
   498  	result = result.Union(alignedCPUs)
   499  	leftNumCPUs := numCPUs - result.Size()
   500  	if leftNumCPUs > 0 {
   501  		general.Errorf("result cpus: %s in hint NUMA nodes: %d with size: %d can't meet cpus request: %d",
   502  			result.String(), hint.Nodes, result.Size(), numCPUs)
   503  
   504  		return machine.NewCPUSet(), fmt.Errorf("results can't meet cpus request")
   505  	}
   506  	return result, nil
   507  }
   508  
   509  // putAllocationsAndAdjustAllocationEntries calculates and generates the latest checkpoint
   510  // - unlike adjustAllocationEntries, it will also consider AllocationInfo
   511  func (p *DynamicPolicy) putAllocationsAndAdjustAllocationEntries(allocationInfos []*state.AllocationInfo, incrByReq bool) error {
   512  	if len(allocationInfos) == 0 {
   513  		return nil
   514  	}
   515  
   516  	entries := p.state.GetPodEntries()
   517  	machineState := p.state.GetMachineState()
   518  
   519  	var poolsQuantityMap map[string]int
   520  	if p.enableCPUAdvisor {
   521  		// if sys advisor is enabled, we believe the pools' ratio that sys advisor indicates
   522  		poolsQuantityMap = machine.ParseCPUAssignmentQuantityMap(entries.GetFilteredPoolsCPUSetMap(state.ResidentPools))
   523  	} else {
   524  		// else we do sum(containers req) for each pool to get pools ratio
   525  		poolsQuantityMap = state.GetSharedQuantityMapFromPodEntries(entries, allocationInfos)
   526  	}
   527  
   528  	incrQuantityMap := make(map[string]float64)
   529  	for _, allocationInfo := range allocationInfos {
   530  		if allocationInfo == nil {
   531  			return fmt.Errorf("found nil allocationInfo in input parameter")
   532  		} else if !state.CheckShared(allocationInfo) {
   533  			return fmt.Errorf("put container with invalid qos level: %s into pool", allocationInfo.QoSLevel)
   534  		} else if entries[allocationInfo.PodUid][allocationInfo.ContainerName] == nil {
   535  			return fmt.Errorf("entry %s/%s, %s isn't found in state",
   536  				allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName)
   537  		}
   538  
   539  		poolName := allocationInfo.GetSpecifiedPoolName()
   540  		if poolName == advisorapi.EmptyOwnerPoolName {
   541  			return fmt.Errorf("allocationInfo points to empty poolName")
   542  		}
   543  
   544  		if incrByReq {
   545  			requestQuantity := state.GetContainerRequestedCores()(allocationInfo)
   546  			incrQuantityMap[poolName] += requestQuantity
   547  			general.Infof("put allocation with request quantity: %.3f", requestQuantity)
   548  		}
   549  	}
   550  
   551  	for poolName, incrQuantity := range incrQuantityMap {
   552  		incrInt := int(math.Ceil(incrQuantity))
   553  		poolsQuantityMap[poolName] += incrInt
   554  		general.Infof("increase pool: %s by %d", poolName, incrInt)
   555  	}
   556  
   557  	isolatedQuantityMap := state.GetIsolatedQuantityMapFromPodEntries(entries, allocationInfos)
   558  	err := p.adjustPoolsAndIsolatedEntries(poolsQuantityMap, isolatedQuantityMap, entries, machineState)
   559  	if err != nil {
   560  		return fmt.Errorf("adjustPoolsAndIsolatedEntries failed with error: %v", err)
   561  	}
   562  
   563  	return nil
   564  }
   565  
   566  // adjustAllocationEntries calculates and generates the latest checkpoint
   567  func (p *DynamicPolicy) adjustAllocationEntries() error {
   568  	entries := p.state.GetPodEntries()
   569  	machineState := p.state.GetMachineState()
   570  
   571  	// since adjustAllocationEntries will cause re-generate pools,
   572  	// if sys advisor is enabled, we believe the pools' ratio that sys advisor indicates,
   573  	// else we do sum(containers req) for each pool to get pools ratio
   574  	var poolsQuantityMap map[string]int
   575  	if p.enableCPUAdvisor {
   576  		poolsQuantityMap = machine.ParseCPUAssignmentQuantityMap(entries.GetFilteredPoolsCPUSetMap(state.ResidentPools))
   577  	} else {
   578  		poolsQuantityMap = state.GetSharedQuantityMapFromPodEntries(entries, nil)
   579  	}
   580  	isolatedQuantityMap := state.GetIsolatedQuantityMapFromPodEntries(entries, nil)
   581  
   582  	err := p.adjustPoolsAndIsolatedEntries(poolsQuantityMap, isolatedQuantityMap, entries, machineState)
   583  	if err != nil {
   584  		return fmt.Errorf("adjustPoolsAndIsolatedEntries failed with error: %v", err)
   585  	}
   586  
   587  	return nil
   588  }
   589  
   590  // adjustPoolsAndIsolatedEntries works for the following steps
   591  // 1. calculate pools and isolated cpusets according to expectant quantities
   592  // 2. make reclaimed overlap with numa-binding
   593  // 3. apply them to local state
   594  // 4. clean pools
   595  func (p *DynamicPolicy) adjustPoolsAndIsolatedEntries(poolsQuantityMap map[string]int,
   596  	isolatedQuantityMap map[string]map[string]int, entries state.PodEntries, machineState state.NUMANodeMap,
   597  ) error {
   598  	availableCPUs := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs, nil, state.CheckDedicatedNUMABinding)
   599  
   600  	poolsCPUSet, isolatedCPUSet, err := p.generatePoolsAndIsolation(poolsQuantityMap, isolatedQuantityMap, availableCPUs)
   601  	if err != nil {
   602  		return fmt.Errorf("generatePoolsAndIsolation failed with error: %v", err)
   603  	}
   604  
   605  	err = p.reclaimOverlapNUMABinding(poolsCPUSet, entries)
   606  	if err != nil {
   607  		return fmt.Errorf("reclaimOverlapNUMABinding failed with error: %v", err)
   608  	}
   609  
   610  	err = p.applyPoolsAndIsolatedInfo(poolsCPUSet, isolatedCPUSet, entries, machineState)
   611  	if err != nil {
   612  		return fmt.Errorf("applyPoolsAndIsolatedInfo failed with error: %v", err)
   613  	}
   614  
   615  	err = p.cleanPools()
   616  	if err != nil {
   617  		return fmt.Errorf("cleanPools failed with error: %v", err)
   618  	}
   619  
   620  	return nil
   621  }
   622  
   623  // reclaimOverlapNUMABinding unions calculated reclaim pool in empty NUMAs
   624  // with the intersection of previous reclaim pool and non-ramp-up dedicated_cores numa_binding containers
   625  func (p *DynamicPolicy) reclaimOverlapNUMABinding(poolsCPUSet map[string]machine.CPUSet, entries state.PodEntries) error {
   626  	// reclaimOverlapNUMABinding only works with cpu advisor and reclaim enabled
   627  	if !(p.enableCPUAdvisor && p.dynamicConfig.GetDynamicConfiguration().EnableReclaim) {
   628  		return nil
   629  	}
   630  
   631  	if entries.CheckPoolEmpty(state.PoolNameReclaim) {
   632  		return fmt.Errorf("reclaim pool misses in current entries")
   633  	}
   634  
   635  	curReclaimCPUSet := entries[state.PoolNameReclaim][advisorapi.FakedContainerName].AllocationResult.Clone()
   636  	nonOverlapReclaimCPUSet := poolsCPUSet[state.PoolNameReclaim].Clone()
   637  	general.Infof("curReclaimCPUSet: %s", curReclaimCPUSet.String())
   638  
   639  	for _, containerEntries := range entries {
   640  		if containerEntries.IsPoolEntry() {
   641  			continue
   642  		}
   643  
   644  		for _, allocationInfo := range containerEntries {
   645  			if !(allocationInfo != nil && state.CheckDedicatedNUMABinding(allocationInfo) && allocationInfo.CheckMainContainer()) {
   646  				continue
   647  			} else if allocationInfo.RampUp {
   648  				general.Infof("dedicated numa_binding pod: %s/%s container: %s is in ramp up, not to overlap reclaim pool with it",
   649  					allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName)
   650  				continue
   651  			}
   652  
   653  			poolsCPUSet[state.PoolNameReclaim] = poolsCPUSet[state.PoolNameReclaim].Union(curReclaimCPUSet.Intersection(allocationInfo.AllocationResult))
   654  		}
   655  	}
   656  
   657  	if poolsCPUSet[state.PoolNameReclaim].IsEmpty() {
   658  		return fmt.Errorf("reclaim pool is empty after overlapping with dedicated_cores numa_binding containers")
   659  	}
   660  
   661  	general.Infof("nonOverlapReclaimCPUSet: %s, finalReclaimCPUSet: %s", nonOverlapReclaimCPUSet.String(), poolsCPUSet[state.PoolNameReclaim].String())
   662  	return nil
   663  }
   664  
   665  // applyPoolsAndIsolatedInfo generates the latest checkpoint by pools and isolated cpusets calculation results.
   666  // 1. construct entries for isolated containers (probably be dedicated_cores not numa_binding )
   667  // 2. construct entries for all pools
   668  // 3. construct entries for shared and reclaimed containers
   669  func (p *DynamicPolicy) applyPoolsAndIsolatedInfo(poolsCPUSet map[string]machine.CPUSet,
   670  	isolatedCPUSet map[string]map[string]machine.CPUSet, curEntries state.PodEntries, machineState state.NUMANodeMap,
   671  ) error {
   672  	newPodEntries := make(state.PodEntries)
   673  	unionDedicatedIsolatedCPUSet := machine.NewCPUSet()
   674  
   675  	// walk through all isolated CPUSet map to store those pods/containers in pod entries
   676  	for podUID, containerEntries := range isolatedCPUSet {
   677  		for containerName, isolatedCPUs := range containerEntries {
   678  			allocationInfo := curEntries[podUID][containerName]
   679  			if allocationInfo == nil {
   680  				general.Errorf("isolated pod: %s, container: %s without entry in current checkpoint", podUID, containerName)
   681  				continue
   682  			} else if !state.CheckDedicated(allocationInfo) || state.CheckNUMABinding(allocationInfo) {
   683  				general.Errorf("isolated pod: %s, container: %s isn't dedicated_cores without NUMA binding", podUID, containerName)
   684  				continue
   685  			}
   686  
   687  			topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, isolatedCPUs)
   688  			if err != nil {
   689  				general.ErrorS(err, "Unable to calculate topologyAwareAssignments",
   690  					"podNamespace", allocationInfo.PodNamespace,
   691  					"podName", allocationInfo.PodName,
   692  					"containerName", allocationInfo.ContainerName,
   693  					"result cpuset", isolatedCPUs.String())
   694  				continue
   695  			}
   696  
   697  			general.InfoS("isolate info",
   698  				"podNamespace", allocationInfo.PodNamespace,
   699  				"podName", allocationInfo.PodName,
   700  				"containerName", allocationInfo.ContainerName,
   701  				"result cpuset", isolatedCPUs.String(),
   702  				"result cpuset size", isolatedCPUs.Size(),
   703  				"qosLevel", allocationInfo.QoSLevel)
   704  
   705  			if newPodEntries[podUID] == nil {
   706  				newPodEntries[podUID] = make(state.ContainerEntries)
   707  			}
   708  
   709  			newPodEntries[podUID][containerName] = allocationInfo.Clone()
   710  			newPodEntries[podUID][containerName].OwnerPoolName = state.PoolNameDedicated
   711  			newPodEntries[podUID][containerName].AllocationResult = isolatedCPUs.Clone()
   712  			newPodEntries[podUID][containerName].OriginalAllocationResult = isolatedCPUs.Clone()
   713  			newPodEntries[podUID][containerName].TopologyAwareAssignments = topologyAwareAssignments
   714  			newPodEntries[podUID][containerName].OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(topologyAwareAssignments)
   715  
   716  			unionDedicatedIsolatedCPUSet = unionDedicatedIsolatedCPUSet.Union(isolatedCPUs)
   717  		}
   718  	}
   719  
   720  	if poolsCPUSet[state.PoolNameReclaim].IsEmpty() {
   721  		return fmt.Errorf("entry: %s is empty", state.PoolNameShare)
   722  	}
   723  
   724  	// walk through all pools CPUSet map to store those pools in pod entries
   725  	for poolName, cset := range poolsCPUSet {
   726  		general.Infof("try to apply pool %s: %s", poolName, cset.String())
   727  
   728  		topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, cset)
   729  		if err != nil {
   730  			return fmt.Errorf("unable to calculate topologyAwareAssignments for pool: %s, result cpuset: %s, error: %v",
   731  				poolName, cset.String(), err)
   732  		}
   733  
   734  		allocationInfo := curEntries[poolName][advisorapi.FakedContainerName]
   735  		if allocationInfo != nil {
   736  			general.Infof("pool: %s allocation result transform from %s to %s",
   737  				poolName, allocationInfo.AllocationResult.String(), cset.String())
   738  		}
   739  
   740  		if newPodEntries[poolName] == nil {
   741  			newPodEntries[poolName] = make(state.ContainerEntries)
   742  		}
   743  		newPodEntries[poolName][advisorapi.FakedContainerName] = &state.AllocationInfo{
   744  			PodUid:                           poolName,
   745  			OwnerPoolName:                    poolName,
   746  			AllocationResult:                 cset.Clone(),
   747  			OriginalAllocationResult:         cset.Clone(),
   748  			TopologyAwareAssignments:         topologyAwareAssignments,
   749  			OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments),
   750  		}
   751  
   752  		_ = p.emitter.StoreInt64(util.MetricNamePoolSize, int64(cset.Size()), metrics.MetricTypeNameRaw,
   753  			metrics.MetricTag{Key: "poolName", Val: poolName},
   754  			metrics.MetricTag{Key: "pool_type", Val: state.GetPoolType(poolName)})
   755  	}
   756  
   757  	// rampUpCPUs includes common reclaimed pool
   758  	rampUpCPUs := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs,
   759  		nil, state.CheckDedicatedNUMABinding).Difference(unionDedicatedIsolatedCPUSet)
   760  	rampUpCPUsTopologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, rampUpCPUs)
   761  	if err != nil {
   762  		return fmt.Errorf("unable to calculate topologyAwareAssignments for rampUpCPUs, result cpuset: %s, error: %v",
   763  			rampUpCPUs.String(), err)
   764  	}
   765  
   766  	// walk through current pod entries to handle container-related entries (besides pooled entries)
   767  	for podUID, containerEntries := range curEntries {
   768  		if containerEntries.IsPoolEntry() {
   769  			continue
   770  		}
   771  
   772  	containerLoop:
   773  		for containerName, allocationInfo := range containerEntries {
   774  			if allocationInfo == nil {
   775  				general.Errorf("pod: %s, container: %s has nil allocationInfo", podUID, containerName)
   776  				continue
   777  			}
   778  
   779  			if newPodEntries[podUID][containerName] != nil {
   780  				// adapt to old checkpoint without RequestQuantity property
   781  				newPodEntries[podUID][containerName].RequestQuantity = state.GetContainerRequestedCores()(allocationInfo)
   782  				general.Infof("pod: %s/%s, container: %s, qosLevel: %s is isolated, ignore original allocationInfo",
   783  					allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, allocationInfo.QoSLevel)
   784  				continue
   785  			}
   786  
   787  			if newPodEntries[podUID] == nil {
   788  				newPodEntries[podUID] = make(state.ContainerEntries)
   789  			}
   790  
   791  			newPodEntries[podUID][containerName] = allocationInfo.Clone()
   792  			switch allocationInfo.QoSLevel {
   793  			case apiconsts.PodAnnotationQoSLevelDedicatedCores:
   794  				newPodEntries[podUID][containerName].OwnerPoolName = allocationInfo.GetPoolName()
   795  
   796  				// for numa_binding containers, we just clone checkpoint already exist
   797  				if state.CheckDedicatedNUMABinding(allocationInfo) {
   798  					continue containerLoop
   799  				}
   800  
   801  				// dedicated_cores without numa_binding is not isolated, we will try to isolate it in next adjustment.
   802  				general.Warningf("pod: %s/%s, container: %s is dedicated_cores without numa_binding but not isolated, "+
   803  					"we put it into fallback pool: %s temporary",
   804  					allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, rampUpCPUs.String())
   805  
   806  				newPodEntries[podUID][containerName].OwnerPoolName = state.PoolNameFallback
   807  				newPodEntries[podUID][containerName].AllocationResult = rampUpCPUs.Clone()
   808  				newPodEntries[podUID][containerName].OriginalAllocationResult = rampUpCPUs.Clone()
   809  				newPodEntries[podUID][containerName].TopologyAwareAssignments = machine.DeepcopyCPUAssignment(rampUpCPUsTopologyAwareAssignments)
   810  				newPodEntries[podUID][containerName].OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(rampUpCPUsTopologyAwareAssignments)
   811  
   812  			case apiconsts.PodAnnotationQoSLevelSharedCores, apiconsts.PodAnnotationQoSLevelReclaimedCores:
   813  				ownerPoolName := allocationInfo.GetPoolName()
   814  
   815  				if allocationInfo.RampUp {
   816  					general.Infof("pod: %s/%s container: %s is in ramp up, set its allocation result from %s to rampUpCPUs :%s",
   817  						allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName,
   818  						allocationInfo.AllocationResult.String(), rampUpCPUs.String())
   819  
   820  					newPodEntries[podUID][containerName].OwnerPoolName = advisorapi.EmptyOwnerPoolName
   821  					newPodEntries[podUID][containerName].AllocationResult = rampUpCPUs.Clone()
   822  					newPodEntries[podUID][containerName].OriginalAllocationResult = rampUpCPUs.Clone()
   823  					newPodEntries[podUID][containerName].TopologyAwareAssignments = machine.DeepcopyCPUAssignment(rampUpCPUsTopologyAwareAssignments)
   824  					newPodEntries[podUID][containerName].OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(rampUpCPUsTopologyAwareAssignments)
   825  				} else if newPodEntries[ownerPoolName][advisorapi.FakedContainerName] == nil {
   826  					general.Warningf("pod: %s/%s container: %s get owner pool: %s allocationInfo failed. reuse its allocation result: %s",
   827  						allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName,
   828  						ownerPoolName, allocationInfo.AllocationResult.String())
   829  				} else {
   830  					poolEntry := newPodEntries[ownerPoolName][advisorapi.FakedContainerName]
   831  					general.Infof("put pod: %s/%s container: %s to pool: %s, set its allocation result from %s to %s",
   832  						allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName,
   833  						ownerPoolName, allocationInfo.AllocationResult.String(), poolEntry.AllocationResult.String())
   834  
   835  					newPodEntries[podUID][containerName].OwnerPoolName = ownerPoolName
   836  					newPodEntries[podUID][containerName].AllocationResult = poolEntry.AllocationResult.Clone()
   837  					newPodEntries[podUID][containerName].OriginalAllocationResult = poolEntry.OriginalAllocationResult.Clone()
   838  					newPodEntries[podUID][containerName].TopologyAwareAssignments = machine.DeepcopyCPUAssignment(poolEntry.TopologyAwareAssignments)
   839  					newPodEntries[podUID][containerName].OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(poolEntry.TopologyAwareAssignments)
   840  				}
   841  			default:
   842  				return fmt.Errorf("invalid qosLevel: %s for pod: %s/%s container: %s",
   843  					allocationInfo.QoSLevel, allocationInfo.PodNamespace,
   844  					allocationInfo.PodName, allocationInfo.ContainerName)
   845  			}
   846  		}
   847  	}
   848  
   849  	// use pod entries generated above to generate machine state info, and store in local state
   850  	machineState, err = generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, newPodEntries)
   851  	if err != nil {
   852  		return fmt.Errorf("calculate machineState by newPodEntries failed with error: %v", err)
   853  	}
   854  	p.state.SetPodEntries(newPodEntries)
   855  	p.state.SetMachineState(machineState)
   856  
   857  	return nil
   858  }
   859  
   860  // generatePoolsAndIsolation is used to generate cpuset pools and isolated cpuset
   861  // 1. allocate isolated cpuset for pod/containers, and divide total cores evenly if not possible to allocate
   862  // 2. use the left cores to allocate among different pools
   863  // 3. apportion to other pools if reclaimed is disabled
   864  func (p *DynamicPolicy) generatePoolsAndIsolation(poolsQuantityMap map[string]int,
   865  	isolatedQuantityMap map[string]map[string]int, availableCPUs machine.CPUSet) (poolsCPUSet map[string]machine.CPUSet,
   866  	isolatedCPUSet map[string]map[string]machine.CPUSet, err error,
   867  ) {
   868  	// clear pool map with zero quantity
   869  	for poolName, quantity := range poolsQuantityMap {
   870  		if quantity == 0 {
   871  			general.Warningf("pool: %s with 0 quantity, skip generate", poolName)
   872  			delete(poolsQuantityMap, poolName)
   873  		}
   874  	}
   875  
   876  	// clear isolated map with zero quantity
   877  	for podUID, containerEntries := range isolatedQuantityMap {
   878  		for containerName, quantity := range containerEntries {
   879  			if quantity == 0 {
   880  				general.Warningf("isolated pod: %s, container: %s with 0 quantity, skip generate it", podUID, containerName)
   881  				delete(containerEntries, containerName)
   882  			}
   883  		}
   884  		if len(containerEntries) == 0 {
   885  			general.Warningf(" isolated pod: %s all container entries skipped", podUID)
   886  			delete(isolatedQuantityMap, podUID)
   887  		}
   888  	}
   889  
   890  	availableSize := availableCPUs.Size()
   891  
   892  	poolsCPUSet = make(map[string]machine.CPUSet)
   893  	poolsTotalQuantity := general.SumUpMapValues(poolsQuantityMap)
   894  
   895  	isolatedCPUSet = make(map[string]map[string]machine.CPUSet)
   896  	isolatedTotalQuantity := general.SumUpMultipleMapValues(isolatedQuantityMap)
   897  
   898  	general.Infof("isolatedTotalQuantity: %d, poolsTotalQuantity: %d, availableSize: %d",
   899  		isolatedTotalQuantity, poolsTotalQuantity, availableSize)
   900  
   901  	var tErr error
   902  	if poolsTotalQuantity+isolatedTotalQuantity <= availableSize {
   903  		general.Infof("all pools and isolated containers could be allocated")
   904  
   905  		isolatedCPUSet, availableCPUs, tErr = p.takeCPUsForContainers(isolatedQuantityMap, availableCPUs)
   906  		if tErr != nil {
   907  			err = fmt.Errorf("allocate isolated cpus for dedicated_cores failed with error: %v", tErr)
   908  			return
   909  		}
   910  
   911  		poolsCPUSet, availableCPUs, tErr = p.takeCPUsForPools(poolsQuantityMap, availableCPUs)
   912  		if tErr != nil {
   913  			err = fmt.Errorf("allocate cpus for pools failed with error: %v", tErr)
   914  			return
   915  		}
   916  	} else if poolsTotalQuantity <= availableSize {
   917  		general.Infof("all pools could be allocated, all isolated containers would be put to pools")
   918  
   919  		poolsCPUSet, availableCPUs, tErr = p.takeCPUsForPools(poolsQuantityMap, availableCPUs)
   920  		if tErr != nil {
   921  			err = fmt.Errorf("allocate cpus for pools failed with error: %v", tErr)
   922  			return
   923  		}
   924  	} else if poolsTotalQuantity > 0 {
   925  		general.Infof("can't allocate for all pools")
   926  
   927  		totalProportionalPoolsQuantity := 0
   928  		proportionalPoolsQuantityMap := make(map[string]int)
   929  
   930  		for poolName, poolQuantity := range poolsQuantityMap {
   931  			proportionalSize := general.Max(getProportionalSize(poolQuantity, poolsTotalQuantity, availableSize, true /*ceil*/), 1)
   932  			proportionalPoolsQuantityMap[poolName] = proportionalSize
   933  			totalProportionalPoolsQuantity += proportionalSize
   934  		}
   935  
   936  		poolNames := make([]string, 0, len(proportionalPoolsQuantityMap))
   937  
   938  		for poolName := range proportionalPoolsQuantityMap {
   939  			poolNames = append(poolNames, poolName)
   940  		}
   941  
   942  		sort.Slice(poolNames, func(x, y int) bool {
   943  			// sort in descending order
   944  			return proportionalPoolsQuantityMap[poolNames[x]] > proportionalPoolsQuantityMap[poolNames[y]]
   945  		})
   946  
   947  		// corner case: after divide, the total count goes to be bigger than available total
   948  		for totalProportionalPoolsQuantity > availableSize {
   949  			curTotalProportionalPoolsQuantity := totalProportionalPoolsQuantity
   950  
   951  			for _, poolName := range poolNames {
   952  				quantity := proportionalPoolsQuantityMap[poolName]
   953  
   954  				if quantity > 1 && totalProportionalPoolsQuantity > 0 {
   955  					quantity--
   956  					totalProportionalPoolsQuantity--
   957  					proportionalPoolsQuantityMap[poolName] = quantity
   958  
   959  					if totalProportionalPoolsQuantity == availableSize {
   960  						break
   961  					}
   962  				}
   963  			}
   964  
   965  			// availableSize can't satisfy every pool has at least one cpu
   966  			if curTotalProportionalPoolsQuantity == totalProportionalPoolsQuantity {
   967  				break
   968  			}
   969  		}
   970  
   971  		general.Infof("poolsQuantityMap: %v, proportionalPoolsQuantityMap: %v, availableSize: %d",
   972  			poolsQuantityMap, proportionalPoolsQuantityMap, availableSize)
   973  
   974  		// availableSize can't satisfy every pool has at least one cpu,
   975  		// we make all pools equals to availableCPUs in this case.
   976  		if totalProportionalPoolsQuantity > availableSize {
   977  			for poolName := range poolsQuantityMap {
   978  				poolsCPUSet[poolName] = availableCPUs.Clone()
   979  			}
   980  		} else {
   981  			poolsCPUSet, availableCPUs, tErr = p.takeCPUsForPools(proportionalPoolsQuantityMap, availableCPUs)
   982  			if tErr != nil {
   983  				err = fmt.Errorf("allocate cpus for pools failed with error: %v", tErr)
   984  				return
   985  			}
   986  		}
   987  	}
   988  
   989  	if poolsCPUSet[state.PoolNameReserve].IsEmpty() {
   990  		poolsCPUSet[state.PoolNameReserve] = p.reservedCPUs.Clone()
   991  		general.Infof("set pool %s:%s", state.PoolNameReserve, poolsCPUSet[state.PoolNameReserve].String())
   992  	} else {
   993  		err = fmt.Errorf("static pool %s result: %s is generated dynamically", state.PoolNameReserve, poolsCPUSet[state.PoolNameReserve].String())
   994  		return
   995  	}
   996  
   997  	poolsCPUSet[state.PoolNameReclaim] = poolsCPUSet[state.PoolNameReclaim].Union(availableCPUs)
   998  	if poolsCPUSet[state.PoolNameReclaim].IsEmpty() {
   999  		// for reclaimed pool, we must make them exist when the node isn't in hybrid mode even if cause overlap
  1000  		allAvailableCPUs := p.machineInfo.CPUDetails.CPUs().Difference(p.reservedCPUs)
  1001  		reclaimedCPUSet, _, tErr := calculator.TakeByNUMABalance(p.machineInfo, allAvailableCPUs, reservedReclaimedCPUsSize)
  1002  		if tErr != nil {
  1003  			err = fmt.Errorf("fallback takeByNUMABalance faild in generatePoolsAndIsolation for reclaimedCPUSet with error: %v", tErr)
  1004  			return
  1005  		}
  1006  
  1007  		general.Infof("fallback takeByNUMABalance in generatePoolsAndIsolation for reclaimedCPUSet: %s", reclaimedCPUSet.String())
  1008  		poolsCPUSet[state.PoolNameReclaim] = reclaimedCPUSet
  1009  	}
  1010  
  1011  	enableReclaim := p.dynamicConfig.GetDynamicConfiguration().EnableReclaim
  1012  	if !enableReclaim && poolsCPUSet[state.PoolNameReclaim].Size() > reservedReclaimedCPUsSize {
  1013  		poolsCPUSet[state.PoolNameReclaim] = p.apportionReclaimedPool(poolsCPUSet, poolsCPUSet[state.PoolNameReclaim].Clone())
  1014  		general.Infof("apportionReclaimedPool finished, current %s pool: %s",
  1015  			state.PoolNameReclaim, poolsCPUSet[state.PoolNameReclaim].String())
  1016  	}
  1017  
  1018  	return
  1019  }
  1020  
  1021  // apportionReclaimedPool tries to allocate reclaimed cores to none-reclaimed pools.
  1022  // if we disable reclaim on current node, this could be used a down-grade strategy
  1023  // to disable reclaimed workloads in emergency
  1024  func (p *DynamicPolicy) apportionReclaimedPool(poolsCPUSet map[string]machine.CPUSet, reclaimedCPUs machine.CPUSet) machine.CPUSet {
  1025  	totalSize := 0
  1026  	for poolName, poolCPUs := range poolsCPUSet {
  1027  		if state.ResidentPools.Has(poolName) {
  1028  			continue
  1029  		}
  1030  		totalSize += poolCPUs.Size()
  1031  	}
  1032  
  1033  	availableSize := reclaimedCPUs.Size() - reservedReclaimedCPUsSize
  1034  	if availableSize <= 0 || totalSize == 0 {
  1035  		return reclaimedCPUs
  1036  	}
  1037  
  1038  	for poolName, poolCPUs := range poolsCPUSet {
  1039  		if state.ResidentPools.Has(poolName) {
  1040  			continue
  1041  		}
  1042  		proportionalSize := general.Max(getProportionalSize(poolCPUs.Size(), totalSize, availableSize, false /*ceil*/), 1)
  1043  
  1044  		var err error
  1045  		var cpuset machine.CPUSet
  1046  		cpuset, reclaimedCPUs, err = calculator.TakeByNUMABalance(p.machineInfo, reclaimedCPUs, proportionalSize)
  1047  		if err != nil {
  1048  			general.Errorf("take %d cpus from reclaimedCPUs: %s, size: %d failed with error: %v",
  1049  				proportionalSize, reclaimedCPUs.String(), reclaimedCPUs.Size(), err)
  1050  			return reclaimedCPUs
  1051  		}
  1052  
  1053  		poolsCPUSet[poolName] = poolCPUs.Union(cpuset)
  1054  		general.Infof("take %s to %s; prev: %s, current: %s", cpuset.String(), poolName, poolCPUs.String(), poolsCPUSet[poolName].String())
  1055  
  1056  		if reclaimedCPUs.Size() <= reservedReclaimedCPUsSize {
  1057  			break
  1058  		}
  1059  	}
  1060  
  1061  	return reclaimedCPUs
  1062  }
  1063  
  1064  // takeCPUsForPools tries to allocate cpuset for each given pool,
  1065  // and it will consider the total available cpuset during calculation.
  1066  // the returned value includes cpuset pool map and remaining available cpuset.
  1067  func (p *DynamicPolicy) takeCPUsForPools(poolsQuantityMap map[string]int,
  1068  	availableCPUs machine.CPUSet,
  1069  ) (map[string]machine.CPUSet, machine.CPUSet, error) {
  1070  	poolsCPUSet := make(map[string]machine.CPUSet)
  1071  	clonedAvailableCPUs := availableCPUs.Clone()
  1072  
  1073  	// to avoid random map iteration sequence to generate pools randomly
  1074  	sortedPoolNames := general.GetSortedMapKeys(poolsQuantityMap)
  1075  	for _, poolName := range sortedPoolNames {
  1076  		req := poolsQuantityMap[poolName]
  1077  		general.Infof("allocated for pool: %s with req: %d", poolName, req)
  1078  
  1079  		var err error
  1080  		var cset machine.CPUSet
  1081  		cset, availableCPUs, err = calculator.TakeByNUMABalance(p.machineInfo, availableCPUs, req)
  1082  		if err != nil {
  1083  			return nil, clonedAvailableCPUs, fmt.Errorf("take cpu for pool: %s of req: %d failed with error: %v",
  1084  				poolName, req, err)
  1085  		}
  1086  		poolsCPUSet[poolName] = cset
  1087  	}
  1088  	return poolsCPUSet, availableCPUs, nil
  1089  }
  1090  
  1091  // takeCPUsForContainers tries to allocate cpuset for the given pod/container combinations,
  1092  // and it will consider the total available cpuset during calculation.
  1093  // the returned value includes cpuset map for pod/container combinations and remaining available cpuset.
  1094  func (p *DynamicPolicy) takeCPUsForContainers(containersQuantityMap map[string]map[string]int,
  1095  	availableCPUs machine.CPUSet,
  1096  ) (map[string]map[string]machine.CPUSet, machine.CPUSet, error) {
  1097  	containersCPUSet := make(map[string]map[string]machine.CPUSet)
  1098  	clonedAvailableCPUs := availableCPUs.Clone()
  1099  
  1100  	for podUID, containerQuantities := range containersQuantityMap {
  1101  		if len(containerQuantities) > 0 {
  1102  			containersCPUSet[podUID] = make(map[string]machine.CPUSet)
  1103  		}
  1104  
  1105  		for containerName, quantity := range containerQuantities {
  1106  			general.Infof("allocated for pod: %s container: %s with req: %d", podUID, containerName, quantity)
  1107  
  1108  			var err error
  1109  			var cset machine.CPUSet
  1110  			cset, availableCPUs, err = calculator.TakeByNUMABalance(p.machineInfo, availableCPUs, quantity)
  1111  			if err != nil {
  1112  				return nil, clonedAvailableCPUs, fmt.Errorf("take cpu for pod: %s container: %s of req: %d failed with error: %v",
  1113  					podUID, containerName, quantity, err)
  1114  			}
  1115  			containersCPUSet[podUID][containerName] = cset
  1116  		}
  1117  	}
  1118  	return containersCPUSet, availableCPUs, nil
  1119  }
  1120  
  1121  func (p *DynamicPolicy) shouldSharedCoresRampUp(podUID string) bool {
  1122  	ctx, cancel := context.WithTimeout(context.Background(), time.Second)
  1123  	defer cancel()
  1124  	pod, err := p.metaServer.GetPod(ctx, podUID)
  1125  
  1126  	if err != nil {
  1127  		general.Errorf("get pod failed with error: %v, try to ramp up it", err)
  1128  		return true
  1129  	} else if pod == nil {
  1130  		general.Infof("can't get pod: %s from metaServer, try to ramp up it", podUID)
  1131  		return true
  1132  	} else if !native.PodIsPending(pod) {
  1133  		general.Infof("pod: %s/%s isn't pending(not admit firstly), not try to ramp up it", pod.Namespace, pod.Name)
  1134  		return false
  1135  	} else {
  1136  		general.Infof("pod: %s/%s isn't active, try to ramp up it", pod.Namespace, pod.Name)
  1137  		return true
  1138  	}
  1139  }
  1140  
  1141  func (p *DynamicPolicy) doAndCheckPutAllocationInfo(allocationInfo *state.AllocationInfo, incrByReq bool) error {
  1142  	if allocationInfo == nil {
  1143  		return fmt.Errorf("doAndCheckPutAllocationInfo got nil allocationInfo")
  1144  	}
  1145  
  1146  	// need to adjust pools and putAllocationsAndAdjustAllocationEntries will set the allocationInfo after adjusted
  1147  	err := p.putAllocationsAndAdjustAllocationEntries([]*state.AllocationInfo{allocationInfo}, incrByReq)
  1148  	if err != nil {
  1149  		general.Errorf("pod: %s/%s, container: %s putAllocationsAndAdjustAllocationEntries failed with error: %v",
  1150  			allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, err)
  1151  		return fmt.Errorf("putAllocationsAndAdjustAllocationEntries failed with error: %v", err)
  1152  	}
  1153  
  1154  	checkedAllocationInfo := p.state.GetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName)
  1155  	if checkedAllocationInfo == nil {
  1156  		general.Errorf("pod: %s/%s, container: %s get nil allocationInfo after putAllocationsAndAdjustAllocationEntries",
  1157  			allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName)
  1158  		return fmt.Errorf("putAllocationsAndAdjustAllocationEntries failed with error: %v", err)
  1159  	}
  1160  
  1161  	return nil
  1162  }