github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package dynamicpolicy
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"google.golang.org/grpc"
    26  	v1 "k8s.io/api/core/v1"
    27  	"k8s.io/apimachinery/pkg/util/sets"
    28  	"k8s.io/apimachinery/pkg/util/wait"
    29  	pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
    30  	maputil "k8s.io/kubernetes/pkg/util/maps"
    31  	"k8s.io/utils/clock"
    32  
    33  	"github.com/kubewharf/katalyst-api/pkg/consts"
    34  	"github.com/kubewharf/katalyst-api/pkg/plugins/skeleton"
    35  	"github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent"
    36  	"github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm"
    37  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/advisorsvc"
    38  	cpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/consts"
    39  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/calculator"
    40  	advisorapi "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor"
    41  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpueviction"
    42  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/state"
    43  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/validator"
    44  	cpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/util"
    45  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util"
    46  	"github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler"
    47  	"github.com/kubewharf/katalyst-core/pkg/config"
    48  	dynamicconfig "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic"
    49  	"github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic/crd"
    50  	"github.com/kubewharf/katalyst-core/pkg/config/generic"
    51  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    52  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    53  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    54  	"github.com/kubewharf/katalyst-core/pkg/util/machine"
    55  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    56  	"github.com/kubewharf/katalyst-core/pkg/util/process"
    57  	"github.com/kubewharf/katalyst-core/pkg/util/timemonitor"
    58  )
    59  
    60  const (
    61  	cpuPluginStateFileName = "cpu_plugin_state"
    62  
    63  	reservedReclaimedCPUsSize = 4
    64  
    65  	cpusetCheckPeriod = 10 * time.Second
    66  	stateCheckPeriod  = 30 * time.Second
    67  	maxResidualTime   = 5 * time.Minute
    68  	syncCPUIdlePeriod = 30 * time.Second
    69  
    70  	healthCheckTolerationTimes = 3
    71  )
    72  
    73  var (
    74  	readonlyStateLock sync.RWMutex
    75  	readonlyState     state.ReadonlyState
    76  )
    77  
    78  // GetReadonlyState returns state.ReadonlyState to provides a way
    79  // to obtain the running states of the plugin
    80  func GetReadonlyState() (state.ReadonlyState, error) {
    81  	readonlyStateLock.RLock()
    82  	defer readonlyStateLock.RUnlock()
    83  
    84  	if readonlyState == nil {
    85  		return nil, fmt.Errorf("readonlyState isn't setted")
    86  	}
    87  	return readonlyState, nil
    88  }
    89  
    90  // DynamicPolicy is the policy that's used by default;
    91  // it will consider the dynamic running information to calculate
    92  // and adjust resource requirements and configurations
    93  type DynamicPolicy struct {
    94  	sync.RWMutex
    95  	name    string
    96  	stopCh  chan struct{}
    97  	started bool
    98  
    99  	emitter     metrics.MetricEmitter
   100  	metaServer  *metaserver.MetaServer
   101  	machineInfo *machine.KatalystMachineInfo
   102  
   103  	advisorClient    advisorapi.CPUAdvisorClient
   104  	advisorConn      *grpc.ClientConn
   105  	advisorValidator *validator.CPUAdvisorValidator
   106  	advisorapi.UnimplementedCPUPluginServer
   107  	lwRecvTimeMonitor *timemonitor.TimeMonitor
   108  
   109  	state              state.State
   110  	residualHitMap     map[string]int64
   111  	allocationHandlers map[string]util.AllocationHandler
   112  	hintHandlers       map[string]util.HintHandler
   113  
   114  	cpuPressureEviction       agent.Component
   115  	cpuPressureEvictionCancel context.CancelFunc
   116  
   117  	// those are parsed from configurations
   118  	// todo if we want to use dynamic configuration, we'd better not use self-defined conf
   119  	enableCPUAdvisor              bool
   120  	reservedCPUs                  machine.CPUSet
   121  	cpuAdvisorSocketAbsPath       string
   122  	cpuPluginSocketAbsPath        string
   123  	extraStateFileAbsPath         string
   124  	enableCPUIdle                 bool
   125  	enableSyncingCPUIdle          bool
   126  	reclaimRelativeRootCgroupPath string
   127  	qosConfig                     *generic.QoSConfiguration
   128  	dynamicConfig                 *dynamicconfig.DynamicAgentConfiguration
   129  	podDebugAnnoKeys              []string
   130  	transitionPeriod              time.Duration
   131  }
   132  
   133  func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration,
   134  	_ interface{}, agentName string,
   135  ) (bool, agent.Component, error) {
   136  	reservedCPUs, reserveErr := cpuutil.GetCoresReservedForSystem(conf, agentCtx.MetaServer, agentCtx.KatalystMachineInfo, agentCtx.CPUDetails.CPUs().Clone())
   137  	if reserveErr != nil {
   138  		return false, agent.ComponentStub{}, fmt.Errorf("GetCoresReservedForSystem for reservedCPUsNum: %d failed with error: %v",
   139  			conf.ReservedCPUCores, reserveErr)
   140  	}
   141  
   142  	stateImpl, stateErr := state.NewCheckpointState(conf.GenericQRMPluginConfiguration.StateFileDirectory, cpuPluginStateFileName,
   143  		cpuconsts.CPUResourcePluginPolicyNameDynamic, agentCtx.CPUTopology, conf.SkipCPUStateCorruption)
   144  	if stateErr != nil {
   145  		return false, agent.ComponentStub{}, fmt.Errorf("NewCheckpointState failed with error: %v", stateErr)
   146  	}
   147  
   148  	readonlyStateLock.Lock()
   149  	readonlyState = stateImpl
   150  	readonlyStateLock.Unlock()
   151  
   152  	wrappedEmitter := agentCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags(agentName, metrics.MetricTag{
   153  		Key: util.QRMPluginPolicyTagName,
   154  		Val: cpuconsts.CPUResourcePluginPolicyNameDynamic,
   155  	})
   156  
   157  	var (
   158  		cpuPressureEviction agent.Component
   159  		err                 error
   160  	)
   161  	if conf.EnableCPUPressureEviction {
   162  		cpuPressureEviction, err = cpueviction.NewCPUPressureEviction(
   163  			agentCtx.EmitterPool.GetDefaultMetricsEmitter(), agentCtx.MetaServer, conf, stateImpl)
   164  		if err != nil {
   165  			return false, agent.ComponentStub{}, err
   166  		}
   167  	}
   168  
   169  	// since the reservedCPUs won't influence stateImpl directly.
   170  	// so we don't modify stateImpl with reservedCPUs here.
   171  	// for those pods have already been allocated reservedCPUs,
   172  	// we won't touch them and wait them to be deleted the next update.
   173  	policyImplement := &DynamicPolicy{
   174  		name:   fmt.Sprintf("%s_%s", agentName, cpuconsts.CPUResourcePluginPolicyNameDynamic),
   175  		stopCh: make(chan struct{}),
   176  
   177  		machineInfo: agentCtx.KatalystMachineInfo,
   178  		emitter:     wrappedEmitter,
   179  		metaServer:  agentCtx.MetaServer,
   180  
   181  		state:          stateImpl,
   182  		residualHitMap: make(map[string]int64),
   183  
   184  		advisorValidator: validator.NewCPUAdvisorValidator(stateImpl, agentCtx.KatalystMachineInfo),
   185  
   186  		cpuPressureEviction: cpuPressureEviction,
   187  
   188  		qosConfig:                     conf.QoSConfiguration,
   189  		dynamicConfig:                 conf.DynamicAgentConfiguration,
   190  		cpuAdvisorSocketAbsPath:       conf.CPUAdvisorSocketAbsPath,
   191  		cpuPluginSocketAbsPath:        conf.CPUPluginSocketAbsPath,
   192  		enableCPUAdvisor:              conf.CPUQRMPluginConfig.EnableCPUAdvisor,
   193  		reservedCPUs:                  reservedCPUs,
   194  		extraStateFileAbsPath:         conf.ExtraStateFileAbsPath,
   195  		enableSyncingCPUIdle:          conf.CPUQRMPluginConfig.EnableSyncingCPUIdle,
   196  		enableCPUIdle:                 conf.CPUQRMPluginConfig.EnableCPUIdle,
   197  		reclaimRelativeRootCgroupPath: conf.ReclaimRelativeRootCgroupPath,
   198  		podDebugAnnoKeys:              conf.PodDebugAnnoKeys,
   199  		transitionPeriod:              30 * time.Second,
   200  	}
   201  
   202  	// register allocation behaviors for pods with different QoS level
   203  	policyImplement.allocationHandlers = map[string]util.AllocationHandler{
   204  		consts.PodAnnotationQoSLevelSharedCores:    policyImplement.sharedCoresAllocationHandler,
   205  		consts.PodAnnotationQoSLevelDedicatedCores: policyImplement.dedicatedCoresAllocationHandler,
   206  		consts.PodAnnotationQoSLevelReclaimedCores: policyImplement.reclaimedCoresAllocationHandler,
   207  	}
   208  
   209  	// register hint providers for pods with different QoS level
   210  	policyImplement.hintHandlers = map[string]util.HintHandler{
   211  		consts.PodAnnotationQoSLevelSharedCores:    policyImplement.sharedCoresHintHandler,
   212  		consts.PodAnnotationQoSLevelDedicatedCores: policyImplement.dedicatedCoresHintHandler,
   213  		consts.PodAnnotationQoSLevelReclaimedCores: policyImplement.reclaimedCoresHintHandler,
   214  	}
   215  
   216  	state.SetContainerRequestedCores(policyImplement.getContainerRequestedCores)
   217  
   218  	if err := policyImplement.cleanPools(); err != nil {
   219  		return false, agent.ComponentStub{}, fmt.Errorf("cleanPools failed with error: %v", err)
   220  	}
   221  
   222  	if err := policyImplement.initReservePool(); err != nil {
   223  		return false, agent.ComponentStub{}, fmt.Errorf("dynamic policy initReservePool failed with error: %v", err)
   224  	}
   225  
   226  	if err := policyImplement.initReclaimPool(); err != nil {
   227  		return false, agent.ComponentStub{}, fmt.Errorf("dynamic policy initReclaimPool failed with error: %v", err)
   228  	}
   229  
   230  	err = agentCtx.MetaServer.ConfigurationManager.AddConfigWatcher(crd.AdminQoSConfigurationGVR)
   231  	if err != nil {
   232  		return false, nil, err
   233  	}
   234  
   235  	pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(policyImplement, conf.QRMPluginSocketDirs, func(key string, value int64) {
   236  		_ = wrappedEmitter.StoreInt64(key, value, metrics.MetricTypeNameRaw)
   237  	})
   238  	if err != nil {
   239  		return false, agent.ComponentStub{}, fmt.Errorf("dynamic policy new plugin wrapper failed with error: %v", err)
   240  	}
   241  
   242  	return true, &agent.PluginWrapper{GenericPlugin: pluginWrapper}, nil
   243  }
   244  
   245  func (p *DynamicPolicy) Name() string {
   246  	return p.name
   247  }
   248  
   249  func (p *DynamicPolicy) ResourceName() string {
   250  	return string(v1.ResourceCPU)
   251  }
   252  
   253  func (p *DynamicPolicy) Start() (err error) {
   254  	general.Infof("called")
   255  
   256  	p.Lock()
   257  	defer func() {
   258  		if !p.started {
   259  			if err == nil {
   260  				p.started = true
   261  			} else {
   262  				close(p.stopCh)
   263  			}
   264  		}
   265  		p.Unlock()
   266  	}()
   267  
   268  	if p.started {
   269  		general.Infof("is already started")
   270  		return nil
   271  	}
   272  	p.stopCh = make(chan struct{})
   273  
   274  	go wait.Until(func() {
   275  		_ = p.emitter.StoreInt64(util.MetricNameHeartBeat, 1, metrics.MetricTypeNameRaw)
   276  	}, time.Second*30, p.stopCh)
   277  
   278  	err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(cpuconsts.ClearResidualState, general.HealthzCheckStateNotReady,
   279  		qrm.QRMCPUPluginPeriodicalHandlerGroupName, p.clearResidualState, stateCheckPeriod, healthCheckTolerationTimes)
   280  	if err != nil {
   281  		general.Errorf("start %v failed,err:%v", cpuconsts.ClearResidualState, err)
   282  	}
   283  
   284  	err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(cpuconsts.CheckCPUSet, general.HealthzCheckStateNotReady,
   285  		qrm.QRMCPUPluginPeriodicalHandlerGroupName, p.checkCPUSet, cpusetCheckPeriod, healthCheckTolerationTimes)
   286  	if err != nil {
   287  		general.Errorf("start %v failed,err:%v", cpuconsts.CheckCPUSet, err)
   288  	}
   289  
   290  	// start cpu-idle syncing if needed
   291  	if p.enableSyncingCPUIdle {
   292  		general.Infof("syncCPUIdle enabled")
   293  
   294  		if p.reclaimRelativeRootCgroupPath == "" {
   295  			return fmt.Errorf("enable syncing cpu idle but not set reclaiemd relative root cgroup path in configuration")
   296  		}
   297  
   298  		err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(cpuconsts.SyncCPUIdle, general.HealthzCheckStateNotReady,
   299  			qrm.QRMCPUPluginPeriodicalHandlerGroupName, p.syncCPUIdle, syncCPUIdlePeriod, healthCheckTolerationTimes)
   300  		if err != nil {
   301  			general.Errorf("start %v failed,err:%v", cpuconsts.SyncCPUIdle, err)
   302  		}
   303  	}
   304  
   305  	// start cpu-pressure eviction plugin if needed
   306  	if p.cpuPressureEviction != nil {
   307  		var ctx context.Context
   308  		ctx, p.cpuPressureEvictionCancel = context.WithCancel(context.Background())
   309  		go p.cpuPressureEviction.Run(ctx)
   310  	}
   311  
   312  	go wait.Until(func() {
   313  		periodicalhandler.ReadyToStartHandlersByGroup(qrm.QRMCPUPluginPeriodicalHandlerGroupName)
   314  	}, 5*time.Second, p.stopCh)
   315  
   316  	// pre-check necessary dirs if sys-advisor is enabled
   317  	if !p.enableCPUAdvisor {
   318  		general.Infof("start dynamic policy cpu plugin without sys-advisor")
   319  		return nil
   320  	} else if p.cpuAdvisorSocketAbsPath == "" || p.cpuPluginSocketAbsPath == "" {
   321  		return fmt.Errorf("invalid cpuAdvisorSocketAbsPath: %s or cpuPluginSocketAbsPath: %s",
   322  			p.cpuAdvisorSocketAbsPath, p.cpuPluginSocketAbsPath)
   323  	}
   324  
   325  	general.Infof("start dynamic policy cpu plugin with sys-advisor")
   326  	err = p.initAdvisorClientConn()
   327  	if err != nil {
   328  		general.Errorf("initAdvisorClientConn failed with error: %v", err)
   329  		return
   330  	}
   331  
   332  	go wait.BackoffUntil(func() { p.serveForAdvisor(p.stopCh) }, wait.NewExponentialBackoffManager(
   333  		800*time.Millisecond, 30*time.Second, 2*time.Minute, 2.0, 0, &clock.RealClock{}), true, p.stopCh)
   334  
   335  	communicateWithCPUAdvisorServer := func() {
   336  		general.Infof("waiting cpu plugin checkpoint server serving confirmation")
   337  		if conn, err := process.Dial(p.cpuPluginSocketAbsPath, 5*time.Second); err != nil {
   338  			general.Errorf("dial check at socket: %s failed with err: %v", p.cpuPluginSocketAbsPath, err)
   339  			return
   340  		} else {
   341  			_ = conn.Close()
   342  		}
   343  		general.Infof("cpu plugin checkpoint server serving confirmed")
   344  
   345  		err = p.pushCPUAdvisor()
   346  		if err != nil {
   347  			general.Errorf("sync existing containers to cpu advisor failed with error: %v", err)
   348  			return
   349  		}
   350  		general.Infof("sync existing containers to cpu advisor successfully")
   351  
   352  		// call lw of CPUAdvisorServer and do allocation
   353  		if err = p.lwCPUAdvisorServer(p.stopCh); err != nil {
   354  			general.Errorf("lwCPUAdvisorServer failed with error: %v", err)
   355  		} else {
   356  			general.Infof("lwCPUAdvisorServer finished")
   357  		}
   358  	}
   359  
   360  	general.RegisterHeartbeatCheck(cpuconsts.CommunicateWithAdvisor, 2*time.Minute, general.HealthzCheckStateNotReady, 2*time.Minute)
   361  	go wait.BackoffUntil(communicateWithCPUAdvisorServer, wait.NewExponentialBackoffManager(800*time.Millisecond,
   362  		30*time.Second, 2*time.Minute, 2.0, 0, &clock.RealClock{}), true, p.stopCh)
   363  
   364  	p.lwRecvTimeMonitor = timemonitor.NewTimeMonitor(cpuAdvisorLWRecvTimeMonitorName,
   365  		cpuAdvisorLWRecvTimeMonitorDurationThreshold, cpuAdvisorLWRecvTimeMonitorInterval,
   366  		util.MetricNameLWRecvStuck, p.emitter)
   367  	go p.lwRecvTimeMonitor.Run(p.stopCh)
   368  	return nil
   369  }
   370  
   371  func (p *DynamicPolicy) Stop() error {
   372  	p.Lock()
   373  	defer func() {
   374  		p.started = false
   375  		p.Unlock()
   376  		general.Infof("stopped")
   377  	}()
   378  
   379  	if !p.started {
   380  		general.Warningf("already stopped")
   381  		return nil
   382  	}
   383  
   384  	close(p.stopCh)
   385  
   386  	if p.cpuPressureEvictionCancel != nil {
   387  		p.cpuPressureEvictionCancel()
   388  	}
   389  
   390  	periodicalhandler.StopHandlersByGroup(qrm.QRMCPUPluginPeriodicalHandlerGroupName)
   391  
   392  	if p.advisorConn != nil {
   393  		return p.advisorConn.Close()
   394  	}
   395  
   396  	return nil
   397  }
   398  
   399  // GetResourcesAllocation returns allocation results of corresponding resources
   400  func (p *DynamicPolicy) GetResourcesAllocation(_ context.Context,
   401  	req *pluginapi.GetResourcesAllocationRequest,
   402  ) (*pluginapi.GetResourcesAllocationResponse, error) {
   403  	if req == nil {
   404  		return nil, fmt.Errorf("GetResourcesAllocation got nil req")
   405  	}
   406  
   407  	general.Infof("called")
   408  	p.Lock()
   409  	defer p.Unlock()
   410  
   411  	podEntries := p.state.GetPodEntries()
   412  	machineState := p.state.GetMachineState()
   413  
   414  	// pooledCPUs is the total available cpu cores minus those that are reserved
   415  	pooledCPUs := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs,
   416  		state.CheckDedicated, state.CheckDedicatedNUMABinding)
   417  	pooledCPUsTopologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, pooledCPUs)
   418  	if err != nil {
   419  		return nil, fmt.Errorf("GetNumaAwareAssignments err: %v", err)
   420  	}
   421  
   422  	podResources := make(map[string]*pluginapi.ContainerResources)
   423  	var allocationInfosJustFinishRampUp []*state.AllocationInfo
   424  	for podUID, containerEntries := range podEntries {
   425  		// if it's a pool, not returning to QRM
   426  		if containerEntries.IsPoolEntry() {
   427  			continue
   428  		}
   429  
   430  		if podResources[podUID] == nil {
   431  			podResources[podUID] = &pluginapi.ContainerResources{}
   432  		}
   433  
   434  		for containerName, allocationInfo := range containerEntries {
   435  			if allocationInfo == nil {
   436  				continue
   437  			}
   438  			allocationInfo = allocationInfo.Clone()
   439  
   440  			initTs, tsErr := time.Parse(util.QRMTimeFormat, allocationInfo.InitTimestamp)
   441  			if tsErr != nil {
   442  				if state.CheckShared(allocationInfo) {
   443  					general.Errorf("pod: %s/%s, container: %s init timestamp parsed failed with error: %v, re-ramp-up it",
   444  						allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, tsErr)
   445  
   446  					clonedPooledCPUs := pooledCPUs.Clone()
   447  					clonedPooledCPUsTopologyAwareAssignments := machine.DeepcopyCPUAssignment(pooledCPUsTopologyAwareAssignments)
   448  
   449  					allocationInfo.AllocationResult = clonedPooledCPUs
   450  					allocationInfo.OriginalAllocationResult = clonedPooledCPUs
   451  					allocationInfo.TopologyAwareAssignments = clonedPooledCPUsTopologyAwareAssignments
   452  					allocationInfo.OriginalTopologyAwareAssignments = clonedPooledCPUsTopologyAwareAssignments
   453  					// fill OwnerPoolName with empty string when ramping up
   454  					allocationInfo.OwnerPoolName = advisorapi.EmptyOwnerPoolName
   455  					allocationInfo.RampUp = true
   456  				}
   457  
   458  				allocationInfo.InitTimestamp = time.Now().Format(util.QRMTimeFormat)
   459  				p.state.SetAllocationInfo(podUID, containerName, allocationInfo)
   460  			} else if allocationInfo.RampUp && time.Now().After(initTs.Add(p.transitionPeriod)) {
   461  				general.Infof("pod: %s/%s, container: %s ramp up finished", allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName)
   462  				allocationInfo.RampUp = false
   463  				p.state.SetAllocationInfo(podUID, containerName, allocationInfo)
   464  
   465  				if state.CheckShared(allocationInfo) {
   466  					allocationInfosJustFinishRampUp = append(allocationInfosJustFinishRampUp, allocationInfo)
   467  				}
   468  			}
   469  
   470  			if podResources[podUID].ContainerResources == nil {
   471  				podResources[podUID].ContainerResources = make(map[string]*pluginapi.ResourceAllocation)
   472  			}
   473  			podResources[podUID].ContainerResources[containerName] = &pluginapi.ResourceAllocation{
   474  				ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{
   475  					string(v1.ResourceCPU): {
   476  						OciPropertyName:   util.OCIPropertyNameCPUSetCPUs,
   477  						IsNodeResource:    false,
   478  						IsScalarResource:  true,
   479  						AllocatedQuantity: float64(allocationInfo.AllocationResult.Size()),
   480  						AllocationResult:  allocationInfo.AllocationResult.String(),
   481  					},
   482  				},
   483  			}
   484  		}
   485  	}
   486  
   487  	if len(allocationInfosJustFinishRampUp) > 0 {
   488  		if err = p.putAllocationsAndAdjustAllocationEntries(allocationInfosJustFinishRampUp, true); err != nil {
   489  			// not influencing return response to kubelet when putAllocationsAndAdjustAllocationEntries failed
   490  			general.Errorf("putAllocationsAndAdjustAllocationEntries failed with error: %v", err)
   491  		}
   492  	}
   493  
   494  	return &pluginapi.GetResourcesAllocationResponse{
   495  		PodResources: podResources,
   496  	}, nil
   497  }
   498  
   499  // GetTopologyAwareResources returns allocation results of corresponding resources as machineInfo aware format
   500  func (p *DynamicPolicy) GetTopologyAwareResources(_ context.Context,
   501  	req *pluginapi.GetTopologyAwareResourcesRequest,
   502  ) (*pluginapi.GetTopologyAwareResourcesResponse, error) {
   503  	if req == nil {
   504  		return nil, fmt.Errorf("GetTopologyAwareResources got nil req")
   505  	}
   506  
   507  	general.Infof("called")
   508  	p.RLock()
   509  	defer p.RUnlock()
   510  
   511  	allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName)
   512  	if allocationInfo == nil {
   513  		return nil, fmt.Errorf("pod: %s, container: %s is not show up in cpu plugin state", req.PodUid, req.ContainerName)
   514  	}
   515  
   516  	resp := &pluginapi.GetTopologyAwareResourcesResponse{
   517  		PodUid:       allocationInfo.PodUid,
   518  		PodName:      allocationInfo.PodName,
   519  		PodNamespace: allocationInfo.PodNamespace,
   520  		ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{
   521  			ContainerName: allocationInfo.ContainerName,
   522  		},
   523  	}
   524  
   525  	if allocationInfo.CheckSideCar() {
   526  		resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{
   527  			string(v1.ResourceCPU): {
   528  				IsNodeResource:                    false,
   529  				IsScalarResource:                  true,
   530  				AggregatedQuantity:                0,
   531  				OriginalAggregatedQuantity:        0,
   532  				TopologyAwareQuantityList:         nil,
   533  				OriginalTopologyAwareQuantityList: nil,
   534  			},
   535  		}
   536  	} else {
   537  		resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{
   538  			string(v1.ResourceCPU): {
   539  				IsNodeResource:                    false,
   540  				IsScalarResource:                  true,
   541  				AggregatedQuantity:                float64(allocationInfo.AllocationResult.Size()),
   542  				OriginalAggregatedQuantity:        float64(allocationInfo.OriginalAllocationResult.Size()),
   543  				TopologyAwareQuantityList:         util.GetTopologyAwareQuantityFromAssignments(allocationInfo.TopologyAwareAssignments),
   544  				OriginalTopologyAwareQuantityList: util.GetTopologyAwareQuantityFromAssignments(allocationInfo.OriginalTopologyAwareAssignments),
   545  			},
   546  		}
   547  	}
   548  
   549  	return resp, nil
   550  }
   551  
   552  // GetTopologyAwareAllocatableResources returns corresponding allocatable resources as machineInfo aware format
   553  func (p *DynamicPolicy) GetTopologyAwareAllocatableResources(_ context.Context,
   554  	_ *pluginapi.GetTopologyAwareAllocatableResourcesRequest,
   555  ) (*pluginapi.GetTopologyAwareAllocatableResourcesResponse, error) {
   556  	general.Infof("is called")
   557  
   558  	numaNodes := p.machineInfo.CPUDetails.NUMANodes().ToSliceInt()
   559  	topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(numaNodes))
   560  	topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(numaNodes))
   561  
   562  	for _, numaNode := range numaNodes {
   563  		numaNodeCPUs := p.machineInfo.CPUDetails.CPUsInNUMANodes(numaNode).Clone()
   564  		topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{
   565  			ResourceValue: float64(numaNodeCPUs.Difference(p.reservedCPUs).Size()),
   566  			Node:          uint64(numaNode),
   567  		})
   568  		topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{
   569  			ResourceValue: float64(numaNodeCPUs.Size()),
   570  			Node:          uint64(numaNode),
   571  		})
   572  	}
   573  
   574  	return &pluginapi.GetTopologyAwareAllocatableResourcesResponse{
   575  		AllocatableResources: map[string]*pluginapi.AllocatableTopologyAwareResource{
   576  			string(v1.ResourceCPU): {
   577  				IsNodeResource:                       false,
   578  				IsScalarResource:                     true,
   579  				AggregatedAllocatableQuantity:        float64(p.machineInfo.NumCPUs - p.reservedCPUs.Size()),
   580  				TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList,
   581  				AggregatedCapacityQuantity:           float64(p.machineInfo.NumCPUs),
   582  				TopologyAwareCapacityQuantityList:    topologyAwareCapacityQuantityList,
   583  			},
   584  		},
   585  	}, nil
   586  }
   587  
   588  // GetTopologyHints returns hints of corresponding resources
   589  func (p *DynamicPolicy) GetTopologyHints(ctx context.Context,
   590  	req *pluginapi.ResourceRequest,
   591  ) (resp *pluginapi.ResourceHintsResponse, err error) {
   592  	if req == nil {
   593  		return nil, fmt.Errorf("GetTopologyHints got nil req")
   594  	}
   595  
   596  	// identify if the pod is a debug pod,
   597  	// if so, apply specific strategy to it.
   598  	// since GetKatalystQoSLevelFromResourceReq function will filter annotations,
   599  	// we should do it before GetKatalystQoSLevelFromResourceReq.
   600  	isDebugPod := util.IsDebugPod(req.Annotations, p.podDebugAnnoKeys)
   601  
   602  	qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req)
   603  	if err != nil {
   604  		err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v",
   605  			req.PodNamespace, req.PodName, req.ContainerName, err)
   606  		general.Errorf("%s", err.Error())
   607  		return nil, err
   608  	}
   609  
   610  	reqInt, reqFloat64, err := util.GetQuantityFromResourceReq(req)
   611  	if err != nil {
   612  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
   613  	}
   614  
   615  	general.InfoS("called",
   616  		"podNamespace", req.PodNamespace,
   617  		"podName", req.PodName,
   618  		"containerName", req.ContainerName,
   619  		"podType", req.PodType,
   620  		"podRole", req.PodRole,
   621  		"containerType", req.ContainerType,
   622  		"qosLevel", qosLevel,
   623  		"numCPUsInt", reqInt,
   624  		"numCPUsFloat64", reqFloat64,
   625  		"isDebugPod", isDebugPod)
   626  
   627  	if req.ContainerType == pluginapi.ContainerType_INIT || isDebugPod {
   628  		general.Infof("there is no NUMA preference, return nil hint")
   629  		return util.PackResourceHintsResponse(req, string(v1.ResourceCPU),
   630  			map[string]*pluginapi.ListOfTopologyHints{
   631  				string(v1.ResourceCPU): nil, // indicates that there is no numa preference
   632  			})
   633  	}
   634  
   635  	p.RLock()
   636  	defer func() {
   637  		p.RUnlock()
   638  		if err != nil {
   639  			_ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw)
   640  		}
   641  	}()
   642  
   643  	if p.hintHandlers[qosLevel] == nil {
   644  		return nil, fmt.Errorf("katalyst QoS level: %s is not supported yet", qosLevel)
   645  	}
   646  	return p.hintHandlers[qosLevel](ctx, req)
   647  }
   648  
   649  // GetResourcePluginOptions returns options to be communicated with Resource Manager
   650  func (p *DynamicPolicy) GetResourcePluginOptions(context.Context,
   651  	*pluginapi.Empty,
   652  ) (*pluginapi.ResourcePluginOptions, error) {
   653  	general.Infof("called")
   654  	return &pluginapi.ResourcePluginOptions{
   655  		PreStartRequired:      false,
   656  		WithTopologyAlignment: true,
   657  		NeedReconcile:         true,
   658  	}, nil
   659  }
   660  
   661  // Allocate is called during pod admit so that the resource
   662  // plugin can allocate corresponding resource for the container
   663  // according to resource request
   664  func (p *DynamicPolicy) Allocate(ctx context.Context,
   665  	req *pluginapi.ResourceRequest,
   666  ) (resp *pluginapi.ResourceAllocationResponse, respErr error) {
   667  	if req == nil {
   668  		return nil, fmt.Errorf("allocate got nil req")
   669  	}
   670  
   671  	// identify if the pod is a debug pod,
   672  	// if so, apply specific strategy to it.
   673  	// since GetKatalystQoSLevelFromResourceReq function will filter annotations,
   674  	// we should do it before GetKatalystQoSLevelFromResourceReq.
   675  	isDebugPod := util.IsDebugPod(req.Annotations, p.podDebugAnnoKeys)
   676  
   677  	qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req)
   678  	if err != nil {
   679  		err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v",
   680  			req.PodNamespace, req.PodName, req.ContainerName, err)
   681  		general.Errorf("%s", err.Error())
   682  		return nil, err
   683  	}
   684  
   685  	reqInt, reqFloat64, err := util.GetQuantityFromResourceReq(req)
   686  	if err != nil {
   687  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
   688  	}
   689  
   690  	general.InfoS("called",
   691  		"podNamespace", req.PodNamespace,
   692  		"podName", req.PodName,
   693  		"containerName", req.ContainerName,
   694  		"podType", req.PodType,
   695  		"podRole", req.PodRole,
   696  		"containerType", req.ContainerType,
   697  		"qosLevel", qosLevel,
   698  		"numCPUsInt", reqInt,
   699  		"numCPUsFloat64", reqFloat64,
   700  		"isDebugPod", isDebugPod)
   701  
   702  	if req.ContainerType == pluginapi.ContainerType_INIT {
   703  		return &pluginapi.ResourceAllocationResponse{
   704  			PodUid:         req.PodUid,
   705  			PodNamespace:   req.PodNamespace,
   706  			PodName:        req.PodName,
   707  			ContainerName:  req.ContainerName,
   708  			ContainerType:  req.ContainerType,
   709  			ContainerIndex: req.ContainerIndex,
   710  			PodRole:        req.PodRole,
   711  			PodType:        req.PodType,
   712  			ResourceName:   string(v1.ResourceCPU),
   713  			Labels:         general.DeepCopyMap(req.Labels),
   714  			Annotations:    general.DeepCopyMap(req.Annotations),
   715  		}, nil
   716  	} else if isDebugPod {
   717  		return &pluginapi.ResourceAllocationResponse{
   718  			PodUid:         req.PodUid,
   719  			PodNamespace:   req.PodNamespace,
   720  			PodName:        req.PodName,
   721  			ContainerName:  req.ContainerName,
   722  			ContainerType:  req.ContainerType,
   723  			ContainerIndex: req.ContainerIndex,
   724  			PodRole:        req.PodRole,
   725  			PodType:        req.PodType,
   726  			ResourceName:   string(v1.ResourceCPU),
   727  			AllocationResult: &pluginapi.ResourceAllocation{
   728  				ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{
   729  					string(v1.ResourceCPU): {
   730  						// return ResourceAllocation with empty OciPropertyName, AllocatedQuantity, AllocationResult for containers in debug pod,
   731  						// it won't influence oci spec properties of the container
   732  						IsNodeResource:   false,
   733  						IsScalarResource: true,
   734  					},
   735  				},
   736  			},
   737  			Labels:      general.DeepCopyMap(req.Labels),
   738  			Annotations: general.DeepCopyMap(req.Annotations),
   739  		}, nil
   740  	}
   741  
   742  	p.Lock()
   743  	defer func() {
   744  		// calls sys-advisor to inform the latest container
   745  		if p.enableCPUAdvisor && respErr == nil && req.ContainerType != pluginapi.ContainerType_INIT {
   746  			_, err := p.advisorClient.AddContainer(ctx, &advisorsvc.ContainerMetadata{
   747  				PodUid:          req.PodUid,
   748  				PodNamespace:    req.PodNamespace,
   749  				PodName:         req.PodName,
   750  				ContainerName:   req.ContainerName,
   751  				ContainerType:   req.ContainerType,
   752  				ContainerIndex:  req.ContainerIndex,
   753  				Labels:          maputil.CopySS(req.Labels),
   754  				Annotations:     maputil.CopySS(req.Annotations),
   755  				QosLevel:        qosLevel,
   756  				RequestQuantity: uint64(reqInt),
   757  			})
   758  			if err != nil {
   759  				resp = nil
   760  				respErr = fmt.Errorf("add container to qos aware server failed with error: %v", err)
   761  				_ = p.removeContainer(req.PodUid, req.ContainerName)
   762  			}
   763  		} else if respErr != nil {
   764  			_ = p.removeContainer(req.PodUid, req.ContainerName)
   765  			_ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw)
   766  		}
   767  
   768  		p.Unlock()
   769  		return
   770  	}()
   771  
   772  	allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName)
   773  	if allocationInfo != nil && allocationInfo.OriginalAllocationResult.Size() >= reqInt {
   774  		general.InfoS("already allocated and meet requirement",
   775  			"podNamespace", req.PodNamespace,
   776  			"podName", req.PodName,
   777  			"containerName", req.ContainerName,
   778  			"numCPUs", reqInt,
   779  			"originalAllocationResult", allocationInfo.OriginalAllocationResult.String(),
   780  			"currentResult", allocationInfo.AllocationResult.String())
   781  
   782  		return &pluginapi.ResourceAllocationResponse{
   783  			PodUid:         req.PodUid,
   784  			PodNamespace:   req.PodNamespace,
   785  			PodName:        req.PodName,
   786  			ContainerName:  req.ContainerName,
   787  			ContainerType:  req.ContainerType,
   788  			ContainerIndex: req.ContainerIndex,
   789  			PodRole:        req.PodRole,
   790  			PodType:        req.PodType,
   791  			ResourceName:   string(v1.ResourceCPU),
   792  			AllocationResult: &pluginapi.ResourceAllocation{
   793  				ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{
   794  					string(v1.ResourceCPU): {
   795  						OciPropertyName:   util.OCIPropertyNameCPUSetCPUs,
   796  						IsNodeResource:    false,
   797  						IsScalarResource:  true,
   798  						AllocatedQuantity: float64(allocationInfo.AllocationResult.Size()),
   799  						AllocationResult:  allocationInfo.AllocationResult.String(),
   800  					},
   801  				},
   802  			},
   803  			Labels:      general.DeepCopyMap(req.Labels),
   804  			Annotations: general.DeepCopyMap(req.Annotations),
   805  		}, nil
   806  	}
   807  
   808  	if p.allocationHandlers[qosLevel] == nil {
   809  		return nil, fmt.Errorf("katalyst QoS level: %s is not supported yet", qosLevel)
   810  	}
   811  	return p.allocationHandlers[qosLevel](ctx, req)
   812  }
   813  
   814  // PreStartContainer is called, if indicated by resource plugin during registration phase,
   815  // before each container start. Resource plugin can run resource specific operations
   816  // such as resetting the resource before making resources available to the container
   817  func (p *DynamicPolicy) PreStartContainer(context.Context,
   818  	*pluginapi.PreStartContainerRequest,
   819  ) (*pluginapi.PreStartContainerResponse, error) {
   820  	return nil, nil
   821  }
   822  
   823  func (p *DynamicPolicy) RemovePod(ctx context.Context,
   824  	req *pluginapi.RemovePodRequest,
   825  ) (resp *pluginapi.RemovePodResponse, err error) {
   826  	if req == nil {
   827  		return nil, fmt.Errorf("RemovePod got nil req")
   828  	}
   829  	general.InfoS("called", "podUID", req.PodUid)
   830  
   831  	p.Lock()
   832  	defer func() {
   833  		p.Unlock()
   834  		if err != nil {
   835  			general.ErrorS(err, "remove pod failed with error", "podUID", req.PodUid)
   836  			_ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw)
   837  		}
   838  	}()
   839  
   840  	if p.enableCPUAdvisor {
   841  		_, err = p.advisorClient.RemovePod(ctx, &advisorsvc.RemovePodRequest{PodUid: req.PodUid})
   842  		if err != nil {
   843  			return nil, fmt.Errorf("remove pod in QoS aware server failed with error: %v", err)
   844  		}
   845  	}
   846  
   847  	err = p.removePod(req.PodUid)
   848  	if err != nil {
   849  		general.ErrorS(err, "remove pod failed with error", "podUID", req.PodUid)
   850  		return nil, err
   851  	}
   852  
   853  	aErr := p.adjustAllocationEntries()
   854  	if aErr != nil {
   855  		general.ErrorS(aErr, "adjustAllocationEntries failed", "podUID", req.PodUid)
   856  	}
   857  
   858  	return &pluginapi.RemovePodResponse{}, nil
   859  }
   860  
   861  func (p *DynamicPolicy) removePod(podUID string) error {
   862  	podEntries := p.state.GetPodEntries()
   863  	if len(podEntries[podUID]) == 0 {
   864  		return nil
   865  	}
   866  	delete(podEntries, podUID)
   867  
   868  	updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   869  	if err != nil {
   870  		return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   871  	}
   872  
   873  	p.state.SetPodEntries(podEntries)
   874  	p.state.SetMachineState(updatedMachineState)
   875  	return nil
   876  }
   877  
   878  func (p *DynamicPolicy) removeContainer(podUID, containerName string) error {
   879  	podEntries := p.state.GetPodEntries()
   880  
   881  	found := false
   882  	if podEntries[podUID][containerName] != nil {
   883  		found = true
   884  	}
   885  
   886  	delete(podEntries[podUID], containerName)
   887  
   888  	if !found {
   889  		return nil
   890  	}
   891  
   892  	updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   893  	if err != nil {
   894  		return fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err)
   895  	}
   896  
   897  	p.state.SetPodEntries(podEntries)
   898  	p.state.SetMachineState(updatedMachineState)
   899  	return nil
   900  }
   901  
   902  // initAdvisorClientConn initializes cpu-advisor related connections
   903  func (p *DynamicPolicy) initAdvisorClientConn() (err error) {
   904  	cpuAdvisorConn, err := process.Dial(p.cpuAdvisorSocketAbsPath, 5*time.Second)
   905  	if err != nil {
   906  		err = fmt.Errorf("get cpu advisor connection with socket: %s failed with error: %v", p.cpuAdvisorSocketAbsPath, err)
   907  		return
   908  	}
   909  
   910  	p.advisorClient = advisorapi.NewCPUAdvisorClient(cpuAdvisorConn)
   911  	p.advisorConn = cpuAdvisorConn
   912  	return nil
   913  }
   914  
   915  // cleanPools is used to clean pools-related data in local state
   916  func (p *DynamicPolicy) cleanPools() error {
   917  	remainPools := make(map[string]bool)
   918  
   919  	// walk through pod entries to put them into specified pool maps
   920  	podEntries := p.state.GetPodEntries()
   921  	for _, entries := range podEntries {
   922  		if entries.IsPoolEntry() {
   923  			continue
   924  		}
   925  
   926  		for _, allocationInfo := range entries {
   927  			ownerPool := allocationInfo.GetOwnerPoolName()
   928  			if ownerPool != advisorapi.EmptyOwnerPoolName {
   929  				remainPools[ownerPool] = true
   930  			}
   931  		}
   932  	}
   933  
   934  	// if pool exists in entries, but has no corresponding container, we need to delete it
   935  	poolsToDelete := sets.NewString()
   936  	for poolName, entries := range podEntries {
   937  		if entries.IsPoolEntry() {
   938  			if !remainPools[poolName] && !state.ResidentPools.Has(poolName) {
   939  				poolsToDelete.Insert(poolName)
   940  			}
   941  		}
   942  	}
   943  
   944  	if poolsToDelete.Len() > 0 {
   945  		general.Infof("pools to delete: %v", poolsToDelete.UnsortedList())
   946  		for _, poolName := range poolsToDelete.UnsortedList() {
   947  			delete(podEntries, poolName)
   948  		}
   949  
   950  		machineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries)
   951  		if err != nil {
   952  			return fmt.Errorf("calculate machineState by podEntries failed with error: %v", err)
   953  		}
   954  
   955  		p.state.SetPodEntries(podEntries)
   956  		p.state.SetMachineState(machineState)
   957  	} else {
   958  		general.Infof("there is no pool to delete")
   959  	}
   960  
   961  	return nil
   962  }
   963  
   964  // initReservePool initializes reserve pool for system cores workload
   965  func (p *DynamicPolicy) initReservePool() error {
   966  	reserveAllocationInfo := p.state.GetAllocationInfo(state.PoolNameReserve, advisorapi.FakedContainerName)
   967  	if reserveAllocationInfo != nil && !reserveAllocationInfo.AllocationResult.IsEmpty() {
   968  		general.Infof("pool: %s allocation result transform from %s to %s",
   969  			state.PoolNameReserve, reserveAllocationInfo.AllocationResult.String(), p.reservedCPUs)
   970  	}
   971  
   972  	general.Infof("initReservePool %s: %s", state.PoolNameReserve, p.reservedCPUs)
   973  	topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, p.reservedCPUs)
   974  	if err != nil {
   975  		return fmt.Errorf("unable to calculate topologyAwareAssignments for pool: %s, result cpuset: %s, error: %v",
   976  			state.PoolNameReserve, p.reservedCPUs.String(), err)
   977  	}
   978  
   979  	curReserveAllocationInfo := &state.AllocationInfo{
   980  		PodUid:                           state.PoolNameReserve,
   981  		OwnerPoolName:                    state.PoolNameReserve,
   982  		AllocationResult:                 p.reservedCPUs.Clone(),
   983  		OriginalAllocationResult:         p.reservedCPUs.Clone(),
   984  		TopologyAwareAssignments:         topologyAwareAssignments,
   985  		OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments),
   986  	}
   987  	p.state.SetAllocationInfo(state.PoolNameReserve, advisorapi.FakedContainerName, curReserveAllocationInfo)
   988  
   989  	return nil
   990  }
   991  
   992  // initReclaimPool initializes pools for reclaimed-cores.
   993  // if this info already exists in state-file, just use it, otherwise calculate right away
   994  func (p *DynamicPolicy) initReclaimPool() error {
   995  	reclaimedAllocationInfo := p.state.GetAllocationInfo(state.PoolNameReclaim, advisorapi.FakedContainerName)
   996  	if reclaimedAllocationInfo == nil {
   997  		podEntries := p.state.GetPodEntries()
   998  		noneResidentCPUs := podEntries.GetFilteredPoolsCPUSet(state.ResidentPools)
   999  
  1000  		machineState := p.state.GetMachineState()
  1001  		availableCPUs := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs,
  1002  			state.CheckDedicated, state.CheckDedicatedNUMABinding).Difference(noneResidentCPUs)
  1003  
  1004  		var initReclaimedCPUSetSize int
  1005  		if availableCPUs.Size() >= reservedReclaimedCPUsSize {
  1006  			initReclaimedCPUSetSize = reservedReclaimedCPUsSize
  1007  		} else {
  1008  			initReclaimedCPUSetSize = availableCPUs.Size()
  1009  		}
  1010  
  1011  		reclaimedCPUSet, _, err := calculator.TakeByNUMABalance(p.machineInfo, availableCPUs, initReclaimedCPUSetSize)
  1012  		if err != nil {
  1013  			return fmt.Errorf("takeByNUMABalance faild in initReclaimPool for %s and %s with error: %v",
  1014  				state.PoolNameShare, state.PoolNameReclaim, err)
  1015  		}
  1016  
  1017  		// for residual pools, we must make them exist even if cause overlap
  1018  		// todo: noneResidentCPUs is the same as reservedCPUs, why should we do this?
  1019  		allAvailableCPUs := p.machineInfo.CPUDetails.CPUs().Difference(p.reservedCPUs)
  1020  		if reclaimedCPUSet.IsEmpty() {
  1021  			reclaimedCPUSet, _, err = calculator.TakeByNUMABalance(p.machineInfo, allAvailableCPUs, reservedReclaimedCPUsSize)
  1022  			if err != nil {
  1023  				return fmt.Errorf("fallback takeByNUMABalance faild in initReclaimPool for %s with error: %v",
  1024  					state.PoolNameReclaim, err)
  1025  			}
  1026  		}
  1027  
  1028  		for poolName, cset := range map[string]machine.CPUSet{state.PoolNameReclaim: reclaimedCPUSet} {
  1029  			general.Infof("initReclaimPool %s: %s", poolName, cset.String())
  1030  			topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, cset)
  1031  			if err != nil {
  1032  				return fmt.Errorf("unable to calculate topologyAwareAssignments for pool: %s, "+
  1033  					"result cpuset: %s, error: %v", poolName, cset.String(), err)
  1034  			}
  1035  
  1036  			curPoolAllocationInfo := &state.AllocationInfo{
  1037  				PodUid:                           poolName,
  1038  				OwnerPoolName:                    poolName,
  1039  				AllocationResult:                 cset.Clone(),
  1040  				OriginalAllocationResult:         cset.Clone(),
  1041  				TopologyAwareAssignments:         topologyAwareAssignments,
  1042  				OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments),
  1043  			}
  1044  			p.state.SetAllocationInfo(poolName, advisorapi.FakedContainerName, curPoolAllocationInfo)
  1045  		}
  1046  	} else {
  1047  		general.Infof("exist initial %s: %s", state.PoolNameReclaim, reclaimedAllocationInfo.AllocationResult.String())
  1048  	}
  1049  
  1050  	return nil
  1051  }
  1052  
  1053  // getContainerRequestedCores parses and returns request cores for the given container
  1054  func (p *DynamicPolicy) getContainerRequestedCores(allocationInfo *state.AllocationInfo) float64 {
  1055  	if allocationInfo == nil {
  1056  		general.Errorf("got nil allocationInfo")
  1057  		return 0
  1058  	}
  1059  
  1060  	if allocationInfo.RequestQuantity == 0 {
  1061  		if p.metaServer == nil {
  1062  			general.Errorf("got nil metaServer")
  1063  			return 0
  1064  		}
  1065  
  1066  		container, err := p.metaServer.GetContainerSpec(allocationInfo.PodUid, allocationInfo.ContainerName)
  1067  		if err != nil || container == nil {
  1068  			general.Errorf("get container failed with error: %v", err)
  1069  			return 0
  1070  		}
  1071  
  1072  		cpuQuantity := native.CPUQuantityGetter()(container.Resources.Requests)
  1073  		allocationInfo.RequestQuantity = general.MaxFloat64(float64(cpuQuantity.MilliValue())/1000.0, 0)
  1074  		general.Infof("get cpu request quantity: %.3f for pod: %s/%s container: %s from podWatcher",
  1075  			allocationInfo.RequestQuantity, allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName)
  1076  	}
  1077  	return allocationInfo.RequestQuantity
  1078  }