github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go (about)

     1  /*
     2  Copyright 2022 The Katalyst Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package dynamicpolicy
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"github.com/cilium/ebpf"
    26  	"google.golang.org/grpc"
    27  	v1 "k8s.io/api/core/v1"
    28  	"k8s.io/apimachinery/pkg/util/wait"
    29  	pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
    30  	maputil "k8s.io/kubernetes/pkg/util/maps"
    31  	"k8s.io/utils/clock"
    32  
    33  	apiconsts "github.com/kubewharf/katalyst-api/pkg/consts"
    34  	"github.com/kubewharf/katalyst-api/pkg/plugins/skeleton"
    35  	"github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent"
    36  	"github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm"
    37  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/advisorsvc"
    38  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate"
    39  	memconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/consts"
    40  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor"
    41  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/oom"
    42  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/state"
    43  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/handlers/sockmem"
    44  	"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util"
    45  	"github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler"
    46  	"github.com/kubewharf/katalyst-core/pkg/config"
    47  	"github.com/kubewharf/katalyst-core/pkg/config/generic"
    48  	"github.com/kubewharf/katalyst-core/pkg/metaserver"
    49  	"github.com/kubewharf/katalyst-core/pkg/metrics"
    50  	"github.com/kubewharf/katalyst-core/pkg/util/asyncworker"
    51  	"github.com/kubewharf/katalyst-core/pkg/util/general"
    52  	"github.com/kubewharf/katalyst-core/pkg/util/machine"
    53  	"github.com/kubewharf/katalyst-core/pkg/util/native"
    54  	"github.com/kubewharf/katalyst-core/pkg/util/process"
    55  	"github.com/kubewharf/katalyst-core/pkg/util/timemonitor"
    56  )
    57  
    58  const (
    59  	MemoryResourcePluginPolicyNameDynamic = string(apiconsts.ResourcePluginPolicyNameDynamic)
    60  
    61  	memoryPluginStateFileName                    = "memory_plugin_state"
    62  	memoryPluginAsyncWorkersName                 = "qrm_memory_plugin_async_workers"
    63  	memoryPluginAsyncWorkTopicDropCache          = "qrm_memory_plugin_drop_cache"
    64  	memoryPluginAsyncWorkTopicSetExtraCGMemLimit = "qrm_memory_plugin_set_extra_mem_limit"
    65  	memoryPluginAsyncWorkTopicMovePage           = "qrm_memory_plugin_move_page"
    66  	memoryPluginAsyncWorkTopicMemoryOffloading   = "qrm_memory_plugin_mem_offload"
    67  
    68  	dropCacheTimeoutSeconds          = 30
    69  	setExtraCGMemLimitTimeoutSeconds = 60
    70  )
    71  
    72  const (
    73  	memsetCheckPeriod          = 10 * time.Second
    74  	stateCheckPeriod           = 30 * time.Second
    75  	maxResidualTime            = 5 * time.Minute
    76  	setMemoryMigratePeriod     = 5 * time.Second
    77  	applyCgroupPeriod          = 5 * time.Second
    78  	setExtraControlKnobsPeriod = 5 * time.Second
    79  	clearOOMPriorityPeriod     = 1 * time.Hour
    80  	syncOOMPriorityPeriod      = 5 * time.Second
    81  
    82  	healthCheckTolerationTimes = 3
    83  	dropCacheGracePeriod       = 60 * time.Second
    84  )
    85  
    86  var (
    87  	readonlyStateLock sync.RWMutex
    88  	readonlyState     state.ReadonlyState
    89  )
    90  
    91  // GetReadonlyState returns state.ReadonlyState to provides a way
    92  // to obtain the running states of the plugin
    93  func GetReadonlyState() (state.ReadonlyState, error) {
    94  	readonlyStateLock.RLock()
    95  	defer readonlyStateLock.RUnlock()
    96  
    97  	if readonlyState == nil {
    98  		return nil, fmt.Errorf("readonlyState isn't setted")
    99  	}
   100  	return readonlyState, nil
   101  }
   102  
   103  type DynamicPolicy struct {
   104  	sync.RWMutex
   105  
   106  	stopCh                  chan struct{}
   107  	started                 bool
   108  	qosConfig               *generic.QoSConfiguration
   109  	extraControlKnobConfigs commonstate.ExtraControlKnobConfigs
   110  
   111  	// emitter is used to emit metrics.
   112  	// metaServer is used to collect metadata universal metaServer.
   113  	emitter    metrics.MetricEmitter
   114  	metaServer *metaserver.MetaServer
   115  
   116  	advisorClient     advisorsvc.AdvisorServiceClient
   117  	advisorConn       *grpc.ClientConn
   118  	lwRecvTimeMonitor *timemonitor.TimeMonitor
   119  
   120  	topology *machine.CPUTopology
   121  	state    state.State
   122  
   123  	migrateMemoryLock sync.Mutex
   124  	migratingMemory   map[string]map[string]bool
   125  	residualHitMap    map[string]int64
   126  
   127  	allocationHandlers  map[string]util.AllocationHandler
   128  	hintHandlers        map[string]util.HintHandler
   129  	enhancementHandlers util.ResourceEnhancementHandlerMap
   130  
   131  	extraStateFileAbsPath string
   132  	name                  string
   133  
   134  	podDebugAnnoKeys []string
   135  
   136  	asyncWorkers *asyncworker.AsyncWorkers
   137  
   138  	enableSettingMemoryMigrate bool
   139  	enableSettingSockMem       bool
   140  	enableMemoryAdvisor        bool
   141  	memoryAdvisorSocketAbsPath string
   142  	memoryPluginSocketAbsPath  string
   143  
   144  	enableOOMPriority        bool
   145  	oomPriorityMapPinnedPath string
   146  	oomPriorityMapLock       sync.Mutex
   147  	oomPriorityMap           *ebpf.Map
   148  }
   149  
   150  func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration,
   151  	_ interface{}, agentName string,
   152  ) (bool, agent.Component, error) {
   153  	reservedMemory, err := getReservedMemory(conf, agentCtx.MetaServer, agentCtx.MachineInfo)
   154  	if err != nil {
   155  		return false, agent.ComponentStub{}, fmt.Errorf("getReservedMemoryFromOptions failed with error: %v", err)
   156  	}
   157  
   158  	resourcesReservedMemory := map[v1.ResourceName]map[int]uint64{
   159  		v1.ResourceMemory: reservedMemory,
   160  	}
   161  	stateImpl, err := state.NewCheckpointState(conf.GenericQRMPluginConfiguration.StateFileDirectory, memoryPluginStateFileName,
   162  		memconsts.MemoryResourcePluginPolicyNameDynamic, agentCtx.CPUTopology, agentCtx.MachineInfo, resourcesReservedMemory, conf.SkipMemoryStateCorruption)
   163  	if err != nil {
   164  		return false, agent.ComponentStub{}, fmt.Errorf("NewCheckpointState failed with error: %v", err)
   165  	}
   166  
   167  	extraControlKnobConfigs := make(commonstate.ExtraControlKnobConfigs)
   168  	if len(conf.ExtraControlKnobConfigFile) > 0 {
   169  		extraControlKnobConfigs, err = commonstate.LoadExtraControlKnobConfigs(conf.ExtraControlKnobConfigFile)
   170  		if err != nil {
   171  			return false, agent.ComponentStub{}, fmt.Errorf("loadExtraControlKnobConfigs failed with error: %v", err)
   172  		}
   173  	} else {
   174  		general.Infof("empty ExtraControlKnobConfigFile, initialize empty extraControlKnobConfigs")
   175  	}
   176  
   177  	readonlyStateLock.Lock()
   178  	readonlyState = stateImpl
   179  	readonlyStateLock.Unlock()
   180  
   181  	wrappedEmitter := agentCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags(agentName, metrics.MetricTag{
   182  		Key: util.QRMPluginPolicyTagName,
   183  		Val: memconsts.MemoryResourcePluginPolicyNameDynamic,
   184  	})
   185  
   186  	policyImplement := &DynamicPolicy{
   187  		topology:                   agentCtx.CPUTopology,
   188  		qosConfig:                  conf.QoSConfiguration,
   189  		emitter:                    wrappedEmitter,
   190  		metaServer:                 agentCtx.MetaServer,
   191  		state:                      stateImpl,
   192  		stopCh:                     make(chan struct{}),
   193  		migratingMemory:            make(map[string]map[string]bool),
   194  		residualHitMap:             make(map[string]int64),
   195  		enhancementHandlers:        make(util.ResourceEnhancementHandlerMap),
   196  		extraStateFileAbsPath:      conf.ExtraStateFileAbsPath,
   197  		name:                       fmt.Sprintf("%s_%s", agentName, memconsts.MemoryResourcePluginPolicyNameDynamic),
   198  		podDebugAnnoKeys:           conf.PodDebugAnnoKeys,
   199  		asyncWorkers:               asyncworker.NewAsyncWorkers(memoryPluginAsyncWorkersName, wrappedEmitter),
   200  		enableSettingMemoryMigrate: conf.EnableSettingMemoryMigrate,
   201  		enableSettingSockMem:       conf.EnableSettingSockMem,
   202  		enableMemoryAdvisor:        conf.EnableMemoryAdvisor,
   203  		memoryAdvisorSocketAbsPath: conf.MemoryAdvisorSocketAbsPath,
   204  		memoryPluginSocketAbsPath:  conf.MemoryPluginSocketAbsPath,
   205  		extraControlKnobConfigs:    extraControlKnobConfigs, // [TODO]: support modifying extraControlKnobConfigs by KCC
   206  		enableOOMPriority:          conf.EnableOOMPriority,
   207  		oomPriorityMapPinnedPath:   conf.OOMPriorityPinnedMapAbsPath,
   208  	}
   209  
   210  	policyImplement.allocationHandlers = map[string]util.AllocationHandler{
   211  		apiconsts.PodAnnotationQoSLevelSharedCores:    policyImplement.sharedCoresAllocationHandler,
   212  		apiconsts.PodAnnotationQoSLevelDedicatedCores: policyImplement.dedicatedCoresAllocationHandler,
   213  		apiconsts.PodAnnotationQoSLevelReclaimedCores: policyImplement.reclaimedCoresAllocationHandler,
   214  	}
   215  
   216  	policyImplement.hintHandlers = map[string]util.HintHandler{
   217  		apiconsts.PodAnnotationQoSLevelSharedCores:    policyImplement.sharedCoresHintHandler,
   218  		apiconsts.PodAnnotationQoSLevelDedicatedCores: policyImplement.dedicatedCoresHintHandler,
   219  		apiconsts.PodAnnotationQoSLevelReclaimedCores: policyImplement.reclaimedCoresHintHandler,
   220  	}
   221  
   222  	if policyImplement.enableOOMPriority {
   223  		policyImplement.enhancementHandlers.Register(apiconsts.QRMPhaseRemovePod,
   224  			apiconsts.PodAnnotationMemoryEnhancementOOMPriority, policyImplement.clearOOMPriority)
   225  	}
   226  
   227  	pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(policyImplement, conf.QRMPluginSocketDirs,
   228  		func(key string, value int64) {
   229  			_ = wrappedEmitter.StoreInt64(key, value, metrics.MetricTypeNameRaw)
   230  		})
   231  	if err != nil {
   232  		return false, agent.ComponentStub{}, fmt.Errorf("dynamic policy new plugin wrapper failed with error: %v", err)
   233  	}
   234  
   235  	memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyMemoryLimitInBytes,
   236  		memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorMemoryLimitInBytes))
   237  	memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyCPUSetMems,
   238  		memoryadvisor.ControlKnobHandlerWithChecker(handleAdvisorCPUSetMems))
   239  	memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyDropCache,
   240  		memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorDropCache))
   241  	memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobReclaimedMemorySize,
   242  		memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorMemoryProvisions))
   243  	memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyBalanceNumaMemory,
   244  		memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleNumaMemoryBalance))
   245  	memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnowKeyMemoryOffloading,
   246  		memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorMemoryOffloading))
   247  
   248  	return true, &agent.PluginWrapper{GenericPlugin: pluginWrapper}, nil
   249  }
   250  
   251  func (p *DynamicPolicy) registerControlKnobHandlerCheckRules() {
   252  	general.RegisterReportCheck(memconsts.DropCache, dropCacheGracePeriod)
   253  }
   254  
   255  func (p *DynamicPolicy) Start() (err error) {
   256  	general.Infof("called")
   257  
   258  	p.Lock()
   259  	defer func() {
   260  		if !p.started {
   261  			if err == nil {
   262  				p.started = true
   263  			} else {
   264  				close(p.stopCh)
   265  			}
   266  		}
   267  		p.Unlock()
   268  	}()
   269  
   270  	if p.started {
   271  		general.Infof("already started")
   272  		return nil
   273  	}
   274  	p.stopCh = make(chan struct{})
   275  
   276  	p.registerControlKnobHandlerCheckRules()
   277  	go wait.Until(func() {
   278  		_ = p.emitter.StoreInt64(util.MetricNameHeartBeat, 1, metrics.MetricTypeNameRaw)
   279  	}, time.Second*30, p.stopCh)
   280  
   281  	err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.ClearResidualState,
   282  		general.HealthzCheckStateNotReady, qrm.QRMMemoryPluginPeriodicalHandlerGroupName,
   283  		p.clearResidualState, stateCheckPeriod, healthCheckTolerationTimes)
   284  	if err != nil {
   285  		general.Errorf("start %v failed, err: %v", memconsts.ClearResidualState, err)
   286  	}
   287  
   288  	err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.CheckMemSet, general.HealthzCheckStateNotReady,
   289  		qrm.QRMMemoryPluginPeriodicalHandlerGroupName, p.checkMemorySet, memsetCheckPeriod, healthCheckTolerationTimes)
   290  	if err != nil {
   291  		general.Errorf("start %v failed, err: %v", memconsts.CheckMemSet, err)
   292  	}
   293  
   294  	err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.ApplyExternalCGParams, general.HealthzCheckStateNotReady,
   295  		qrm.QRMMemoryPluginPeriodicalHandlerGroupName, p.applyExternalCgroupParams, applyCgroupPeriod, healthCheckTolerationTimes)
   296  	if err != nil {
   297  		general.Errorf("start %v failed, err: %v", memconsts.ApplyExternalCGParams, err)
   298  	}
   299  
   300  	err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.SetExtraControlKnob, general.HealthzCheckStateNotReady,
   301  		qrm.QRMMemoryPluginPeriodicalHandlerGroupName, p.setExtraControlKnobByConfigs, setExtraControlKnobsPeriod, healthCheckTolerationTimes)
   302  	if err != nil {
   303  		general.Errorf("start %v failed, err: %v", memconsts.SetExtraControlKnob, err)
   304  	}
   305  
   306  	err = p.asyncWorkers.Start(p.stopCh)
   307  	if err != nil {
   308  		general.Errorf("start async worker failed, err: %v", err)
   309  	}
   310  
   311  	if p.enableSettingMemoryMigrate {
   312  		general.Infof("setMemoryMigrate enabled")
   313  		go wait.Until(p.setMemoryMigrate, setMemoryMigratePeriod, p.stopCh)
   314  	}
   315  
   316  	if p.enableOOMPriority {
   317  		general.Infof("OOM priority enabled")
   318  		go p.PollOOMBPFInit(p.stopCh)
   319  
   320  		err := periodicalhandler.RegisterPeriodicalHandler(qrm.QRMMemoryPluginPeriodicalHandlerGroupName,
   321  			oom.ClearResidualOOMPriorityPeriodicalHandlerName, p.clearResidualOOMPriority, clearOOMPriorityPeriod)
   322  		if err != nil {
   323  			general.Infof("register clearResidualOOMPriority failed, err=%v", err)
   324  		}
   325  
   326  		err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.OOMPriority, general.HealthzCheckStateNotReady,
   327  			qrm.QRMMemoryPluginPeriodicalHandlerGroupName, p.syncOOMPriority, syncOOMPriorityPeriod, healthCheckTolerationTimes)
   328  		if err != nil {
   329  			general.Infof("register syncOOMPriority failed, err=%v", err)
   330  		}
   331  	}
   332  
   333  	if p.enableSettingSockMem {
   334  		general.Infof("setSockMem enabled")
   335  		err := periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.SetSockMem,
   336  			general.HealthzCheckStateNotReady, qrm.QRMMemoryPluginPeriodicalHandlerGroupName,
   337  			sockmem.SetSockMemLimit, 60*time.Second, healthCheckTolerationTimes)
   338  		if err != nil {
   339  			general.Infof("setSockMem failed, err=%v", err)
   340  		}
   341  	}
   342  
   343  	go wait.Until(func() {
   344  		periodicalhandler.ReadyToStartHandlersByGroup(qrm.QRMMemoryPluginPeriodicalHandlerGroupName)
   345  	}, 5*time.Second, p.stopCh)
   346  
   347  	if !p.enableMemoryAdvisor {
   348  		general.Infof("start dynamic policy memory plugin without memory advisor")
   349  		return nil
   350  	} else if p.memoryAdvisorSocketAbsPath == "" {
   351  		return fmt.Errorf("invalid memoryAdvisorSocketAbsPath: %s", p.memoryAdvisorSocketAbsPath)
   352  	}
   353  
   354  	general.Infof("start dynamic policy memory plugin with memory advisor")
   355  	err = p.initAdvisorClientConn()
   356  	if err != nil {
   357  		general.Errorf("initAdvisorClientConn failed with error: %v", err)
   358  		return
   359  	}
   360  
   361  	go wait.BackoffUntil(func() { p.serveForAdvisor(p.stopCh) }, wait.NewExponentialBackoffManager(
   362  		800*time.Millisecond, 30*time.Second, 2*time.Minute, 2.0, 0, &clock.RealClock{}), true, p.stopCh)
   363  
   364  	communicateWithMemoryAdvisorServer := func() {
   365  		general.Infof("waiting memory plugin checkpoint server serving confirmation")
   366  		if conn, err := process.Dial(p.memoryPluginSocketAbsPath, 5*time.Second); err != nil {
   367  			general.Errorf("dial check at socket: %s failed with err: %v", p.memoryPluginSocketAbsPath, err)
   368  			return
   369  		} else {
   370  			_ = conn.Close()
   371  		}
   372  		general.Infof("memory plugin checkpoint server serving confirmed")
   373  
   374  		// keep compatible to old version sys advisor not supporting list containers from memory plugin
   375  		err = p.pushMemoryAdvisor()
   376  		if err != nil {
   377  			general.Errorf("sync existing containers to memory advisor failed with error: %v", err)
   378  			return
   379  		}
   380  
   381  		// call lw of MemoryAdvisorServer and do allocation
   382  		if err := p.lwMemoryAdvisorServer(p.stopCh); err != nil {
   383  			general.Errorf("lwMemoryAdvisorServer failed with error: %v", err)
   384  		} else {
   385  			general.Infof("lwMemoryAdvisorServer finished")
   386  		}
   387  	}
   388  
   389  	general.RegisterHeartbeatCheck(memconsts.CommunicateWithAdvisor, 2*time.Minute, general.HealthzCheckStateNotReady,
   390  		2*time.Minute)
   391  	go wait.BackoffUntil(communicateWithMemoryAdvisorServer, wait.NewExponentialBackoffManager(800*time.Millisecond,
   392  		30*time.Second, 2*time.Minute, 2.0, 0, &clock.RealClock{}), true, p.stopCh)
   393  
   394  	p.lwRecvTimeMonitor = timemonitor.NewTimeMonitor(memoryAdvisorLWRecvTimeMonitorName,
   395  		memoryAdvisorLWRecvTimeMonitorDurationThreshold, memoryAdvisorLWRecvTimeMonitorInterval,
   396  		util.MetricNameLWRecvStuck, p.emitter)
   397  	go p.lwRecvTimeMonitor.Run(p.stopCh)
   398  	return nil
   399  }
   400  
   401  func (p *DynamicPolicy) Stop() error {
   402  	p.Lock()
   403  	defer func() {
   404  		p.oomPriorityMap.Close()
   405  		p.started = false
   406  		p.Unlock()
   407  		general.Warningf("stopped")
   408  	}()
   409  
   410  	if !p.started {
   411  		general.Warningf("already stopped")
   412  		return nil
   413  	}
   414  	close(p.stopCh)
   415  
   416  	periodicalhandler.StopHandlersByGroup(qrm.QRMMemoryPluginPeriodicalHandlerGroupName)
   417  
   418  	return nil
   419  }
   420  
   421  func (p *DynamicPolicy) Name() string {
   422  	return p.name
   423  }
   424  
   425  func (p *DynamicPolicy) ResourceName() string {
   426  	return string(v1.ResourceMemory)
   427  }
   428  
   429  // GetTopologyHints returns hints of corresponding resources
   430  func (p *DynamicPolicy) GetTopologyHints(ctx context.Context,
   431  	req *pluginapi.ResourceRequest,
   432  ) (*pluginapi.ResourceHintsResponse, error) {
   433  	if req == nil {
   434  		return nil, fmt.Errorf("GetTopologyHints got nil req")
   435  	}
   436  
   437  	// identify if the pod is a debug pod,
   438  	// if so, apply specific strategy to it.
   439  	// since GetKatalystQoSLevelFromResourceReq function will filter annotations,
   440  	// we should do it before GetKatalystQoSLevelFromResourceReq.
   441  	isDebugPod := util.IsDebugPod(req.Annotations, p.podDebugAnnoKeys)
   442  
   443  	qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req)
   444  	if err != nil {
   445  		err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v",
   446  			req.PodNamespace, req.PodName, req.ContainerName, err)
   447  		general.Errorf("%s", err.Error())
   448  		return nil, err
   449  	}
   450  
   451  	reqInt, _, err := util.GetQuantityFromResourceReq(req)
   452  	if err != nil {
   453  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
   454  	}
   455  
   456  	general.InfoS("GetTopologyHints is called",
   457  		"podNamespace", req.PodNamespace,
   458  		"podName", req.PodName,
   459  		"containerName", req.ContainerName,
   460  		"podType", req.PodType,
   461  		"podRole", req.PodRole,
   462  		"containerType", req.ContainerType,
   463  		"qosLevel", qosLevel,
   464  		"memoryReq(bytes)", reqInt,
   465  		"isDebugPod", isDebugPod)
   466  
   467  	if req.ContainerType == pluginapi.ContainerType_INIT || isDebugPod {
   468  		general.Infof("there is no NUMA preference, return nil hint")
   469  		return util.PackResourceHintsResponse(req, string(v1.ResourceMemory),
   470  			map[string]*pluginapi.ListOfTopologyHints{
   471  				string(v1.ResourceMemory): nil,
   472  			})
   473  	}
   474  
   475  	p.RLock()
   476  	defer func() {
   477  		p.RUnlock()
   478  		if err != nil {
   479  			_ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw)
   480  		}
   481  	}()
   482  
   483  	if p.hintHandlers[qosLevel] == nil {
   484  		return nil, fmt.Errorf("katalyst QoS level: %s is not supported yet", qosLevel)
   485  	}
   486  	return p.hintHandlers[qosLevel](ctx, req)
   487  }
   488  
   489  func (p *DynamicPolicy) RemovePod(ctx context.Context,
   490  	req *pluginapi.RemovePodRequest,
   491  ) (resp *pluginapi.RemovePodResponse, err error) {
   492  	if req == nil {
   493  		return nil, fmt.Errorf("RemovePod got nil req")
   494  	}
   495  
   496  	general.InfoS("called", "podUID", req.PodUid)
   497  
   498  	p.Lock()
   499  	defer func() {
   500  		p.Unlock()
   501  		if err != nil {
   502  			_ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw)
   503  		}
   504  	}()
   505  
   506  	for lastLevelEnhancementKey, handler := range p.enhancementHandlers[apiconsts.QRMPhaseRemovePod] {
   507  		if p.hasLastLevelEnhancementKey(lastLevelEnhancementKey, req.PodUid) {
   508  			herr := handler(ctx, p.emitter, p.metaServer, req,
   509  				p.state.GetPodResourceEntries())
   510  			if herr != nil {
   511  				return &pluginapi.RemovePodResponse{}, herr
   512  			}
   513  		}
   514  	}
   515  
   516  	if p.enableMemoryAdvisor {
   517  		_, err = p.advisorClient.RemovePod(ctx, &advisorsvc.RemovePodRequest{PodUid: req.PodUid})
   518  		if err != nil {
   519  			return nil, fmt.Errorf("remove pod in QoS aware server failed with error: %v", err)
   520  		}
   521  	}
   522  
   523  	err = p.removePod(req.PodUid)
   524  	if err != nil {
   525  		general.ErrorS(err, "remove pod failed with error", "podUID", req.PodUid)
   526  		_ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw)
   527  		return nil, err
   528  	}
   529  
   530  	aErr := p.adjustAllocationEntries()
   531  	if aErr != nil {
   532  		general.ErrorS(aErr, "adjustAllocationEntries failed", "podUID", req.PodUid)
   533  	}
   534  
   535  	return &pluginapi.RemovePodResponse{}, nil
   536  }
   537  
   538  // GetResourcesAllocation returns allocation results of corresponding resources
   539  func (p *DynamicPolicy) GetResourcesAllocation(_ context.Context,
   540  	req *pluginapi.GetResourcesAllocationRequest,
   541  ) (*pluginapi.GetResourcesAllocationResponse, error) {
   542  	if req == nil {
   543  		return nil, fmt.Errorf("GetResourcesAllocation got nil req")
   544  	}
   545  
   546  	p.RLock()
   547  	defer p.RUnlock()
   548  
   549  	podResources := make(map[string]*pluginapi.ContainerResources)
   550  	podEntries := p.state.GetPodResourceEntries()[v1.ResourceMemory]
   551  	for podUID, containerEntries := range podEntries {
   552  		if podResources[podUID] == nil {
   553  			podResources[podUID] = &pluginapi.ContainerResources{}
   554  		}
   555  
   556  		for containerName, allocationInfo := range containerEntries {
   557  			if allocationInfo == nil {
   558  				continue
   559  			}
   560  
   561  			if podResources[podUID].ContainerResources == nil {
   562  				podResources[podUID].ContainerResources = make(map[string]*pluginapi.ResourceAllocation)
   563  			}
   564  
   565  			var err error
   566  			podResources[podUID].ContainerResources[containerName], err = allocationInfo.GetResourceAllocation()
   567  			if err != nil {
   568  				errMsg := "allocationInfo.GetResourceAllocation failed"
   569  				general.ErrorS(err, errMsg,
   570  					"podNamespace", allocationInfo.PodNamespace,
   571  					"podName", allocationInfo.PodName,
   572  					"containerName", allocationInfo.ContainerName)
   573  				return nil, fmt.Errorf(errMsg)
   574  			}
   575  		}
   576  	}
   577  
   578  	return &pluginapi.GetResourcesAllocationResponse{
   579  		PodResources: podResources,
   580  	}, nil
   581  }
   582  
   583  // GetTopologyAwareResources returns allocation results of corresponding resources as topology aware format
   584  func (p *DynamicPolicy) GetTopologyAwareResources(_ context.Context,
   585  	req *pluginapi.GetTopologyAwareResourcesRequest,
   586  ) (*pluginapi.GetTopologyAwareResourcesResponse, error) {
   587  	if req == nil {
   588  		return nil, fmt.Errorf("GetTopologyAwareResources got nil req")
   589  	}
   590  
   591  	p.RLock()
   592  	defer p.RUnlock()
   593  
   594  	allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName)
   595  	if allocationInfo == nil {
   596  		return nil, fmt.Errorf("pod: %s, container: %s is not show up in memory plugin state", req.PodUid, req.ContainerName)
   597  	}
   598  
   599  	topologyAwareQuantityList := util.GetTopologyAwareQuantityFromAssignmentsSize(allocationInfo.TopologyAwareAllocations)
   600  	resp := &pluginapi.GetTopologyAwareResourcesResponse{
   601  		PodUid:       allocationInfo.PodUid,
   602  		PodName:      allocationInfo.PodName,
   603  		PodNamespace: allocationInfo.PodNamespace,
   604  		ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{
   605  			ContainerName: allocationInfo.ContainerName,
   606  		},
   607  	}
   608  
   609  	if allocationInfo.CheckSideCar() {
   610  		resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{
   611  			string(v1.ResourceMemory): {
   612  				IsNodeResource:                    false,
   613  				IsScalarResource:                  true,
   614  				AggregatedQuantity:                0,
   615  				OriginalAggregatedQuantity:        0,
   616  				TopologyAwareQuantityList:         nil,
   617  				OriginalTopologyAwareQuantityList: nil,
   618  			},
   619  		}
   620  	} else {
   621  		resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{
   622  			string(v1.ResourceMemory): {
   623  				IsNodeResource:                    false,
   624  				IsScalarResource:                  true,
   625  				AggregatedQuantity:                float64(allocationInfo.AggregatedQuantity),
   626  				OriginalAggregatedQuantity:        float64(allocationInfo.AggregatedQuantity),
   627  				TopologyAwareQuantityList:         topologyAwareQuantityList,
   628  				OriginalTopologyAwareQuantityList: topologyAwareQuantityList,
   629  			},
   630  		}
   631  	}
   632  
   633  	return resp, nil
   634  }
   635  
   636  // GetTopologyAwareAllocatableResources returns corresponding allocatable resources as topology aware format
   637  func (p *DynamicPolicy) GetTopologyAwareAllocatableResources(context.Context,
   638  	*pluginapi.GetTopologyAwareAllocatableResourcesRequest,
   639  ) (*pluginapi.GetTopologyAwareAllocatableResourcesResponse, error) {
   640  	p.RLock()
   641  	defer p.RUnlock()
   642  
   643  	machineState := p.state.GetMachineState()[v1.ResourceMemory]
   644  
   645  	numaNodes := p.topology.CPUDetails.NUMANodes().ToSliceInt()
   646  	topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState))
   647  	topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState))
   648  
   649  	var aggregatedAllocatableQuantity, aggregatedCapacityQuantity uint64 = 0, 0
   650  	for _, numaNode := range numaNodes {
   651  		numaNodeState := machineState[numaNode]
   652  		if numaNodeState == nil {
   653  			return nil, fmt.Errorf("nil numaNodeState for NUMA: %d", numaNode)
   654  		}
   655  
   656  		topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{
   657  			ResourceValue: float64(numaNodeState.Allocatable),
   658  			Node:          uint64(numaNode),
   659  		})
   660  		topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{
   661  			ResourceValue: float64(numaNodeState.TotalMemSize),
   662  			Node:          uint64(numaNode),
   663  		})
   664  		aggregatedAllocatableQuantity += numaNodeState.Allocatable
   665  		aggregatedCapacityQuantity += numaNodeState.TotalMemSize
   666  	}
   667  
   668  	return &pluginapi.GetTopologyAwareAllocatableResourcesResponse{
   669  		AllocatableResources: map[string]*pluginapi.AllocatableTopologyAwareResource{
   670  			string(v1.ResourceMemory): {
   671  				IsNodeResource:                       false,
   672  				IsScalarResource:                     true,
   673  				AggregatedAllocatableQuantity:        float64(aggregatedAllocatableQuantity),
   674  				TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList,
   675  				AggregatedCapacityQuantity:           float64(aggregatedCapacityQuantity),
   676  				TopologyAwareCapacityQuantityList:    topologyAwareCapacityQuantityList,
   677  			},
   678  		},
   679  	}, nil
   680  }
   681  
   682  // GetResourcePluginOptions returns options to be communicated with Resource Manager
   683  func (p *DynamicPolicy) GetResourcePluginOptions(context.Context,
   684  	*pluginapi.Empty,
   685  ) (*pluginapi.ResourcePluginOptions, error) {
   686  	return &pluginapi.ResourcePluginOptions{
   687  		PreStartRequired:      false,
   688  		WithTopologyAlignment: true,
   689  		NeedReconcile:         true,
   690  	}, nil
   691  }
   692  
   693  // Allocate is called during pod admit so that the resource
   694  // plugin can allocate corresponding resource for the container
   695  // according to resource request
   696  func (p *DynamicPolicy) Allocate(ctx context.Context,
   697  	req *pluginapi.ResourceRequest,
   698  ) (resp *pluginapi.ResourceAllocationResponse, respErr error) {
   699  	if req == nil {
   700  		return nil, fmt.Errorf("Allocate got nil req")
   701  	}
   702  
   703  	// identify if the pod is a debug pod,
   704  	// if so, apply specific strategy to it.
   705  	// since GetKatalystQoSLevelFromResourceReq function will filter annotations,
   706  	// we should do it before GetKatalystQoSLevelFromResourceReq.
   707  	isDebugPod := util.IsDebugPod(req.Annotations, p.podDebugAnnoKeys)
   708  
   709  	qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req)
   710  	if err != nil {
   711  		err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v",
   712  			req.PodNamespace, req.PodName, req.ContainerName, err)
   713  		general.Errorf("%s", err.Error())
   714  		return nil, err
   715  	}
   716  
   717  	reqInt, _, err := util.GetQuantityFromResourceReq(req)
   718  	if err != nil {
   719  		return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
   720  	}
   721  
   722  	general.InfoS("called",
   723  		"podNamespace", req.PodNamespace,
   724  		"podName", req.PodName,
   725  		"containerName", req.ContainerName,
   726  		"podType", req.PodType,
   727  		"podRole", req.PodRole,
   728  		"qosLevel", qosLevel,
   729  		"memoryReq(bytes)", reqInt)
   730  
   731  	if req.ContainerType == pluginapi.ContainerType_INIT {
   732  		return &pluginapi.ResourceAllocationResponse{
   733  			PodUid:         req.PodUid,
   734  			PodNamespace:   req.PodNamespace,
   735  			PodName:        req.PodName,
   736  			ContainerName:  req.ContainerName,
   737  			ContainerType:  req.ContainerType,
   738  			ContainerIndex: req.ContainerIndex,
   739  			PodRole:        req.PodRole,
   740  			PodType:        req.PodType,
   741  			ResourceName:   string(v1.ResourceMemory),
   742  			Labels:         general.DeepCopyMap(req.Labels),
   743  			Annotations:    general.DeepCopyMap(req.Annotations),
   744  		}, nil
   745  	} else if isDebugPod {
   746  		return &pluginapi.ResourceAllocationResponse{
   747  			PodUid:         req.PodUid,
   748  			PodNamespace:   req.PodNamespace,
   749  			PodName:        req.PodName,
   750  			ContainerName:  req.ContainerName,
   751  			ContainerType:  req.ContainerType,
   752  			ContainerIndex: req.ContainerIndex,
   753  			PodRole:        req.PodRole,
   754  			PodType:        req.PodType,
   755  			ResourceName:   string(v1.ResourceMemory),
   756  			AllocationResult: &pluginapi.ResourceAllocation{
   757  				ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{
   758  					string(v1.ResourceMemory): {
   759  						// return ResourceAllocation with empty OciPropertyName, AllocatedQuantity, AllocationResult for containers in debug pod,
   760  						// it won't influence oci spec properties of the container
   761  						IsNodeResource:   false,
   762  						IsScalarResource: true,
   763  					},
   764  				},
   765  			},
   766  			Labels:      general.DeepCopyMap(req.Labels),
   767  			Annotations: general.DeepCopyMap(req.Annotations),
   768  		}, nil
   769  	}
   770  
   771  	p.Lock()
   772  	defer func() {
   773  		// calls sys-advisor to inform the latest container
   774  		if p.enableMemoryAdvisor && respErr == nil && req.ContainerType != pluginapi.ContainerType_INIT {
   775  			_, err := p.advisorClient.AddContainer(ctx, &advisorsvc.ContainerMetadata{
   776  				PodUid:          req.PodUid,
   777  				PodNamespace:    req.PodNamespace,
   778  				PodName:         req.PodName,
   779  				ContainerName:   req.ContainerName,
   780  				ContainerType:   req.ContainerType,
   781  				ContainerIndex:  req.ContainerIndex,
   782  				Labels:          maputil.CopySS(req.Labels),
   783  				Annotations:     maputil.CopySS(req.Annotations),
   784  				QosLevel:        qosLevel,
   785  				RequestQuantity: uint64(reqInt),
   786  			})
   787  			if err != nil {
   788  				resp = nil
   789  				respErr = fmt.Errorf("add container to qos aware server failed with error: %v", err)
   790  				_ = p.removeContainer(req.PodUid, req.ContainerName)
   791  			}
   792  		} else if respErr != nil {
   793  			_ = p.removeContainer(req.PodUid, req.ContainerName)
   794  			_ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw)
   795  		}
   796  
   797  		p.Unlock()
   798  		return
   799  	}()
   800  
   801  	allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName)
   802  	if allocationInfo != nil && allocationInfo.AggregatedQuantity >= uint64(reqInt) {
   803  		general.InfoS("already allocated and meet requirement",
   804  			"podNamespace", req.PodNamespace,
   805  			"podName", req.PodName,
   806  			"containerName", req.ContainerName,
   807  			"memoryReq(bytes)", reqInt,
   808  			"currentResult(bytes)", allocationInfo.AggregatedQuantity)
   809  		return &pluginapi.ResourceAllocationResponse{
   810  			PodUid:         req.PodUid,
   811  			PodNamespace:   req.PodNamespace,
   812  			PodName:        req.PodName,
   813  			ContainerName:  req.ContainerName,
   814  			ContainerType:  req.ContainerType,
   815  			ContainerIndex: req.ContainerIndex,
   816  			PodRole:        req.PodRole,
   817  			PodType:        req.PodType,
   818  			ResourceName:   string(v1.ResourceMemory),
   819  			AllocationResult: &pluginapi.ResourceAllocation{
   820  				ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{
   821  					string(v1.ResourceMemory): {
   822  						OciPropertyName:   util.OCIPropertyNameCPUSetMems,
   823  						IsNodeResource:    false,
   824  						IsScalarResource:  true,
   825  						AllocatedQuantity: float64(allocationInfo.AggregatedQuantity),
   826  						AllocationResult:  allocationInfo.NumaAllocationResult.String(),
   827  					},
   828  				},
   829  			},
   830  			Labels:      general.DeepCopyMap(req.Labels),
   831  			Annotations: general.DeepCopyMap(req.Annotations),
   832  		}, nil
   833  	}
   834  
   835  	if p.allocationHandlers[qosLevel] == nil {
   836  		return nil, fmt.Errorf("katalyst QoS level: %s is not supported yet", qosLevel)
   837  	}
   838  	return p.allocationHandlers[qosLevel](ctx, req)
   839  }
   840  
   841  // PreStartContainer is called, if indicated by resource plugin during registration phase,
   842  // before each container start. Resource plugin can run resource specific operations
   843  // such as resetting the resource before making resources available to the container
   844  func (p *DynamicPolicy) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
   845  	return nil, nil
   846  }
   847  
   848  func (p *DynamicPolicy) removePod(podUID string) error {
   849  	podResourceEntries := p.state.GetPodResourceEntries()
   850  	for _, podEntries := range podResourceEntries {
   851  		delete(podEntries, podUID)
   852  	}
   853  
   854  	resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetReservedMemory())
   855  	if err != nil {
   856  		general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err)
   857  		return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err)
   858  	}
   859  
   860  	p.state.SetPodResourceEntries(podResourceEntries)
   861  	p.state.SetMachineState(resourcesMachineState)
   862  	return nil
   863  }
   864  
   865  func (p *DynamicPolicy) removeContainer(podUID, containerName string) error {
   866  	podResourceEntries := p.state.GetPodResourceEntries()
   867  
   868  	found := false
   869  	for _, podEntries := range podResourceEntries {
   870  		if podEntries[podUID][containerName] != nil {
   871  			found = true
   872  		}
   873  
   874  		delete(podEntries[podUID], containerName)
   875  	}
   876  
   877  	if !found {
   878  		return nil
   879  	}
   880  
   881  	resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetReservedMemory())
   882  	if err != nil {
   883  		general.Errorf("pod: %s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", podUID, containerName, err)
   884  		return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err)
   885  	}
   886  
   887  	p.state.SetPodResourceEntries(podResourceEntries)
   888  	p.state.SetMachineState(resourcesMachineState)
   889  	return nil
   890  }
   891  
   892  // getContainerRequestedMemoryBytes parses and returns requested memory bytes for the given container
   893  func (p *DynamicPolicy) getContainerRequestedMemoryBytes(allocationInfo *state.AllocationInfo) int {
   894  	if allocationInfo == nil {
   895  		general.Errorf("got nil allocationInfo")
   896  		return 0
   897  	}
   898  
   899  	if p.metaServer == nil {
   900  		general.Errorf("got nil metaServer")
   901  		return 0
   902  	}
   903  
   904  	container, err := p.metaServer.GetContainerSpec(allocationInfo.PodUid, allocationInfo.ContainerName)
   905  	if err != nil || container == nil {
   906  		general.Errorf("get container failed with error: %v", err)
   907  		return 0
   908  	}
   909  
   910  	memoryQuantity := native.MemoryQuantityGetter()(container.Resources.Requests)
   911  	requestBytes := general.Max(int(memoryQuantity.Value()), 0)
   912  
   913  	general.Infof("get memory request bytes: %d for pod: %s/%s container: %s from podWatcher",
   914  		requestBytes, allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName)
   915  	return requestBytes
   916  }
   917  
   918  // hasLastLevelEnhancementKey check if the pod with the given UID has the corresponding last level enhancement key
   919  func (p *DynamicPolicy) hasLastLevelEnhancementKey(lastLevelEnhancementKey string, podUID string) bool {
   920  	podEntries := p.state.GetPodResourceEntries()[v1.ResourceMemory]
   921  
   922  	for _, allocationInfo := range podEntries[podUID] {
   923  		if _, ok := allocationInfo.Annotations[lastLevelEnhancementKey]; ok {
   924  			general.Infof("pod: %s has last level enhancement key: %s", podUID, lastLevelEnhancementKey)
   925  			return true
   926  		}
   927  	}
   928  
   929  	general.Infof("pod: %s does not have last level enhancement key: %s", podUID, lastLevelEnhancementKey)
   930  	return false
   931  }