github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package dynamicpolicy 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 "github.com/cilium/ebpf" 26 "google.golang.org/grpc" 27 v1 "k8s.io/api/core/v1" 28 "k8s.io/apimachinery/pkg/util/wait" 29 pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" 30 maputil "k8s.io/kubernetes/pkg/util/maps" 31 "k8s.io/utils/clock" 32 33 apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" 34 "github.com/kubewharf/katalyst-api/pkg/plugins/skeleton" 35 "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent" 36 "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/agent/qrm" 37 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/advisorsvc" 38 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" 39 memconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/consts" 40 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor" 41 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/oom" 42 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/state" 43 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/handlers/sockmem" 44 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" 45 "github.com/kubewharf/katalyst-core/pkg/agent/utilcomponent/periodicalhandler" 46 "github.com/kubewharf/katalyst-core/pkg/config" 47 "github.com/kubewharf/katalyst-core/pkg/config/generic" 48 "github.com/kubewharf/katalyst-core/pkg/metaserver" 49 "github.com/kubewharf/katalyst-core/pkg/metrics" 50 "github.com/kubewharf/katalyst-core/pkg/util/asyncworker" 51 "github.com/kubewharf/katalyst-core/pkg/util/general" 52 "github.com/kubewharf/katalyst-core/pkg/util/machine" 53 "github.com/kubewharf/katalyst-core/pkg/util/native" 54 "github.com/kubewharf/katalyst-core/pkg/util/process" 55 "github.com/kubewharf/katalyst-core/pkg/util/timemonitor" 56 ) 57 58 const ( 59 MemoryResourcePluginPolicyNameDynamic = string(apiconsts.ResourcePluginPolicyNameDynamic) 60 61 memoryPluginStateFileName = "memory_plugin_state" 62 memoryPluginAsyncWorkersName = "qrm_memory_plugin_async_workers" 63 memoryPluginAsyncWorkTopicDropCache = "qrm_memory_plugin_drop_cache" 64 memoryPluginAsyncWorkTopicSetExtraCGMemLimit = "qrm_memory_plugin_set_extra_mem_limit" 65 memoryPluginAsyncWorkTopicMovePage = "qrm_memory_plugin_move_page" 66 memoryPluginAsyncWorkTopicMemoryOffloading = "qrm_memory_plugin_mem_offload" 67 68 dropCacheTimeoutSeconds = 30 69 setExtraCGMemLimitTimeoutSeconds = 60 70 ) 71 72 const ( 73 memsetCheckPeriod = 10 * time.Second 74 stateCheckPeriod = 30 * time.Second 75 maxResidualTime = 5 * time.Minute 76 setMemoryMigratePeriod = 5 * time.Second 77 applyCgroupPeriod = 5 * time.Second 78 setExtraControlKnobsPeriod = 5 * time.Second 79 clearOOMPriorityPeriod = 1 * time.Hour 80 syncOOMPriorityPeriod = 5 * time.Second 81 82 healthCheckTolerationTimes = 3 83 dropCacheGracePeriod = 60 * time.Second 84 ) 85 86 var ( 87 readonlyStateLock sync.RWMutex 88 readonlyState state.ReadonlyState 89 ) 90 91 // GetReadonlyState returns state.ReadonlyState to provides a way 92 // to obtain the running states of the plugin 93 func GetReadonlyState() (state.ReadonlyState, error) { 94 readonlyStateLock.RLock() 95 defer readonlyStateLock.RUnlock() 96 97 if readonlyState == nil { 98 return nil, fmt.Errorf("readonlyState isn't setted") 99 } 100 return readonlyState, nil 101 } 102 103 type DynamicPolicy struct { 104 sync.RWMutex 105 106 stopCh chan struct{} 107 started bool 108 qosConfig *generic.QoSConfiguration 109 extraControlKnobConfigs commonstate.ExtraControlKnobConfigs 110 111 // emitter is used to emit metrics. 112 // metaServer is used to collect metadata universal metaServer. 113 emitter metrics.MetricEmitter 114 metaServer *metaserver.MetaServer 115 116 advisorClient advisorsvc.AdvisorServiceClient 117 advisorConn *grpc.ClientConn 118 lwRecvTimeMonitor *timemonitor.TimeMonitor 119 120 topology *machine.CPUTopology 121 state state.State 122 123 migrateMemoryLock sync.Mutex 124 migratingMemory map[string]map[string]bool 125 residualHitMap map[string]int64 126 127 allocationHandlers map[string]util.AllocationHandler 128 hintHandlers map[string]util.HintHandler 129 enhancementHandlers util.ResourceEnhancementHandlerMap 130 131 extraStateFileAbsPath string 132 name string 133 134 podDebugAnnoKeys []string 135 136 asyncWorkers *asyncworker.AsyncWorkers 137 138 enableSettingMemoryMigrate bool 139 enableSettingSockMem bool 140 enableMemoryAdvisor bool 141 memoryAdvisorSocketAbsPath string 142 memoryPluginSocketAbsPath string 143 144 enableOOMPriority bool 145 oomPriorityMapPinnedPath string 146 oomPriorityMapLock sync.Mutex 147 oomPriorityMap *ebpf.Map 148 } 149 150 func NewDynamicPolicy(agentCtx *agent.GenericContext, conf *config.Configuration, 151 _ interface{}, agentName string, 152 ) (bool, agent.Component, error) { 153 reservedMemory, err := getReservedMemory(conf, agentCtx.MetaServer, agentCtx.MachineInfo) 154 if err != nil { 155 return false, agent.ComponentStub{}, fmt.Errorf("getReservedMemoryFromOptions failed with error: %v", err) 156 } 157 158 resourcesReservedMemory := map[v1.ResourceName]map[int]uint64{ 159 v1.ResourceMemory: reservedMemory, 160 } 161 stateImpl, err := state.NewCheckpointState(conf.GenericQRMPluginConfiguration.StateFileDirectory, memoryPluginStateFileName, 162 memconsts.MemoryResourcePluginPolicyNameDynamic, agentCtx.CPUTopology, agentCtx.MachineInfo, resourcesReservedMemory, conf.SkipMemoryStateCorruption) 163 if err != nil { 164 return false, agent.ComponentStub{}, fmt.Errorf("NewCheckpointState failed with error: %v", err) 165 } 166 167 extraControlKnobConfigs := make(commonstate.ExtraControlKnobConfigs) 168 if len(conf.ExtraControlKnobConfigFile) > 0 { 169 extraControlKnobConfigs, err = commonstate.LoadExtraControlKnobConfigs(conf.ExtraControlKnobConfigFile) 170 if err != nil { 171 return false, agent.ComponentStub{}, fmt.Errorf("loadExtraControlKnobConfigs failed with error: %v", err) 172 } 173 } else { 174 general.Infof("empty ExtraControlKnobConfigFile, initialize empty extraControlKnobConfigs") 175 } 176 177 readonlyStateLock.Lock() 178 readonlyState = stateImpl 179 readonlyStateLock.Unlock() 180 181 wrappedEmitter := agentCtx.EmitterPool.GetDefaultMetricsEmitter().WithTags(agentName, metrics.MetricTag{ 182 Key: util.QRMPluginPolicyTagName, 183 Val: memconsts.MemoryResourcePluginPolicyNameDynamic, 184 }) 185 186 policyImplement := &DynamicPolicy{ 187 topology: agentCtx.CPUTopology, 188 qosConfig: conf.QoSConfiguration, 189 emitter: wrappedEmitter, 190 metaServer: agentCtx.MetaServer, 191 state: stateImpl, 192 stopCh: make(chan struct{}), 193 migratingMemory: make(map[string]map[string]bool), 194 residualHitMap: make(map[string]int64), 195 enhancementHandlers: make(util.ResourceEnhancementHandlerMap), 196 extraStateFileAbsPath: conf.ExtraStateFileAbsPath, 197 name: fmt.Sprintf("%s_%s", agentName, memconsts.MemoryResourcePluginPolicyNameDynamic), 198 podDebugAnnoKeys: conf.PodDebugAnnoKeys, 199 asyncWorkers: asyncworker.NewAsyncWorkers(memoryPluginAsyncWorkersName, wrappedEmitter), 200 enableSettingMemoryMigrate: conf.EnableSettingMemoryMigrate, 201 enableSettingSockMem: conf.EnableSettingSockMem, 202 enableMemoryAdvisor: conf.EnableMemoryAdvisor, 203 memoryAdvisorSocketAbsPath: conf.MemoryAdvisorSocketAbsPath, 204 memoryPluginSocketAbsPath: conf.MemoryPluginSocketAbsPath, 205 extraControlKnobConfigs: extraControlKnobConfigs, // [TODO]: support modifying extraControlKnobConfigs by KCC 206 enableOOMPriority: conf.EnableOOMPriority, 207 oomPriorityMapPinnedPath: conf.OOMPriorityPinnedMapAbsPath, 208 } 209 210 policyImplement.allocationHandlers = map[string]util.AllocationHandler{ 211 apiconsts.PodAnnotationQoSLevelSharedCores: policyImplement.sharedCoresAllocationHandler, 212 apiconsts.PodAnnotationQoSLevelDedicatedCores: policyImplement.dedicatedCoresAllocationHandler, 213 apiconsts.PodAnnotationQoSLevelReclaimedCores: policyImplement.reclaimedCoresAllocationHandler, 214 } 215 216 policyImplement.hintHandlers = map[string]util.HintHandler{ 217 apiconsts.PodAnnotationQoSLevelSharedCores: policyImplement.sharedCoresHintHandler, 218 apiconsts.PodAnnotationQoSLevelDedicatedCores: policyImplement.dedicatedCoresHintHandler, 219 apiconsts.PodAnnotationQoSLevelReclaimedCores: policyImplement.reclaimedCoresHintHandler, 220 } 221 222 if policyImplement.enableOOMPriority { 223 policyImplement.enhancementHandlers.Register(apiconsts.QRMPhaseRemovePod, 224 apiconsts.PodAnnotationMemoryEnhancementOOMPriority, policyImplement.clearOOMPriority) 225 } 226 227 pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(policyImplement, conf.QRMPluginSocketDirs, 228 func(key string, value int64) { 229 _ = wrappedEmitter.StoreInt64(key, value, metrics.MetricTypeNameRaw) 230 }) 231 if err != nil { 232 return false, agent.ComponentStub{}, fmt.Errorf("dynamic policy new plugin wrapper failed with error: %v", err) 233 } 234 235 memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyMemoryLimitInBytes, 236 memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorMemoryLimitInBytes)) 237 memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyCPUSetMems, 238 memoryadvisor.ControlKnobHandlerWithChecker(handleAdvisorCPUSetMems)) 239 memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyDropCache, 240 memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorDropCache)) 241 memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobReclaimedMemorySize, 242 memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorMemoryProvisions)) 243 memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnobKeyBalanceNumaMemory, 244 memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleNumaMemoryBalance)) 245 memoryadvisor.RegisterControlKnobHandler(memoryadvisor.ControlKnowKeyMemoryOffloading, 246 memoryadvisor.ControlKnobHandlerWithChecker(policyImplement.handleAdvisorMemoryOffloading)) 247 248 return true, &agent.PluginWrapper{GenericPlugin: pluginWrapper}, nil 249 } 250 251 func (p *DynamicPolicy) registerControlKnobHandlerCheckRules() { 252 general.RegisterReportCheck(memconsts.DropCache, dropCacheGracePeriod) 253 } 254 255 func (p *DynamicPolicy) Start() (err error) { 256 general.Infof("called") 257 258 p.Lock() 259 defer func() { 260 if !p.started { 261 if err == nil { 262 p.started = true 263 } else { 264 close(p.stopCh) 265 } 266 } 267 p.Unlock() 268 }() 269 270 if p.started { 271 general.Infof("already started") 272 return nil 273 } 274 p.stopCh = make(chan struct{}) 275 276 p.registerControlKnobHandlerCheckRules() 277 go wait.Until(func() { 278 _ = p.emitter.StoreInt64(util.MetricNameHeartBeat, 1, metrics.MetricTypeNameRaw) 279 }, time.Second*30, p.stopCh) 280 281 err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.ClearResidualState, 282 general.HealthzCheckStateNotReady, qrm.QRMMemoryPluginPeriodicalHandlerGroupName, 283 p.clearResidualState, stateCheckPeriod, healthCheckTolerationTimes) 284 if err != nil { 285 general.Errorf("start %v failed, err: %v", memconsts.ClearResidualState, err) 286 } 287 288 err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.CheckMemSet, general.HealthzCheckStateNotReady, 289 qrm.QRMMemoryPluginPeriodicalHandlerGroupName, p.checkMemorySet, memsetCheckPeriod, healthCheckTolerationTimes) 290 if err != nil { 291 general.Errorf("start %v failed, err: %v", memconsts.CheckMemSet, err) 292 } 293 294 err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.ApplyExternalCGParams, general.HealthzCheckStateNotReady, 295 qrm.QRMMemoryPluginPeriodicalHandlerGroupName, p.applyExternalCgroupParams, applyCgroupPeriod, healthCheckTolerationTimes) 296 if err != nil { 297 general.Errorf("start %v failed, err: %v", memconsts.ApplyExternalCGParams, err) 298 } 299 300 err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.SetExtraControlKnob, general.HealthzCheckStateNotReady, 301 qrm.QRMMemoryPluginPeriodicalHandlerGroupName, p.setExtraControlKnobByConfigs, setExtraControlKnobsPeriod, healthCheckTolerationTimes) 302 if err != nil { 303 general.Errorf("start %v failed, err: %v", memconsts.SetExtraControlKnob, err) 304 } 305 306 err = p.asyncWorkers.Start(p.stopCh) 307 if err != nil { 308 general.Errorf("start async worker failed, err: %v", err) 309 } 310 311 if p.enableSettingMemoryMigrate { 312 general.Infof("setMemoryMigrate enabled") 313 go wait.Until(p.setMemoryMigrate, setMemoryMigratePeriod, p.stopCh) 314 } 315 316 if p.enableOOMPriority { 317 general.Infof("OOM priority enabled") 318 go p.PollOOMBPFInit(p.stopCh) 319 320 err := periodicalhandler.RegisterPeriodicalHandler(qrm.QRMMemoryPluginPeriodicalHandlerGroupName, 321 oom.ClearResidualOOMPriorityPeriodicalHandlerName, p.clearResidualOOMPriority, clearOOMPriorityPeriod) 322 if err != nil { 323 general.Infof("register clearResidualOOMPriority failed, err=%v", err) 324 } 325 326 err = periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.OOMPriority, general.HealthzCheckStateNotReady, 327 qrm.QRMMemoryPluginPeriodicalHandlerGroupName, p.syncOOMPriority, syncOOMPriorityPeriod, healthCheckTolerationTimes) 328 if err != nil { 329 general.Infof("register syncOOMPriority failed, err=%v", err) 330 } 331 } 332 333 if p.enableSettingSockMem { 334 general.Infof("setSockMem enabled") 335 err := periodicalhandler.RegisterPeriodicalHandlerWithHealthz(memconsts.SetSockMem, 336 general.HealthzCheckStateNotReady, qrm.QRMMemoryPluginPeriodicalHandlerGroupName, 337 sockmem.SetSockMemLimit, 60*time.Second, healthCheckTolerationTimes) 338 if err != nil { 339 general.Infof("setSockMem failed, err=%v", err) 340 } 341 } 342 343 go wait.Until(func() { 344 periodicalhandler.ReadyToStartHandlersByGroup(qrm.QRMMemoryPluginPeriodicalHandlerGroupName) 345 }, 5*time.Second, p.stopCh) 346 347 if !p.enableMemoryAdvisor { 348 general.Infof("start dynamic policy memory plugin without memory advisor") 349 return nil 350 } else if p.memoryAdvisorSocketAbsPath == "" { 351 return fmt.Errorf("invalid memoryAdvisorSocketAbsPath: %s", p.memoryAdvisorSocketAbsPath) 352 } 353 354 general.Infof("start dynamic policy memory plugin with memory advisor") 355 err = p.initAdvisorClientConn() 356 if err != nil { 357 general.Errorf("initAdvisorClientConn failed with error: %v", err) 358 return 359 } 360 361 go wait.BackoffUntil(func() { p.serveForAdvisor(p.stopCh) }, wait.NewExponentialBackoffManager( 362 800*time.Millisecond, 30*time.Second, 2*time.Minute, 2.0, 0, &clock.RealClock{}), true, p.stopCh) 363 364 communicateWithMemoryAdvisorServer := func() { 365 general.Infof("waiting memory plugin checkpoint server serving confirmation") 366 if conn, err := process.Dial(p.memoryPluginSocketAbsPath, 5*time.Second); err != nil { 367 general.Errorf("dial check at socket: %s failed with err: %v", p.memoryPluginSocketAbsPath, err) 368 return 369 } else { 370 _ = conn.Close() 371 } 372 general.Infof("memory plugin checkpoint server serving confirmed") 373 374 // keep compatible to old version sys advisor not supporting list containers from memory plugin 375 err = p.pushMemoryAdvisor() 376 if err != nil { 377 general.Errorf("sync existing containers to memory advisor failed with error: %v", err) 378 return 379 } 380 381 // call lw of MemoryAdvisorServer and do allocation 382 if err := p.lwMemoryAdvisorServer(p.stopCh); err != nil { 383 general.Errorf("lwMemoryAdvisorServer failed with error: %v", err) 384 } else { 385 general.Infof("lwMemoryAdvisorServer finished") 386 } 387 } 388 389 general.RegisterHeartbeatCheck(memconsts.CommunicateWithAdvisor, 2*time.Minute, general.HealthzCheckStateNotReady, 390 2*time.Minute) 391 go wait.BackoffUntil(communicateWithMemoryAdvisorServer, wait.NewExponentialBackoffManager(800*time.Millisecond, 392 30*time.Second, 2*time.Minute, 2.0, 0, &clock.RealClock{}), true, p.stopCh) 393 394 p.lwRecvTimeMonitor = timemonitor.NewTimeMonitor(memoryAdvisorLWRecvTimeMonitorName, 395 memoryAdvisorLWRecvTimeMonitorDurationThreshold, memoryAdvisorLWRecvTimeMonitorInterval, 396 util.MetricNameLWRecvStuck, p.emitter) 397 go p.lwRecvTimeMonitor.Run(p.stopCh) 398 return nil 399 } 400 401 func (p *DynamicPolicy) Stop() error { 402 p.Lock() 403 defer func() { 404 p.oomPriorityMap.Close() 405 p.started = false 406 p.Unlock() 407 general.Warningf("stopped") 408 }() 409 410 if !p.started { 411 general.Warningf("already stopped") 412 return nil 413 } 414 close(p.stopCh) 415 416 periodicalhandler.StopHandlersByGroup(qrm.QRMMemoryPluginPeriodicalHandlerGroupName) 417 418 return nil 419 } 420 421 func (p *DynamicPolicy) Name() string { 422 return p.name 423 } 424 425 func (p *DynamicPolicy) ResourceName() string { 426 return string(v1.ResourceMemory) 427 } 428 429 // GetTopologyHints returns hints of corresponding resources 430 func (p *DynamicPolicy) GetTopologyHints(ctx context.Context, 431 req *pluginapi.ResourceRequest, 432 ) (*pluginapi.ResourceHintsResponse, error) { 433 if req == nil { 434 return nil, fmt.Errorf("GetTopologyHints got nil req") 435 } 436 437 // identify if the pod is a debug pod, 438 // if so, apply specific strategy to it. 439 // since GetKatalystQoSLevelFromResourceReq function will filter annotations, 440 // we should do it before GetKatalystQoSLevelFromResourceReq. 441 isDebugPod := util.IsDebugPod(req.Annotations, p.podDebugAnnoKeys) 442 443 qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req) 444 if err != nil { 445 err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", 446 req.PodNamespace, req.PodName, req.ContainerName, err) 447 general.Errorf("%s", err.Error()) 448 return nil, err 449 } 450 451 reqInt, _, err := util.GetQuantityFromResourceReq(req) 452 if err != nil { 453 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 454 } 455 456 general.InfoS("GetTopologyHints is called", 457 "podNamespace", req.PodNamespace, 458 "podName", req.PodName, 459 "containerName", req.ContainerName, 460 "podType", req.PodType, 461 "podRole", req.PodRole, 462 "containerType", req.ContainerType, 463 "qosLevel", qosLevel, 464 "memoryReq(bytes)", reqInt, 465 "isDebugPod", isDebugPod) 466 467 if req.ContainerType == pluginapi.ContainerType_INIT || isDebugPod { 468 general.Infof("there is no NUMA preference, return nil hint") 469 return util.PackResourceHintsResponse(req, string(v1.ResourceMemory), 470 map[string]*pluginapi.ListOfTopologyHints{ 471 string(v1.ResourceMemory): nil, 472 }) 473 } 474 475 p.RLock() 476 defer func() { 477 p.RUnlock() 478 if err != nil { 479 _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw) 480 } 481 }() 482 483 if p.hintHandlers[qosLevel] == nil { 484 return nil, fmt.Errorf("katalyst QoS level: %s is not supported yet", qosLevel) 485 } 486 return p.hintHandlers[qosLevel](ctx, req) 487 } 488 489 func (p *DynamicPolicy) RemovePod(ctx context.Context, 490 req *pluginapi.RemovePodRequest, 491 ) (resp *pluginapi.RemovePodResponse, err error) { 492 if req == nil { 493 return nil, fmt.Errorf("RemovePod got nil req") 494 } 495 496 general.InfoS("called", "podUID", req.PodUid) 497 498 p.Lock() 499 defer func() { 500 p.Unlock() 501 if err != nil { 502 _ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw) 503 } 504 }() 505 506 for lastLevelEnhancementKey, handler := range p.enhancementHandlers[apiconsts.QRMPhaseRemovePod] { 507 if p.hasLastLevelEnhancementKey(lastLevelEnhancementKey, req.PodUid) { 508 herr := handler(ctx, p.emitter, p.metaServer, req, 509 p.state.GetPodResourceEntries()) 510 if herr != nil { 511 return &pluginapi.RemovePodResponse{}, herr 512 } 513 } 514 } 515 516 if p.enableMemoryAdvisor { 517 _, err = p.advisorClient.RemovePod(ctx, &advisorsvc.RemovePodRequest{PodUid: req.PodUid}) 518 if err != nil { 519 return nil, fmt.Errorf("remove pod in QoS aware server failed with error: %v", err) 520 } 521 } 522 523 err = p.removePod(req.PodUid) 524 if err != nil { 525 general.ErrorS(err, "remove pod failed with error", "podUID", req.PodUid) 526 _ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw) 527 return nil, err 528 } 529 530 aErr := p.adjustAllocationEntries() 531 if aErr != nil { 532 general.ErrorS(aErr, "adjustAllocationEntries failed", "podUID", req.PodUid) 533 } 534 535 return &pluginapi.RemovePodResponse{}, nil 536 } 537 538 // GetResourcesAllocation returns allocation results of corresponding resources 539 func (p *DynamicPolicy) GetResourcesAllocation(_ context.Context, 540 req *pluginapi.GetResourcesAllocationRequest, 541 ) (*pluginapi.GetResourcesAllocationResponse, error) { 542 if req == nil { 543 return nil, fmt.Errorf("GetResourcesAllocation got nil req") 544 } 545 546 p.RLock() 547 defer p.RUnlock() 548 549 podResources := make(map[string]*pluginapi.ContainerResources) 550 podEntries := p.state.GetPodResourceEntries()[v1.ResourceMemory] 551 for podUID, containerEntries := range podEntries { 552 if podResources[podUID] == nil { 553 podResources[podUID] = &pluginapi.ContainerResources{} 554 } 555 556 for containerName, allocationInfo := range containerEntries { 557 if allocationInfo == nil { 558 continue 559 } 560 561 if podResources[podUID].ContainerResources == nil { 562 podResources[podUID].ContainerResources = make(map[string]*pluginapi.ResourceAllocation) 563 } 564 565 var err error 566 podResources[podUID].ContainerResources[containerName], err = allocationInfo.GetResourceAllocation() 567 if err != nil { 568 errMsg := "allocationInfo.GetResourceAllocation failed" 569 general.ErrorS(err, errMsg, 570 "podNamespace", allocationInfo.PodNamespace, 571 "podName", allocationInfo.PodName, 572 "containerName", allocationInfo.ContainerName) 573 return nil, fmt.Errorf(errMsg) 574 } 575 } 576 } 577 578 return &pluginapi.GetResourcesAllocationResponse{ 579 PodResources: podResources, 580 }, nil 581 } 582 583 // GetTopologyAwareResources returns allocation results of corresponding resources as topology aware format 584 func (p *DynamicPolicy) GetTopologyAwareResources(_ context.Context, 585 req *pluginapi.GetTopologyAwareResourcesRequest, 586 ) (*pluginapi.GetTopologyAwareResourcesResponse, error) { 587 if req == nil { 588 return nil, fmt.Errorf("GetTopologyAwareResources got nil req") 589 } 590 591 p.RLock() 592 defer p.RUnlock() 593 594 allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName) 595 if allocationInfo == nil { 596 return nil, fmt.Errorf("pod: %s, container: %s is not show up in memory plugin state", req.PodUid, req.ContainerName) 597 } 598 599 topologyAwareQuantityList := util.GetTopologyAwareQuantityFromAssignmentsSize(allocationInfo.TopologyAwareAllocations) 600 resp := &pluginapi.GetTopologyAwareResourcesResponse{ 601 PodUid: allocationInfo.PodUid, 602 PodName: allocationInfo.PodName, 603 PodNamespace: allocationInfo.PodNamespace, 604 ContainerTopologyAwareResources: &pluginapi.ContainerTopologyAwareResources{ 605 ContainerName: allocationInfo.ContainerName, 606 }, 607 } 608 609 if allocationInfo.CheckSideCar() { 610 resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{ 611 string(v1.ResourceMemory): { 612 IsNodeResource: false, 613 IsScalarResource: true, 614 AggregatedQuantity: 0, 615 OriginalAggregatedQuantity: 0, 616 TopologyAwareQuantityList: nil, 617 OriginalTopologyAwareQuantityList: nil, 618 }, 619 } 620 } else { 621 resp.ContainerTopologyAwareResources.AllocatedResources = map[string]*pluginapi.TopologyAwareResource{ 622 string(v1.ResourceMemory): { 623 IsNodeResource: false, 624 IsScalarResource: true, 625 AggregatedQuantity: float64(allocationInfo.AggregatedQuantity), 626 OriginalAggregatedQuantity: float64(allocationInfo.AggregatedQuantity), 627 TopologyAwareQuantityList: topologyAwareQuantityList, 628 OriginalTopologyAwareQuantityList: topologyAwareQuantityList, 629 }, 630 } 631 } 632 633 return resp, nil 634 } 635 636 // GetTopologyAwareAllocatableResources returns corresponding allocatable resources as topology aware format 637 func (p *DynamicPolicy) GetTopologyAwareAllocatableResources(context.Context, 638 *pluginapi.GetTopologyAwareAllocatableResourcesRequest, 639 ) (*pluginapi.GetTopologyAwareAllocatableResourcesResponse, error) { 640 p.RLock() 641 defer p.RUnlock() 642 643 machineState := p.state.GetMachineState()[v1.ResourceMemory] 644 645 numaNodes := p.topology.CPUDetails.NUMANodes().ToSliceInt() 646 topologyAwareAllocatableQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) 647 topologyAwareCapacityQuantityList := make([]*pluginapi.TopologyAwareQuantity, 0, len(machineState)) 648 649 var aggregatedAllocatableQuantity, aggregatedCapacityQuantity uint64 = 0, 0 650 for _, numaNode := range numaNodes { 651 numaNodeState := machineState[numaNode] 652 if numaNodeState == nil { 653 return nil, fmt.Errorf("nil numaNodeState for NUMA: %d", numaNode) 654 } 655 656 topologyAwareAllocatableQuantityList = append(topologyAwareAllocatableQuantityList, &pluginapi.TopologyAwareQuantity{ 657 ResourceValue: float64(numaNodeState.Allocatable), 658 Node: uint64(numaNode), 659 }) 660 topologyAwareCapacityQuantityList = append(topologyAwareCapacityQuantityList, &pluginapi.TopologyAwareQuantity{ 661 ResourceValue: float64(numaNodeState.TotalMemSize), 662 Node: uint64(numaNode), 663 }) 664 aggregatedAllocatableQuantity += numaNodeState.Allocatable 665 aggregatedCapacityQuantity += numaNodeState.TotalMemSize 666 } 667 668 return &pluginapi.GetTopologyAwareAllocatableResourcesResponse{ 669 AllocatableResources: map[string]*pluginapi.AllocatableTopologyAwareResource{ 670 string(v1.ResourceMemory): { 671 IsNodeResource: false, 672 IsScalarResource: true, 673 AggregatedAllocatableQuantity: float64(aggregatedAllocatableQuantity), 674 TopologyAwareAllocatableQuantityList: topologyAwareAllocatableQuantityList, 675 AggregatedCapacityQuantity: float64(aggregatedCapacityQuantity), 676 TopologyAwareCapacityQuantityList: topologyAwareCapacityQuantityList, 677 }, 678 }, 679 }, nil 680 } 681 682 // GetResourcePluginOptions returns options to be communicated with Resource Manager 683 func (p *DynamicPolicy) GetResourcePluginOptions(context.Context, 684 *pluginapi.Empty, 685 ) (*pluginapi.ResourcePluginOptions, error) { 686 return &pluginapi.ResourcePluginOptions{ 687 PreStartRequired: false, 688 WithTopologyAlignment: true, 689 NeedReconcile: true, 690 }, nil 691 } 692 693 // Allocate is called during pod admit so that the resource 694 // plugin can allocate corresponding resource for the container 695 // according to resource request 696 func (p *DynamicPolicy) Allocate(ctx context.Context, 697 req *pluginapi.ResourceRequest, 698 ) (resp *pluginapi.ResourceAllocationResponse, respErr error) { 699 if req == nil { 700 return nil, fmt.Errorf("Allocate got nil req") 701 } 702 703 // identify if the pod is a debug pod, 704 // if so, apply specific strategy to it. 705 // since GetKatalystQoSLevelFromResourceReq function will filter annotations, 706 // we should do it before GetKatalystQoSLevelFromResourceReq. 707 isDebugPod := util.IsDebugPod(req.Annotations, p.podDebugAnnoKeys) 708 709 qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.qosConfig, req) 710 if err != nil { 711 err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", 712 req.PodNamespace, req.PodName, req.ContainerName, err) 713 general.Errorf("%s", err.Error()) 714 return nil, err 715 } 716 717 reqInt, _, err := util.GetQuantityFromResourceReq(req) 718 if err != nil { 719 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 720 } 721 722 general.InfoS("called", 723 "podNamespace", req.PodNamespace, 724 "podName", req.PodName, 725 "containerName", req.ContainerName, 726 "podType", req.PodType, 727 "podRole", req.PodRole, 728 "qosLevel", qosLevel, 729 "memoryReq(bytes)", reqInt) 730 731 if req.ContainerType == pluginapi.ContainerType_INIT { 732 return &pluginapi.ResourceAllocationResponse{ 733 PodUid: req.PodUid, 734 PodNamespace: req.PodNamespace, 735 PodName: req.PodName, 736 ContainerName: req.ContainerName, 737 ContainerType: req.ContainerType, 738 ContainerIndex: req.ContainerIndex, 739 PodRole: req.PodRole, 740 PodType: req.PodType, 741 ResourceName: string(v1.ResourceMemory), 742 Labels: general.DeepCopyMap(req.Labels), 743 Annotations: general.DeepCopyMap(req.Annotations), 744 }, nil 745 } else if isDebugPod { 746 return &pluginapi.ResourceAllocationResponse{ 747 PodUid: req.PodUid, 748 PodNamespace: req.PodNamespace, 749 PodName: req.PodName, 750 ContainerName: req.ContainerName, 751 ContainerType: req.ContainerType, 752 ContainerIndex: req.ContainerIndex, 753 PodRole: req.PodRole, 754 PodType: req.PodType, 755 ResourceName: string(v1.ResourceMemory), 756 AllocationResult: &pluginapi.ResourceAllocation{ 757 ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ 758 string(v1.ResourceMemory): { 759 // return ResourceAllocation with empty OciPropertyName, AllocatedQuantity, AllocationResult for containers in debug pod, 760 // it won't influence oci spec properties of the container 761 IsNodeResource: false, 762 IsScalarResource: true, 763 }, 764 }, 765 }, 766 Labels: general.DeepCopyMap(req.Labels), 767 Annotations: general.DeepCopyMap(req.Annotations), 768 }, nil 769 } 770 771 p.Lock() 772 defer func() { 773 // calls sys-advisor to inform the latest container 774 if p.enableMemoryAdvisor && respErr == nil && req.ContainerType != pluginapi.ContainerType_INIT { 775 _, err := p.advisorClient.AddContainer(ctx, &advisorsvc.ContainerMetadata{ 776 PodUid: req.PodUid, 777 PodNamespace: req.PodNamespace, 778 PodName: req.PodName, 779 ContainerName: req.ContainerName, 780 ContainerType: req.ContainerType, 781 ContainerIndex: req.ContainerIndex, 782 Labels: maputil.CopySS(req.Labels), 783 Annotations: maputil.CopySS(req.Annotations), 784 QosLevel: qosLevel, 785 RequestQuantity: uint64(reqInt), 786 }) 787 if err != nil { 788 resp = nil 789 respErr = fmt.Errorf("add container to qos aware server failed with error: %v", err) 790 _ = p.removeContainer(req.PodUid, req.ContainerName) 791 } 792 } else if respErr != nil { 793 _ = p.removeContainer(req.PodUid, req.ContainerName) 794 _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw) 795 } 796 797 p.Unlock() 798 return 799 }() 800 801 allocationInfo := p.state.GetAllocationInfo(v1.ResourceMemory, req.PodUid, req.ContainerName) 802 if allocationInfo != nil && allocationInfo.AggregatedQuantity >= uint64(reqInt) { 803 general.InfoS("already allocated and meet requirement", 804 "podNamespace", req.PodNamespace, 805 "podName", req.PodName, 806 "containerName", req.ContainerName, 807 "memoryReq(bytes)", reqInt, 808 "currentResult(bytes)", allocationInfo.AggregatedQuantity) 809 return &pluginapi.ResourceAllocationResponse{ 810 PodUid: req.PodUid, 811 PodNamespace: req.PodNamespace, 812 PodName: req.PodName, 813 ContainerName: req.ContainerName, 814 ContainerType: req.ContainerType, 815 ContainerIndex: req.ContainerIndex, 816 PodRole: req.PodRole, 817 PodType: req.PodType, 818 ResourceName: string(v1.ResourceMemory), 819 AllocationResult: &pluginapi.ResourceAllocation{ 820 ResourceAllocation: map[string]*pluginapi.ResourceAllocationInfo{ 821 string(v1.ResourceMemory): { 822 OciPropertyName: util.OCIPropertyNameCPUSetMems, 823 IsNodeResource: false, 824 IsScalarResource: true, 825 AllocatedQuantity: float64(allocationInfo.AggregatedQuantity), 826 AllocationResult: allocationInfo.NumaAllocationResult.String(), 827 }, 828 }, 829 }, 830 Labels: general.DeepCopyMap(req.Labels), 831 Annotations: general.DeepCopyMap(req.Annotations), 832 }, nil 833 } 834 835 if p.allocationHandlers[qosLevel] == nil { 836 return nil, fmt.Errorf("katalyst QoS level: %s is not supported yet", qosLevel) 837 } 838 return p.allocationHandlers[qosLevel](ctx, req) 839 } 840 841 // PreStartContainer is called, if indicated by resource plugin during registration phase, 842 // before each container start. Resource plugin can run resource specific operations 843 // such as resetting the resource before making resources available to the container 844 func (p *DynamicPolicy) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) { 845 return nil, nil 846 } 847 848 func (p *DynamicPolicy) removePod(podUID string) error { 849 podResourceEntries := p.state.GetPodResourceEntries() 850 for _, podEntries := range podResourceEntries { 851 delete(podEntries, podUID) 852 } 853 854 resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetReservedMemory()) 855 if err != nil { 856 general.Errorf("pod: %s, GenerateMachineStateFromPodEntries failed with error: %v", podUID, err) 857 return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) 858 } 859 860 p.state.SetPodResourceEntries(podResourceEntries) 861 p.state.SetMachineState(resourcesMachineState) 862 return nil 863 } 864 865 func (p *DynamicPolicy) removeContainer(podUID, containerName string) error { 866 podResourceEntries := p.state.GetPodResourceEntries() 867 868 found := false 869 for _, podEntries := range podResourceEntries { 870 if podEntries[podUID][containerName] != nil { 871 found = true 872 } 873 874 delete(podEntries[podUID], containerName) 875 } 876 877 if !found { 878 return nil 879 } 880 881 resourcesMachineState, err := state.GenerateMachineStateFromPodEntries(p.state.GetMachineInfo(), podResourceEntries, p.state.GetReservedMemory()) 882 if err != nil { 883 general.Errorf("pod: %s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", podUID, containerName, err) 884 return fmt.Errorf("calculate machineState by updated pod entries failed with error: %v", err) 885 } 886 887 p.state.SetPodResourceEntries(podResourceEntries) 888 p.state.SetMachineState(resourcesMachineState) 889 return nil 890 } 891 892 // getContainerRequestedMemoryBytes parses and returns requested memory bytes for the given container 893 func (p *DynamicPolicy) getContainerRequestedMemoryBytes(allocationInfo *state.AllocationInfo) int { 894 if allocationInfo == nil { 895 general.Errorf("got nil allocationInfo") 896 return 0 897 } 898 899 if p.metaServer == nil { 900 general.Errorf("got nil metaServer") 901 return 0 902 } 903 904 container, err := p.metaServer.GetContainerSpec(allocationInfo.PodUid, allocationInfo.ContainerName) 905 if err != nil || container == nil { 906 general.Errorf("get container failed with error: %v", err) 907 return 0 908 } 909 910 memoryQuantity := native.MemoryQuantityGetter()(container.Resources.Requests) 911 requestBytes := general.Max(int(memoryQuantity.Value()), 0) 912 913 general.Infof("get memory request bytes: %d for pod: %s/%s container: %s from podWatcher", 914 requestBytes, allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName) 915 return requestBytes 916 } 917 918 // hasLastLevelEnhancementKey check if the pod with the given UID has the corresponding last level enhancement key 919 func (p *DynamicPolicy) hasLastLevelEnhancementKey(lastLevelEnhancementKey string, podUID string) bool { 920 podEntries := p.state.GetPodResourceEntries()[v1.ResourceMemory] 921 922 for _, allocationInfo := range podEntries[podUID] { 923 if _, ok := allocationInfo.Annotations[lastLevelEnhancementKey]; ok { 924 general.Infof("pod: %s has last level enhancement key: %s", podUID, lastLevelEnhancementKey) 925 return true 926 } 927 } 928 929 general.Infof("pod: %s does not have last level enhancement key: %s", podUID, lastLevelEnhancementKey) 930 return false 931 }