github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy_allocation_handlers.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package dynamicpolicy 18 19 import ( 20 "context" 21 "fmt" 22 "math" 23 "sort" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" 28 29 apiconsts "github.com/kubewharf/katalyst-api/pkg/consts" 30 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/calculator" 31 advisorapi "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor" 32 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/state" 33 cpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/util" 34 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" 35 "github.com/kubewharf/katalyst-core/pkg/metrics" 36 "github.com/kubewharf/katalyst-core/pkg/util/general" 37 "github.com/kubewharf/katalyst-core/pkg/util/machine" 38 "github.com/kubewharf/katalyst-core/pkg/util/native" 39 qosutil "github.com/kubewharf/katalyst-core/pkg/util/qos" 40 ) 41 42 func (p *DynamicPolicy) sharedCoresAllocationHandler(_ context.Context, 43 req *pluginapi.ResourceRequest, 44 ) (*pluginapi.ResourceAllocationResponse, error) { 45 if req == nil { 46 return nil, fmt.Errorf("sharedCoresAllocationHandler got nil request") 47 } 48 49 _, reqFloat64, err := util.GetQuantityFromResourceReq(req) 50 if err != nil { 51 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 52 } 53 54 machineState := p.state.GetMachineState() 55 pooledCPUs := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs, 56 state.CheckDedicated, state.CheckDedicatedNUMABinding) 57 if pooledCPUs.IsEmpty() { 58 general.Errorf("pod: %s/%s, container: %s get empty pooledCPUs", req.PodNamespace, req.PodName, req.ContainerName) 59 return nil, fmt.Errorf("get empty pooledCPUs") 60 } 61 62 pooledCPUsTopologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, pooledCPUs) 63 if err != nil { 64 general.Errorf("pod: %s/%s, container: %s GetTopologyAwareAssignmentsByCPUSet failed with error: %v", 65 req.PodNamespace, req.PodName, req.ContainerName, err) 66 return nil, fmt.Errorf("GetTopologyAwareAssignmentsByCPUSet failed with error: %v", err) 67 } 68 69 needSet := true 70 allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) 71 err = updateAllocationInfoByReq(req, allocationInfo) 72 if err != nil { 73 general.Errorf("pod: %s/%s, container: %s updateAllocationInfoByReq failed with error: %v", 74 req.PodNamespace, req.PodName, req.ContainerName, err) 75 return nil, fmt.Errorf("updateAllocationInfoByReq failed with error: %v", err) 76 } 77 78 if allocationInfo == nil { 79 general.Infof("pod: %s/%s, container: %s is met firstly, do ramp up with pooled cpus: %s", 80 req.PodNamespace, req.PodName, req.ContainerName, pooledCPUs.String()) 81 82 shouldRampUp := p.shouldSharedCoresRampUp(req.PodUid) 83 84 allocationInfo = &state.AllocationInfo{ 85 PodUid: req.PodUid, 86 PodNamespace: req.PodNamespace, 87 PodName: req.PodName, 88 ContainerName: req.ContainerName, 89 ContainerType: req.ContainerType.String(), 90 ContainerIndex: req.ContainerIndex, 91 RampUp: shouldRampUp, 92 OwnerPoolName: advisorapi.EmptyOwnerPoolName, 93 PodRole: req.PodRole, 94 PodType: req.PodType, 95 AllocationResult: pooledCPUs, 96 OriginalAllocationResult: pooledCPUs.Clone(), 97 TopologyAwareAssignments: pooledCPUsTopologyAwareAssignments, 98 OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(pooledCPUsTopologyAwareAssignments), 99 InitTimestamp: time.Now().Format(util.QRMTimeFormat), 100 Labels: general.DeepCopyMap(req.Labels), 101 Annotations: general.DeepCopyMap(req.Annotations), 102 QoSLevel: apiconsts.PodAnnotationQoSLevelSharedCores, 103 RequestQuantity: reqFloat64, 104 } 105 106 if !shouldRampUp { 107 targetPoolName := allocationInfo.GetSpecifiedPoolName() 108 poolAllocationInfo := p.state.GetAllocationInfo(targetPoolName, advisorapi.FakedContainerName) 109 110 if poolAllocationInfo == nil { 111 general.Infof("pod: %s/%s, container: %s is active, but its specified pool entry doesn't exist, try to ramp up it", 112 req.PodNamespace, req.PodName, req.ContainerName) 113 allocationInfo.RampUp = true 114 } else { 115 p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo) 116 err := p.doAndCheckPutAllocationInfo(allocationInfo, false) 117 if err != nil { 118 return nil, err 119 } 120 121 needSet = false 122 } 123 } 124 } else if allocationInfo.RampUp { 125 general.Infof("pod: %s/%s, container: %s is still in ramp up, allocate pooled cpus: %s", 126 req.PodNamespace, req.PodName, req.ContainerName, pooledCPUs.String()) 127 128 allocationInfo.AllocationResult = pooledCPUs 129 allocationInfo.OriginalAllocationResult = pooledCPUs.Clone() 130 allocationInfo.TopologyAwareAssignments = pooledCPUsTopologyAwareAssignments 131 allocationInfo.OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(pooledCPUsTopologyAwareAssignments) 132 } else { 133 err := p.doAndCheckPutAllocationInfo(allocationInfo, true) 134 if err != nil { 135 return nil, err 136 } 137 138 needSet = false 139 } 140 141 if needSet { 142 // update pod entries directly. 143 // if one of subsequent steps is failed, 144 // we will delete current allocationInfo from podEntries in defer function of allocation function. 145 p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo) 146 podEntries := p.state.GetPodEntries() 147 148 updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries) 149 if err != nil { 150 general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", 151 req.PodNamespace, req.PodName, req.ContainerName, err) 152 return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) 153 } 154 p.state.SetMachineState(updatedMachineState) 155 } 156 157 resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req) 158 if err != nil { 159 general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", 160 req.PodNamespace, req.PodName, req.ContainerName, err) 161 return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err) 162 } 163 return resp, nil 164 } 165 166 func (p *DynamicPolicy) reclaimedCoresAllocationHandler(_ context.Context, 167 req *pluginapi.ResourceRequest, 168 ) (*pluginapi.ResourceAllocationResponse, error) { 169 if req == nil { 170 return nil, fmt.Errorf("reclaimedCoresAllocationHandler got nil request") 171 } 172 173 _, reqFloat64, err := util.GetQuantityFromResourceReq(req) 174 if err != nil { 175 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 176 } 177 178 allocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) 179 err = updateAllocationInfoByReq(req, allocationInfo) 180 if err != nil { 181 general.Errorf("pod: %s/%s, container: %s updateAllocationInfoByReq failed with error: %v", 182 req.PodNamespace, req.PodName, req.ContainerName, err) 183 return nil, fmt.Errorf("updateAllocationInfoByReq failed with error: %v", err) 184 } 185 186 reclaimedAllocationInfo := p.state.GetAllocationInfo(state.PoolNameReclaim, advisorapi.FakedContainerName) 187 if reclaimedAllocationInfo == nil { 188 general.Errorf("allocation for pod: %s/%s, container: %s is failed, because pool: %s is not ready", 189 req.PodNamespace, req.PodName, req.ContainerName, state.PoolNameReclaim) 190 191 return nil, fmt.Errorf("pool: %s is not ready", state.PoolNameReclaim) 192 } else if reclaimedAllocationInfo.AllocationResult.Size() == 0 { 193 general.Errorf("allocation for pod: %s/%s, container: %s is failed, because pool: %s is empty", 194 req.PodNamespace, req.PodName, req.ContainerName, state.PoolNameReclaim) 195 196 return nil, fmt.Errorf("pool: %s is not empty", state.PoolNameReclaim) 197 } 198 199 if allocationInfo != nil { 200 general.Infof("pod: %s/%s, container: %s with old allocation result: %s, allocate by reclaimedCPUSet: %s", 201 req.PodNamespace, req.PodName, req.ContainerName, allocationInfo.AllocationResult.String(), reclaimedAllocationInfo.AllocationResult.String()) 202 } else { 203 general.Infof("pod: %s/%s, container: %s is firstly met, allocate by reclaimedCPUSet: %s", 204 req.PodNamespace, req.PodName, req.ContainerName, reclaimedAllocationInfo.AllocationResult.String()) 205 206 allocationInfo = &state.AllocationInfo{ 207 PodUid: req.PodUid, 208 PodNamespace: req.PodNamespace, 209 PodName: req.PodName, 210 ContainerName: req.ContainerName, 211 ContainerType: req.ContainerType.String(), 212 ContainerIndex: req.ContainerIndex, 213 OwnerPoolName: state.PoolNameReclaim, 214 PodRole: req.PodRole, 215 PodType: req.PodType, 216 InitTimestamp: time.Now().Format(util.QRMTimeFormat), 217 Labels: general.DeepCopyMap(req.Labels), 218 Annotations: general.DeepCopyMap(req.Annotations), 219 QoSLevel: apiconsts.PodAnnotationQoSLevelReclaimedCores, 220 RequestQuantity: reqFloat64, 221 } 222 } 223 224 allocationInfo.OwnerPoolName = state.PoolNameReclaim 225 allocationInfo.AllocationResult = reclaimedAllocationInfo.AllocationResult.Clone() 226 allocationInfo.OriginalAllocationResult = reclaimedAllocationInfo.OriginalAllocationResult.Clone() 227 allocationInfo.TopologyAwareAssignments = machine.DeepcopyCPUAssignment(reclaimedAllocationInfo.TopologyAwareAssignments) 228 allocationInfo.OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(reclaimedAllocationInfo.OriginalTopologyAwareAssignments) 229 230 // update pod entries directly. 231 // if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function. 232 p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo) 233 podEntries := p.state.GetPodEntries() 234 235 updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries) 236 if err != nil { 237 general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", 238 req.PodNamespace, req.PodName, req.ContainerName, err) 239 return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) 240 } 241 242 resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req) 243 if err != nil { 244 general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", 245 req.PodNamespace, req.PodName, req.ContainerName, err) 246 return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err) 247 } 248 p.state.SetMachineState(updatedMachineState) 249 250 return resp, nil 251 } 252 253 func (p *DynamicPolicy) dedicatedCoresAllocationHandler(ctx context.Context, 254 req *pluginapi.ResourceRequest, 255 ) (*pluginapi.ResourceAllocationResponse, error) { 256 if req == nil { 257 return nil, fmt.Errorf("dedicatedCoresAllocationHandler got nil req") 258 } 259 260 switch req.Annotations[apiconsts.PodAnnotationMemoryEnhancementNumaBinding] { 261 case apiconsts.PodAnnotationMemoryEnhancementNumaBindingEnable: 262 return p.dedicatedCoresWithNUMABindingAllocationHandler(ctx, req) 263 default: 264 return p.dedicatedCoresWithoutNUMABindingAllocationHandler(ctx, req) 265 } 266 } 267 268 func (p *DynamicPolicy) dedicatedCoresWithoutNUMABindingAllocationHandler(_ context.Context, 269 _ *pluginapi.ResourceRequest, 270 ) (*pluginapi.ResourceAllocationResponse, error) { 271 // todo: support dedicated_cores without NUMA binding 272 return nil, fmt.Errorf("not support dedicated_cores without NUMA binding") 273 } 274 275 func (p *DynamicPolicy) dedicatedCoresWithNUMABindingAllocationHandler(ctx context.Context, 276 req *pluginapi.ResourceRequest, 277 ) (*pluginapi.ResourceAllocationResponse, error) { 278 if req.ContainerType == pluginapi.ContainerType_SIDECAR { 279 return p.dedicatedCoresWithNUMABindingAllocationSidecarHandler(ctx, req) 280 } 281 282 var machineState state.NUMANodeMap 283 oldAllocationInfo := p.state.GetAllocationInfo(req.PodUid, req.ContainerName) 284 if oldAllocationInfo == nil { 285 machineState = p.state.GetMachineState() 286 } else { 287 p.state.Delete(req.PodUid, req.ContainerName) 288 podEntries := p.state.GetPodEntries() 289 290 var err error 291 machineState, err = generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries) 292 if err != nil { 293 general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", 294 req.PodNamespace, req.PodName, req.ContainerName, err) 295 return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) 296 } 297 } 298 299 reqInt, reqFloat64, err := util.GetQuantityFromResourceReq(req) 300 if err != nil { 301 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 302 } 303 304 result, err := p.allocateNumaBindingCPUs(reqInt, req.Hint, machineState, req.Annotations) 305 if err != nil { 306 general.ErrorS(err, "unable to allocate CPUs", 307 "podNamespace", req.PodNamespace, 308 "podName", req.PodName, 309 "containerName", req.ContainerName, 310 "numCPUsInt", reqInt, 311 "numCPUsFloat64", reqFloat64) 312 return nil, err 313 } 314 315 general.InfoS("allocate CPUs successfully", 316 "podNamespace", req.PodNamespace, 317 "podName", req.PodName, 318 "containerName", req.ContainerName, 319 "numCPUsInt", reqInt, 320 "numCPUsFloat64", reqFloat64, 321 "result", result.String()) 322 323 topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, result) 324 if err != nil { 325 general.ErrorS(err, "unable to calculate topologyAwareAssignments", 326 "podNamespace", req.PodNamespace, 327 "podName", req.PodName, 328 "containerName", req.ContainerName, 329 "numCPUsInt", reqInt, 330 "numCPUsFloat64", reqFloat64, 331 "result cpuset", result.String()) 332 return nil, err 333 } 334 335 allocationInfo := &state.AllocationInfo{ 336 PodUid: req.PodUid, 337 PodNamespace: req.PodNamespace, 338 PodName: req.PodName, 339 ContainerName: req.ContainerName, 340 ContainerType: req.ContainerType.String(), 341 ContainerIndex: req.ContainerIndex, 342 RampUp: true, 343 PodRole: req.PodRole, 344 PodType: req.PodType, 345 OwnerPoolName: state.PoolNameDedicated, 346 AllocationResult: result.Clone(), 347 OriginalAllocationResult: result.Clone(), 348 TopologyAwareAssignments: topologyAwareAssignments, 349 OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments), 350 InitTimestamp: time.Now().Format(util.QRMTimeFormat), 351 QoSLevel: apiconsts.PodAnnotationQoSLevelDedicatedCores, 352 Labels: general.DeepCopyMap(req.Labels), 353 Annotations: general.DeepCopyMap(req.Annotations), 354 RequestQuantity: reqFloat64, 355 } 356 357 // update pod entries directly. 358 // if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function. 359 p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo) 360 podEntries := p.state.GetPodEntries() 361 362 updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries) 363 if err != nil { 364 general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", 365 req.PodNamespace, req.PodName, req.ContainerName, err) 366 return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) 367 } 368 p.state.SetMachineState(updatedMachineState) 369 370 err = p.adjustAllocationEntries() 371 if err != nil { 372 general.Errorf("pod: %s/%s, container: %s putContainersAndAdjustAllocationEntriesWithoutAllocation failed with error: %v", 373 req.PodNamespace, req.PodName, req.ContainerName, err) 374 return nil, fmt.Errorf("adjustAllocationEntries failed with error: %v", err) 375 } 376 377 resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req) 378 if err != nil { 379 general.Errorf("pod: %s/%s, container: %s PackResourceAllocationResponseByAllocationInfo failed with error: %v", 380 req.PodNamespace, req.PodName, req.ContainerName, err) 381 return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err) 382 } 383 return resp, nil 384 } 385 386 // dedicatedCoresWithNUMABindingAllocationSidecarHandler currently we set cpuset of sidecar to the cpuset of its main container 387 func (p *DynamicPolicy) dedicatedCoresWithNUMABindingAllocationSidecarHandler(_ context.Context, 388 req *pluginapi.ResourceRequest, 389 ) (*pluginapi.ResourceAllocationResponse, error) { 390 _, reqFloat64, err := util.GetQuantityFromResourceReq(req) 391 if err != nil { 392 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 393 } 394 395 podEntries := p.state.GetPodEntries() 396 if podEntries[req.PodUid] == nil { 397 general.Infof("there is no pod entry, pod: %s/%s, sidecar: %s, waiting next reconcile", 398 req.PodNamespace, req.PodName, req.ContainerName) 399 return &pluginapi.ResourceAllocationResponse{}, nil 400 } 401 402 mainContainerAllocationInfo := podEntries[req.PodUid].GetMainContainerEntry() 403 404 // todo: consider sidecar without reconcile in vpa 405 if mainContainerAllocationInfo == nil { 406 general.Infof("main container is not found for pod: %s/%s, sidecar: %s, waiting next reconcile", 407 req.PodNamespace, req.PodName, req.ContainerName) 408 return &pluginapi.ResourceAllocationResponse{}, nil 409 } 410 411 allocationInfo := &state.AllocationInfo{ 412 PodUid: req.PodUid, 413 PodNamespace: req.PodNamespace, 414 PodName: req.PodName, 415 ContainerName: req.ContainerName, 416 ContainerType: req.ContainerType.String(), 417 ContainerIndex: req.ContainerIndex, 418 PodRole: req.PodRole, 419 PodType: req.PodType, 420 AllocationResult: mainContainerAllocationInfo.AllocationResult.Clone(), 421 OriginalAllocationResult: mainContainerAllocationInfo.OriginalAllocationResult.Clone(), 422 TopologyAwareAssignments: machine.DeepcopyCPUAssignment(mainContainerAllocationInfo.TopologyAwareAssignments), 423 OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(mainContainerAllocationInfo.OriginalTopologyAwareAssignments), 424 InitTimestamp: time.Now().Format(util.QRMTimeFormat), 425 QoSLevel: apiconsts.PodAnnotationQoSLevelDedicatedCores, 426 Labels: general.DeepCopyMap(req.Labels), 427 Annotations: general.DeepCopyMap(req.Annotations), 428 RequestQuantity: reqFloat64, 429 } 430 431 // update pod entries directly. 432 // if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function. 433 p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo) 434 podEntries = p.state.GetPodEntries() 435 436 updatedMachineState, err := generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries) 437 if err != nil { 438 general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", 439 req.PodNamespace, req.PodName, req.ContainerName, err) 440 return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) 441 } 442 p.state.SetMachineState(updatedMachineState) 443 444 resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req) 445 if err != nil { 446 general.Errorf("pod: %s/%s, container: %s packAllocationResponse failed with error: %v", 447 req.PodNamespace, req.PodName, req.ContainerName, err) 448 return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err) 449 } 450 return resp, nil 451 } 452 453 func (p *DynamicPolicy) allocateNumaBindingCPUs(numCPUs int, hint *pluginapi.TopologyHint, 454 machineState state.NUMANodeMap, reqAnnotations map[string]string, 455 ) (machine.CPUSet, error) { 456 if hint == nil { 457 return machine.NewCPUSet(), fmt.Errorf("hint is nil") 458 } else if len(hint.Nodes) == 0 { 459 return machine.NewCPUSet(), fmt.Errorf("hint is empty") 460 } else if qosutil.AnnotationsIndicateNUMABinding(reqAnnotations) && 461 !qosutil.AnnotationsIndicateNUMAExclusive(reqAnnotations) && 462 len(hint.Nodes) > 1 { 463 return machine.NewCPUSet(), fmt.Errorf("NUMA not exclusive binding container has request larger than 1 NUMA") 464 } 465 466 result := machine.NewCPUSet() 467 alignedAvailableCPUs := machine.CPUSet{} 468 for _, numaNode := range hint.Nodes { 469 alignedAvailableCPUs = alignedAvailableCPUs.Union(machineState[int(numaNode)].GetAvailableCPUSet(p.reservedCPUs)) 470 } 471 472 var alignedCPUs machine.CPUSet 473 474 if qosutil.AnnotationsIndicateNUMAExclusive(reqAnnotations) { 475 // todo: currently we hack dedicated_cores with NUMA binding take up whole NUMA, 476 // and we will modify strategy here if assumption above breaks. 477 alignedCPUs = alignedAvailableCPUs.Clone() 478 } else { 479 var err error 480 alignedCPUs, err = calculator.TakeByTopology(p.machineInfo, alignedAvailableCPUs, numCPUs) 481 if err != nil { 482 general.ErrorS(err, "take cpu for NUMA not exclusive binding container failed", 483 "hints", hint.Nodes, 484 "alignedAvailableCPUs", alignedAvailableCPUs.String()) 485 486 return machine.NewCPUSet(), 487 fmt.Errorf("take cpu for NUMA not exclusive binding container failed with err: %v", err) 488 } 489 } 490 491 general.InfoS("allocate by hints", 492 "hints", hint.Nodes, 493 "alignedAvailableCPUs", alignedAvailableCPUs.String(), 494 "alignedAllocatedCPUs", alignedCPUs) 495 496 // currently, result equals to alignedCPUs, 497 // maybe extend cpus not aligned to meet requirement later 498 result = result.Union(alignedCPUs) 499 leftNumCPUs := numCPUs - result.Size() 500 if leftNumCPUs > 0 { 501 general.Errorf("result cpus: %s in hint NUMA nodes: %d with size: %d can't meet cpus request: %d", 502 result.String(), hint.Nodes, result.Size(), numCPUs) 503 504 return machine.NewCPUSet(), fmt.Errorf("results can't meet cpus request") 505 } 506 return result, nil 507 } 508 509 // putAllocationsAndAdjustAllocationEntries calculates and generates the latest checkpoint 510 // - unlike adjustAllocationEntries, it will also consider AllocationInfo 511 func (p *DynamicPolicy) putAllocationsAndAdjustAllocationEntries(allocationInfos []*state.AllocationInfo, incrByReq bool) error { 512 if len(allocationInfos) == 0 { 513 return nil 514 } 515 516 entries := p.state.GetPodEntries() 517 machineState := p.state.GetMachineState() 518 519 var poolsQuantityMap map[string]int 520 if p.enableCPUAdvisor { 521 // if sys advisor is enabled, we believe the pools' ratio that sys advisor indicates 522 poolsQuantityMap = machine.ParseCPUAssignmentQuantityMap(entries.GetFilteredPoolsCPUSetMap(state.ResidentPools)) 523 } else { 524 // else we do sum(containers req) for each pool to get pools ratio 525 poolsQuantityMap = state.GetSharedQuantityMapFromPodEntries(entries, allocationInfos) 526 } 527 528 incrQuantityMap := make(map[string]float64) 529 for _, allocationInfo := range allocationInfos { 530 if allocationInfo == nil { 531 return fmt.Errorf("found nil allocationInfo in input parameter") 532 } else if !state.CheckShared(allocationInfo) { 533 return fmt.Errorf("put container with invalid qos level: %s into pool", allocationInfo.QoSLevel) 534 } else if entries[allocationInfo.PodUid][allocationInfo.ContainerName] == nil { 535 return fmt.Errorf("entry %s/%s, %s isn't found in state", 536 allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName) 537 } 538 539 poolName := allocationInfo.GetSpecifiedPoolName() 540 if poolName == advisorapi.EmptyOwnerPoolName { 541 return fmt.Errorf("allocationInfo points to empty poolName") 542 } 543 544 if incrByReq { 545 requestQuantity := state.GetContainerRequestedCores()(allocationInfo) 546 incrQuantityMap[poolName] += requestQuantity 547 general.Infof("put allocation with request quantity: %.3f", requestQuantity) 548 } 549 } 550 551 for poolName, incrQuantity := range incrQuantityMap { 552 incrInt := int(math.Ceil(incrQuantity)) 553 poolsQuantityMap[poolName] += incrInt 554 general.Infof("increase pool: %s by %d", poolName, incrInt) 555 } 556 557 isolatedQuantityMap := state.GetIsolatedQuantityMapFromPodEntries(entries, allocationInfos) 558 err := p.adjustPoolsAndIsolatedEntries(poolsQuantityMap, isolatedQuantityMap, entries, machineState) 559 if err != nil { 560 return fmt.Errorf("adjustPoolsAndIsolatedEntries failed with error: %v", err) 561 } 562 563 return nil 564 } 565 566 // adjustAllocationEntries calculates and generates the latest checkpoint 567 func (p *DynamicPolicy) adjustAllocationEntries() error { 568 entries := p.state.GetPodEntries() 569 machineState := p.state.GetMachineState() 570 571 // since adjustAllocationEntries will cause re-generate pools, 572 // if sys advisor is enabled, we believe the pools' ratio that sys advisor indicates, 573 // else we do sum(containers req) for each pool to get pools ratio 574 var poolsQuantityMap map[string]int 575 if p.enableCPUAdvisor { 576 poolsQuantityMap = machine.ParseCPUAssignmentQuantityMap(entries.GetFilteredPoolsCPUSetMap(state.ResidentPools)) 577 } else { 578 poolsQuantityMap = state.GetSharedQuantityMapFromPodEntries(entries, nil) 579 } 580 isolatedQuantityMap := state.GetIsolatedQuantityMapFromPodEntries(entries, nil) 581 582 err := p.adjustPoolsAndIsolatedEntries(poolsQuantityMap, isolatedQuantityMap, entries, machineState) 583 if err != nil { 584 return fmt.Errorf("adjustPoolsAndIsolatedEntries failed with error: %v", err) 585 } 586 587 return nil 588 } 589 590 // adjustPoolsAndIsolatedEntries works for the following steps 591 // 1. calculate pools and isolated cpusets according to expectant quantities 592 // 2. make reclaimed overlap with numa-binding 593 // 3. apply them to local state 594 // 4. clean pools 595 func (p *DynamicPolicy) adjustPoolsAndIsolatedEntries(poolsQuantityMap map[string]int, 596 isolatedQuantityMap map[string]map[string]int, entries state.PodEntries, machineState state.NUMANodeMap, 597 ) error { 598 availableCPUs := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs, nil, state.CheckDedicatedNUMABinding) 599 600 poolsCPUSet, isolatedCPUSet, err := p.generatePoolsAndIsolation(poolsQuantityMap, isolatedQuantityMap, availableCPUs) 601 if err != nil { 602 return fmt.Errorf("generatePoolsAndIsolation failed with error: %v", err) 603 } 604 605 err = p.reclaimOverlapNUMABinding(poolsCPUSet, entries) 606 if err != nil { 607 return fmt.Errorf("reclaimOverlapNUMABinding failed with error: %v", err) 608 } 609 610 err = p.applyPoolsAndIsolatedInfo(poolsCPUSet, isolatedCPUSet, entries, machineState) 611 if err != nil { 612 return fmt.Errorf("applyPoolsAndIsolatedInfo failed with error: %v", err) 613 } 614 615 err = p.cleanPools() 616 if err != nil { 617 return fmt.Errorf("cleanPools failed with error: %v", err) 618 } 619 620 return nil 621 } 622 623 // reclaimOverlapNUMABinding unions calculated reclaim pool in empty NUMAs 624 // with the intersection of previous reclaim pool and non-ramp-up dedicated_cores numa_binding containers 625 func (p *DynamicPolicy) reclaimOverlapNUMABinding(poolsCPUSet map[string]machine.CPUSet, entries state.PodEntries) error { 626 // reclaimOverlapNUMABinding only works with cpu advisor and reclaim enabled 627 if !(p.enableCPUAdvisor && p.dynamicConfig.GetDynamicConfiguration().EnableReclaim) { 628 return nil 629 } 630 631 if entries.CheckPoolEmpty(state.PoolNameReclaim) { 632 return fmt.Errorf("reclaim pool misses in current entries") 633 } 634 635 curReclaimCPUSet := entries[state.PoolNameReclaim][advisorapi.FakedContainerName].AllocationResult.Clone() 636 nonOverlapReclaimCPUSet := poolsCPUSet[state.PoolNameReclaim].Clone() 637 general.Infof("curReclaimCPUSet: %s", curReclaimCPUSet.String()) 638 639 for _, containerEntries := range entries { 640 if containerEntries.IsPoolEntry() { 641 continue 642 } 643 644 for _, allocationInfo := range containerEntries { 645 if !(allocationInfo != nil && state.CheckDedicatedNUMABinding(allocationInfo) && allocationInfo.CheckMainContainer()) { 646 continue 647 } else if allocationInfo.RampUp { 648 general.Infof("dedicated numa_binding pod: %s/%s container: %s is in ramp up, not to overlap reclaim pool with it", 649 allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName) 650 continue 651 } 652 653 poolsCPUSet[state.PoolNameReclaim] = poolsCPUSet[state.PoolNameReclaim].Union(curReclaimCPUSet.Intersection(allocationInfo.AllocationResult)) 654 } 655 } 656 657 if poolsCPUSet[state.PoolNameReclaim].IsEmpty() { 658 return fmt.Errorf("reclaim pool is empty after overlapping with dedicated_cores numa_binding containers") 659 } 660 661 general.Infof("nonOverlapReclaimCPUSet: %s, finalReclaimCPUSet: %s", nonOverlapReclaimCPUSet.String(), poolsCPUSet[state.PoolNameReclaim].String()) 662 return nil 663 } 664 665 // applyPoolsAndIsolatedInfo generates the latest checkpoint by pools and isolated cpusets calculation results. 666 // 1. construct entries for isolated containers (probably be dedicated_cores not numa_binding ) 667 // 2. construct entries for all pools 668 // 3. construct entries for shared and reclaimed containers 669 func (p *DynamicPolicy) applyPoolsAndIsolatedInfo(poolsCPUSet map[string]machine.CPUSet, 670 isolatedCPUSet map[string]map[string]machine.CPUSet, curEntries state.PodEntries, machineState state.NUMANodeMap, 671 ) error { 672 newPodEntries := make(state.PodEntries) 673 unionDedicatedIsolatedCPUSet := machine.NewCPUSet() 674 675 // walk through all isolated CPUSet map to store those pods/containers in pod entries 676 for podUID, containerEntries := range isolatedCPUSet { 677 for containerName, isolatedCPUs := range containerEntries { 678 allocationInfo := curEntries[podUID][containerName] 679 if allocationInfo == nil { 680 general.Errorf("isolated pod: %s, container: %s without entry in current checkpoint", podUID, containerName) 681 continue 682 } else if !state.CheckDedicated(allocationInfo) || state.CheckNUMABinding(allocationInfo) { 683 general.Errorf("isolated pod: %s, container: %s isn't dedicated_cores without NUMA binding", podUID, containerName) 684 continue 685 } 686 687 topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, isolatedCPUs) 688 if err != nil { 689 general.ErrorS(err, "Unable to calculate topologyAwareAssignments", 690 "podNamespace", allocationInfo.PodNamespace, 691 "podName", allocationInfo.PodName, 692 "containerName", allocationInfo.ContainerName, 693 "result cpuset", isolatedCPUs.String()) 694 continue 695 } 696 697 general.InfoS("isolate info", 698 "podNamespace", allocationInfo.PodNamespace, 699 "podName", allocationInfo.PodName, 700 "containerName", allocationInfo.ContainerName, 701 "result cpuset", isolatedCPUs.String(), 702 "result cpuset size", isolatedCPUs.Size(), 703 "qosLevel", allocationInfo.QoSLevel) 704 705 if newPodEntries[podUID] == nil { 706 newPodEntries[podUID] = make(state.ContainerEntries) 707 } 708 709 newPodEntries[podUID][containerName] = allocationInfo.Clone() 710 newPodEntries[podUID][containerName].OwnerPoolName = state.PoolNameDedicated 711 newPodEntries[podUID][containerName].AllocationResult = isolatedCPUs.Clone() 712 newPodEntries[podUID][containerName].OriginalAllocationResult = isolatedCPUs.Clone() 713 newPodEntries[podUID][containerName].TopologyAwareAssignments = topologyAwareAssignments 714 newPodEntries[podUID][containerName].OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(topologyAwareAssignments) 715 716 unionDedicatedIsolatedCPUSet = unionDedicatedIsolatedCPUSet.Union(isolatedCPUs) 717 } 718 } 719 720 if poolsCPUSet[state.PoolNameReclaim].IsEmpty() { 721 return fmt.Errorf("entry: %s is empty", state.PoolNameShare) 722 } 723 724 // walk through all pools CPUSet map to store those pools in pod entries 725 for poolName, cset := range poolsCPUSet { 726 general.Infof("try to apply pool %s: %s", poolName, cset.String()) 727 728 topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, cset) 729 if err != nil { 730 return fmt.Errorf("unable to calculate topologyAwareAssignments for pool: %s, result cpuset: %s, error: %v", 731 poolName, cset.String(), err) 732 } 733 734 allocationInfo := curEntries[poolName][advisorapi.FakedContainerName] 735 if allocationInfo != nil { 736 general.Infof("pool: %s allocation result transform from %s to %s", 737 poolName, allocationInfo.AllocationResult.String(), cset.String()) 738 } 739 740 if newPodEntries[poolName] == nil { 741 newPodEntries[poolName] = make(state.ContainerEntries) 742 } 743 newPodEntries[poolName][advisorapi.FakedContainerName] = &state.AllocationInfo{ 744 PodUid: poolName, 745 OwnerPoolName: poolName, 746 AllocationResult: cset.Clone(), 747 OriginalAllocationResult: cset.Clone(), 748 TopologyAwareAssignments: topologyAwareAssignments, 749 OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments), 750 } 751 752 _ = p.emitter.StoreInt64(util.MetricNamePoolSize, int64(cset.Size()), metrics.MetricTypeNameRaw, 753 metrics.MetricTag{Key: "poolName", Val: poolName}, 754 metrics.MetricTag{Key: "pool_type", Val: state.GetPoolType(poolName)}) 755 } 756 757 // rampUpCPUs includes common reclaimed pool 758 rampUpCPUs := machineState.GetFilteredAvailableCPUSet(p.reservedCPUs, 759 nil, state.CheckDedicatedNUMABinding).Difference(unionDedicatedIsolatedCPUSet) 760 rampUpCPUsTopologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, rampUpCPUs) 761 if err != nil { 762 return fmt.Errorf("unable to calculate topologyAwareAssignments for rampUpCPUs, result cpuset: %s, error: %v", 763 rampUpCPUs.String(), err) 764 } 765 766 // walk through current pod entries to handle container-related entries (besides pooled entries) 767 for podUID, containerEntries := range curEntries { 768 if containerEntries.IsPoolEntry() { 769 continue 770 } 771 772 containerLoop: 773 for containerName, allocationInfo := range containerEntries { 774 if allocationInfo == nil { 775 general.Errorf("pod: %s, container: %s has nil allocationInfo", podUID, containerName) 776 continue 777 } 778 779 if newPodEntries[podUID][containerName] != nil { 780 // adapt to old checkpoint without RequestQuantity property 781 newPodEntries[podUID][containerName].RequestQuantity = state.GetContainerRequestedCores()(allocationInfo) 782 general.Infof("pod: %s/%s, container: %s, qosLevel: %s is isolated, ignore original allocationInfo", 783 allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, allocationInfo.QoSLevel) 784 continue 785 } 786 787 if newPodEntries[podUID] == nil { 788 newPodEntries[podUID] = make(state.ContainerEntries) 789 } 790 791 newPodEntries[podUID][containerName] = allocationInfo.Clone() 792 switch allocationInfo.QoSLevel { 793 case apiconsts.PodAnnotationQoSLevelDedicatedCores: 794 newPodEntries[podUID][containerName].OwnerPoolName = allocationInfo.GetPoolName() 795 796 // for numa_binding containers, we just clone checkpoint already exist 797 if state.CheckDedicatedNUMABinding(allocationInfo) { 798 continue containerLoop 799 } 800 801 // dedicated_cores without numa_binding is not isolated, we will try to isolate it in next adjustment. 802 general.Warningf("pod: %s/%s, container: %s is dedicated_cores without numa_binding but not isolated, "+ 803 "we put it into fallback pool: %s temporary", 804 allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, rampUpCPUs.String()) 805 806 newPodEntries[podUID][containerName].OwnerPoolName = state.PoolNameFallback 807 newPodEntries[podUID][containerName].AllocationResult = rampUpCPUs.Clone() 808 newPodEntries[podUID][containerName].OriginalAllocationResult = rampUpCPUs.Clone() 809 newPodEntries[podUID][containerName].TopologyAwareAssignments = machine.DeepcopyCPUAssignment(rampUpCPUsTopologyAwareAssignments) 810 newPodEntries[podUID][containerName].OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(rampUpCPUsTopologyAwareAssignments) 811 812 case apiconsts.PodAnnotationQoSLevelSharedCores, apiconsts.PodAnnotationQoSLevelReclaimedCores: 813 ownerPoolName := allocationInfo.GetPoolName() 814 815 if allocationInfo.RampUp { 816 general.Infof("pod: %s/%s container: %s is in ramp up, set its allocation result from %s to rampUpCPUs :%s", 817 allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, 818 allocationInfo.AllocationResult.String(), rampUpCPUs.String()) 819 820 newPodEntries[podUID][containerName].OwnerPoolName = advisorapi.EmptyOwnerPoolName 821 newPodEntries[podUID][containerName].AllocationResult = rampUpCPUs.Clone() 822 newPodEntries[podUID][containerName].OriginalAllocationResult = rampUpCPUs.Clone() 823 newPodEntries[podUID][containerName].TopologyAwareAssignments = machine.DeepcopyCPUAssignment(rampUpCPUsTopologyAwareAssignments) 824 newPodEntries[podUID][containerName].OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(rampUpCPUsTopologyAwareAssignments) 825 } else if newPodEntries[ownerPoolName][advisorapi.FakedContainerName] == nil { 826 general.Warningf("pod: %s/%s container: %s get owner pool: %s allocationInfo failed. reuse its allocation result: %s", 827 allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, 828 ownerPoolName, allocationInfo.AllocationResult.String()) 829 } else { 830 poolEntry := newPodEntries[ownerPoolName][advisorapi.FakedContainerName] 831 general.Infof("put pod: %s/%s container: %s to pool: %s, set its allocation result from %s to %s", 832 allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, 833 ownerPoolName, allocationInfo.AllocationResult.String(), poolEntry.AllocationResult.String()) 834 835 newPodEntries[podUID][containerName].OwnerPoolName = ownerPoolName 836 newPodEntries[podUID][containerName].AllocationResult = poolEntry.AllocationResult.Clone() 837 newPodEntries[podUID][containerName].OriginalAllocationResult = poolEntry.OriginalAllocationResult.Clone() 838 newPodEntries[podUID][containerName].TopologyAwareAssignments = machine.DeepcopyCPUAssignment(poolEntry.TopologyAwareAssignments) 839 newPodEntries[podUID][containerName].OriginalTopologyAwareAssignments = machine.DeepcopyCPUAssignment(poolEntry.TopologyAwareAssignments) 840 } 841 default: 842 return fmt.Errorf("invalid qosLevel: %s for pod: %s/%s container: %s", 843 allocationInfo.QoSLevel, allocationInfo.PodNamespace, 844 allocationInfo.PodName, allocationInfo.ContainerName) 845 } 846 } 847 } 848 849 // use pod entries generated above to generate machine state info, and store in local state 850 machineState, err = generateMachineStateFromPodEntries(p.machineInfo.CPUTopology, newPodEntries) 851 if err != nil { 852 return fmt.Errorf("calculate machineState by newPodEntries failed with error: %v", err) 853 } 854 p.state.SetPodEntries(newPodEntries) 855 p.state.SetMachineState(machineState) 856 857 return nil 858 } 859 860 // generatePoolsAndIsolation is used to generate cpuset pools and isolated cpuset 861 // 1. allocate isolated cpuset for pod/containers, and divide total cores evenly if not possible to allocate 862 // 2. use the left cores to allocate among different pools 863 // 3. apportion to other pools if reclaimed is disabled 864 func (p *DynamicPolicy) generatePoolsAndIsolation(poolsQuantityMap map[string]int, 865 isolatedQuantityMap map[string]map[string]int, availableCPUs machine.CPUSet) (poolsCPUSet map[string]machine.CPUSet, 866 isolatedCPUSet map[string]map[string]machine.CPUSet, err error, 867 ) { 868 // clear pool map with zero quantity 869 for poolName, quantity := range poolsQuantityMap { 870 if quantity == 0 { 871 general.Warningf("pool: %s with 0 quantity, skip generate", poolName) 872 delete(poolsQuantityMap, poolName) 873 } 874 } 875 876 // clear isolated map with zero quantity 877 for podUID, containerEntries := range isolatedQuantityMap { 878 for containerName, quantity := range containerEntries { 879 if quantity == 0 { 880 general.Warningf("isolated pod: %s, container: %s with 0 quantity, skip generate it", podUID, containerName) 881 delete(containerEntries, containerName) 882 } 883 } 884 if len(containerEntries) == 0 { 885 general.Warningf(" isolated pod: %s all container entries skipped", podUID) 886 delete(isolatedQuantityMap, podUID) 887 } 888 } 889 890 availableSize := availableCPUs.Size() 891 892 poolsCPUSet = make(map[string]machine.CPUSet) 893 poolsTotalQuantity := general.SumUpMapValues(poolsQuantityMap) 894 895 isolatedCPUSet = make(map[string]map[string]machine.CPUSet) 896 isolatedTotalQuantity := general.SumUpMultipleMapValues(isolatedQuantityMap) 897 898 general.Infof("isolatedTotalQuantity: %d, poolsTotalQuantity: %d, availableSize: %d", 899 isolatedTotalQuantity, poolsTotalQuantity, availableSize) 900 901 var tErr error 902 if poolsTotalQuantity+isolatedTotalQuantity <= availableSize { 903 general.Infof("all pools and isolated containers could be allocated") 904 905 isolatedCPUSet, availableCPUs, tErr = p.takeCPUsForContainers(isolatedQuantityMap, availableCPUs) 906 if tErr != nil { 907 err = fmt.Errorf("allocate isolated cpus for dedicated_cores failed with error: %v", tErr) 908 return 909 } 910 911 poolsCPUSet, availableCPUs, tErr = p.takeCPUsForPools(poolsQuantityMap, availableCPUs) 912 if tErr != nil { 913 err = fmt.Errorf("allocate cpus for pools failed with error: %v", tErr) 914 return 915 } 916 } else if poolsTotalQuantity <= availableSize { 917 general.Infof("all pools could be allocated, all isolated containers would be put to pools") 918 919 poolsCPUSet, availableCPUs, tErr = p.takeCPUsForPools(poolsQuantityMap, availableCPUs) 920 if tErr != nil { 921 err = fmt.Errorf("allocate cpus for pools failed with error: %v", tErr) 922 return 923 } 924 } else if poolsTotalQuantity > 0 { 925 general.Infof("can't allocate for all pools") 926 927 totalProportionalPoolsQuantity := 0 928 proportionalPoolsQuantityMap := make(map[string]int) 929 930 for poolName, poolQuantity := range poolsQuantityMap { 931 proportionalSize := general.Max(getProportionalSize(poolQuantity, poolsTotalQuantity, availableSize, true /*ceil*/), 1) 932 proportionalPoolsQuantityMap[poolName] = proportionalSize 933 totalProportionalPoolsQuantity += proportionalSize 934 } 935 936 poolNames := make([]string, 0, len(proportionalPoolsQuantityMap)) 937 938 for poolName := range proportionalPoolsQuantityMap { 939 poolNames = append(poolNames, poolName) 940 } 941 942 sort.Slice(poolNames, func(x, y int) bool { 943 // sort in descending order 944 return proportionalPoolsQuantityMap[poolNames[x]] > proportionalPoolsQuantityMap[poolNames[y]] 945 }) 946 947 // corner case: after divide, the total count goes to be bigger than available total 948 for totalProportionalPoolsQuantity > availableSize { 949 curTotalProportionalPoolsQuantity := totalProportionalPoolsQuantity 950 951 for _, poolName := range poolNames { 952 quantity := proportionalPoolsQuantityMap[poolName] 953 954 if quantity > 1 && totalProportionalPoolsQuantity > 0 { 955 quantity-- 956 totalProportionalPoolsQuantity-- 957 proportionalPoolsQuantityMap[poolName] = quantity 958 959 if totalProportionalPoolsQuantity == availableSize { 960 break 961 } 962 } 963 } 964 965 // availableSize can't satisfy every pool has at least one cpu 966 if curTotalProportionalPoolsQuantity == totalProportionalPoolsQuantity { 967 break 968 } 969 } 970 971 general.Infof("poolsQuantityMap: %v, proportionalPoolsQuantityMap: %v, availableSize: %d", 972 poolsQuantityMap, proportionalPoolsQuantityMap, availableSize) 973 974 // availableSize can't satisfy every pool has at least one cpu, 975 // we make all pools equals to availableCPUs in this case. 976 if totalProportionalPoolsQuantity > availableSize { 977 for poolName := range poolsQuantityMap { 978 poolsCPUSet[poolName] = availableCPUs.Clone() 979 } 980 } else { 981 poolsCPUSet, availableCPUs, tErr = p.takeCPUsForPools(proportionalPoolsQuantityMap, availableCPUs) 982 if tErr != nil { 983 err = fmt.Errorf("allocate cpus for pools failed with error: %v", tErr) 984 return 985 } 986 } 987 } 988 989 if poolsCPUSet[state.PoolNameReserve].IsEmpty() { 990 poolsCPUSet[state.PoolNameReserve] = p.reservedCPUs.Clone() 991 general.Infof("set pool %s:%s", state.PoolNameReserve, poolsCPUSet[state.PoolNameReserve].String()) 992 } else { 993 err = fmt.Errorf("static pool %s result: %s is generated dynamically", state.PoolNameReserve, poolsCPUSet[state.PoolNameReserve].String()) 994 return 995 } 996 997 poolsCPUSet[state.PoolNameReclaim] = poolsCPUSet[state.PoolNameReclaim].Union(availableCPUs) 998 if poolsCPUSet[state.PoolNameReclaim].IsEmpty() { 999 // for reclaimed pool, we must make them exist when the node isn't in hybrid mode even if cause overlap 1000 allAvailableCPUs := p.machineInfo.CPUDetails.CPUs().Difference(p.reservedCPUs) 1001 reclaimedCPUSet, _, tErr := calculator.TakeByNUMABalance(p.machineInfo, allAvailableCPUs, reservedReclaimedCPUsSize) 1002 if tErr != nil { 1003 err = fmt.Errorf("fallback takeByNUMABalance faild in generatePoolsAndIsolation for reclaimedCPUSet with error: %v", tErr) 1004 return 1005 } 1006 1007 general.Infof("fallback takeByNUMABalance in generatePoolsAndIsolation for reclaimedCPUSet: %s", reclaimedCPUSet.String()) 1008 poolsCPUSet[state.PoolNameReclaim] = reclaimedCPUSet 1009 } 1010 1011 enableReclaim := p.dynamicConfig.GetDynamicConfiguration().EnableReclaim 1012 if !enableReclaim && poolsCPUSet[state.PoolNameReclaim].Size() > reservedReclaimedCPUsSize { 1013 poolsCPUSet[state.PoolNameReclaim] = p.apportionReclaimedPool(poolsCPUSet, poolsCPUSet[state.PoolNameReclaim].Clone()) 1014 general.Infof("apportionReclaimedPool finished, current %s pool: %s", 1015 state.PoolNameReclaim, poolsCPUSet[state.PoolNameReclaim].String()) 1016 } 1017 1018 return 1019 } 1020 1021 // apportionReclaimedPool tries to allocate reclaimed cores to none-reclaimed pools. 1022 // if we disable reclaim on current node, this could be used a down-grade strategy 1023 // to disable reclaimed workloads in emergency 1024 func (p *DynamicPolicy) apportionReclaimedPool(poolsCPUSet map[string]machine.CPUSet, reclaimedCPUs machine.CPUSet) machine.CPUSet { 1025 totalSize := 0 1026 for poolName, poolCPUs := range poolsCPUSet { 1027 if state.ResidentPools.Has(poolName) { 1028 continue 1029 } 1030 totalSize += poolCPUs.Size() 1031 } 1032 1033 availableSize := reclaimedCPUs.Size() - reservedReclaimedCPUsSize 1034 if availableSize <= 0 || totalSize == 0 { 1035 return reclaimedCPUs 1036 } 1037 1038 for poolName, poolCPUs := range poolsCPUSet { 1039 if state.ResidentPools.Has(poolName) { 1040 continue 1041 } 1042 proportionalSize := general.Max(getProportionalSize(poolCPUs.Size(), totalSize, availableSize, false /*ceil*/), 1) 1043 1044 var err error 1045 var cpuset machine.CPUSet 1046 cpuset, reclaimedCPUs, err = calculator.TakeByNUMABalance(p.machineInfo, reclaimedCPUs, proportionalSize) 1047 if err != nil { 1048 general.Errorf("take %d cpus from reclaimedCPUs: %s, size: %d failed with error: %v", 1049 proportionalSize, reclaimedCPUs.String(), reclaimedCPUs.Size(), err) 1050 return reclaimedCPUs 1051 } 1052 1053 poolsCPUSet[poolName] = poolCPUs.Union(cpuset) 1054 general.Infof("take %s to %s; prev: %s, current: %s", cpuset.String(), poolName, poolCPUs.String(), poolsCPUSet[poolName].String()) 1055 1056 if reclaimedCPUs.Size() <= reservedReclaimedCPUsSize { 1057 break 1058 } 1059 } 1060 1061 return reclaimedCPUs 1062 } 1063 1064 // takeCPUsForPools tries to allocate cpuset for each given pool, 1065 // and it will consider the total available cpuset during calculation. 1066 // the returned value includes cpuset pool map and remaining available cpuset. 1067 func (p *DynamicPolicy) takeCPUsForPools(poolsQuantityMap map[string]int, 1068 availableCPUs machine.CPUSet, 1069 ) (map[string]machine.CPUSet, machine.CPUSet, error) { 1070 poolsCPUSet := make(map[string]machine.CPUSet) 1071 clonedAvailableCPUs := availableCPUs.Clone() 1072 1073 // to avoid random map iteration sequence to generate pools randomly 1074 sortedPoolNames := general.GetSortedMapKeys(poolsQuantityMap) 1075 for _, poolName := range sortedPoolNames { 1076 req := poolsQuantityMap[poolName] 1077 general.Infof("allocated for pool: %s with req: %d", poolName, req) 1078 1079 var err error 1080 var cset machine.CPUSet 1081 cset, availableCPUs, err = calculator.TakeByNUMABalance(p.machineInfo, availableCPUs, req) 1082 if err != nil { 1083 return nil, clonedAvailableCPUs, fmt.Errorf("take cpu for pool: %s of req: %d failed with error: %v", 1084 poolName, req, err) 1085 } 1086 poolsCPUSet[poolName] = cset 1087 } 1088 return poolsCPUSet, availableCPUs, nil 1089 } 1090 1091 // takeCPUsForContainers tries to allocate cpuset for the given pod/container combinations, 1092 // and it will consider the total available cpuset during calculation. 1093 // the returned value includes cpuset map for pod/container combinations and remaining available cpuset. 1094 func (p *DynamicPolicy) takeCPUsForContainers(containersQuantityMap map[string]map[string]int, 1095 availableCPUs machine.CPUSet, 1096 ) (map[string]map[string]machine.CPUSet, machine.CPUSet, error) { 1097 containersCPUSet := make(map[string]map[string]machine.CPUSet) 1098 clonedAvailableCPUs := availableCPUs.Clone() 1099 1100 for podUID, containerQuantities := range containersQuantityMap { 1101 if len(containerQuantities) > 0 { 1102 containersCPUSet[podUID] = make(map[string]machine.CPUSet) 1103 } 1104 1105 for containerName, quantity := range containerQuantities { 1106 general.Infof("allocated for pod: %s container: %s with req: %d", podUID, containerName, quantity) 1107 1108 var err error 1109 var cset machine.CPUSet 1110 cset, availableCPUs, err = calculator.TakeByNUMABalance(p.machineInfo, availableCPUs, quantity) 1111 if err != nil { 1112 return nil, clonedAvailableCPUs, fmt.Errorf("take cpu for pod: %s container: %s of req: %d failed with error: %v", 1113 podUID, containerName, quantity, err) 1114 } 1115 containersCPUSet[podUID][containerName] = cset 1116 } 1117 } 1118 return containersCPUSet, availableCPUs, nil 1119 } 1120 1121 func (p *DynamicPolicy) shouldSharedCoresRampUp(podUID string) bool { 1122 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 1123 defer cancel() 1124 pod, err := p.metaServer.GetPod(ctx, podUID) 1125 1126 if err != nil { 1127 general.Errorf("get pod failed with error: %v, try to ramp up it", err) 1128 return true 1129 } else if pod == nil { 1130 general.Infof("can't get pod: %s from metaServer, try to ramp up it", podUID) 1131 return true 1132 } else if !native.PodIsPending(pod) { 1133 general.Infof("pod: %s/%s isn't pending(not admit firstly), not try to ramp up it", pod.Namespace, pod.Name) 1134 return false 1135 } else { 1136 general.Infof("pod: %s/%s isn't active, try to ramp up it", pod.Namespace, pod.Name) 1137 return true 1138 } 1139 } 1140 1141 func (p *DynamicPolicy) doAndCheckPutAllocationInfo(allocationInfo *state.AllocationInfo, incrByReq bool) error { 1142 if allocationInfo == nil { 1143 return fmt.Errorf("doAndCheckPutAllocationInfo got nil allocationInfo") 1144 } 1145 1146 // need to adjust pools and putAllocationsAndAdjustAllocationEntries will set the allocationInfo after adjusted 1147 err := p.putAllocationsAndAdjustAllocationEntries([]*state.AllocationInfo{allocationInfo}, incrByReq) 1148 if err != nil { 1149 general.Errorf("pod: %s/%s, container: %s putAllocationsAndAdjustAllocationEntries failed with error: %v", 1150 allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName, err) 1151 return fmt.Errorf("putAllocationsAndAdjustAllocationEntries failed with error: %v", err) 1152 } 1153 1154 checkedAllocationInfo := p.state.GetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName) 1155 if checkedAllocationInfo == nil { 1156 general.Errorf("pod: %s/%s, container: %s get nil allocationInfo after putAllocationsAndAdjustAllocationEntries", 1157 allocationInfo.PodNamespace, allocationInfo.PodName, allocationInfo.ContainerName) 1158 return fmt.Errorf("putAllocationsAndAdjustAllocationEntries failed with error: %v", err) 1159 } 1160 1161 return nil 1162 }