github.com/kubewharf/katalyst-core@v0.5.3/pkg/agent/qrm-plugins/cpu/nativepolicy/policy_allocation_handlers.go (about) 1 /* 2 Copyright 2022 The Katalyst Authors. 3 Copyright 2017 The Kubernetes Authors. 4 5 Licensed under the Apache License, Version 2.0 (the "License"); 6 you may not use this file except in compliance with the License. 7 You may obtain a copy of the License at 8 9 http://www.apache.org/licenses/LICENSE-2.0 10 11 Unless required by applicable law or agreed to in writing, software 12 distributed under the License is distributed on an "AS IS" BASIS, 13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 See the License for the specific language governing permissions and 15 limitations under the License. 16 */ 17 18 package nativepolicy 19 20 import ( 21 "context" 22 "errors" 23 "fmt" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 "k8s.io/klog/v2" 28 pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" 29 30 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/state" 31 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/nativepolicy/calculator" 32 nativepolicyutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/nativepolicy/util" 33 cpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/util" 34 "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" 35 "github.com/kubewharf/katalyst-core/pkg/util/general" 36 "github.com/kubewharf/katalyst-core/pkg/util/machine" 37 ) 38 39 const ( 40 // ErrorSMTAlignment represents the type of an SMTAlignmentError. 41 ErrorSMTAlignment = "SMTAlignmentError" 42 ) 43 44 // SMTAlignmentError represents an error due to SMT alignment 45 type SMTAlignmentError struct { 46 RequestedCPUs int 47 CpusPerCore int 48 } 49 50 func (e SMTAlignmentError) Error() string { 51 return fmt.Sprintf("SMT Alignment Error: requested %d cpus not multiple cpus per core = %d", e.RequestedCPUs, e.CpusPerCore) 52 } 53 54 func (e SMTAlignmentError) Type() string { 55 return ErrorSMTAlignment 56 } 57 58 func (p *NativePolicy) dedicatedCoresAllocationHandler(_ context.Context, 59 req *pluginapi.ResourceRequest, 60 ) (*pluginapi.ResourceAllocationResponse, error) { 61 if req == nil { 62 return nil, fmt.Errorf("dedicatedCoresAllocationHandler got nil req") 63 } 64 65 reqInt, reqFloat64, err := util.GetQuantityFromResourceReq(req) 66 if err != nil { 67 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 68 } 69 70 if p.enableFullPhysicalCPUsOnly && ((reqInt % p.machineInfo.CPUsPerCore()) != 0) { 71 // Since CPU plugin has been enabled requesting strict SMT alignment, it means a guaranteed pod can only be admitted 72 // if the CPU requested is a multiple of the number of virtual cpus per physical cores. 73 // In case CPU request is not a multiple of the number of virtual cpus per physical cores the Pod will be put 74 // in Failed state, with SMTAlignmentError as reason. Since the allocation happens in terms of physical cores 75 // and the scheduler is responsible for ensuring that the workload goes to a node that has enough CPUs, 76 // the pod would be placed on a node where there are enough physical cores available to be allocated. 77 // Just like the behavior in case of static policy, takeByTopology will try to first allocate CPUs from the same socket 78 // and only in case the request cannot be satisfied on a single socket, CPU allocation is done for a workload to occupy all 79 // CPUs on a physical core. Allocation of individual threads would never have to occur. 80 return nil, SMTAlignmentError{ 81 RequestedCPUs: reqInt, 82 CpusPerCore: p.machineInfo.CPUsPerCore(), 83 } 84 } 85 86 machineState := p.state.GetMachineState() 87 88 // Allocate CPUs according to the NUMA affinity contained in the hint. 89 result, err := p.allocateCPUs(machineState, reqInt, req.Hint, p.cpusToReuse[req.PodUid]) 90 if err != nil { 91 general.ErrorS(err, "unable to allocate CPUs", 92 "podNamespace", req.PodNamespace, 93 "podName", req.PodName, 94 "containerName", req.ContainerName, 95 "numCPUs", reqInt) 96 return nil, err 97 } 98 99 general.InfoS("allocate CPUs successfully", 100 "podNamespace", req.PodNamespace, 101 "podName", req.PodName, 102 "containerName", req.ContainerName, 103 "numCPUs", reqInt, 104 "result", result.String()) 105 106 topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, result) 107 if err != nil { 108 general.ErrorS(err, "unable to calculate topologyAwareAssignments", 109 "podNamespace", req.PodNamespace, 110 "podName", req.PodName, 111 "containerName", req.ContainerName, 112 "numCPUs", reqInt, 113 "cpuset", result.String()) 114 return nil, err 115 } 116 117 allocationInfo := &state.AllocationInfo{ 118 PodUid: req.PodUid, 119 PodNamespace: req.PodNamespace, 120 PodName: req.PodName, 121 ContainerName: req.ContainerName, 122 ContainerType: req.ContainerType.String(), 123 ContainerIndex: req.ContainerIndex, 124 PodType: req.PodType, 125 OwnerPoolName: state.PoolNameDedicated, 126 AllocationResult: result.Clone(), 127 OriginalAllocationResult: result.Clone(), 128 TopologyAwareAssignments: topologyAwareAssignments, 129 OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments), 130 InitTimestamp: time.Now().Format(util.QRMTimeFormat), 131 Labels: general.DeepCopyMap(req.Labels), 132 Annotations: general.DeepCopyMap(req.Annotations), 133 RequestQuantity: reqFloat64, 134 } 135 136 // update pod entries directly. 137 // if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function. 138 p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo) 139 podEntries := p.state.GetPodEntries() 140 141 updatedMachineState, err := nativepolicyutil.GenerateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries) 142 if err != nil { 143 general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", 144 req.PodNamespace, req.PodName, req.ContainerName, err) 145 return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) 146 } 147 p.state.SetMachineState(updatedMachineState) 148 149 resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req) 150 if err != nil { 151 general.Errorf("pod: %s/%s, container: %s PackResourceAllocationResponseByAllocationInfo failed with error: %v", 152 req.PodNamespace, req.PodName, req.ContainerName, err) 153 return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err) 154 } 155 return resp, nil 156 } 157 158 func (p *NativePolicy) sharedPoolAllocationHandler(ctx context.Context, 159 req *pluginapi.ResourceRequest, 160 ) (*pluginapi.ResourceAllocationResponse, error) { 161 if req == nil { 162 return nil, fmt.Errorf("dedicatedCoresAllocationHandler got nil req") 163 } 164 165 reqInt, reqFloat64, err := util.GetQuantityFromResourceReq(req) 166 if err != nil { 167 return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err) 168 } 169 170 defaultCPUSet := p.state.GetMachineState().GetDefaultCPUSet() 171 if defaultCPUSet.IsEmpty() { 172 return nil, errors.New("default cpuset is empty") 173 } 174 175 general.InfoS("allocate default cpuset successfully", 176 "podNamespace", req.PodNamespace, 177 "podName", req.PodName, 178 "containerName", req.ContainerName, 179 "numCPUs", reqInt, 180 "result", defaultCPUSet.String()) 181 182 topologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, defaultCPUSet) 183 if err != nil { 184 general.ErrorS(err, "unable to calculate topologyAwareAssignments", 185 "podNamespace", req.PodNamespace, 186 "podName", req.PodName, 187 "containerName", req.ContainerName, 188 "numCPUs", reqInt, 189 "cpuset", defaultCPUSet.String()) 190 return nil, err 191 } 192 193 allocationInfo := &state.AllocationInfo{ 194 PodUid: req.PodUid, 195 PodNamespace: req.PodNamespace, 196 PodName: req.PodName, 197 ContainerName: req.ContainerName, 198 ContainerType: req.ContainerType.String(), 199 ContainerIndex: req.ContainerIndex, 200 PodType: req.PodType, 201 OwnerPoolName: state.PoolNameShare, 202 AllocationResult: defaultCPUSet.Clone(), 203 OriginalAllocationResult: defaultCPUSet.Clone(), 204 TopologyAwareAssignments: topologyAwareAssignments, 205 OriginalTopologyAwareAssignments: machine.DeepcopyCPUAssignment(topologyAwareAssignments), 206 InitTimestamp: time.Now().Format(util.QRMTimeFormat), 207 Labels: general.DeepCopyMap(req.Labels), 208 Annotations: general.DeepCopyMap(req.Annotations), 209 RequestQuantity: reqFloat64, 210 } 211 212 // update pod entries directly. 213 // if one of subsequent steps is failed, we will delete current allocationInfo from podEntries in defer function of allocation function. 214 p.state.SetAllocationInfo(allocationInfo.PodUid, allocationInfo.ContainerName, allocationInfo) 215 podEntries := p.state.GetPodEntries() 216 217 updatedMachineState, err := nativepolicyutil.GenerateMachineStateFromPodEntries(p.machineInfo.CPUTopology, podEntries) 218 if err != nil { 219 general.Errorf("pod: %s/%s, container: %s GenerateMachineStateFromPodEntries failed with error: %v", 220 req.PodNamespace, req.PodName, req.ContainerName, err) 221 return nil, fmt.Errorf("GenerateMachineStateFromPodEntries failed with error: %v", err) 222 } 223 p.state.SetMachineState(updatedMachineState) 224 225 resp, err := cpuutil.PackAllocationResponse(allocationInfo, string(v1.ResourceCPU), util.OCIPropertyNameCPUSetCPUs, false, true, req) 226 if err != nil { 227 general.Errorf("pod: %s/%s, container: %s PackResourceAllocationResponseByAllocationInfo failed with error: %v", 228 req.PodNamespace, req.PodName, req.ContainerName, err) 229 return nil, fmt.Errorf("PackResourceAllocationResponseByAllocationInfo failed with error: %v", err) 230 } 231 return resp, nil 232 } 233 234 func (p *NativePolicy) allocateCPUs(machineState state.NUMANodeMap, numCPUs int, hint *pluginapi.TopologyHint, reusableCPUs machine.CPUSet) (machine.CPUSet, error) { 235 klog.InfoS("AllocateCPUs", "numCPUs", numCPUs, "hint", hint) 236 237 allocatableCPUs := machineState.GetAvailableCPUSet(p.reservedCPUs).Union(reusableCPUs) 238 239 // If there are aligned CPUs in numaAffinity, attempt to take those first. 240 result := machine.NewCPUSet() 241 if hint != nil { 242 alignedCPUs := machine.NewCPUSet() 243 for _, numaNode := range hint.Nodes { 244 alignedCPUs = alignedCPUs.Union(machineState[int(numaNode)].GetAvailableCPUSet(p.reservedCPUs)) 245 } 246 247 numAlignedToAlloc := alignedCPUs.Size() 248 if numCPUs < numAlignedToAlloc { 249 numAlignedToAlloc = numCPUs 250 } 251 252 alignedCPUs, err := p.takeByTopology(alignedCPUs, numAlignedToAlloc) 253 if err != nil { 254 return machine.NewCPUSet(), err 255 } 256 257 result = result.Union(alignedCPUs) 258 } 259 260 // Get any remaining CPUs from what's leftover after attempting to grab aligned ones. 261 remainingCPUs, err := p.takeByTopology(allocatableCPUs.Difference(result), numCPUs-result.Size()) 262 if err != nil { 263 return machine.NewCPUSet(), err 264 } 265 result = result.Union(remainingCPUs) 266 267 klog.InfoS("AllocateCPUs", "result", result) 268 return result, nil 269 } 270 271 func (p *NativePolicy) takeByTopology(availableCPUs machine.CPUSet, numCPUs int) (machine.CPUSet, error) { 272 if p.cpuAllocationOption == nativepolicyutil.CPUResourcePluginNativePolicyAllocationOptionDistributed { 273 cpuGroupSize := 1 274 if p.enableFullPhysicalCPUsOnly { 275 cpuGroupSize = p.machineInfo.CPUsPerCore() 276 } 277 return calculator.TakeByTopologyNUMADistributed(p.machineInfo.CPUTopology, availableCPUs, numCPUs, cpuGroupSize) 278 } 279 return calculator.TakeByTopologyNUMAPacked(p.machineInfo.CPUTopology, availableCPUs, numCPUs) 280 } 281 282 func (p *NativePolicy) updateCPUsToReuse(req *pluginapi.ResourceRequest, cset machine.CPUSet) { 283 // If pod entries to m.cpusToReuse other than the current pod exist, delete them. 284 for podUID := range p.cpusToReuse { 285 if podUID != req.PodUid { 286 delete(p.cpusToReuse, podUID) 287 } 288 } 289 // If no cpuset exists for cpusToReuse by this pod yet, create one. 290 if _, ok := p.cpusToReuse[req.PodUid]; !ok { 291 p.cpusToReuse[req.PodUid] = machine.NewCPUSet() 292 } 293 // Check if the container is an init container. 294 // If so, add its cpuset to the cpuset of reusable CPUs for any new allocations. 295 if req.ContainerType == pluginapi.ContainerType_INIT { 296 p.cpusToReuse[req.PodUid] = p.cpusToReuse[req.PodUid].Union(cset) 297 return 298 } 299 // Otherwise it is an app container. 300 // Remove its cpuset from the cpuset of reusable CPUs for any new allocations. 301 p.cpusToReuse[req.PodUid] = p.cpusToReuse[req.PodUid].Difference(cset) 302 }