volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/numaaware/numaaware.go (about) 1 /* 2 Copyright 2021 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package numaaware 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 24 v1 "k8s.io/api/core/v1" 25 "k8s.io/client-go/util/workqueue" 26 "k8s.io/klog/v2" 27 v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 28 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology" 29 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" 30 "k8s.io/utils/cpuset" 31 32 nodeinfov1alpha1 "volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1" 33 34 "volcano.sh/volcano/pkg/scheduler/api" 35 "volcano.sh/volcano/pkg/scheduler/framework" 36 "volcano.sh/volcano/pkg/scheduler/plugins/numaaware/policy" 37 "volcano.sh/volcano/pkg/scheduler/plugins/numaaware/provider/cpumanager" 38 "volcano.sh/volcano/pkg/scheduler/plugins/util" 39 ) 40 41 const ( 42 // PluginName indicates name of volcano scheduler plugin. 43 PluginName = "numa-aware" 44 // NumaTopoWeight indicates the weight of numa-aware plugin. 45 NumaTopoWeight = "weight" 46 ) 47 48 type numaPlugin struct { 49 sync.Mutex 50 // Arguments given for the plugin 51 pluginArguments framework.Arguments 52 hintProviders []policy.HintProvider 53 assignRes map[api.TaskID]map[string]api.ResNumaSets // map[taskUID]map[nodename][resourceName]cpuset.CPUSet 54 nodeResSets map[string]api.ResNumaSets // map[nodename][resourceName]cpuset.CPUSet 55 taskBindNodeMap map[api.TaskID]string 56 } 57 58 // New function returns prioritize plugin object. 59 func New(arguments framework.Arguments) framework.Plugin { 60 plugin := &numaPlugin{ 61 pluginArguments: arguments, 62 assignRes: make(map[api.TaskID]map[string]api.ResNumaSets), 63 taskBindNodeMap: make(map[api.TaskID]string), 64 } 65 66 plugin.hintProviders = append(plugin.hintProviders, cpumanager.NewProvider()) 67 return plugin 68 } 69 70 func (pp *numaPlugin) Name() string { 71 return PluginName 72 } 73 74 func calculateWeight(args framework.Arguments) int { 75 weight := 1 76 args.GetInt(&weight, NumaTopoWeight) 77 return weight 78 } 79 80 func (pp *numaPlugin) OnSessionOpen(ssn *framework.Session) { 81 weight := calculateWeight(pp.pluginArguments) 82 numaNodes := api.GenerateNumaNodes(ssn.Nodes) 83 pp.nodeResSets = api.GenerateNodeResNumaSets(ssn.Nodes) 84 85 ssn.AddEventHandler(&framework.EventHandler{ 86 AllocateFunc: func(event *framework.Event) { 87 node := pp.nodeResSets[event.Task.NodeName] 88 if _, ok := pp.assignRes[event.Task.UID]; !ok { 89 return 90 } 91 92 resNumaSets, ok := pp.assignRes[event.Task.UID][event.Task.NodeName] 93 if !ok { 94 return 95 } 96 97 node.Allocate(resNumaSets) 98 pp.taskBindNodeMap[event.Task.UID] = event.Task.NodeName 99 }, 100 DeallocateFunc: func(event *framework.Event) { 101 node := pp.nodeResSets[event.Task.NodeName] 102 if _, ok := pp.assignRes[event.Task.UID]; !ok { 103 return 104 } 105 106 resNumaSets, ok := pp.assignRes[event.Task.UID][event.Task.NodeName] 107 if !ok { 108 return 109 } 110 111 delete(pp.taskBindNodeMap, event.Task.UID) 112 node.Release(resNumaSets) 113 }, 114 }) 115 116 predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) { 117 predicateStatus := make([]*api.Status, 0) 118 numaStatus := &api.Status{} 119 if v1qos.GetPodQOS(task.Pod) != v1.PodQOSGuaranteed { 120 klog.V(3).Infof("task %s isn't Guaranteed pod", task.Name) 121 return predicateStatus, nil 122 } 123 124 if fit, err := filterNodeByPolicy(task, node, pp.nodeResSets); !fit { 125 return predicateStatus, err 126 } 127 128 resNumaSets := pp.nodeResSets[node.Name].Clone() 129 130 taskPolicy := policy.GetPolicy(node, numaNodes[node.Name]) 131 allResAssignMap := make(map[string]cpuset.CPUSet) 132 for _, container := range task.Pod.Spec.Containers { 133 providersHints := policy.AccumulateProvidersHints(&container, node.NumaSchedulerInfo, resNumaSets, pp.hintProviders) 134 hit, admit := taskPolicy.Predicate(providersHints) 135 if !admit { 136 numaStatus.Code = api.UnschedulableAndUnresolvable 137 numaStatus.Reason = fmt.Sprintf("plugin %s predicates failed for task %s container %s on node %s", 138 pp.Name(), task.Name, container.Name, node.Name) 139 predicateStatus = append(predicateStatus, numaStatus) 140 return predicateStatus, fmt.Errorf("plugin %s predicates failed for task %s container %s on node %s", 141 pp.Name(), task.Name, container.Name, node.Name) 142 } 143 144 klog.V(4).Infof("[numaaware] hits for task %s container '%v': %v on node %s, besthit: %v", 145 task.Name, container.Name, providersHints, node.Name, hit) 146 resAssignMap := policy.Allocate(&container, &hit, node.NumaSchedulerInfo, resNumaSets, pp.hintProviders) 147 for resName, assign := range resAssignMap { 148 allResAssignMap[resName] = allResAssignMap[resName].Union(assign) 149 resNumaSets[resName] = resNumaSets[resName].Difference(assign) 150 } 151 } 152 153 pp.Lock() 154 defer pp.Unlock() 155 if _, ok := pp.assignRes[task.UID]; !ok { 156 pp.assignRes[task.UID] = make(map[string]api.ResNumaSets) 157 } 158 159 pp.assignRes[task.UID][node.Name] = allResAssignMap 160 161 klog.V(4).Infof(" task %s's on node<%s> resAssignMap: %v", 162 task.Name, node.Name, pp.assignRes[task.UID][node.Name]) 163 164 numaStatus.Code = api.Success 165 predicateStatus = append(predicateStatus, numaStatus) 166 return predicateStatus, nil 167 } 168 169 ssn.AddPredicateFn(pp.Name(), predicateFn) 170 171 batchNodeOrderFn := func(task *api.TaskInfo, nodeInfo []*api.NodeInfo) (map[string]float64, error) { 172 nodeScores := make(map[string]float64, len(nodeInfo)) 173 if task.NumaInfo == nil || task.NumaInfo.Policy == "" || task.NumaInfo.Policy == "none" { 174 return nodeScores, nil 175 } 176 177 if _, found := pp.assignRes[task.UID]; !found { 178 return nodeScores, nil 179 } 180 181 scoreList := getNodeNumaNumForTask(nodeInfo, pp.assignRes[task.UID]) 182 util.NormalizeScore(api.DefaultMaxNodeScore, true, scoreList) 183 184 for idx, scoreNode := range scoreList { 185 scoreNode.Score *= int64(weight) 186 nodeName := nodeInfo[idx].Name 187 nodeScores[nodeName] = float64(scoreNode.Score) 188 } 189 190 klog.V(4).Infof("numa-aware plugin Score for task %s/%s is: %v", 191 task.Namespace, task.Name, nodeScores) 192 return nodeScores, nil 193 } 194 195 ssn.AddBatchNodeOrderFn(pp.Name(), batchNodeOrderFn) 196 } 197 198 func filterNodeByPolicy(task *api.TaskInfo, node *api.NodeInfo, nodeResSets map[string]api.ResNumaSets) (fit bool, err error) { 199 if !(task.NumaInfo == nil || task.NumaInfo.Policy == "" || task.NumaInfo.Policy == "none") { 200 if node.NumaSchedulerInfo == nil { 201 return false, fmt.Errorf("numa info is empty") 202 } 203 204 if node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.CPUManagerPolicy] != "static" { 205 return false, fmt.Errorf("cpu manager policy isn't static") 206 } 207 208 if task.NumaInfo.Policy != node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy] { 209 return false, fmt.Errorf("task topology polocy[%s] is different with node[%s]", 210 task.NumaInfo.Policy, node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy]) 211 } 212 213 if _, ok := nodeResSets[node.Name]; !ok { 214 return false, fmt.Errorf("no topo information") 215 } 216 217 if nodeResSets[node.Name][string(v1.ResourceCPU)].Size() == 0 { 218 return false, fmt.Errorf("cpu allocatable map is empty") 219 } 220 } else { 221 if node.NumaSchedulerInfo == nil { 222 return false, nil 223 } 224 225 if node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.CPUManagerPolicy] != "static" { 226 return false, nil 227 } 228 229 if (node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy] == "none") || 230 (node.NumaSchedulerInfo.Policies[nodeinfov1alpha1.TopologyManagerPolicy] == "") { 231 return false, nil 232 } 233 } 234 235 return true, nil 236 } 237 238 func getNodeNumaNumForTask(nodeInfo []*api.NodeInfo, resAssignMap map[string]api.ResNumaSets) []api.ScoredNode { 239 nodeNumaCnts := make([]api.ScoredNode, len(nodeInfo)) 240 workqueue.ParallelizeUntil(context.TODO(), 16, len(nodeInfo), func(index int) { 241 node := nodeInfo[index] 242 assignCpus := resAssignMap[node.Name][string(v1.ResourceCPU)] 243 nodeNumaCnts[index] = api.ScoredNode{ 244 NodeName: node.Name, 245 Score: int64(getNumaNodeCntForCPUID(assignCpus, node.NumaSchedulerInfo.CPUDetail)), 246 } 247 }) 248 249 return nodeNumaCnts 250 } 251 252 func getNumaNodeCntForCPUID(cpus cpuset.CPUSet, cpuDetails topology.CPUDetails) int { 253 mask, _ := bitmask.NewBitMask() 254 s := cpus.List() 255 256 for _, cpuID := range s { 257 mask.Add(cpuDetails[cpuID].NUMANodeID) 258 } 259 260 return mask.Count() 261 } 262 263 func (pp *numaPlugin) OnSessionClose(ssn *framework.Session) { 264 if len(pp.taskBindNodeMap) == 0 { 265 return 266 } 267 268 allocatedResSet := make(map[string]api.ResNumaSets) 269 for taskID, nodeName := range pp.taskBindNodeMap { 270 if _, existed := pp.assignRes[taskID]; !existed { 271 continue 272 } 273 274 if _, existed := pp.assignRes[taskID][nodeName]; !existed { 275 continue 276 } 277 278 if _, existed := allocatedResSet[nodeName]; !existed { 279 allocatedResSet[nodeName] = make(api.ResNumaSets) 280 } 281 282 resSet := pp.assignRes[taskID][nodeName] 283 for resName, set := range resSet { 284 if _, existed := allocatedResSet[nodeName][resName]; !existed { 285 allocatedResSet[nodeName][resName] = cpuset.New() 286 } 287 288 allocatedResSet[nodeName][resName] = allocatedResSet[nodeName][resName].Union(set) 289 } 290 } 291 292 klog.V(4).Infof("[numaPlugin]allocatedResSet: %v", allocatedResSet) 293 ssn.UpdateSchedulerNumaInfo(allocatedResSet) 294 }