volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/nodeorder/nodeorder.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package nodeorder 18 19 import ( 20 "context" 21 "fmt" 22 23 v1 "k8s.io/api/core/v1" 24 utilFeature "k8s.io/apiserver/pkg/util/feature" 25 "k8s.io/client-go/util/workqueue" 26 "k8s.io/klog/v2" 27 "k8s.io/kubernetes/pkg/features" 28 "k8s.io/kubernetes/pkg/scheduler/apis/config" 29 k8sframework "k8s.io/kubernetes/pkg/scheduler/framework" 30 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" 31 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/imagelocality" 32 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity" 33 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity" 34 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources" 35 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread" 36 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration" 37 38 "volcano.sh/volcano/pkg/scheduler/api" 39 "volcano.sh/volcano/pkg/scheduler/framework" 40 "volcano.sh/volcano/pkg/scheduler/plugins/util/k8s" 41 ) 42 43 const ( 44 // PluginName indicates name of volcano scheduler plugin. 45 PluginName = "nodeorder" 46 47 // NodeAffinityWeight is the key for providing Node Affinity Priority Weight in YAML 48 NodeAffinityWeight = "nodeaffinity.weight" 49 // PodAffinityWeight is the key for providing Pod Affinity Priority Weight in YAML 50 PodAffinityWeight = "podaffinity.weight" 51 // LeastRequestedWeight is the key for providing Least Requested Priority Weight in YAML 52 LeastRequestedWeight = "leastrequested.weight" 53 // BalancedResourceWeight is the key for providing Balanced Resource Priority Weight in YAML 54 BalancedResourceWeight = "balancedresource.weight" 55 // MostRequestedWeight is the key for providing Most Requested Priority Weight in YAML 56 MostRequestedWeight = "mostrequested.weight" 57 // TaintTolerationWeight is the key for providing Taint Toleration Priority Weight in YAML 58 TaintTolerationWeight = "tainttoleration.weight" 59 // ImageLocalityWeight is the key for providing Image Locality Priority Weight in YAML 60 ImageLocalityWeight = "imagelocality.weight" 61 // PodTopologySpreadWeight is the key for providing Pod Topology Spread Priority Weight in YAML 62 PodTopologySpreadWeight = "podtopologyspread.weight" 63 ) 64 65 type nodeOrderPlugin struct { 66 // Arguments given for the plugin 67 pluginArguments framework.Arguments 68 } 69 70 // New function returns nodeorder plugin object. 71 func New(arguments framework.Arguments) framework.Plugin { 72 return &nodeOrderPlugin{pluginArguments: arguments} 73 } 74 75 func (pp *nodeOrderPlugin) Name() string { 76 return PluginName 77 } 78 79 type priorityWeight struct { 80 leastReqWeight int 81 mostReqWeight int 82 nodeAffinityWeight int 83 podAffinityWeight int 84 balancedResourceWeight int 85 taintTolerationWeight int 86 imageLocalityWeight int 87 podTopologySpreadWeight int 88 } 89 90 // calculateWeight from the provided arguments. 91 // 92 // Currently only supported priorities are nodeaffinity, podaffinity, leastrequested, 93 // mostrequested, balancedresouce, imagelocality, tainttoleration. 94 // 95 // User should specify priority weights in the config in this format: 96 // 97 // actions: "reclaim, allocate, backfill, preempt" 98 // tiers: 99 // - plugins: 100 // - name: priority 101 // - name: gang 102 // - name: conformance 103 // - plugins: 104 // - name: drf 105 // - name: predicates 106 // - name: proportion 107 // - name: nodeorder 108 // arguments: 109 // leastrequested.weight: 1 110 // mostrequested.weight: 0 111 // nodeaffinity.weight: 2 112 // podaffinity.weight: 2 113 // balancedresource.weight: 1 114 // tainttoleration.weight: 3 115 // imagelocality.weight: 1 116 // podtopologyspread.weight: 2 117 func calculateWeight(args framework.Arguments) priorityWeight { 118 // Initial values for weights. 119 // By default, for backward compatibility and for reasonable scores, 120 // least requested priority is enabled and most requested priority is disabled. 121 weight := priorityWeight{ 122 leastReqWeight: 1, 123 mostReqWeight: 0, 124 nodeAffinityWeight: 2, 125 podAffinityWeight: 2, 126 balancedResourceWeight: 1, 127 taintTolerationWeight: 3, 128 imageLocalityWeight: 1, 129 podTopologySpreadWeight: 2, // be consistent with kubernetes default setting. 130 } 131 132 // Checks whether nodeaffinity.weight is provided or not, if given, modifies the value in weight struct. 133 args.GetInt(&weight.nodeAffinityWeight, NodeAffinityWeight) 134 135 // Checks whether podaffinity.weight is provided or not, if given, modifies the value in weight struct. 136 args.GetInt(&weight.podAffinityWeight, PodAffinityWeight) 137 138 // Checks whether leastrequested.weight is provided or not, if given, modifies the value in weight struct. 139 args.GetInt(&weight.leastReqWeight, LeastRequestedWeight) 140 141 // Checks whether mostrequested.weight is provided or not, if given, modifies the value in weight struct. 142 args.GetInt(&weight.mostReqWeight, MostRequestedWeight) 143 144 // Checks whether balancedresource.weight is provided or not, if given, modifies the value in weight struct. 145 args.GetInt(&weight.balancedResourceWeight, BalancedResourceWeight) 146 147 // Checks whether tainttoleration.weight is provided or not, if given, modifies the value in weight struct. 148 args.GetInt(&weight.taintTolerationWeight, TaintTolerationWeight) 149 150 // Checks whether imagelocality.weight is provided or not, if given, modifies the value in weight struct. 151 args.GetInt(&weight.imageLocalityWeight, ImageLocalityWeight) 152 153 // Checks whether podtopologyspread.weight is provided or not, if given, modifies the value in weight struct. 154 args.GetInt(&weight.podTopologySpreadWeight, PodTopologySpreadWeight) 155 156 return weight 157 } 158 159 func (pp *nodeOrderPlugin) OnSessionOpen(ssn *framework.Session) { 160 weight := calculateWeight(pp.pluginArguments) 161 nodeMap := ssn.NodeMap 162 163 fts := feature.Features{ 164 EnableVolumeCapacityPriority: utilFeature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority), 165 EnableMinDomainsInPodTopologySpread: utilFeature.DefaultFeatureGate.Enabled(features.MinDomainsInPodTopologySpread), 166 EnableNodeInclusionPolicyInPodTopologySpread: utilFeature.DefaultFeatureGate.Enabled(features.NodeInclusionPolicyInPodTopologySpread), 167 EnableMatchLabelKeysInPodTopologySpread: utilFeature.DefaultFeatureGate.Enabled(features.MatchLabelKeysInPodTopologySpread), 168 } 169 170 // Initialize k8s scheduling plugins 171 handle := k8s.NewFrameworkHandle(nodeMap, ssn.KubeClient(), ssn.InformerFactory()) 172 173 // 1. NodeResourcesLeastAllocated 174 leastAllocatedArgs := &config.NodeResourcesFitArgs{ 175 ScoringStrategy: &config.ScoringStrategy{ 176 Type: config.LeastAllocated, 177 Resources: []config.ResourceSpec{{Name: "cpu", Weight: 50}, {Name: "memory", Weight: 50}}, 178 }, 179 } 180 p, _ := noderesources.NewFit(context.TODO(), leastAllocatedArgs, handle, fts) 181 leastAllocated := p.(*noderesources.Fit) 182 183 // 2. NodeResourcesMostAllocated 184 mostAllocatedArgs := &config.NodeResourcesFitArgs{ 185 ScoringStrategy: &config.ScoringStrategy{ 186 Type: config.MostAllocated, 187 Resources: []config.ResourceSpec{{Name: "cpu", Weight: 1}, {Name: "memory", Weight: 1}}, 188 }, 189 } 190 p, _ = noderesources.NewFit(context.TODO(), mostAllocatedArgs, handle, fts) 191 mostAllocation := p.(*noderesources.Fit) 192 193 // 3. NodeResourcesBalancedAllocation 194 blArgs := &config.NodeResourcesBalancedAllocationArgs{ 195 Resources: []config.ResourceSpec{ 196 {Name: string(v1.ResourceCPU), Weight: 1}, 197 {Name: string(v1.ResourceMemory), Weight: 1}, 198 {Name: "nvidia.com/gpu", Weight: 1}, 199 }, 200 } 201 p, _ = noderesources.NewBalancedAllocation(context.TODO(), blArgs, handle, fts) 202 balancedAllocation := p.(*noderesources.BalancedAllocation) 203 204 // 4. NodeAffinity 205 naArgs := &config.NodeAffinityArgs{ 206 AddedAffinity: &v1.NodeAffinity{}, 207 } 208 p, _ = nodeaffinity.New(context.TODO(), naArgs, handle) 209 nodeAffinity := p.(*nodeaffinity.NodeAffinity) 210 211 // 5. ImageLocality 212 p, _ = imagelocality.New(context.TODO(), nil, handle) 213 imageLocality := p.(*imagelocality.ImageLocality) 214 215 nodeOrderFn := func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) { 216 var nodeScore = 0.0 217 218 state := k8sframework.NewCycleState() 219 if weight.imageLocalityWeight != 0 { 220 score, status := imageLocality.Score(context.TODO(), state, task.Pod, node.Name) 221 if !status.IsSuccess() { 222 klog.Warningf("Node: %s, Image Locality Priority Failed because of Error: %v", node.Name, status.AsError()) 223 return 0, status.AsError() 224 } 225 226 // If imageLocalityWeight is provided, host.Score is multiplied with weight, if not, host.Score is added to total score. 227 nodeScore += float64(score) * float64(weight.imageLocalityWeight) 228 klog.V(5).Infof("Node: %s, task<%s/%s> Image Locality weight %d, score: %f", node.Name, task.Namespace, task.Name, weight.imageLocalityWeight, float64(score)*float64(weight.imageLocalityWeight)) 229 } 230 231 // NodeResourcesLeastAllocated 232 if weight.leastReqWeight != 0 { 233 score, status := leastAllocated.Score(context.TODO(), state, task.Pod, node.Name) 234 if !status.IsSuccess() { 235 klog.Warningf("Node: %s, Least Allocated Priority Failed because of Error: %v", node.Name, status.AsError()) 236 return 0, status.AsError() 237 } 238 239 // If leastReqWeight is provided, host.Score is multiplied with weight, if not, host.Score is added to total score. 240 nodeScore += float64(score) * float64(weight.leastReqWeight) 241 klog.V(5).Infof("Node: %s, task<%s/%s> Least Request weight %d, score: %f", node.Name, task.Namespace, task.Name, weight.leastReqWeight, float64(score)*float64(weight.leastReqWeight)) 242 } 243 244 // NodeResourcesMostAllocated 245 if weight.mostReqWeight != 0 { 246 score, status := mostAllocation.Score(context.TODO(), state, task.Pod, node.Name) 247 if !status.IsSuccess() { 248 klog.Warningf("Node: %s, Most Allocated Priority Failed because of Error: %v", node.Name, status.AsError()) 249 return 0, status.AsError() 250 } 251 252 // If mostRequestedWeight is provided, host.Score is multiplied with weight, it's 0 by default 253 nodeScore += float64(score) * float64(weight.mostReqWeight) 254 klog.V(5).Infof("Node: %s, task<%s/%s> Most Request weight %d, score: %f", node.Name, task.Namespace, task.Name, weight.mostReqWeight, float64(score)*float64(weight.mostReqWeight)) 255 } 256 257 // NodeResourcesBalancedAllocation 258 if weight.balancedResourceWeight != 0 { 259 score, status := balancedAllocation.Score(context.TODO(), state, task.Pod, node.Name) 260 if !status.IsSuccess() { 261 klog.Warningf("Node: %s, Balanced Resource Allocation Priority Failed because of Error: %v", node.Name, status.AsError()) 262 return 0, status.AsError() 263 } 264 265 // If balancedResourceWeight is provided, host.Score is multiplied with weight, if not, host.Score is added to total score. 266 nodeScore += float64(score) * float64(weight.balancedResourceWeight) 267 klog.V(5).Infof("Node: %s, task<%s/%s> Balanced Request weight %d, score: %f", node.Name, task.Namespace, task.Name, weight.balancedResourceWeight, float64(score)*float64(weight.balancedResourceWeight)) 268 } 269 270 // NodeAffinity 271 if weight.nodeAffinityWeight != 0 { 272 score, status := nodeAffinity.Score(context.TODO(), state, task.Pod, node.Name) 273 if !status.IsSuccess() { 274 klog.Warningf("Node: %s, Calculate Node Affinity Priority Failed because of Error: %v", node.Name, status.AsError()) 275 return 0, status.AsError() 276 } 277 278 // TODO: should we normalize the score 279 // If nodeAffinityWeight is provided, host.Score is multiplied with weight, if not, host.Score is added to total score. 280 nodeScore += float64(score) * float64(weight.nodeAffinityWeight) 281 klog.V(5).Infof("Node: %s, task<%s/%s> Node Affinity weight %d, score: %f", node.Name, task.Namespace, task.Name, weight.nodeAffinityWeight, float64(score)*float64(weight.nodeAffinityWeight)) 282 } 283 284 klog.V(4).Infof("Nodeorder Total Score for task<%s/%s> on node %s is: %f", task.Namespace, task.Name, node.Name, nodeScore) 285 return nodeScore, nil 286 } 287 ssn.AddNodeOrderFn(pp.Name(), nodeOrderFn) 288 289 plArgs := &config.InterPodAffinityArgs{} 290 p, _ = interpodaffinity.New(context.TODO(), plArgs, handle) 291 interPodAffinity := p.(*interpodaffinity.InterPodAffinity) 292 293 p, _ = tainttoleration.New(context.TODO(), nil, handle) 294 taintToleration := p.(*tainttoleration.TaintToleration) 295 296 ptsArgs := &config.PodTopologySpreadArgs{ 297 DefaultingType: config.SystemDefaulting, 298 } 299 p, _ = podtopologyspread.New(context.TODO(), ptsArgs, handle, fts) 300 podTopologySpread := p.(*podtopologyspread.PodTopologySpread) 301 302 batchNodeOrderFn := func(task *api.TaskInfo, nodeInfo []*api.NodeInfo) (map[string]float64, error) { 303 // InterPodAffinity 304 state := k8sframework.NewCycleState() 305 nodes := make([]*v1.Node, 0, len(nodeInfo)) 306 for _, node := range nodeInfo { 307 nodes = append(nodes, node.Node) 308 } 309 nodeScores := make(map[string]float64, len(nodes)) 310 311 podAffinityScores, podErr := interPodAffinityScore(interPodAffinity, state, task.Pod, nodes, weight.podAffinityWeight) 312 if podErr != nil { 313 return nil, podErr 314 } 315 316 nodeTolerationScores, err := taintTolerationScore(taintToleration, state, task.Pod, nodes, weight.taintTolerationWeight) 317 if err != nil { 318 return nil, err 319 } 320 321 podTopologySpreadScores, err := podTopologySpreadScore(podTopologySpread, state, task.Pod, nodes, weight.podTopologySpreadWeight) 322 if err != nil { 323 return nil, err 324 } 325 326 for _, node := range nodes { 327 nodeScores[node.Name] = podAffinityScores[node.Name] + nodeTolerationScores[node.Name] + podTopologySpreadScores[node.Name] 328 } 329 330 klog.V(4).Infof("Batch Total Score for task %s/%s is: %v", task.Namespace, task.Name, nodeScores) 331 return nodeScores, nil 332 } 333 ssn.AddBatchNodeOrderFn(pp.Name(), batchNodeOrderFn) 334 } 335 336 func interPodAffinityScore( 337 interPodAffinity *interpodaffinity.InterPodAffinity, 338 state *k8sframework.CycleState, 339 pod *v1.Pod, 340 nodes []*v1.Node, 341 podAffinityWeight int, 342 ) (map[string]float64, error) { 343 preScoreStatus := interPodAffinity.PreScore(context.TODO(), state, pod, nodes) 344 if !preScoreStatus.IsSuccess() { 345 return nil, preScoreStatus.AsError() 346 } 347 348 nodeScoreList := make(k8sframework.NodeScoreList, len(nodes)) 349 // the default parallelization worker number is 16. 350 // the whole scoring will fail if one of the processes failed. 351 // so just create a parallelizeContext to control the whole ParallelizeUntil process. 352 // if the parallelizeCancel is invoked, the whole "ParallelizeUntil" goes to the end. 353 // this could avoid extra computation, especially in huge cluster. 354 // and the ParallelizeUntil guarantees only "workerNum" goroutines will be working simultaneously. 355 // so it's enough to allocate workerNum size for errCh. 356 // note that, in such case, size of errCh should be no less than parallelization number 357 workerNum := 16 358 errCh := make(chan error, workerNum) 359 parallelizeContext, parallelizeCancel := context.WithCancel(context.TODO()) 360 defer parallelizeCancel() 361 362 workqueue.ParallelizeUntil(parallelizeContext, workerNum, len(nodes), func(index int) { 363 nodeName := nodes[index].Name 364 ctx, cancel := context.WithCancel(context.Background()) 365 defer cancel() 366 s, status := interPodAffinity.Score(ctx, state, pod, nodeName) 367 if !status.IsSuccess() { 368 parallelizeCancel() 369 errCh <- fmt.Errorf("calculate inter pod affinity priority failed %v", status.Message()) 370 return 371 } 372 nodeScoreList[index] = k8sframework.NodeScore{ 373 Name: nodeName, 374 Score: s, 375 } 376 }) 377 378 select { 379 case err := <-errCh: 380 return nil, err 381 default: 382 } 383 384 interPodAffinity.NormalizeScore(context.TODO(), state, pod, nodeScoreList) 385 386 nodeScores := make(map[string]float64, len(nodes)) 387 for i, nodeScore := range nodeScoreList { 388 // return error if score plugin returns invalid score. 389 if nodeScore.Score > k8sframework.MaxNodeScore || nodeScore.Score < k8sframework.MinNodeScore { 390 return nil, fmt.Errorf("inter pod affinity returns an invalid score %v for node %s", nodeScore.Score, nodeScore.Name) 391 } 392 nodeScore.Score *= int64(podAffinityWeight) 393 nodeScoreList[i] = nodeScore 394 nodeScores[nodeScore.Name] = float64(nodeScore.Score) 395 } 396 397 klog.V(4).Infof("inter pod affinity Score for task %s/%s is: %v", pod.Namespace, pod.Name, nodeScores) 398 return nodeScores, nil 399 } 400 401 func taintTolerationScore( 402 taintToleration *tainttoleration.TaintToleration, 403 cycleState *k8sframework.CycleState, 404 pod *v1.Pod, 405 nodes []*v1.Node, 406 taintTolerationWeight int, 407 ) (map[string]float64, error) { 408 preScoreStatus := taintToleration.PreScore(context.TODO(), cycleState, pod, nodes) 409 if !preScoreStatus.IsSuccess() { 410 return nil, preScoreStatus.AsError() 411 } 412 413 nodeScoreList := make(k8sframework.NodeScoreList, len(nodes)) 414 // size of errCh should be no less than parallelization number, see interPodAffinityScore. 415 workerNum := 16 416 errCh := make(chan error, workerNum) 417 parallelizeContext, parallelizeCancel := context.WithCancel(context.TODO()) 418 defer parallelizeCancel() 419 420 workqueue.ParallelizeUntil(parallelizeContext, workerNum, len(nodes), func(index int) { 421 nodeName := nodes[index].Name 422 ctx, cancel := context.WithCancel(context.Background()) 423 defer cancel() 424 s, status := taintToleration.Score(ctx, cycleState, pod, nodeName) 425 if !status.IsSuccess() { 426 parallelizeCancel() 427 errCh <- fmt.Errorf("calculate taint toleration priority failed %v", status.Message()) 428 return 429 } 430 nodeScoreList[index] = k8sframework.NodeScore{ 431 Name: nodeName, 432 Score: s, 433 } 434 }) 435 436 select { 437 case err := <-errCh: 438 return nil, err 439 default: 440 } 441 442 taintToleration.NormalizeScore(context.TODO(), cycleState, pod, nodeScoreList) 443 444 nodeScores := make(map[string]float64, len(nodes)) 445 for i, nodeScore := range nodeScoreList { 446 // return error if score plugin returns invalid score. 447 if nodeScore.Score > k8sframework.MaxNodeScore || nodeScore.Score < k8sframework.MinNodeScore { 448 return nil, fmt.Errorf("taint toleration returns an invalid score %v for node %s", nodeScore.Score, nodeScore.Name) 449 } 450 nodeScore.Score *= int64(taintTolerationWeight) 451 nodeScoreList[i] = nodeScore 452 nodeScores[nodeScore.Name] = float64(nodeScore.Score) 453 } 454 455 klog.V(4).Infof("taint toleration Score for task %s/%s is: %v", pod.Namespace, pod.Name, nodeScores) 456 return nodeScores, nil 457 } 458 459 func podTopologySpreadScore( 460 podTopologySpread *podtopologyspread.PodTopologySpread, 461 cycleState *k8sframework.CycleState, 462 pod *v1.Pod, 463 nodes []*v1.Node, 464 podTopologySpreadWeight int, 465 ) (map[string]float64, error) { 466 preScoreStatus := podTopologySpread.PreScore(context.TODO(), cycleState, pod, nodes) 467 if !preScoreStatus.IsSuccess() { 468 return nil, preScoreStatus.AsError() 469 } 470 471 nodeScoreList := make(k8sframework.NodeScoreList, len(nodes)) 472 // size of errCh should be no less than parallelization number, see interPodAffinityScore. 473 workerNum := 16 474 errCh := make(chan error, workerNum) 475 parallelizeContext, parallelizeCancel := context.WithCancel(context.TODO()) 476 workqueue.ParallelizeUntil(parallelizeContext, workerNum, len(nodes), func(index int) { 477 nodeName := nodes[index].Name 478 ctx, cancel := context.WithCancel(context.Background()) 479 defer cancel() 480 s, status := podTopologySpread.Score(ctx, cycleState, pod, nodeName) 481 if !status.IsSuccess() { 482 parallelizeCancel() 483 errCh <- fmt.Errorf("calculate pod topology spread priority failed %v", status.Message()) 484 return 485 } 486 nodeScoreList[index] = k8sframework.NodeScore{ 487 Name: nodeName, 488 Score: s, 489 } 490 }) 491 492 select { 493 case err := <-errCh: 494 return nil, err 495 default: 496 } 497 498 podTopologySpread.NormalizeScore(context.TODO(), cycleState, pod, nodeScoreList) 499 500 nodeScores := make(map[string]float64, len(nodes)) 501 for i, nodeScore := range nodeScoreList { 502 // return error if score plugin returns invalid score. 503 if nodeScore.Score > k8sframework.MaxNodeScore || nodeScore.Score < k8sframework.MinNodeScore { 504 return nil, fmt.Errorf("pod topology spread returns an invalid score %v for node %s", nodeScore.Score, nodeScore.Name) 505 } 506 nodeScore.Score *= int64(podTopologySpreadWeight) 507 nodeScoreList[i] = nodeScore 508 nodeScores[nodeScore.Name] = float64(nodeScore.Score) 509 } 510 511 klog.V(4).Infof("pod topology spread Score for task %s/%s is: %v", pod.Namespace, pod.Name, nodeScores) 512 return nodeScores, nil 513 } 514 515 func (pp *nodeOrderPlugin) OnSessionClose(ssn *framework.Session) { 516 }