volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/usage/usage.go (about) 1 /* 2 Copyright 2022 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package usage 18 19 import ( 20 "fmt" 21 "time" 22 23 "volcano.sh/volcano/pkg/scheduler/metrics/source" 24 25 "k8s.io/klog/v2" 26 k8sFramework "k8s.io/kubernetes/pkg/scheduler/framework" 27 28 "volcano.sh/volcano/pkg/scheduler/api" 29 "volcano.sh/volcano/pkg/scheduler/framework" 30 ) 31 32 const ( 33 // PluginName indicates name of volcano scheduler plugin. 34 PluginName = "usage" 35 thresholdSection = "thresholds" 36 MetricsActiveTime = 5 * time.Minute 37 NodeUsageCPUExtend = "the CPU load of the node exceeds the upper limit." 38 NodeUsageMemoryExtend = "the memory load of the node exceeds the upper limit." 39 ) 40 41 /* 42 actions: "enqueue, allocate, backfill" 43 tiers: 44 - plugins: 45 - name: usage 46 enablePredicate: false # If the value is false, new pod scheduling is not disabled when the node load reaches the threshold. If the value is true or left blank, new pod scheduling is disabled. 47 arguments: 48 usage.weight: 5 49 cpu.weight: 1 50 memory.weight: 1 51 thresholds: 52 cpu: 80 53 mem: 80 54 */ 55 56 const AVG string = "average" 57 58 type usagePlugin struct { 59 pluginArguments framework.Arguments 60 usageWeight int 61 cpuWeight int 62 memoryWeight int 63 usageType string 64 cpuThresholds float64 65 memThresholds float64 66 period string 67 } 68 69 // New function returns usagePlugin object 70 func New(args framework.Arguments) framework.Plugin { 71 var plugin = &usagePlugin{ 72 pluginArguments: args, 73 usageWeight: 5, 74 cpuWeight: 1, 75 memoryWeight: 1, 76 usageType: AVG, 77 cpuThresholds: 80, 78 memThresholds: 80, 79 period: source.NODE_METRICS_PERIOD, 80 } 81 args.GetInt(&plugin.usageWeight, "usage.weight") 82 args.GetInt(&plugin.cpuWeight, "cpu.weight") 83 args.GetInt(&plugin.memoryWeight, "memory.weight") 84 85 argsValue, ok := plugin.pluginArguments[thresholdSection] 86 if !ok { 87 klog.Errorf("Failed to obtain thresholds information, usage plugin arguments is %v", plugin.pluginArguments) 88 return plugin 89 } 90 91 thresholdArgs, ok := argsValue.(map[interface{}]interface{}) 92 if !ok { 93 klog.Errorf("Failed to convert the thresholds information, thresholds args values is %v", argsValue) 94 return plugin 95 } 96 for resourceName, threshold := range thresholdArgs { 97 resource, _ := resourceName.(string) 98 value, _ := threshold.(int) 99 switch resource { 100 case "cpu": 101 plugin.cpuThresholds = float64(value) 102 case "mem": 103 plugin.memThresholds = float64(value) 104 } 105 } 106 107 return plugin 108 } 109 110 func (up *usagePlugin) Name() string { 111 return PluginName 112 } 113 114 func (up *usagePlugin) OnSessionOpen(ssn *framework.Session) { 115 klog.V(5).Infof("Enter usage plugin ...") 116 defer func() { 117 klog.V(5).Infof("Leaving usage plugin ...") 118 }() 119 120 if klog.V(4).Enabled() { 121 for node, nodeInfo := range ssn.Nodes { 122 klog.V(4).Infof("node:%v, cpu usage:%v, mem usage:%v, metrics time is %v", 123 node, nodeInfo.ResourceUsage.CPUUsageAvg, nodeInfo.ResourceUsage.MEMUsageAvg, nodeInfo.ResourceUsage.MetricsTime) 124 } 125 } 126 127 predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) { 128 predicateStatus := make([]*api.Status, 0) 129 usageStatus := &api.Status{} 130 131 now := time.Now() 132 if up.period == "" || now.Sub(node.ResourceUsage.MetricsTime) > MetricsActiveTime { 133 klog.V(4).Infof("The period(%s) is empty or the usage metrics data is not updated for more than %v minutes, "+ 134 "Usage plugin filter for task %s/%s on node %s pass, metrics time is %v. ", up.period, MetricsActiveTime, task.Namespace, task.Name, node.Name, node.ResourceUsage.MetricsTime) 135 136 usageStatus.Code = api.Success 137 predicateStatus = append(predicateStatus, usageStatus) 138 return predicateStatus, nil 139 } 140 141 klog.V(4).Infof("predicateFn cpuUsageAvg:%v,predicateFn memUsageAvg:%v", up.cpuThresholds, up.memThresholds) 142 if node.ResourceUsage.CPUUsageAvg[up.period] > up.cpuThresholds { 143 klog.V(3).Infof("Node %s cpu usage %f exceeds the threshold %f", node.Name, node.ResourceUsage.CPUUsageAvg[up.period], up.cpuThresholds) 144 usageStatus.Code = api.UnschedulableAndUnresolvable 145 usageStatus.Reason = NodeUsageCPUExtend 146 predicateStatus = append(predicateStatus, usageStatus) 147 return predicateStatus, fmt.Errorf("Plugin %s predicates failed, because of %s", up.Name(), NodeUsageCPUExtend) 148 } 149 if node.ResourceUsage.MEMUsageAvg[up.period] > up.memThresholds { 150 klog.V(3).Infof("Node %s mem usage %f exceeds the threshold %f", node.Name, node.ResourceUsage.MEMUsageAvg[up.period], up.memThresholds) 151 usageStatus.Code = api.UnschedulableAndUnresolvable 152 usageStatus.Reason = NodeUsageMemoryExtend 153 predicateStatus = append(predicateStatus, usageStatus) 154 return predicateStatus, fmt.Errorf("Plugin %s predicates failed, because of %s", up.Name(), NodeUsageMemoryExtend) 155 } 156 157 klog.V(4).Infof("Usage plugin filter for task %s/%s on node %s pass.", task.Namespace, task.Name, node.Name) 158 return predicateStatus, nil 159 } 160 161 nodeOrderFn := func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) { 162 score := 0.0 163 now := time.Now() 164 if up.period == "" || now.Sub(node.ResourceUsage.MetricsTime) > MetricsActiveTime { 165 klog.V(4).Infof("The period(%s) is empty or the usage metrics data is not updated for more than %v minutes, "+ 166 "Usage plugin score for task %s/%s on node %s is 0, metrics time is %v. ", up.period, MetricsActiveTime, task.Namespace, task.Name, node.Name, node.ResourceUsage.MetricsTime) 167 return 0, nil 168 } 169 170 cpuUsage, exist := node.ResourceUsage.CPUUsageAvg[up.period] 171 klog.V(4).Infof("Node %s cpu usage is %f.", node.Name, cpuUsage) 172 if !exist { 173 return 0, nil 174 } 175 cpuScore := (100 - cpuUsage) / 100 * float64(up.cpuWeight) 176 177 memoryUsage, exist := node.ResourceUsage.MEMUsageAvg[up.period] 178 klog.V(4).Infof("Node %s memory usage is %f.", node.Name, memoryUsage) 179 if !exist { 180 return 0, nil 181 } 182 memoryScore := (100 - memoryUsage) / 100 * float64(up.memoryWeight) 183 score = (cpuScore + memoryScore) / float64((up.cpuWeight + up.memoryWeight)) 184 score *= float64(k8sFramework.MaxNodeScore * int64(up.usageWeight)) 185 klog.V(4).Infof("Node %s score for task %s is %f.", node.Name, task.Name, score) 186 return score, nil 187 } 188 189 ssn.AddPredicateFn(up.Name(), predicateFn) 190 ssn.AddNodeOrderFn(up.Name(), nodeOrderFn) 191 } 192 193 func (up *usagePlugin) OnSessionClose(ssn *framework.Session) {}