volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/usage/usage.go (about)

     1  /*
     2  Copyright 2022 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package usage
    18  
    19  import (
    20  	"fmt"
    21  	"time"
    22  
    23  	"volcano.sh/volcano/pkg/scheduler/metrics/source"
    24  
    25  	"k8s.io/klog/v2"
    26  	k8sFramework "k8s.io/kubernetes/pkg/scheduler/framework"
    27  
    28  	"volcano.sh/volcano/pkg/scheduler/api"
    29  	"volcano.sh/volcano/pkg/scheduler/framework"
    30  )
    31  
    32  const (
    33  	// PluginName indicates name of volcano scheduler plugin.
    34  	PluginName            = "usage"
    35  	thresholdSection      = "thresholds"
    36  	MetricsActiveTime     = 5 * time.Minute
    37  	NodeUsageCPUExtend    = "the CPU load of the node exceeds the upper limit."
    38  	NodeUsageMemoryExtend = "the memory load of the node exceeds the upper limit."
    39  )
    40  
    41  /*
    42     actions: "enqueue, allocate, backfill"
    43     tiers:
    44     - plugins:
    45       - name: usage
    46         enablePredicate: false  # If the value is false, new pod scheduling is not disabled when the node load reaches the threshold. If the value is true or left blank, new pod scheduling is disabled.
    47         arguments:
    48           usage.weight: 5
    49           cpu.weight: 1
    50           memory.weight: 1
    51           thresholds:
    52             cpu: 80
    53             mem: 80
    54  */
    55  
    56  const AVG string = "average"
    57  
    58  type usagePlugin struct {
    59  	pluginArguments framework.Arguments
    60  	usageWeight     int
    61  	cpuWeight       int
    62  	memoryWeight    int
    63  	usageType       string
    64  	cpuThresholds   float64
    65  	memThresholds   float64
    66  	period          string
    67  }
    68  
    69  // New function returns usagePlugin object
    70  func New(args framework.Arguments) framework.Plugin {
    71  	var plugin = &usagePlugin{
    72  		pluginArguments: args,
    73  		usageWeight:     5,
    74  		cpuWeight:       1,
    75  		memoryWeight:    1,
    76  		usageType:       AVG,
    77  		cpuThresholds:   80,
    78  		memThresholds:   80,
    79  		period:          source.NODE_METRICS_PERIOD,
    80  	}
    81  	args.GetInt(&plugin.usageWeight, "usage.weight")
    82  	args.GetInt(&plugin.cpuWeight, "cpu.weight")
    83  	args.GetInt(&plugin.memoryWeight, "memory.weight")
    84  
    85  	argsValue, ok := plugin.pluginArguments[thresholdSection]
    86  	if !ok {
    87  		klog.Errorf("Failed to obtain thresholds information, usage plugin arguments is %v", plugin.pluginArguments)
    88  		return plugin
    89  	}
    90  
    91  	thresholdArgs, ok := argsValue.(map[interface{}]interface{})
    92  	if !ok {
    93  		klog.Errorf("Failed to convert the thresholds information, thresholds args values is %v", argsValue)
    94  		return plugin
    95  	}
    96  	for resourceName, threshold := range thresholdArgs {
    97  		resource, _ := resourceName.(string)
    98  		value, _ := threshold.(int)
    99  		switch resource {
   100  		case "cpu":
   101  			plugin.cpuThresholds = float64(value)
   102  		case "mem":
   103  			plugin.memThresholds = float64(value)
   104  		}
   105  	}
   106  
   107  	return plugin
   108  }
   109  
   110  func (up *usagePlugin) Name() string {
   111  	return PluginName
   112  }
   113  
   114  func (up *usagePlugin) OnSessionOpen(ssn *framework.Session) {
   115  	klog.V(5).Infof("Enter usage plugin ...")
   116  	defer func() {
   117  		klog.V(5).Infof("Leaving usage plugin ...")
   118  	}()
   119  
   120  	if klog.V(4).Enabled() {
   121  		for node, nodeInfo := range ssn.Nodes {
   122  			klog.V(4).Infof("node:%v, cpu usage:%v, mem usage:%v, metrics time is %v",
   123  				node, nodeInfo.ResourceUsage.CPUUsageAvg, nodeInfo.ResourceUsage.MEMUsageAvg, nodeInfo.ResourceUsage.MetricsTime)
   124  		}
   125  	}
   126  
   127  	predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) {
   128  		predicateStatus := make([]*api.Status, 0)
   129  		usageStatus := &api.Status{}
   130  
   131  		now := time.Now()
   132  		if up.period == "" || now.Sub(node.ResourceUsage.MetricsTime) > MetricsActiveTime {
   133  			klog.V(4).Infof("The period(%s) is empty or the usage metrics data is not updated for more than %v minutes, "+
   134  				"Usage plugin filter for task %s/%s on node %s pass, metrics time is %v. ", up.period, MetricsActiveTime, task.Namespace, task.Name, node.Name, node.ResourceUsage.MetricsTime)
   135  
   136  			usageStatus.Code = api.Success
   137  			predicateStatus = append(predicateStatus, usageStatus)
   138  			return predicateStatus, nil
   139  		}
   140  
   141  		klog.V(4).Infof("predicateFn cpuUsageAvg:%v,predicateFn memUsageAvg:%v", up.cpuThresholds, up.memThresholds)
   142  		if node.ResourceUsage.CPUUsageAvg[up.period] > up.cpuThresholds {
   143  			klog.V(3).Infof("Node %s cpu usage %f exceeds the threshold %f", node.Name, node.ResourceUsage.CPUUsageAvg[up.period], up.cpuThresholds)
   144  			usageStatus.Code = api.UnschedulableAndUnresolvable
   145  			usageStatus.Reason = NodeUsageCPUExtend
   146  			predicateStatus = append(predicateStatus, usageStatus)
   147  			return predicateStatus, fmt.Errorf("Plugin %s predicates failed, because of %s", up.Name(), NodeUsageCPUExtend)
   148  		}
   149  		if node.ResourceUsage.MEMUsageAvg[up.period] > up.memThresholds {
   150  			klog.V(3).Infof("Node %s mem usage %f exceeds the threshold %f", node.Name, node.ResourceUsage.MEMUsageAvg[up.period], up.memThresholds)
   151  			usageStatus.Code = api.UnschedulableAndUnresolvable
   152  			usageStatus.Reason = NodeUsageMemoryExtend
   153  			predicateStatus = append(predicateStatus, usageStatus)
   154  			return predicateStatus, fmt.Errorf("Plugin %s predicates failed, because of %s", up.Name(), NodeUsageMemoryExtend)
   155  		}
   156  
   157  		klog.V(4).Infof("Usage plugin filter for task %s/%s on node %s pass.", task.Namespace, task.Name, node.Name)
   158  		return predicateStatus, nil
   159  	}
   160  
   161  	nodeOrderFn := func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
   162  		score := 0.0
   163  		now := time.Now()
   164  		if up.period == "" || now.Sub(node.ResourceUsage.MetricsTime) > MetricsActiveTime {
   165  			klog.V(4).Infof("The period(%s) is empty or the usage metrics data is not updated for more than %v minutes, "+
   166  				"Usage plugin score for task %s/%s on node %s is 0, metrics time is %v. ", up.period, MetricsActiveTime, task.Namespace, task.Name, node.Name, node.ResourceUsage.MetricsTime)
   167  			return 0, nil
   168  		}
   169  
   170  		cpuUsage, exist := node.ResourceUsage.CPUUsageAvg[up.period]
   171  		klog.V(4).Infof("Node %s cpu usage is %f.", node.Name, cpuUsage)
   172  		if !exist {
   173  			return 0, nil
   174  		}
   175  		cpuScore := (100 - cpuUsage) / 100 * float64(up.cpuWeight)
   176  
   177  		memoryUsage, exist := node.ResourceUsage.MEMUsageAvg[up.period]
   178  		klog.V(4).Infof("Node %s memory usage is %f.", node.Name, memoryUsage)
   179  		if !exist {
   180  			return 0, nil
   181  		}
   182  		memoryScore := (100 - memoryUsage) / 100 * float64(up.memoryWeight)
   183  		score = (cpuScore + memoryScore) / float64((up.cpuWeight + up.memoryWeight))
   184  		score *= float64(k8sFramework.MaxNodeScore * int64(up.usageWeight))
   185  		klog.V(4).Infof("Node %s score for task %s is %f.", node.Name, task.Name, score)
   186  		return score, nil
   187  	}
   188  
   189  	ssn.AddPredicateFn(up.Name(), predicateFn)
   190  	ssn.AddNodeOrderFn(up.Name(), nodeOrderFn)
   191  }
   192  
   193  func (up *usagePlugin) OnSessionClose(ssn *framework.Session) {}