volcano.sh/volcano@v1.9.0/pkg/scheduler/metrics/metrics.go (about)

     1  /*
     2  Copyright 2020 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package metrics
    18  
    19  import (
    20  	"time"
    21  
    22  	"github.com/prometheus/client_golang/prometheus"
    23  	"github.com/prometheus/client_golang/prometheus/promauto" // auto-registry collectors in default registry
    24  )
    25  
    26  const (
    27  	// VolcanoNamespace - namespace in prometheus used by volcano
    28  	VolcanoNamespace = "volcano"
    29  
    30  	// OnSessionOpen label
    31  	OnSessionOpen = "OnSessionOpen"
    32  
    33  	// OnSessionClose label
    34  	OnSessionClose = "OnSessionClose"
    35  )
    36  
    37  var (
    38  	e2eSchedulingLatency = promauto.NewHistogram(
    39  		prometheus.HistogramOpts{
    40  			Subsystem: VolcanoNamespace,
    41  			Name:      "e2e_scheduling_latency_milliseconds",
    42  			Help:      "E2e scheduling latency in milliseconds (scheduling algorithm + binding)",
    43  			Buckets:   prometheus.ExponentialBuckets(5, 2, 10),
    44  		},
    45  	)
    46  
    47  	e2eJobSchedulingLatency = promauto.NewHistogram(
    48  		prometheus.HistogramOpts{
    49  			Subsystem: VolcanoNamespace,
    50  			Name:      "e2e_job_scheduling_latency_milliseconds",
    51  			Help:      "E2e job scheduling latency in milliseconds",
    52  			Buckets:   prometheus.ExponentialBuckets(32, 2, 10),
    53  		},
    54  	)
    55  
    56  	e2eJobSchedulingDuration = promauto.NewGaugeVec(
    57  		prometheus.GaugeOpts{
    58  			Subsystem: VolcanoNamespace,
    59  			Name:      "e2e_job_scheduling_duration",
    60  			Help:      "E2E job scheduling duration",
    61  		},
    62  		[]string{"job_name", "queue", "job_namespace"},
    63  	)
    64  
    65  	e2eJobSchedulingStartTime = promauto.NewGaugeVec(
    66  		prometheus.GaugeOpts{
    67  			Subsystem: VolcanoNamespace,
    68  			Name:      "e2e_job_scheduling_start_time",
    69  			Help:      "E2E job scheduling start time",
    70  		},
    71  		[]string{"job_name", "queue", "job_namespace"},
    72  	)
    73  
    74  	e2eJobSchedulingLastTime = promauto.NewGaugeVec(
    75  		prometheus.GaugeOpts{
    76  			Subsystem: VolcanoNamespace,
    77  			Name:      "e2e_job_scheduling_last_time",
    78  			Help:      "E2E job scheduling last time",
    79  		},
    80  		[]string{"job_name", "queue", "job_namespace"},
    81  	)
    82  
    83  	pluginSchedulingLatency = promauto.NewHistogramVec(
    84  		prometheus.HistogramOpts{
    85  			Subsystem: VolcanoNamespace,
    86  			Name:      "plugin_scheduling_latency_microseconds",
    87  			Help:      "Plugin scheduling latency in microseconds",
    88  			Buckets:   prometheus.ExponentialBuckets(5, 2, 10),
    89  		}, []string{"plugin", "OnSession"},
    90  	)
    91  
    92  	actionSchedulingLatency = promauto.NewHistogramVec(
    93  		prometheus.HistogramOpts{
    94  			Subsystem: VolcanoNamespace,
    95  			Name:      "action_scheduling_latency_microseconds",
    96  			Help:      "Action scheduling latency in microseconds",
    97  			Buckets:   prometheus.ExponentialBuckets(5, 2, 10),
    98  		}, []string{"action"},
    99  	)
   100  
   101  	taskSchedulingLatency = promauto.NewHistogram(
   102  		prometheus.HistogramOpts{
   103  			Subsystem: VolcanoNamespace,
   104  			Name:      "task_scheduling_latency_milliseconds",
   105  			Help:      "Task scheduling latency in milliseconds",
   106  			Buckets:   prometheus.ExponentialBuckets(5, 2, 10),
   107  		},
   108  	)
   109  
   110  	scheduleAttempts = promauto.NewCounterVec(
   111  		prometheus.CounterOpts{
   112  			Subsystem: VolcanoNamespace,
   113  			Name:      "schedule_attempts_total",
   114  			Help:      "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
   115  		}, []string{"result"},
   116  	)
   117  
   118  	preemptionVictims = promauto.NewGauge(
   119  		prometheus.GaugeOpts{
   120  			Subsystem: VolcanoNamespace,
   121  			Name:      "pod_preemption_victims",
   122  			Help:      "Number of selected preemption victims",
   123  		},
   124  	)
   125  
   126  	preemptionAttempts = promauto.NewCounter(
   127  		prometheus.CounterOpts{
   128  			Subsystem: VolcanoNamespace,
   129  			Name:      "total_preemption_attempts",
   130  			Help:      "Total preemption attempts in the cluster till now",
   131  		},
   132  	)
   133  
   134  	unscheduleTaskCount = promauto.NewGaugeVec(
   135  		prometheus.GaugeOpts{
   136  			Subsystem: VolcanoNamespace,
   137  			Name:      "unschedule_task_count",
   138  			Help:      "Number of tasks could not be scheduled",
   139  		}, []string{"job_id"},
   140  	)
   141  
   142  	unscheduleJobCount = promauto.NewGauge(
   143  		prometheus.GaugeOpts{
   144  			Subsystem: VolcanoNamespace,
   145  			Name:      "unschedule_job_count",
   146  			Help:      "Number of jobs could not be scheduled",
   147  		},
   148  	)
   149  )
   150  
   151  // UpdatePluginDuration updates latency for every plugin
   152  func UpdatePluginDuration(pluginName, onSessionStatus string, duration time.Duration) {
   153  	pluginSchedulingLatency.WithLabelValues(pluginName, onSessionStatus).Observe(DurationInMicroseconds(duration))
   154  }
   155  
   156  // UpdateActionDuration updates latency for every action
   157  func UpdateActionDuration(actionName string, duration time.Duration) {
   158  	actionSchedulingLatency.WithLabelValues(actionName).Observe(DurationInMicroseconds(duration))
   159  }
   160  
   161  // UpdateE2eDuration updates entire end to end scheduling latency
   162  func UpdateE2eDuration(duration time.Duration) {
   163  	e2eSchedulingLatency.Observe(DurationInMilliseconds(duration))
   164  }
   165  
   166  // UpdateE2eSchedulingDurationByJob updates entire end to end scheduling duration
   167  func UpdateE2eSchedulingDurationByJob(jobName string, queue string, namespace string, duration time.Duration) {
   168  	e2eJobSchedulingDuration.WithLabelValues(jobName, queue, namespace).Set(DurationInMilliseconds(duration))
   169  	e2eJobSchedulingLatency.Observe(DurationInMilliseconds(duration))
   170  }
   171  
   172  // UpdateE2eSchedulingStartTimeByJob updates the start time of scheduling
   173  func UpdateE2eSchedulingStartTimeByJob(jobName string, queue string, namespace string, t time.Time) {
   174  	e2eJobSchedulingStartTime.WithLabelValues(jobName, queue, namespace).Set(ConvertToUnix(t))
   175  }
   176  
   177  // UpdateE2eSchedulingLastTimeByJob updates the last time of scheduling
   178  func UpdateE2eSchedulingLastTimeByJob(jobName string, queue string, namespace string, t time.Time) {
   179  	e2eJobSchedulingLastTime.WithLabelValues(jobName, queue, namespace).Set(ConvertToUnix(t))
   180  }
   181  
   182  // UpdateTaskScheduleDuration updates single task scheduling latency
   183  func UpdateTaskScheduleDuration(duration time.Duration) {
   184  	taskSchedulingLatency.Observe(DurationInMilliseconds(duration))
   185  }
   186  
   187  // UpdatePodScheduleStatus update pod schedule decision, could be Success, Failure, Error
   188  func UpdatePodScheduleStatus(label string, count int) {
   189  	scheduleAttempts.WithLabelValues(label).Add(float64(count))
   190  }
   191  
   192  // UpdatePreemptionVictimsCount updates count of preemption victims
   193  func UpdatePreemptionVictimsCount(victimsCount int) {
   194  	preemptionVictims.Set(float64(victimsCount))
   195  }
   196  
   197  // RegisterPreemptionAttempts records number of attempts for preemtion
   198  func RegisterPreemptionAttempts() {
   199  	preemptionAttempts.Inc()
   200  }
   201  
   202  // UpdateUnscheduleTaskCount records total number of unscheduleable tasks
   203  func UpdateUnscheduleTaskCount(jobID string, taskCount int) {
   204  	unscheduleTaskCount.WithLabelValues(jobID).Set(float64(taskCount))
   205  }
   206  
   207  // UpdateUnscheduleJobCount records total number of unscheduleable jobs
   208  func UpdateUnscheduleJobCount(jobCount int) {
   209  	unscheduleJobCount.Set(float64(jobCount))
   210  }
   211  
   212  // DurationInMicroseconds gets the time in microseconds.
   213  func DurationInMicroseconds(duration time.Duration) float64 {
   214  	return float64(duration.Nanoseconds()) / float64(time.Microsecond.Nanoseconds())
   215  }
   216  
   217  // DurationInMilliseconds gets the time in milliseconds.
   218  func DurationInMilliseconds(duration time.Duration) float64 {
   219  	return float64(duration.Nanoseconds()) / float64(time.Millisecond.Nanoseconds())
   220  }
   221  
   222  // DurationInSeconds gets the time in seconds.
   223  func DurationInSeconds(duration time.Duration) float64 {
   224  	return duration.Seconds()
   225  }
   226  
   227  // Duration get the time since specified start
   228  func Duration(start time.Time) time.Duration {
   229  	return time.Since(start)
   230  }
   231  
   232  // ConvertToUnix convert the time to Unix time
   233  func ConvertToUnix(t time.Time) float64 {
   234  	return float64(t.Unix())
   235  }