volcano.sh/volcano@v1.9.0/pkg/scheduler/metrics/metrics.go (about) 1 /* 2 Copyright 2020 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metrics 18 19 import ( 20 "time" 21 22 "github.com/prometheus/client_golang/prometheus" 23 "github.com/prometheus/client_golang/prometheus/promauto" // auto-registry collectors in default registry 24 ) 25 26 const ( 27 // VolcanoNamespace - namespace in prometheus used by volcano 28 VolcanoNamespace = "volcano" 29 30 // OnSessionOpen label 31 OnSessionOpen = "OnSessionOpen" 32 33 // OnSessionClose label 34 OnSessionClose = "OnSessionClose" 35 ) 36 37 var ( 38 e2eSchedulingLatency = promauto.NewHistogram( 39 prometheus.HistogramOpts{ 40 Subsystem: VolcanoNamespace, 41 Name: "e2e_scheduling_latency_milliseconds", 42 Help: "E2e scheduling latency in milliseconds (scheduling algorithm + binding)", 43 Buckets: prometheus.ExponentialBuckets(5, 2, 10), 44 }, 45 ) 46 47 e2eJobSchedulingLatency = promauto.NewHistogram( 48 prometheus.HistogramOpts{ 49 Subsystem: VolcanoNamespace, 50 Name: "e2e_job_scheduling_latency_milliseconds", 51 Help: "E2e job scheduling latency in milliseconds", 52 Buckets: prometheus.ExponentialBuckets(32, 2, 10), 53 }, 54 ) 55 56 e2eJobSchedulingDuration = promauto.NewGaugeVec( 57 prometheus.GaugeOpts{ 58 Subsystem: VolcanoNamespace, 59 Name: "e2e_job_scheduling_duration", 60 Help: "E2E job scheduling duration", 61 }, 62 []string{"job_name", "queue", "job_namespace"}, 63 ) 64 65 e2eJobSchedulingStartTime = promauto.NewGaugeVec( 66 prometheus.GaugeOpts{ 67 Subsystem: VolcanoNamespace, 68 Name: "e2e_job_scheduling_start_time", 69 Help: "E2E job scheduling start time", 70 }, 71 []string{"job_name", "queue", "job_namespace"}, 72 ) 73 74 e2eJobSchedulingLastTime = promauto.NewGaugeVec( 75 prometheus.GaugeOpts{ 76 Subsystem: VolcanoNamespace, 77 Name: "e2e_job_scheduling_last_time", 78 Help: "E2E job scheduling last time", 79 }, 80 []string{"job_name", "queue", "job_namespace"}, 81 ) 82 83 pluginSchedulingLatency = promauto.NewHistogramVec( 84 prometheus.HistogramOpts{ 85 Subsystem: VolcanoNamespace, 86 Name: "plugin_scheduling_latency_microseconds", 87 Help: "Plugin scheduling latency in microseconds", 88 Buckets: prometheus.ExponentialBuckets(5, 2, 10), 89 }, []string{"plugin", "OnSession"}, 90 ) 91 92 actionSchedulingLatency = promauto.NewHistogramVec( 93 prometheus.HistogramOpts{ 94 Subsystem: VolcanoNamespace, 95 Name: "action_scheduling_latency_microseconds", 96 Help: "Action scheduling latency in microseconds", 97 Buckets: prometheus.ExponentialBuckets(5, 2, 10), 98 }, []string{"action"}, 99 ) 100 101 taskSchedulingLatency = promauto.NewHistogram( 102 prometheus.HistogramOpts{ 103 Subsystem: VolcanoNamespace, 104 Name: "task_scheduling_latency_milliseconds", 105 Help: "Task scheduling latency in milliseconds", 106 Buckets: prometheus.ExponentialBuckets(5, 2, 10), 107 }, 108 ) 109 110 scheduleAttempts = promauto.NewCounterVec( 111 prometheus.CounterOpts{ 112 Subsystem: VolcanoNamespace, 113 Name: "schedule_attempts_total", 114 Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.", 115 }, []string{"result"}, 116 ) 117 118 preemptionVictims = promauto.NewGauge( 119 prometheus.GaugeOpts{ 120 Subsystem: VolcanoNamespace, 121 Name: "pod_preemption_victims", 122 Help: "Number of selected preemption victims", 123 }, 124 ) 125 126 preemptionAttempts = promauto.NewCounter( 127 prometheus.CounterOpts{ 128 Subsystem: VolcanoNamespace, 129 Name: "total_preemption_attempts", 130 Help: "Total preemption attempts in the cluster till now", 131 }, 132 ) 133 134 unscheduleTaskCount = promauto.NewGaugeVec( 135 prometheus.GaugeOpts{ 136 Subsystem: VolcanoNamespace, 137 Name: "unschedule_task_count", 138 Help: "Number of tasks could not be scheduled", 139 }, []string{"job_id"}, 140 ) 141 142 unscheduleJobCount = promauto.NewGauge( 143 prometheus.GaugeOpts{ 144 Subsystem: VolcanoNamespace, 145 Name: "unschedule_job_count", 146 Help: "Number of jobs could not be scheduled", 147 }, 148 ) 149 ) 150 151 // UpdatePluginDuration updates latency for every plugin 152 func UpdatePluginDuration(pluginName, onSessionStatus string, duration time.Duration) { 153 pluginSchedulingLatency.WithLabelValues(pluginName, onSessionStatus).Observe(DurationInMicroseconds(duration)) 154 } 155 156 // UpdateActionDuration updates latency for every action 157 func UpdateActionDuration(actionName string, duration time.Duration) { 158 actionSchedulingLatency.WithLabelValues(actionName).Observe(DurationInMicroseconds(duration)) 159 } 160 161 // UpdateE2eDuration updates entire end to end scheduling latency 162 func UpdateE2eDuration(duration time.Duration) { 163 e2eSchedulingLatency.Observe(DurationInMilliseconds(duration)) 164 } 165 166 // UpdateE2eSchedulingDurationByJob updates entire end to end scheduling duration 167 func UpdateE2eSchedulingDurationByJob(jobName string, queue string, namespace string, duration time.Duration) { 168 e2eJobSchedulingDuration.WithLabelValues(jobName, queue, namespace).Set(DurationInMilliseconds(duration)) 169 e2eJobSchedulingLatency.Observe(DurationInMilliseconds(duration)) 170 } 171 172 // UpdateE2eSchedulingStartTimeByJob updates the start time of scheduling 173 func UpdateE2eSchedulingStartTimeByJob(jobName string, queue string, namespace string, t time.Time) { 174 e2eJobSchedulingStartTime.WithLabelValues(jobName, queue, namespace).Set(ConvertToUnix(t)) 175 } 176 177 // UpdateE2eSchedulingLastTimeByJob updates the last time of scheduling 178 func UpdateE2eSchedulingLastTimeByJob(jobName string, queue string, namespace string, t time.Time) { 179 e2eJobSchedulingLastTime.WithLabelValues(jobName, queue, namespace).Set(ConvertToUnix(t)) 180 } 181 182 // UpdateTaskScheduleDuration updates single task scheduling latency 183 func UpdateTaskScheduleDuration(duration time.Duration) { 184 taskSchedulingLatency.Observe(DurationInMilliseconds(duration)) 185 } 186 187 // UpdatePodScheduleStatus update pod schedule decision, could be Success, Failure, Error 188 func UpdatePodScheduleStatus(label string, count int) { 189 scheduleAttempts.WithLabelValues(label).Add(float64(count)) 190 } 191 192 // UpdatePreemptionVictimsCount updates count of preemption victims 193 func UpdatePreemptionVictimsCount(victimsCount int) { 194 preemptionVictims.Set(float64(victimsCount)) 195 } 196 197 // RegisterPreemptionAttempts records number of attempts for preemtion 198 func RegisterPreemptionAttempts() { 199 preemptionAttempts.Inc() 200 } 201 202 // UpdateUnscheduleTaskCount records total number of unscheduleable tasks 203 func UpdateUnscheduleTaskCount(jobID string, taskCount int) { 204 unscheduleTaskCount.WithLabelValues(jobID).Set(float64(taskCount)) 205 } 206 207 // UpdateUnscheduleJobCount records total number of unscheduleable jobs 208 func UpdateUnscheduleJobCount(jobCount int) { 209 unscheduleJobCount.Set(float64(jobCount)) 210 } 211 212 // DurationInMicroseconds gets the time in microseconds. 213 func DurationInMicroseconds(duration time.Duration) float64 { 214 return float64(duration.Nanoseconds()) / float64(time.Microsecond.Nanoseconds()) 215 } 216 217 // DurationInMilliseconds gets the time in milliseconds. 218 func DurationInMilliseconds(duration time.Duration) float64 { 219 return float64(duration.Nanoseconds()) / float64(time.Millisecond.Nanoseconds()) 220 } 221 222 // DurationInSeconds gets the time in seconds. 223 func DurationInSeconds(duration time.Duration) float64 { 224 return duration.Seconds() 225 } 226 227 // Duration get the time since specified start 228 func Duration(start time.Time) time.Duration { 229 return time.Since(start) 230 } 231 232 // ConvertToUnix convert the time to Unix time 233 func ConvertToUnix(t time.Time) float64 { 234 return float64(t.Unix()) 235 }