sigs.k8s.io/kueue@v0.6.2/pkg/metrics/metrics.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metrics 18 19 import ( 20 "time" 21 22 "github.com/prometheus/client_golang/prometheus" 23 "sigs.k8s.io/controller-runtime/pkg/metrics" 24 25 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 26 "sigs.k8s.io/kueue/pkg/constants" 27 "sigs.k8s.io/kueue/pkg/features" 28 ) 29 30 type AdmissionResult string 31 type ClusterQueueStatus string 32 33 const ( 34 AdmissionResultSuccess AdmissionResult = "success" 35 AdmissionResultInadmissible AdmissionResult = "inadmissible" 36 37 PendingStatusActive = "active" 38 PendingStatusInadmissible = "inadmissible" 39 40 // CQStatusPending means the ClusterQueue is accepted but not yet active, 41 // this can be because of: 42 // - a missing ResourceFlavor referenced by the ClusterQueue 43 // - a missing or inactive AdmissionCheck referenced by the ClusterQueue 44 // - the ClusterQueue is stopped 45 // In this state, the ClusterQueue can't admit new workloads and its quota can't be borrowed 46 // by other active ClusterQueues in the cohort. 47 CQStatusPending ClusterQueueStatus = "pending" 48 // CQStatusActive means the ClusterQueue can admit new workloads and its quota 49 // can be borrowed by other ClusterQueues in the cohort. 50 CQStatusActive ClusterQueueStatus = "active" 51 // CQStatusTerminating means the clusterQueue is in pending deletion. 52 CQStatusTerminating ClusterQueueStatus = "terminating" 53 ) 54 55 var ( 56 CQStatuses = []ClusterQueueStatus{CQStatusPending, CQStatusActive, CQStatusTerminating} 57 58 AdmissionAttemptsTotal = prometheus.NewCounterVec( 59 prometheus.CounterOpts{ 60 Subsystem: constants.KueueName, 61 Name: "admission_attempts_total", 62 Help: `The total number of attempts to admit workloads. 63 Each admission attempt might try to admit more than one workload. 64 The label 'result' can have the following values: 65 - 'success' means that at least one workload was admitted., 66 - 'inadmissible' means that no workload was admitted.`, 67 }, []string{"result"}, 68 ) 69 70 admissionAttemptDuration = prometheus.NewHistogramVec( 71 prometheus.HistogramOpts{ 72 Subsystem: constants.KueueName, 73 Name: "admission_attempt_duration_seconds", 74 Help: `The latency of an admission attempt. 75 The label 'result' can have the following values: 76 - 'success' means that at least one workload was admitted., 77 - 'inadmissible' means that no workload was admitted.`, 78 }, []string{"result"}, 79 ) 80 81 // Metrics tied to the queue system. 82 83 PendingWorkloads = prometheus.NewGaugeVec( 84 prometheus.GaugeOpts{ 85 Subsystem: constants.KueueName, 86 Name: "pending_workloads", 87 Help: `The number of pending workloads, per 'cluster_queue' and 'status'. 88 'status' can have the following values: 89 - "active" means that the workloads are in the admission queue. 90 - "inadmissible" means there was a failed admission attempt for these workloads and they won't be retried until cluster conditions, which could make this workload admissible, change`, 91 }, []string{"cluster_queue", "status"}, 92 ) 93 94 AdmittedWorkloadsTotal = prometheus.NewCounterVec( 95 prometheus.CounterOpts{ 96 Subsystem: constants.KueueName, 97 Name: "admitted_workloads_total", 98 Help: "The total number of admitted workloads per 'cluster_queue'", 99 }, []string{"cluster_queue"}, 100 ) 101 102 admissionWaitTime = prometheus.NewHistogramVec( 103 prometheus.HistogramOpts{ 104 Subsystem: constants.KueueName, 105 Name: "admission_wait_time_seconds", 106 Help: "The time between a Workload was created until it was admitted, per 'cluster_queue'", 107 }, []string{"cluster_queue"}, 108 ) 109 110 // Metrics tied to the cache. 111 112 ReservingActiveWorkloads = prometheus.NewGaugeVec( 113 prometheus.GaugeOpts{ 114 Subsystem: constants.KueueName, 115 Name: "reserving_active_workloads", 116 Help: "The number of Workloads that are reserving quota, per 'cluster_queue'", 117 }, []string{"cluster_queue"}, 118 ) 119 120 AdmittedActiveWorkloads = prometheus.NewGaugeVec( 121 prometheus.GaugeOpts{ 122 Subsystem: constants.KueueName, 123 Name: "admitted_active_workloads", 124 Help: "The number of admitted Workloads that are active (unsuspended and not finished), per 'cluster_queue'", 125 }, []string{"cluster_queue"}, 126 ) 127 128 ClusterQueueByStatus = prometheus.NewGaugeVec( 129 prometheus.GaugeOpts{ 130 Subsystem: constants.KueueName, 131 Name: "cluster_queue_status", 132 Help: `Reports 'cluster_queue' with its 'status' (with possible values 'pending', 'active' or 'terminated'). 133 For a ClusterQueue, the metric only reports a value of 1 for one of the statuses.`, 134 }, []string{"cluster_queue", "status"}, 135 ) 136 137 // Optional cluster queue metrics 138 ClusterQueueResourceReservations = prometheus.NewGaugeVec( 139 prometheus.GaugeOpts{ 140 Subsystem: constants.KueueName, 141 Name: "cluster_queue_resource_reservation", 142 Help: `Reports the cluster_queue's total resource reservation within all the flavors`, 143 }, []string{"cohort", "cluster_queue", "flavor", "resource"}, 144 ) 145 146 ClusterQueueResourceUsage = prometheus.NewGaugeVec( 147 prometheus.GaugeOpts{ 148 Subsystem: constants.KueueName, 149 Name: "cluster_queue_resource_usage", 150 Help: `Reports the cluster_queue's total resource usage within all the flavors`, 151 }, []string{"cohort", "cluster_queue", "flavor", "resource"}, 152 ) 153 154 ClusterQueueResourceNominalQuota = prometheus.NewGaugeVec( 155 prometheus.GaugeOpts{ 156 Subsystem: constants.KueueName, 157 Name: "cluster_queue_nominal_quota", 158 Help: `Reports the cluster_queue's resource nominal quota within all the flavors`, 159 }, []string{"cohort", "cluster_queue", "flavor", "resource"}, 160 ) 161 162 ClusterQueueResourceBorrowingLimit = prometheus.NewGaugeVec( 163 prometheus.GaugeOpts{ 164 Subsystem: constants.KueueName, 165 Name: "cluster_queue_borrowing_limit", 166 Help: `Reports the cluster_queue's resource borrowing limit within all the flavors`, 167 }, []string{"cohort", "cluster_queue", "flavor", "resource"}, 168 ) 169 170 ClusterQueueResourceLendingLimit = prometheus.NewGaugeVec( 171 prometheus.GaugeOpts{ 172 Subsystem: constants.KueueName, 173 Name: "cluster_queue_lending_limit", 174 Help: `Reports the cluster_queue's resource lending limit within all the flavors`, 175 }, []string{"cohort", "cluster_queue", "flavor", "resource"}, 176 ) 177 ) 178 179 func AdmissionAttempt(result AdmissionResult, duration time.Duration) { 180 AdmissionAttemptsTotal.WithLabelValues(string(result)).Inc() 181 admissionAttemptDuration.WithLabelValues(string(result)).Observe(duration.Seconds()) 182 } 183 184 func AdmittedWorkload(cqName kueue.ClusterQueueReference, waitTime time.Duration) { 185 AdmittedWorkloadsTotal.WithLabelValues(string(cqName)).Inc() 186 admissionWaitTime.WithLabelValues(string(cqName)).Observe(waitTime.Seconds()) 187 } 188 189 func ReportPendingWorkloads(cqName string, active, inadmissible int) { 190 PendingWorkloads.WithLabelValues(cqName, PendingStatusActive).Set(float64(active)) 191 PendingWorkloads.WithLabelValues(cqName, PendingStatusInadmissible).Set(float64(inadmissible)) 192 } 193 194 func ClearQueueSystemMetrics(cqName string) { 195 PendingWorkloads.DeleteLabelValues(cqName, PendingStatusActive) 196 PendingWorkloads.DeleteLabelValues(cqName, PendingStatusInadmissible) 197 AdmittedWorkloadsTotal.DeleteLabelValues(cqName) 198 admissionWaitTime.DeleteLabelValues(cqName) 199 } 200 201 func ReportClusterQueueStatus(cqName string, cqStatus ClusterQueueStatus) { 202 for _, status := range CQStatuses { 203 var v float64 204 if status == cqStatus { 205 v = 1 206 } 207 ClusterQueueByStatus.WithLabelValues(cqName, string(status)).Set(v) 208 } 209 } 210 211 func ClearCacheMetrics(cqName string) { 212 ReservingActiveWorkloads.DeleteLabelValues(cqName) 213 AdmittedActiveWorkloads.DeleteLabelValues(cqName) 214 for _, status := range CQStatuses { 215 ClusterQueueByStatus.DeleteLabelValues(cqName, string(status)) 216 } 217 } 218 219 func ReportClusterQueueQuotas(cohort, queue, flavor, resource string, nominal, borrowing, lending float64) { 220 ClusterQueueResourceNominalQuota.WithLabelValues(cohort, queue, flavor, resource).Set(nominal) 221 ClusterQueueResourceBorrowingLimit.WithLabelValues(cohort, queue, flavor, resource).Set(borrowing) 222 if features.Enabled(features.LendingLimit) { 223 ClusterQueueResourceLendingLimit.WithLabelValues(cohort, queue, flavor, resource).Set(lending) 224 } 225 } 226 227 func ReportClusterQueueResourceReservations(cohort, queue, flavor, resource string, usage float64) { 228 ClusterQueueResourceReservations.WithLabelValues(cohort, queue, flavor, resource).Set(usage) 229 } 230 231 func ReportClusterQueueResourceUsage(cohort, queue, flavor, resource string, usage float64) { 232 ClusterQueueResourceUsage.WithLabelValues(cohort, queue, flavor, resource).Set(usage) 233 } 234 235 func ClearClusterQueueResourceMetrics(cqName string) { 236 lbls := prometheus.Labels{ 237 "cluster_queue": cqName, 238 } 239 ClusterQueueResourceNominalQuota.DeletePartialMatch(lbls) 240 ClusterQueueResourceBorrowingLimit.DeletePartialMatch(lbls) 241 if features.Enabled(features.LendingLimit) { 242 ClusterQueueResourceLendingLimit.DeletePartialMatch(lbls) 243 } 244 ClusterQueueResourceUsage.DeletePartialMatch(lbls) 245 ClusterQueueResourceReservations.DeletePartialMatch(lbls) 246 } 247 248 func ClearClusterQueueResourceQuotas(cqName, flavor, resource string) { 249 lbls := prometheus.Labels{ 250 "cluster_queue": cqName, 251 "flavor": flavor, 252 } 253 254 if len(resource) != 0 { 255 lbls["resource"] = resource 256 } 257 258 ClusterQueueResourceNominalQuota.DeletePartialMatch(lbls) 259 ClusterQueueResourceBorrowingLimit.DeletePartialMatch(lbls) 260 if features.Enabled(features.LendingLimit) { 261 ClusterQueueResourceLendingLimit.DeletePartialMatch(lbls) 262 } 263 } 264 265 func ClearClusterQueueResourceUsage(cqName, flavor, resource string) { 266 lbls := prometheus.Labels{ 267 "cluster_queue": cqName, 268 "flavor": flavor, 269 } 270 271 if len(resource) != 0 { 272 lbls["resource"] = resource 273 } 274 275 ClusterQueueResourceUsage.DeletePartialMatch(lbls) 276 } 277 278 func ClearClusterQueueResourceReservations(cqName, flavor, resource string) { 279 lbls := prometheus.Labels{ 280 "cluster_queue": cqName, 281 "flavor": flavor, 282 } 283 284 if len(resource) != 0 { 285 lbls["resource"] = resource 286 } 287 288 ClusterQueueResourceReservations.DeletePartialMatch(lbls) 289 } 290 291 func Register() { 292 metrics.Registry.MustRegister( 293 AdmissionAttemptsTotal, 294 admissionAttemptDuration, 295 PendingWorkloads, 296 ReservingActiveWorkloads, 297 AdmittedActiveWorkloads, 298 AdmittedWorkloadsTotal, 299 admissionWaitTime, 300 ClusterQueueResourceUsage, 301 ClusterQueueResourceReservations, 302 ClusterQueueResourceNominalQuota, 303 ClusterQueueResourceBorrowingLimit, 304 ClusterQueueResourceLendingLimit, 305 ) 306 }