sigs.k8s.io/kueue@v0.6.2/pkg/metrics/metrics.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package metrics
    18  
    19  import (
    20  	"time"
    21  
    22  	"github.com/prometheus/client_golang/prometheus"
    23  	"sigs.k8s.io/controller-runtime/pkg/metrics"
    24  
    25  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    26  	"sigs.k8s.io/kueue/pkg/constants"
    27  	"sigs.k8s.io/kueue/pkg/features"
    28  )
    29  
    30  type AdmissionResult string
    31  type ClusterQueueStatus string
    32  
    33  const (
    34  	AdmissionResultSuccess      AdmissionResult = "success"
    35  	AdmissionResultInadmissible AdmissionResult = "inadmissible"
    36  
    37  	PendingStatusActive       = "active"
    38  	PendingStatusInadmissible = "inadmissible"
    39  
    40  	// CQStatusPending means the ClusterQueue is accepted but not yet active,
    41  	// this can be because of:
    42  	// - a missing ResourceFlavor referenced by the ClusterQueue
    43  	// - a missing or inactive AdmissionCheck referenced by the ClusterQueue
    44  	// - the ClusterQueue is stopped
    45  	// In this state, the ClusterQueue can't admit new workloads and its quota can't be borrowed
    46  	// by other active ClusterQueues in the cohort.
    47  	CQStatusPending ClusterQueueStatus = "pending"
    48  	// CQStatusActive means the ClusterQueue can admit new workloads and its quota
    49  	// can be borrowed by other ClusterQueues in the cohort.
    50  	CQStatusActive ClusterQueueStatus = "active"
    51  	// CQStatusTerminating means the clusterQueue is in pending deletion.
    52  	CQStatusTerminating ClusterQueueStatus = "terminating"
    53  )
    54  
    55  var (
    56  	CQStatuses = []ClusterQueueStatus{CQStatusPending, CQStatusActive, CQStatusTerminating}
    57  
    58  	AdmissionAttemptsTotal = prometheus.NewCounterVec(
    59  		prometheus.CounterOpts{
    60  			Subsystem: constants.KueueName,
    61  			Name:      "admission_attempts_total",
    62  			Help: `The total number of attempts to admit workloads.
    63  Each admission attempt might try to admit more than one workload.
    64  The label 'result' can have the following values:
    65  - 'success' means that at least one workload was admitted.,
    66  - 'inadmissible' means that no workload was admitted.`,
    67  		}, []string{"result"},
    68  	)
    69  
    70  	admissionAttemptDuration = prometheus.NewHistogramVec(
    71  		prometheus.HistogramOpts{
    72  			Subsystem: constants.KueueName,
    73  			Name:      "admission_attempt_duration_seconds",
    74  			Help: `The latency of an admission attempt.
    75  The label 'result' can have the following values:
    76  - 'success' means that at least one workload was admitted.,
    77  - 'inadmissible' means that no workload was admitted.`,
    78  		}, []string{"result"},
    79  	)
    80  
    81  	// Metrics tied to the queue system.
    82  
    83  	PendingWorkloads = prometheus.NewGaugeVec(
    84  		prometheus.GaugeOpts{
    85  			Subsystem: constants.KueueName,
    86  			Name:      "pending_workloads",
    87  			Help: `The number of pending workloads, per 'cluster_queue' and 'status'.
    88  'status' can have the following values:
    89  - "active" means that the workloads are in the admission queue.
    90  - "inadmissible" means there was a failed admission attempt for these workloads and they won't be retried until cluster conditions, which could make this workload admissible, change`,
    91  		}, []string{"cluster_queue", "status"},
    92  	)
    93  
    94  	AdmittedWorkloadsTotal = prometheus.NewCounterVec(
    95  		prometheus.CounterOpts{
    96  			Subsystem: constants.KueueName,
    97  			Name:      "admitted_workloads_total",
    98  			Help:      "The total number of admitted workloads per 'cluster_queue'",
    99  		}, []string{"cluster_queue"},
   100  	)
   101  
   102  	admissionWaitTime = prometheus.NewHistogramVec(
   103  		prometheus.HistogramOpts{
   104  			Subsystem: constants.KueueName,
   105  			Name:      "admission_wait_time_seconds",
   106  			Help:      "The time between a Workload was created until it was admitted, per 'cluster_queue'",
   107  		}, []string{"cluster_queue"},
   108  	)
   109  
   110  	// Metrics tied to the cache.
   111  
   112  	ReservingActiveWorkloads = prometheus.NewGaugeVec(
   113  		prometheus.GaugeOpts{
   114  			Subsystem: constants.KueueName,
   115  			Name:      "reserving_active_workloads",
   116  			Help:      "The number of Workloads that are reserving quota, per 'cluster_queue'",
   117  		}, []string{"cluster_queue"},
   118  	)
   119  
   120  	AdmittedActiveWorkloads = prometheus.NewGaugeVec(
   121  		prometheus.GaugeOpts{
   122  			Subsystem: constants.KueueName,
   123  			Name:      "admitted_active_workloads",
   124  			Help:      "The number of admitted Workloads that are active (unsuspended and not finished), per 'cluster_queue'",
   125  		}, []string{"cluster_queue"},
   126  	)
   127  
   128  	ClusterQueueByStatus = prometheus.NewGaugeVec(
   129  		prometheus.GaugeOpts{
   130  			Subsystem: constants.KueueName,
   131  			Name:      "cluster_queue_status",
   132  			Help: `Reports 'cluster_queue' with its 'status' (with possible values 'pending', 'active' or 'terminated').
   133  For a ClusterQueue, the metric only reports a value of 1 for one of the statuses.`,
   134  		}, []string{"cluster_queue", "status"},
   135  	)
   136  
   137  	// Optional cluster queue metrics
   138  	ClusterQueueResourceReservations = prometheus.NewGaugeVec(
   139  		prometheus.GaugeOpts{
   140  			Subsystem: constants.KueueName,
   141  			Name:      "cluster_queue_resource_reservation",
   142  			Help:      `Reports the cluster_queue's total resource reservation within all the flavors`,
   143  		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
   144  	)
   145  
   146  	ClusterQueueResourceUsage = prometheus.NewGaugeVec(
   147  		prometheus.GaugeOpts{
   148  			Subsystem: constants.KueueName,
   149  			Name:      "cluster_queue_resource_usage",
   150  			Help:      `Reports the cluster_queue's total resource usage within all the flavors`,
   151  		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
   152  	)
   153  
   154  	ClusterQueueResourceNominalQuota = prometheus.NewGaugeVec(
   155  		prometheus.GaugeOpts{
   156  			Subsystem: constants.KueueName,
   157  			Name:      "cluster_queue_nominal_quota",
   158  			Help:      `Reports the cluster_queue's resource nominal quota within all the flavors`,
   159  		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
   160  	)
   161  
   162  	ClusterQueueResourceBorrowingLimit = prometheus.NewGaugeVec(
   163  		prometheus.GaugeOpts{
   164  			Subsystem: constants.KueueName,
   165  			Name:      "cluster_queue_borrowing_limit",
   166  			Help:      `Reports the cluster_queue's resource borrowing limit within all the flavors`,
   167  		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
   168  	)
   169  
   170  	ClusterQueueResourceLendingLimit = prometheus.NewGaugeVec(
   171  		prometheus.GaugeOpts{
   172  			Subsystem: constants.KueueName,
   173  			Name:      "cluster_queue_lending_limit",
   174  			Help:      `Reports the cluster_queue's resource lending limit within all the flavors`,
   175  		}, []string{"cohort", "cluster_queue", "flavor", "resource"},
   176  	)
   177  )
   178  
   179  func AdmissionAttempt(result AdmissionResult, duration time.Duration) {
   180  	AdmissionAttemptsTotal.WithLabelValues(string(result)).Inc()
   181  	admissionAttemptDuration.WithLabelValues(string(result)).Observe(duration.Seconds())
   182  }
   183  
   184  func AdmittedWorkload(cqName kueue.ClusterQueueReference, waitTime time.Duration) {
   185  	AdmittedWorkloadsTotal.WithLabelValues(string(cqName)).Inc()
   186  	admissionWaitTime.WithLabelValues(string(cqName)).Observe(waitTime.Seconds())
   187  }
   188  
   189  func ReportPendingWorkloads(cqName string, active, inadmissible int) {
   190  	PendingWorkloads.WithLabelValues(cqName, PendingStatusActive).Set(float64(active))
   191  	PendingWorkloads.WithLabelValues(cqName, PendingStatusInadmissible).Set(float64(inadmissible))
   192  }
   193  
   194  func ClearQueueSystemMetrics(cqName string) {
   195  	PendingWorkloads.DeleteLabelValues(cqName, PendingStatusActive)
   196  	PendingWorkloads.DeleteLabelValues(cqName, PendingStatusInadmissible)
   197  	AdmittedWorkloadsTotal.DeleteLabelValues(cqName)
   198  	admissionWaitTime.DeleteLabelValues(cqName)
   199  }
   200  
   201  func ReportClusterQueueStatus(cqName string, cqStatus ClusterQueueStatus) {
   202  	for _, status := range CQStatuses {
   203  		var v float64
   204  		if status == cqStatus {
   205  			v = 1
   206  		}
   207  		ClusterQueueByStatus.WithLabelValues(cqName, string(status)).Set(v)
   208  	}
   209  }
   210  
   211  func ClearCacheMetrics(cqName string) {
   212  	ReservingActiveWorkloads.DeleteLabelValues(cqName)
   213  	AdmittedActiveWorkloads.DeleteLabelValues(cqName)
   214  	for _, status := range CQStatuses {
   215  		ClusterQueueByStatus.DeleteLabelValues(cqName, string(status))
   216  	}
   217  }
   218  
   219  func ReportClusterQueueQuotas(cohort, queue, flavor, resource string, nominal, borrowing, lending float64) {
   220  	ClusterQueueResourceNominalQuota.WithLabelValues(cohort, queue, flavor, resource).Set(nominal)
   221  	ClusterQueueResourceBorrowingLimit.WithLabelValues(cohort, queue, flavor, resource).Set(borrowing)
   222  	if features.Enabled(features.LendingLimit) {
   223  		ClusterQueueResourceLendingLimit.WithLabelValues(cohort, queue, flavor, resource).Set(lending)
   224  	}
   225  }
   226  
   227  func ReportClusterQueueResourceReservations(cohort, queue, flavor, resource string, usage float64) {
   228  	ClusterQueueResourceReservations.WithLabelValues(cohort, queue, flavor, resource).Set(usage)
   229  }
   230  
   231  func ReportClusterQueueResourceUsage(cohort, queue, flavor, resource string, usage float64) {
   232  	ClusterQueueResourceUsage.WithLabelValues(cohort, queue, flavor, resource).Set(usage)
   233  }
   234  
   235  func ClearClusterQueueResourceMetrics(cqName string) {
   236  	lbls := prometheus.Labels{
   237  		"cluster_queue": cqName,
   238  	}
   239  	ClusterQueueResourceNominalQuota.DeletePartialMatch(lbls)
   240  	ClusterQueueResourceBorrowingLimit.DeletePartialMatch(lbls)
   241  	if features.Enabled(features.LendingLimit) {
   242  		ClusterQueueResourceLendingLimit.DeletePartialMatch(lbls)
   243  	}
   244  	ClusterQueueResourceUsage.DeletePartialMatch(lbls)
   245  	ClusterQueueResourceReservations.DeletePartialMatch(lbls)
   246  }
   247  
   248  func ClearClusterQueueResourceQuotas(cqName, flavor, resource string) {
   249  	lbls := prometheus.Labels{
   250  		"cluster_queue": cqName,
   251  		"flavor":        flavor,
   252  	}
   253  
   254  	if len(resource) != 0 {
   255  		lbls["resource"] = resource
   256  	}
   257  
   258  	ClusterQueueResourceNominalQuota.DeletePartialMatch(lbls)
   259  	ClusterQueueResourceBorrowingLimit.DeletePartialMatch(lbls)
   260  	if features.Enabled(features.LendingLimit) {
   261  		ClusterQueueResourceLendingLimit.DeletePartialMatch(lbls)
   262  	}
   263  }
   264  
   265  func ClearClusterQueueResourceUsage(cqName, flavor, resource string) {
   266  	lbls := prometheus.Labels{
   267  		"cluster_queue": cqName,
   268  		"flavor":        flavor,
   269  	}
   270  
   271  	if len(resource) != 0 {
   272  		lbls["resource"] = resource
   273  	}
   274  
   275  	ClusterQueueResourceUsage.DeletePartialMatch(lbls)
   276  }
   277  
   278  func ClearClusterQueueResourceReservations(cqName, flavor, resource string) {
   279  	lbls := prometheus.Labels{
   280  		"cluster_queue": cqName,
   281  		"flavor":        flavor,
   282  	}
   283  
   284  	if len(resource) != 0 {
   285  		lbls["resource"] = resource
   286  	}
   287  
   288  	ClusterQueueResourceReservations.DeletePartialMatch(lbls)
   289  }
   290  
   291  func Register() {
   292  	metrics.Registry.MustRegister(
   293  		AdmissionAttemptsTotal,
   294  		admissionAttemptDuration,
   295  		PendingWorkloads,
   296  		ReservingActiveWorkloads,
   297  		AdmittedActiveWorkloads,
   298  		AdmittedWorkloadsTotal,
   299  		admissionWaitTime,
   300  		ClusterQueueResourceUsage,
   301  		ClusterQueueResourceReservations,
   302  		ClusterQueueResourceNominalQuota,
   303  		ClusterQueueResourceBorrowingLimit,
   304  		ClusterQueueResourceLendingLimit,
   305  	)
   306  }