k8s.io/kubernetes@v1.29.3/pkg/scheduler/metrics/metrics.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package metrics
    18  
    19  import (
    20  	"sync"
    21  	"time"
    22  
    23  	"k8s.io/component-base/metrics"
    24  	"k8s.io/component-base/metrics/legacyregistry"
    25  	volumebindingmetrics "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics"
    26  )
    27  
    28  const (
    29  	// SchedulerSubsystem - subsystem name used by scheduler.
    30  	SchedulerSubsystem = "scheduler"
    31  )
    32  
    33  // Below are possible values for the work and operation label.
    34  const (
    35  	// PrioritizingExtender - prioritizing extender work/operation label value.
    36  	PrioritizingExtender = "prioritizing_extender"
    37  	// Binding - binding work/operation label value.
    38  	Binding = "binding"
    39  )
    40  
    41  // Below are possible values for the extension_point label.
    42  const (
    43  	PreFilter                   = "PreFilter"
    44  	Filter                      = "Filter"
    45  	PreFilterExtensionAddPod    = "PreFilterExtensionAddPod"
    46  	PreFilterExtensionRemovePod = "PreFilterExtensionRemovePod"
    47  	PostFilter                  = "PostFilter"
    48  	PreScore                    = "PreScore"
    49  	Score                       = "Score"
    50  	ScoreExtensionNormalize     = "ScoreExtensionNormalize"
    51  	PreBind                     = "PreBind"
    52  	Bind                        = "Bind"
    53  	PostBind                    = "PostBind"
    54  	Reserve                     = "Reserve"
    55  	Unreserve                   = "Unreserve"
    56  	Permit                      = "Permit"
    57  )
    58  
    59  // All the histogram based metrics have 1ms as size for the smallest bucket.
    60  var (
    61  	scheduleAttempts = metrics.NewCounterVec(
    62  		&metrics.CounterOpts{
    63  			Subsystem:      SchedulerSubsystem,
    64  			Name:           "schedule_attempts_total",
    65  			Help:           "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
    66  			StabilityLevel: metrics.STABLE,
    67  		}, []string{"result", "profile"})
    68  
    69  	schedulingLatency = metrics.NewHistogramVec(
    70  		&metrics.HistogramOpts{
    71  			Subsystem:      SchedulerSubsystem,
    72  			Name:           "scheduling_attempt_duration_seconds",
    73  			Help:           "Scheduling attempt latency in seconds (scheduling algorithm + binding)",
    74  			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
    75  			StabilityLevel: metrics.STABLE,
    76  		}, []string{"result", "profile"})
    77  	SchedulingAlgorithmLatency = metrics.NewHistogram(
    78  		&metrics.HistogramOpts{
    79  			Subsystem:      SchedulerSubsystem,
    80  			Name:           "scheduling_algorithm_duration_seconds",
    81  			Help:           "Scheduling algorithm latency in seconds",
    82  			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
    83  			StabilityLevel: metrics.ALPHA,
    84  		},
    85  	)
    86  	PreemptionVictims = metrics.NewHistogram(
    87  		&metrics.HistogramOpts{
    88  			Subsystem: SchedulerSubsystem,
    89  			Name:      "preemption_victims",
    90  			Help:      "Number of selected preemption victims",
    91  			// we think #victims>64 is pretty rare, therefore [64, +Inf) is considered a single bucket.
    92  			Buckets:        metrics.ExponentialBuckets(1, 2, 7),
    93  			StabilityLevel: metrics.STABLE,
    94  		})
    95  	PreemptionAttempts = metrics.NewCounter(
    96  		&metrics.CounterOpts{
    97  			Subsystem:      SchedulerSubsystem,
    98  			Name:           "preemption_attempts_total",
    99  			Help:           "Total preemption attempts in the cluster till now",
   100  			StabilityLevel: metrics.STABLE,
   101  		})
   102  	pendingPods = metrics.NewGaugeVec(
   103  		&metrics.GaugeOpts{
   104  			Subsystem:      SchedulerSubsystem,
   105  			Name:           "pending_pods",
   106  			Help:           "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods that the scheduler attempted to schedule and failed; 'gated' is the number of unschedulable pods that the scheduler never attempted to schedule because they are gated.",
   107  			StabilityLevel: metrics.STABLE,
   108  		}, []string{"queue"})
   109  	Goroutines = metrics.NewGaugeVec(
   110  		&metrics.GaugeOpts{
   111  			Subsystem:      SchedulerSubsystem,
   112  			Name:           "goroutines",
   113  			Help:           "Number of running goroutines split by the work they do such as binding.",
   114  			StabilityLevel: metrics.ALPHA,
   115  		}, []string{"operation"})
   116  
   117  	// PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed
   118  	// in v1.31. Please use PodSchedulingSLIDuration instead.
   119  	PodSchedulingDuration = metrics.NewHistogramVec(
   120  		&metrics.HistogramOpts{
   121  			Subsystem: SchedulerSubsystem,
   122  			Name:      "pod_scheduling_duration_seconds",
   123  			Help:      "E2e latency for a pod being scheduled which may include multiple scheduling attempts.",
   124  			// Start with 10ms with the last bucket being [~88m, Inf).
   125  			Buckets:           metrics.ExponentialBuckets(0.01, 2, 20),
   126  			StabilityLevel:    metrics.STABLE,
   127  			DeprecatedVersion: "1.29.0",
   128  		},
   129  		[]string{"attempts"})
   130  
   131  	PodSchedulingSLIDuration = metrics.NewHistogramVec(
   132  		&metrics.HistogramOpts{
   133  			Subsystem: SchedulerSubsystem,
   134  			Name:      "pod_scheduling_sli_duration_seconds",
   135  			Help:      "E2e latency for a pod being scheduled, from the time the pod enters the scheduling queue an d might involve multiple scheduling attempts.",
   136  			// Start with 10ms with the last bucket being [~88m, Inf).
   137  			Buckets:        metrics.ExponentialBuckets(0.01, 2, 20),
   138  			StabilityLevel: metrics.BETA,
   139  		},
   140  		[]string{"attempts"})
   141  
   142  	PodSchedulingAttempts = metrics.NewHistogram(
   143  		&metrics.HistogramOpts{
   144  			Subsystem:      SchedulerSubsystem,
   145  			Name:           "pod_scheduling_attempts",
   146  			Help:           "Number of attempts to successfully schedule a pod.",
   147  			Buckets:        metrics.ExponentialBuckets(1, 2, 5),
   148  			StabilityLevel: metrics.STABLE,
   149  		})
   150  
   151  	FrameworkExtensionPointDuration = metrics.NewHistogramVec(
   152  		&metrics.HistogramOpts{
   153  			Subsystem: SchedulerSubsystem,
   154  			Name:      "framework_extension_point_duration_seconds",
   155  			Help:      "Latency for running all plugins of a specific extension point.",
   156  			// Start with 0.1ms with the last bucket being [~200ms, Inf)
   157  			Buckets:        metrics.ExponentialBuckets(0.0001, 2, 12),
   158  			StabilityLevel: metrics.STABLE,
   159  		},
   160  		[]string{"extension_point", "status", "profile"})
   161  
   162  	PluginExecutionDuration = metrics.NewHistogramVec(
   163  		&metrics.HistogramOpts{
   164  			Subsystem: SchedulerSubsystem,
   165  			Name:      "plugin_execution_duration_seconds",
   166  			Help:      "Duration for running a plugin at a specific extension point.",
   167  			// Start with 0.01ms with the last bucket being [~22ms, Inf). We use a small factor (1.5)
   168  			// so that we have better granularity since plugin latency is very sensitive.
   169  			Buckets:        metrics.ExponentialBuckets(0.00001, 1.5, 20),
   170  			StabilityLevel: metrics.ALPHA,
   171  		},
   172  		[]string{"plugin", "extension_point", "status"})
   173  
   174  	SchedulerQueueIncomingPods = metrics.NewCounterVec(
   175  		&metrics.CounterOpts{
   176  			Subsystem:      SchedulerSubsystem,
   177  			Name:           "queue_incoming_pods_total",
   178  			Help:           "Number of pods added to scheduling queues by event and queue type.",
   179  			StabilityLevel: metrics.STABLE,
   180  		}, []string{"queue", "event"})
   181  
   182  	PermitWaitDuration = metrics.NewHistogramVec(
   183  		&metrics.HistogramOpts{
   184  			Subsystem:      SchedulerSubsystem,
   185  			Name:           "permit_wait_duration_seconds",
   186  			Help:           "Duration of waiting on permit.",
   187  			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
   188  			StabilityLevel: metrics.ALPHA,
   189  		},
   190  		[]string{"result"})
   191  
   192  	CacheSize = metrics.NewGaugeVec(
   193  		&metrics.GaugeOpts{
   194  			Subsystem:      SchedulerSubsystem,
   195  			Name:           "scheduler_cache_size",
   196  			Help:           "Number of nodes, pods, and assumed (bound) pods in the scheduler cache.",
   197  			StabilityLevel: metrics.ALPHA,
   198  		}, []string{"type"})
   199  
   200  	unschedulableReasons = metrics.NewGaugeVec(
   201  		&metrics.GaugeOpts{
   202  			Subsystem:      SchedulerSubsystem,
   203  			Name:           "unschedulable_pods",
   204  			Help:           "The number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.",
   205  			StabilityLevel: metrics.ALPHA,
   206  		}, []string{"plugin", "profile"})
   207  
   208  	PluginEvaluationTotal = metrics.NewCounterVec(
   209  		&metrics.CounterOpts{
   210  			Subsystem:      SchedulerSubsystem,
   211  			Name:           "plugin_evaluation_total",
   212  			Help:           "Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter and Filter.).",
   213  			StabilityLevel: metrics.ALPHA,
   214  		}, []string{"plugin", "extension_point", "profile"})
   215  
   216  	metricsList = []metrics.Registerable{
   217  		scheduleAttempts,
   218  		schedulingLatency,
   219  		SchedulingAlgorithmLatency,
   220  		PreemptionVictims,
   221  		PreemptionAttempts,
   222  		pendingPods,
   223  		PodSchedulingDuration,
   224  		PodSchedulingSLIDuration,
   225  		PodSchedulingAttempts,
   226  		FrameworkExtensionPointDuration,
   227  		PluginExecutionDuration,
   228  		SchedulerQueueIncomingPods,
   229  		Goroutines,
   230  		PermitWaitDuration,
   231  		CacheSize,
   232  		unschedulableReasons,
   233  		PluginEvaluationTotal,
   234  	}
   235  )
   236  
   237  var registerMetrics sync.Once
   238  
   239  // Register all metrics.
   240  func Register() {
   241  	// Register the metrics.
   242  	registerMetrics.Do(func() {
   243  		RegisterMetrics(metricsList...)
   244  		volumebindingmetrics.RegisterVolumeSchedulingMetrics()
   245  	})
   246  }
   247  
   248  // RegisterMetrics registers a list of metrics.
   249  // This function is exported because it is intended to be used by out-of-tree plugins to register their custom metrics.
   250  func RegisterMetrics(extraMetrics ...metrics.Registerable) {
   251  	for _, metric := range extraMetrics {
   252  		legacyregistry.MustRegister(metric)
   253  	}
   254  }
   255  
   256  // GetGather returns the gatherer. It used by test case outside current package.
   257  func GetGather() metrics.Gatherer {
   258  	return legacyregistry.DefaultGatherer
   259  }
   260  
   261  // ActivePods returns the pending pods metrics with the label active
   262  func ActivePods() metrics.GaugeMetric {
   263  	return pendingPods.With(metrics.Labels{"queue": "active"})
   264  }
   265  
   266  // BackoffPods returns the pending pods metrics with the label backoff
   267  func BackoffPods() metrics.GaugeMetric {
   268  	return pendingPods.With(metrics.Labels{"queue": "backoff"})
   269  }
   270  
   271  // UnschedulablePods returns the pending pods metrics with the label unschedulable
   272  func UnschedulablePods() metrics.GaugeMetric {
   273  	return pendingPods.With(metrics.Labels{"queue": "unschedulable"})
   274  }
   275  
   276  // GatedPods returns the pending pods metrics with the label gated
   277  func GatedPods() metrics.GaugeMetric {
   278  	return pendingPods.With(metrics.Labels{"queue": "gated"})
   279  }
   280  
   281  // SinceInSeconds gets the time since the specified start in seconds.
   282  func SinceInSeconds(start time.Time) float64 {
   283  	return time.Since(start).Seconds()
   284  }
   285  
   286  func UnschedulableReason(plugin string, profile string) metrics.GaugeMetric {
   287  	return unschedulableReasons.With(metrics.Labels{"plugin": plugin, "profile": profile})
   288  }