k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/metrics/metrics.go

k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/metrics/metrics.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package metrics
    18  
    19  import (
    20  	"sync"
    21  	"time"
    22  
    23  	"k8s.io/component-base/metrics"
    24  	"k8s.io/component-base/metrics/legacyregistry"
    25  	volumebindingmetrics "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics"
    26  )
    27  
    28  const (
    29  	// SchedulerSubsystem - subsystem name used by scheduler.
    30  	SchedulerSubsystem = "scheduler"
    31  )
    32  
    33  // Below are possible values for the work and operation label.
    34  const (
    35  	// PrioritizingExtender - prioritizing extender work/operation label value.
    36  	PrioritizingExtender = "prioritizing_extender"
    37  	// Binding - binding work/operation label value.
    38  	Binding = "binding"
    39  )
    40  
    41  // ExtentionPoints is a list of possible values for the extension_point label.
    42  var ExtentionPoints = []string{
    43  	PreFilter,
    44  	Filter,
    45  	PreFilterExtensionAddPod,
    46  	PreFilterExtensionRemovePod,
    47  	PostFilter,
    48  	PreScore,
    49  	Score,
    50  	ScoreExtensionNormalize,
    51  	PreBind,
    52  	Bind,
    53  	PostBind,
    54  	Reserve,
    55  	Unreserve,
    56  	Permit,
    57  }
    58  
    59  const (
    60  	PreFilter                   = "PreFilter"
    61  	Filter                      = "Filter"
    62  	PreFilterExtensionAddPod    = "PreFilterExtensionAddPod"
    63  	PreFilterExtensionRemovePod = "PreFilterExtensionRemovePod"
    64  	PostFilter                  = "PostFilter"
    65  	PreScore                    = "PreScore"
    66  	Score                       = "Score"
    67  	ScoreExtensionNormalize     = "ScoreExtensionNormalize"
    68  	PreBind                     = "PreBind"
    69  	Bind                        = "Bind"
    70  	PostBind                    = "PostBind"
    71  	Reserve                     = "Reserve"
    72  	Unreserve                   = "Unreserve"
    73  	Permit                      = "Permit"
    74  )
    75  
    76  // All the histogram based metrics have 1ms as size for the smallest bucket.
    77  var (
    78  	scheduleAttempts = metrics.NewCounterVec(
    79  		&metrics.CounterOpts{
    80  			Subsystem:      SchedulerSubsystem,
    81  			Name:           "schedule_attempts_total",
    82  			Help:           "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
    83  			StabilityLevel: metrics.STABLE,
    84  		}, []string{"result", "profile"})
    85  
    86  	schedulingLatency = metrics.NewHistogramVec(
    87  		&metrics.HistogramOpts{
    88  			Subsystem:      SchedulerSubsystem,
    89  			Name:           "scheduling_attempt_duration_seconds",
    90  			Help:           "Scheduling attempt latency in seconds (scheduling algorithm + binding)",
    91  			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
    92  			StabilityLevel: metrics.STABLE,
    93  		}, []string{"result", "profile"})
    94  	SchedulingAlgorithmLatency = metrics.NewHistogram(
    95  		&metrics.HistogramOpts{
    96  			Subsystem:      SchedulerSubsystem,
    97  			Name:           "scheduling_algorithm_duration_seconds",
    98  			Help:           "Scheduling algorithm latency in seconds",
    99  			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
   100  			StabilityLevel: metrics.ALPHA,
   101  		},
   102  	)
   103  	PreemptionVictims = metrics.NewHistogram(
   104  		&metrics.HistogramOpts{
   105  			Subsystem: SchedulerSubsystem,
   106  			Name:      "preemption_victims",
   107  			Help:      "Number of selected preemption victims",
   108  			// we think #victims>64 is pretty rare, therefore [64, +Inf) is considered a single bucket.
   109  			Buckets:        metrics.ExponentialBuckets(1, 2, 7),
   110  			StabilityLevel: metrics.STABLE,
   111  		})
   112  	PreemptionAttempts = metrics.NewCounter(
   113  		&metrics.CounterOpts{
   114  			Subsystem:      SchedulerSubsystem,
   115  			Name:           "preemption_attempts_total",
   116  			Help:           "Total preemption attempts in the cluster till now",
   117  			StabilityLevel: metrics.STABLE,
   118  		})
   119  	pendingPods = metrics.NewGaugeVec(
   120  		&metrics.GaugeOpts{
   121  			Subsystem:      SchedulerSubsystem,
   122  			Name:           "pending_pods",
   123  			Help:           "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods that the scheduler attempted to schedule and failed; 'gated' is the number of unschedulable pods that the scheduler never attempted to schedule because they are gated.",
   124  			StabilityLevel: metrics.STABLE,
   125  		}, []string{"queue"})
   126  	Goroutines = metrics.NewGaugeVec(
   127  		&metrics.GaugeOpts{
   128  			Subsystem:      SchedulerSubsystem,
   129  			Name:           "goroutines",
   130  			Help:           "Number of running goroutines split by the work they do such as binding.",
   131  			StabilityLevel: metrics.ALPHA,
   132  		}, []string{"operation"})
   133  
   134  	// PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed
   135  	// in v1.31. Please use PodSchedulingSLIDuration instead.
   136  	PodSchedulingDuration = metrics.NewHistogramVec(
   137  		&metrics.HistogramOpts{
   138  			Subsystem: SchedulerSubsystem,
   139  			Name:      "pod_scheduling_duration_seconds",
   140  			Help:      "E2e latency for a pod being scheduled which may include multiple scheduling attempts.",
   141  			// Start with 10ms with the last bucket being [~88m, Inf).
   142  			Buckets:           metrics.ExponentialBuckets(0.01, 2, 20),
   143  			StabilityLevel:    metrics.STABLE,
   144  			DeprecatedVersion: "1.29.0",
   145  		},
   146  		[]string{"attempts"})
   147  
   148  	PodSchedulingSLIDuration = metrics.NewHistogramVec(
   149  		&metrics.HistogramOpts{
   150  			Subsystem: SchedulerSubsystem,
   151  			Name:      "pod_scheduling_sli_duration_seconds",
   152  			Help:      "E2e latency for a pod being scheduled, from the time the pod enters the scheduling queue and might involve multiple scheduling attempts.",
   153  			// Start with 10ms with the last bucket being [~88m, Inf).
   154  			Buckets:        metrics.ExponentialBuckets(0.01, 2, 20),
   155  			StabilityLevel: metrics.BETA,
   156  		},
   157  		[]string{"attempts"})
   158  
   159  	PodSchedulingAttempts = metrics.NewHistogram(
   160  		&metrics.HistogramOpts{
   161  			Subsystem:      SchedulerSubsystem,
   162  			Name:           "pod_scheduling_attempts",
   163  			Help:           "Number of attempts to successfully schedule a pod.",
   164  			Buckets:        metrics.ExponentialBuckets(1, 2, 5),
   165  			StabilityLevel: metrics.STABLE,
   166  		})
   167  
   168  	FrameworkExtensionPointDuration = metrics.NewHistogramVec(
   169  		&metrics.HistogramOpts{
   170  			Subsystem: SchedulerSubsystem,
   171  			Name:      "framework_extension_point_duration_seconds",
   172  			Help:      "Latency for running all plugins of a specific extension point.",
   173  			// Start with 0.1ms with the last bucket being [~200ms, Inf)
   174  			Buckets:        metrics.ExponentialBuckets(0.0001, 2, 12),
   175  			StabilityLevel: metrics.STABLE,
   176  		},
   177  		[]string{"extension_point", "status", "profile"})
   178  
   179  	PluginExecutionDuration = metrics.NewHistogramVec(
   180  		&metrics.HistogramOpts{
   181  			Subsystem: SchedulerSubsystem,
   182  			Name:      "plugin_execution_duration_seconds",
   183  			Help:      "Duration for running a plugin at a specific extension point.",
   184  			// Start with 0.01ms with the last bucket being [~22ms, Inf). We use a small factor (1.5)
   185  			// so that we have better granularity since plugin latency is very sensitive.
   186  			Buckets:        metrics.ExponentialBuckets(0.00001, 1.5, 20),
   187  			StabilityLevel: metrics.ALPHA,
   188  		},
   189  		[]string{"plugin", "extension_point", "status"})
   190  
   191  	SchedulerQueueIncomingPods = metrics.NewCounterVec(
   192  		&metrics.CounterOpts{
   193  			Subsystem:      SchedulerSubsystem,
   194  			Name:           "queue_incoming_pods_total",
   195  			Help:           "Number of pods added to scheduling queues by event and queue type.",
   196  			StabilityLevel: metrics.STABLE,
   197  		}, []string{"queue", "event"})
   198  
   199  	PermitWaitDuration = metrics.NewHistogramVec(
   200  		&metrics.HistogramOpts{
   201  			Subsystem:      SchedulerSubsystem,
   202  			Name:           "permit_wait_duration_seconds",
   203  			Help:           "Duration of waiting on permit.",
   204  			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
   205  			StabilityLevel: metrics.ALPHA,
   206  		},
   207  		[]string{"result"})
   208  
   209  	CacheSize = metrics.NewGaugeVec(
   210  		&metrics.GaugeOpts{
   211  			Subsystem:      SchedulerSubsystem,
   212  			Name:           "scheduler_cache_size",
   213  			Help:           "Number of nodes, pods, and assumed (bound) pods in the scheduler cache.",
   214  			StabilityLevel: metrics.ALPHA,
   215  		}, []string{"type"})
   216  
   217  	unschedulableReasons = metrics.NewGaugeVec(
   218  		&metrics.GaugeOpts{
   219  			Subsystem:      SchedulerSubsystem,
   220  			Name:           "unschedulable_pods",
   221  			Help:           "The number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.",
   222  			StabilityLevel: metrics.ALPHA,
   223  		}, []string{"plugin", "profile"})
   224  
   225  	PluginEvaluationTotal = metrics.NewCounterVec(
   226  		&metrics.CounterOpts{
   227  			Subsystem:      SchedulerSubsystem,
   228  			Name:           "plugin_evaluation_total",
   229  			Help:           "Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter, Filter, PreScore, and Score).",
   230  			StabilityLevel: metrics.ALPHA,
   231  		}, []string{"plugin", "extension_point", "profile"})
   232  
   233  	metricsList = []metrics.Registerable{
   234  		scheduleAttempts,
   235  		schedulingLatency,
   236  		SchedulingAlgorithmLatency,
   237  		PreemptionVictims,
   238  		PreemptionAttempts,
   239  		pendingPods,
   240  		PodSchedulingDuration,
   241  		PodSchedulingSLIDuration,
   242  		PodSchedulingAttempts,
   243  		FrameworkExtensionPointDuration,
   244  		PluginExecutionDuration,
   245  		SchedulerQueueIncomingPods,
   246  		Goroutines,
   247  		PermitWaitDuration,
   248  		CacheSize,
   249  		unschedulableReasons,
   250  		PluginEvaluationTotal,
   251  	}
   252  )
   253  
   254  var registerMetrics sync.Once
   255  
   256  // Register all metrics.
   257  func Register() {
   258  	// Register the metrics.
   259  	registerMetrics.Do(func() {
   260  		RegisterMetrics(metricsList...)
   261  		volumebindingmetrics.RegisterVolumeSchedulingMetrics()
   262  	})
   263  }
   264  
   265  // RegisterMetrics registers a list of metrics.
   266  // This function is exported because it is intended to be used by out-of-tree plugins to register their custom metrics.
   267  func RegisterMetrics(extraMetrics ...metrics.Registerable) {
   268  	for _, metric := range extraMetrics {
   269  		legacyregistry.MustRegister(metric)
   270  	}
   271  }
   272  
   273  // GetGather returns the gatherer. It used by test case outside current package.
   274  func GetGather() metrics.Gatherer {
   275  	return legacyregistry.DefaultGatherer
   276  }
   277  
   278  // ActivePods returns the pending pods metrics with the label active
   279  func ActivePods() metrics.GaugeMetric {
   280  	return pendingPods.With(metrics.Labels{"queue": "active"})
   281  }
   282  
   283  // BackoffPods returns the pending pods metrics with the label backoff
   284  func BackoffPods() metrics.GaugeMetric {
   285  	return pendingPods.With(metrics.Labels{"queue": "backoff"})
   286  }
   287  
   288  // UnschedulablePods returns the pending pods metrics with the label unschedulable
   289  func UnschedulablePods() metrics.GaugeMetric {
   290  	return pendingPods.With(metrics.Labels{"queue": "unschedulable"})
   291  }
   292  
   293  // GatedPods returns the pending pods metrics with the label gated
   294  func GatedPods() metrics.GaugeMetric {
   295  	return pendingPods.With(metrics.Labels{"queue": "gated"})
   296  }
   297  
   298  // SinceInSeconds gets the time since the specified start in seconds.
   299  func SinceInSeconds(start time.Time) float64 {
   300  	return time.Since(start).Seconds()
   301  }
   302  
   303  func UnschedulableReason(plugin string, profile string) metrics.GaugeMetric {
   304  	return unschedulableReasons.With(metrics.Labels{"plugin": plugin, "profile": profile})
   305  }