k8s.io/kubernetes@v1.29.3/pkg/scheduler/metrics/metrics.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metrics 18 19 import ( 20 "sync" 21 "time" 22 23 "k8s.io/component-base/metrics" 24 "k8s.io/component-base/metrics/legacyregistry" 25 volumebindingmetrics "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics" 26 ) 27 28 const ( 29 // SchedulerSubsystem - subsystem name used by scheduler. 30 SchedulerSubsystem = "scheduler" 31 ) 32 33 // Below are possible values for the work and operation label. 34 const ( 35 // PrioritizingExtender - prioritizing extender work/operation label value. 36 PrioritizingExtender = "prioritizing_extender" 37 // Binding - binding work/operation label value. 38 Binding = "binding" 39 ) 40 41 // Below are possible values for the extension_point label. 42 const ( 43 PreFilter = "PreFilter" 44 Filter = "Filter" 45 PreFilterExtensionAddPod = "PreFilterExtensionAddPod" 46 PreFilterExtensionRemovePod = "PreFilterExtensionRemovePod" 47 PostFilter = "PostFilter" 48 PreScore = "PreScore" 49 Score = "Score" 50 ScoreExtensionNormalize = "ScoreExtensionNormalize" 51 PreBind = "PreBind" 52 Bind = "Bind" 53 PostBind = "PostBind" 54 Reserve = "Reserve" 55 Unreserve = "Unreserve" 56 Permit = "Permit" 57 ) 58 59 // All the histogram based metrics have 1ms as size for the smallest bucket. 60 var ( 61 scheduleAttempts = metrics.NewCounterVec( 62 &metrics.CounterOpts{ 63 Subsystem: SchedulerSubsystem, 64 Name: "schedule_attempts_total", 65 Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.", 66 StabilityLevel: metrics.STABLE, 67 }, []string{"result", "profile"}) 68 69 schedulingLatency = metrics.NewHistogramVec( 70 &metrics.HistogramOpts{ 71 Subsystem: SchedulerSubsystem, 72 Name: "scheduling_attempt_duration_seconds", 73 Help: "Scheduling attempt latency in seconds (scheduling algorithm + binding)", 74 Buckets: metrics.ExponentialBuckets(0.001, 2, 15), 75 StabilityLevel: metrics.STABLE, 76 }, []string{"result", "profile"}) 77 SchedulingAlgorithmLatency = metrics.NewHistogram( 78 &metrics.HistogramOpts{ 79 Subsystem: SchedulerSubsystem, 80 Name: "scheduling_algorithm_duration_seconds", 81 Help: "Scheduling algorithm latency in seconds", 82 Buckets: metrics.ExponentialBuckets(0.001, 2, 15), 83 StabilityLevel: metrics.ALPHA, 84 }, 85 ) 86 PreemptionVictims = metrics.NewHistogram( 87 &metrics.HistogramOpts{ 88 Subsystem: SchedulerSubsystem, 89 Name: "preemption_victims", 90 Help: "Number of selected preemption victims", 91 // we think #victims>64 is pretty rare, therefore [64, +Inf) is considered a single bucket. 92 Buckets: metrics.ExponentialBuckets(1, 2, 7), 93 StabilityLevel: metrics.STABLE, 94 }) 95 PreemptionAttempts = metrics.NewCounter( 96 &metrics.CounterOpts{ 97 Subsystem: SchedulerSubsystem, 98 Name: "preemption_attempts_total", 99 Help: "Total preemption attempts in the cluster till now", 100 StabilityLevel: metrics.STABLE, 101 }) 102 pendingPods = metrics.NewGaugeVec( 103 &metrics.GaugeOpts{ 104 Subsystem: SchedulerSubsystem, 105 Name: "pending_pods", 106 Help: "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods that the scheduler attempted to schedule and failed; 'gated' is the number of unschedulable pods that the scheduler never attempted to schedule because they are gated.", 107 StabilityLevel: metrics.STABLE, 108 }, []string{"queue"}) 109 Goroutines = metrics.NewGaugeVec( 110 &metrics.GaugeOpts{ 111 Subsystem: SchedulerSubsystem, 112 Name: "goroutines", 113 Help: "Number of running goroutines split by the work they do such as binding.", 114 StabilityLevel: metrics.ALPHA, 115 }, []string{"operation"}) 116 117 // PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed 118 // in v1.31. Please use PodSchedulingSLIDuration instead. 119 PodSchedulingDuration = metrics.NewHistogramVec( 120 &metrics.HistogramOpts{ 121 Subsystem: SchedulerSubsystem, 122 Name: "pod_scheduling_duration_seconds", 123 Help: "E2e latency for a pod being scheduled which may include multiple scheduling attempts.", 124 // Start with 10ms with the last bucket being [~88m, Inf). 125 Buckets: metrics.ExponentialBuckets(0.01, 2, 20), 126 StabilityLevel: metrics.STABLE, 127 DeprecatedVersion: "1.29.0", 128 }, 129 []string{"attempts"}) 130 131 PodSchedulingSLIDuration = metrics.NewHistogramVec( 132 &metrics.HistogramOpts{ 133 Subsystem: SchedulerSubsystem, 134 Name: "pod_scheduling_sli_duration_seconds", 135 Help: "E2e latency for a pod being scheduled, from the time the pod enters the scheduling queue an d might involve multiple scheduling attempts.", 136 // Start with 10ms with the last bucket being [~88m, Inf). 137 Buckets: metrics.ExponentialBuckets(0.01, 2, 20), 138 StabilityLevel: metrics.BETA, 139 }, 140 []string{"attempts"}) 141 142 PodSchedulingAttempts = metrics.NewHistogram( 143 &metrics.HistogramOpts{ 144 Subsystem: SchedulerSubsystem, 145 Name: "pod_scheduling_attempts", 146 Help: "Number of attempts to successfully schedule a pod.", 147 Buckets: metrics.ExponentialBuckets(1, 2, 5), 148 StabilityLevel: metrics.STABLE, 149 }) 150 151 FrameworkExtensionPointDuration = metrics.NewHistogramVec( 152 &metrics.HistogramOpts{ 153 Subsystem: SchedulerSubsystem, 154 Name: "framework_extension_point_duration_seconds", 155 Help: "Latency for running all plugins of a specific extension point.", 156 // Start with 0.1ms with the last bucket being [~200ms, Inf) 157 Buckets: metrics.ExponentialBuckets(0.0001, 2, 12), 158 StabilityLevel: metrics.STABLE, 159 }, 160 []string{"extension_point", "status", "profile"}) 161 162 PluginExecutionDuration = metrics.NewHistogramVec( 163 &metrics.HistogramOpts{ 164 Subsystem: SchedulerSubsystem, 165 Name: "plugin_execution_duration_seconds", 166 Help: "Duration for running a plugin at a specific extension point.", 167 // Start with 0.01ms with the last bucket being [~22ms, Inf). We use a small factor (1.5) 168 // so that we have better granularity since plugin latency is very sensitive. 169 Buckets: metrics.ExponentialBuckets(0.00001, 1.5, 20), 170 StabilityLevel: metrics.ALPHA, 171 }, 172 []string{"plugin", "extension_point", "status"}) 173 174 SchedulerQueueIncomingPods = metrics.NewCounterVec( 175 &metrics.CounterOpts{ 176 Subsystem: SchedulerSubsystem, 177 Name: "queue_incoming_pods_total", 178 Help: "Number of pods added to scheduling queues by event and queue type.", 179 StabilityLevel: metrics.STABLE, 180 }, []string{"queue", "event"}) 181 182 PermitWaitDuration = metrics.NewHistogramVec( 183 &metrics.HistogramOpts{ 184 Subsystem: SchedulerSubsystem, 185 Name: "permit_wait_duration_seconds", 186 Help: "Duration of waiting on permit.", 187 Buckets: metrics.ExponentialBuckets(0.001, 2, 15), 188 StabilityLevel: metrics.ALPHA, 189 }, 190 []string{"result"}) 191 192 CacheSize = metrics.NewGaugeVec( 193 &metrics.GaugeOpts{ 194 Subsystem: SchedulerSubsystem, 195 Name: "scheduler_cache_size", 196 Help: "Number of nodes, pods, and assumed (bound) pods in the scheduler cache.", 197 StabilityLevel: metrics.ALPHA, 198 }, []string{"type"}) 199 200 unschedulableReasons = metrics.NewGaugeVec( 201 &metrics.GaugeOpts{ 202 Subsystem: SchedulerSubsystem, 203 Name: "unschedulable_pods", 204 Help: "The number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.", 205 StabilityLevel: metrics.ALPHA, 206 }, []string{"plugin", "profile"}) 207 208 PluginEvaluationTotal = metrics.NewCounterVec( 209 &metrics.CounterOpts{ 210 Subsystem: SchedulerSubsystem, 211 Name: "plugin_evaluation_total", 212 Help: "Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter and Filter.).", 213 StabilityLevel: metrics.ALPHA, 214 }, []string{"plugin", "extension_point", "profile"}) 215 216 metricsList = []metrics.Registerable{ 217 scheduleAttempts, 218 schedulingLatency, 219 SchedulingAlgorithmLatency, 220 PreemptionVictims, 221 PreemptionAttempts, 222 pendingPods, 223 PodSchedulingDuration, 224 PodSchedulingSLIDuration, 225 PodSchedulingAttempts, 226 FrameworkExtensionPointDuration, 227 PluginExecutionDuration, 228 SchedulerQueueIncomingPods, 229 Goroutines, 230 PermitWaitDuration, 231 CacheSize, 232 unschedulableReasons, 233 PluginEvaluationTotal, 234 } 235 ) 236 237 var registerMetrics sync.Once 238 239 // Register all metrics. 240 func Register() { 241 // Register the metrics. 242 registerMetrics.Do(func() { 243 RegisterMetrics(metricsList...) 244 volumebindingmetrics.RegisterVolumeSchedulingMetrics() 245 }) 246 } 247 248 // RegisterMetrics registers a list of metrics. 249 // This function is exported because it is intended to be used by out-of-tree plugins to register their custom metrics. 250 func RegisterMetrics(extraMetrics ...metrics.Registerable) { 251 for _, metric := range extraMetrics { 252 legacyregistry.MustRegister(metric) 253 } 254 } 255 256 // GetGather returns the gatherer. It used by test case outside current package. 257 func GetGather() metrics.Gatherer { 258 return legacyregistry.DefaultGatherer 259 } 260 261 // ActivePods returns the pending pods metrics with the label active 262 func ActivePods() metrics.GaugeMetric { 263 return pendingPods.With(metrics.Labels{"queue": "active"}) 264 } 265 266 // BackoffPods returns the pending pods metrics with the label backoff 267 func BackoffPods() metrics.GaugeMetric { 268 return pendingPods.With(metrics.Labels{"queue": "backoff"}) 269 } 270 271 // UnschedulablePods returns the pending pods metrics with the label unschedulable 272 func UnschedulablePods() metrics.GaugeMetric { 273 return pendingPods.With(metrics.Labels{"queue": "unschedulable"}) 274 } 275 276 // GatedPods returns the pending pods metrics with the label gated 277 func GatedPods() metrics.GaugeMetric { 278 return pendingPods.With(metrics.Labels{"queue": "gated"}) 279 } 280 281 // SinceInSeconds gets the time since the specified start in seconds. 282 func SinceInSeconds(start time.Time) float64 { 283 return time.Since(start).Seconds() 284 } 285 286 func UnschedulableReason(plugin string, profile string) metrics.GaugeMetric { 287 return unschedulableReasons.With(metrics.Labels{"plugin": plugin, "profile": profile}) 288 }