k8s.io/kubernetes@v1.29.3/pkg/kubelet/metrics/metrics.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metrics 18 19 import ( 20 "sync" 21 "time" 22 23 "k8s.io/component-base/metrics" 24 "k8s.io/component-base/metrics/legacyregistry" 25 26 "k8s.io/apimachinery/pkg/types" 27 utilfeature "k8s.io/apiserver/pkg/util/feature" 28 "k8s.io/kubernetes/pkg/features" 29 ) 30 31 // This const block defines the metric names for the kubelet metrics. 32 const ( 33 KubeletSubsystem = "kubelet" 34 NodeNameKey = "node_name" 35 NodeLabelKey = "node" 36 NodeStartupPreKubeletKey = "node_startup_pre_kubelet_duration_seconds" 37 NodeStartupPreRegistrationKey = "node_startup_pre_registration_duration_seconds" 38 NodeStartupRegistrationKey = "node_startup_registration_duration_seconds" 39 NodeStartupPostRegistrationKey = "node_startup_post_registration_duration_seconds" 40 NodeStartupKey = "node_startup_duration_seconds" 41 PodWorkerDurationKey = "pod_worker_duration_seconds" 42 PodStartDurationKey = "pod_start_duration_seconds" 43 PodStartSLIDurationKey = "pod_start_sli_duration_seconds" 44 PodStartTotalDurationKey = "pod_start_total_duration_seconds" 45 CgroupManagerOperationsKey = "cgroup_manager_duration_seconds" 46 PodWorkerStartDurationKey = "pod_worker_start_duration_seconds" 47 PodStatusSyncDurationKey = "pod_status_sync_duration_seconds" 48 PLEGRelistDurationKey = "pleg_relist_duration_seconds" 49 PLEGDiscardEventsKey = "pleg_discard_events" 50 PLEGRelistIntervalKey = "pleg_relist_interval_seconds" 51 PLEGLastSeenKey = "pleg_last_seen_seconds" 52 EventedPLEGConnErrKey = "evented_pleg_connection_error_count" 53 EventedPLEGConnKey = "evented_pleg_connection_success_count" 54 EventedPLEGConnLatencyKey = "evented_pleg_connection_latency_seconds" 55 EvictionsKey = "evictions" 56 EvictionStatsAgeKey = "eviction_stats_age_seconds" 57 PreemptionsKey = "preemptions" 58 VolumeStatsCapacityBytesKey = "volume_stats_capacity_bytes" 59 VolumeStatsAvailableBytesKey = "volume_stats_available_bytes" 60 VolumeStatsUsedBytesKey = "volume_stats_used_bytes" 61 VolumeStatsInodesKey = "volume_stats_inodes" 62 VolumeStatsInodesFreeKey = "volume_stats_inodes_free" 63 VolumeStatsInodesUsedKey = "volume_stats_inodes_used" 64 VolumeStatsHealthStatusAbnormalKey = "volume_stats_health_status_abnormal" 65 RunningPodsKey = "running_pods" 66 RunningContainersKey = "running_containers" 67 DesiredPodCountKey = "desired_pods" 68 ActivePodCountKey = "active_pods" 69 MirrorPodCountKey = "mirror_pods" 70 WorkingPodCountKey = "working_pods" 71 OrphanedRuntimePodTotalKey = "orphaned_runtime_pods_total" 72 RestartedPodTotalKey = "restarted_pods_total" 73 74 // Metrics keys of remote runtime operations 75 RuntimeOperationsKey = "runtime_operations_total" 76 RuntimeOperationsDurationKey = "runtime_operations_duration_seconds" 77 RuntimeOperationsErrorsKey = "runtime_operations_errors_total" 78 // Metrics keys of device plugin operations 79 DevicePluginRegistrationCountKey = "device_plugin_registration_total" 80 DevicePluginAllocationDurationKey = "device_plugin_alloc_duration_seconds" 81 // Metrics keys of pod resources operations 82 PodResourcesEndpointRequestsTotalKey = "pod_resources_endpoint_requests_total" 83 PodResourcesEndpointRequestsListKey = "pod_resources_endpoint_requests_list" 84 PodResourcesEndpointRequestsGetAllocatableKey = "pod_resources_endpoint_requests_get_allocatable" 85 PodResourcesEndpointErrorsListKey = "pod_resources_endpoint_errors_list" 86 PodResourcesEndpointErrorsGetAllocatableKey = "pod_resources_endpoint_errors_get_allocatable" 87 PodResourcesEndpointRequestsGetKey = "pod_resources_endpoint_requests_get" 88 PodResourcesEndpointErrorsGetKey = "pod_resources_endpoint_errors_get" 89 90 // Metrics keys for RuntimeClass 91 RunPodSandboxDurationKey = "run_podsandbox_duration_seconds" 92 RunPodSandboxErrorsKey = "run_podsandbox_errors_total" 93 94 // Metrics to keep track of total number of Pods and Containers started 95 StartedPodsTotalKey = "started_pods_total" 96 StartedPodsErrorsTotalKey = "started_pods_errors_total" 97 StartedContainersTotalKey = "started_containers_total" 98 StartedContainersErrorsTotalKey = "started_containers_errors_total" 99 100 // Metrics to track HostProcess container usage by this kubelet 101 StartedHostProcessContainersTotalKey = "started_host_process_containers_total" 102 StartedHostProcessContainersErrorsTotalKey = "started_host_process_containers_errors_total" 103 104 // Metrics to track ephemeral container usage by this kubelet 105 ManagedEphemeralContainersKey = "managed_ephemeral_containers" 106 107 // Metrics to track the CPU manager behavior 108 CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total" 109 CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total" 110 111 // Metrics to track the Topology manager behavior 112 TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total" 113 TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total" 114 TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms" 115 116 // Metrics to track orphan pod cleanup 117 orphanPodCleanedVolumesKey = "orphan_pod_cleaned_volumes" 118 orphanPodCleanedVolumesErrorsKey = "orphan_pod_cleaned_volumes_errors" 119 120 // Metric for tracking garbage collected images 121 ImageGarbageCollectedTotalKey = "image_garbage_collected_total" 122 123 // Values used in metric labels 124 Container = "container" 125 InitContainer = "init_container" 126 EphemeralContainer = "ephemeral_container" 127 ) 128 129 var ( 130 podStartupDurationBuckets = []float64{0.5, 1, 2, 3, 4, 5, 6, 8, 10, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600} 131 ) 132 133 var ( 134 // NodeName is a Gauge that tracks the ode's name. The count is always 1. 135 NodeName = metrics.NewGaugeVec( 136 &metrics.GaugeOpts{ 137 Subsystem: KubeletSubsystem, 138 Name: NodeNameKey, 139 Help: "The node's name. The count is always 1.", 140 StabilityLevel: metrics.ALPHA, 141 }, 142 []string{NodeLabelKey}, 143 ) 144 // ContainersPerPodCount is a Histogram that tracks the number of containers per pod. 145 ContainersPerPodCount = metrics.NewHistogram( 146 &metrics.HistogramOpts{ 147 Subsystem: KubeletSubsystem, 148 Name: "containers_per_pod_count", 149 Help: "The number of containers per pod.", 150 Buckets: metrics.ExponentialBuckets(1, 2, 5), 151 StabilityLevel: metrics.ALPHA, 152 }, 153 ) 154 // PodWorkerDuration is a Histogram that tracks the duration (in seconds) in takes to sync a single pod. 155 // Broken down by the operation type. 156 PodWorkerDuration = metrics.NewHistogramVec( 157 &metrics.HistogramOpts{ 158 Subsystem: KubeletSubsystem, 159 Name: PodWorkerDurationKey, 160 Help: "Duration in seconds to sync a single pod. Broken down by operation type: create, update, or sync", 161 Buckets: metrics.DefBuckets, 162 StabilityLevel: metrics.ALPHA, 163 }, 164 []string{"operation_type"}, 165 ) 166 // PodStartDuration is a Histogram that tracks the duration (in seconds) it takes for a single pod to run since it's 167 // first time seen by kubelet. 168 PodStartDuration = metrics.NewHistogram( 169 &metrics.HistogramOpts{ 170 Subsystem: KubeletSubsystem, 171 Name: PodStartDurationKey, 172 Help: "Duration in seconds from kubelet seeing a pod for the first time to the pod starting to run", 173 Buckets: podStartupDurationBuckets, 174 StabilityLevel: metrics.ALPHA, 175 }, 176 ) 177 // PodStartSLIDuration is a Histogram that tracks the duration (in seconds) it takes for a single pod to run, 178 // excluding the time for image pulling. This metric should reflect the "Pod startup latency SLI" definition 179 // ref: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md 180 // 181 // The histogram bucket boundaries for pod startup latency metrics, measured in seconds. These are hand-picked 182 // so as to be roughly exponential but still round numbers in everyday units. This is to minimise the number 183 // of buckets while allowing accurate measurement of thresholds which might be used in SLOs 184 // e.g. x% of pods start up within 30 seconds, or 15 minutes, etc. 185 PodStartSLIDuration = metrics.NewHistogramVec( 186 &metrics.HistogramOpts{ 187 Subsystem: KubeletSubsystem, 188 Name: PodStartSLIDurationKey, 189 Help: "Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch", 190 Buckets: podStartupDurationBuckets, 191 StabilityLevel: metrics.ALPHA, 192 }, 193 []string{}, 194 ) 195 196 // PodStartTotalDuration is a Histogram that tracks the duration (in seconds) it takes for a single pod to run 197 // since creation, including the time for image pulling. 198 // 199 // The histogram bucket boundaries for pod startup latency metrics, measured in seconds. These are hand-picked 200 // so as to be roughly exponential but still round numbers in everyday units. This is to minimise the number 201 // of buckets while allowing accurate measurement of thresholds which might be used in SLOs 202 // e.g. x% of pods start up within 30 seconds, or 15 minutes, etc. 203 PodStartTotalDuration = metrics.NewHistogramVec( 204 &metrics.HistogramOpts{ 205 Subsystem: KubeletSubsystem, 206 Name: PodStartTotalDurationKey, 207 Help: "Duration in seconds to start a pod since creation, including time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch", 208 Buckets: podStartupDurationBuckets, 209 StabilityLevel: metrics.ALPHA, 210 }, 211 []string{}, 212 ) 213 214 // CgroupManagerDuration is a Histogram that tracks the duration (in seconds) it takes for cgroup manager operations to complete. 215 // Broken down by method. 216 CgroupManagerDuration = metrics.NewHistogramVec( 217 &metrics.HistogramOpts{ 218 Subsystem: KubeletSubsystem, 219 Name: CgroupManagerOperationsKey, 220 Help: "Duration in seconds for cgroup manager operations. Broken down by method.", 221 Buckets: metrics.DefBuckets, 222 StabilityLevel: metrics.ALPHA, 223 }, 224 []string{"operation_type"}, 225 ) 226 // PodWorkerStartDuration is a Histogram that tracks the duration (in seconds) it takes from kubelet seeing a pod to starting a worker. 227 PodWorkerStartDuration = metrics.NewHistogram( 228 &metrics.HistogramOpts{ 229 Subsystem: KubeletSubsystem, 230 Name: PodWorkerStartDurationKey, 231 Help: "Duration in seconds from kubelet seeing a pod to starting a worker.", 232 Buckets: metrics.DefBuckets, 233 StabilityLevel: metrics.ALPHA, 234 }, 235 ) 236 // PodStatusSyncDuration is a Histogram that tracks the duration (in seconds) in takes from the time a pod 237 // status is generated to the time it is synced with the apiserver. If multiple status changes are generated 238 // on a pod before it is written to the API, the latency is from the first update to the last event. 239 PodStatusSyncDuration = metrics.NewHistogram( 240 &metrics.HistogramOpts{ 241 Subsystem: KubeletSubsystem, 242 Name: PodStatusSyncDurationKey, 243 Help: "Duration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.", 244 Buckets: []float64{0.010, 0.050, 0.100, 0.500, 1, 5, 10, 20, 30, 45, 60}, 245 StabilityLevel: metrics.ALPHA, 246 }, 247 ) 248 // PLEGRelistDuration is a Histogram that tracks the duration (in seconds) it takes for relisting pods in the Kubelet's 249 // Pod Lifecycle Event Generator (PLEG). 250 PLEGRelistDuration = metrics.NewHistogram( 251 &metrics.HistogramOpts{ 252 Subsystem: KubeletSubsystem, 253 Name: PLEGRelistDurationKey, 254 Help: "Duration in seconds for relisting pods in PLEG.", 255 Buckets: metrics.DefBuckets, 256 StabilityLevel: metrics.ALPHA, 257 }, 258 ) 259 // PLEGDiscardEvents is a Counter that tracks the number of discarding events in the Kubelet's Pod Lifecycle Event Generator (PLEG). 260 PLEGDiscardEvents = metrics.NewCounter( 261 &metrics.CounterOpts{ 262 Subsystem: KubeletSubsystem, 263 Name: PLEGDiscardEventsKey, 264 Help: "The number of discard events in PLEG.", 265 StabilityLevel: metrics.ALPHA, 266 }, 267 ) 268 269 // PLEGRelistInterval is a Histogram that tracks the intervals (in seconds) between relisting in the Kubelet's 270 // Pod Lifecycle Event Generator (PLEG). 271 PLEGRelistInterval = metrics.NewHistogram( 272 &metrics.HistogramOpts{ 273 Subsystem: KubeletSubsystem, 274 Name: PLEGRelistIntervalKey, 275 Help: "Interval in seconds between relisting in PLEG.", 276 Buckets: metrics.DefBuckets, 277 StabilityLevel: metrics.ALPHA, 278 }, 279 ) 280 // PLEGLastSeen is a Gauge giving the Unix timestamp when the Kubelet's 281 // Pod Lifecycle Event Generator (PLEG) was last seen active. 282 PLEGLastSeen = metrics.NewGauge( 283 &metrics.GaugeOpts{ 284 Subsystem: KubeletSubsystem, 285 Name: PLEGLastSeenKey, 286 Help: "Timestamp in seconds when PLEG was last seen active.", 287 StabilityLevel: metrics.ALPHA, 288 }, 289 ) 290 291 // EventedPLEGConnErr is a Counter that tracks the number of errors encountered during 292 // the establishment of streaming connection with the CRI runtime. 293 EventedPLEGConnErr = metrics.NewCounter( 294 &metrics.CounterOpts{ 295 Subsystem: KubeletSubsystem, 296 Name: EventedPLEGConnErrKey, 297 Help: "The number of errors encountered during the establishment of streaming connection with the CRI runtime.", 298 StabilityLevel: metrics.ALPHA, 299 }, 300 ) 301 302 // EventedPLEGConn is a Counter that tracks the number of times a streaming client 303 // was obtained to receive CRI Events. 304 EventedPLEGConn = metrics.NewCounter( 305 &metrics.CounterOpts{ 306 Subsystem: KubeletSubsystem, 307 Name: EventedPLEGConnKey, 308 Help: "The number of times a streaming client was obtained to receive CRI Events.", 309 StabilityLevel: metrics.ALPHA, 310 }, 311 ) 312 313 // EventedPLEGConnLatency is a Histogram that tracks the latency of streaming connection 314 // with the CRI runtime, measured in seconds. 315 EventedPLEGConnLatency = metrics.NewHistogram( 316 &metrics.HistogramOpts{ 317 Subsystem: KubeletSubsystem, 318 Name: EventedPLEGConnLatencyKey, 319 Help: "The latency of streaming connection with the CRI runtime, measured in seconds.", 320 Buckets: metrics.DefBuckets, 321 StabilityLevel: metrics.ALPHA, 322 }, 323 ) 324 325 // RuntimeOperations is a Counter that tracks the cumulative number of remote runtime operations. 326 // Broken down by operation type. 327 RuntimeOperations = metrics.NewCounterVec( 328 &metrics.CounterOpts{ 329 Subsystem: KubeletSubsystem, 330 Name: RuntimeOperationsKey, 331 Help: "Cumulative number of runtime operations by operation type.", 332 StabilityLevel: metrics.ALPHA, 333 }, 334 []string{"operation_type"}, 335 ) 336 // RuntimeOperationsDuration is a Histogram that tracks the duration (in seconds) for remote runtime operations to complete. 337 // Broken down by operation type. 338 RuntimeOperationsDuration = metrics.NewHistogramVec( 339 &metrics.HistogramOpts{ 340 Subsystem: KubeletSubsystem, 341 Name: RuntimeOperationsDurationKey, 342 Help: "Duration in seconds of runtime operations. Broken down by operation type.", 343 Buckets: metrics.ExponentialBuckets(.005, 2.5, 14), 344 StabilityLevel: metrics.ALPHA, 345 }, 346 []string{"operation_type"}, 347 ) 348 // RuntimeOperationsErrors is a Counter that tracks the cumulative number of remote runtime operations errors. 349 // Broken down by operation type. 350 RuntimeOperationsErrors = metrics.NewCounterVec( 351 &metrics.CounterOpts{ 352 Subsystem: KubeletSubsystem, 353 Name: RuntimeOperationsErrorsKey, 354 Help: "Cumulative number of runtime operation errors by operation type.", 355 StabilityLevel: metrics.ALPHA, 356 }, 357 []string{"operation_type"}, 358 ) 359 // Evictions is a Counter that tracks the cumulative number of pod evictions initiated by the kubelet. 360 // Broken down by eviction signal. 361 Evictions = metrics.NewCounterVec( 362 &metrics.CounterOpts{ 363 Subsystem: KubeletSubsystem, 364 Name: EvictionsKey, 365 Help: "Cumulative number of pod evictions by eviction signal", 366 StabilityLevel: metrics.ALPHA, 367 }, 368 []string{"eviction_signal"}, 369 ) 370 // EvictionStatsAge is a Histogram that tracks the time (in seconds) between when stats are collected and when a pod is evicted 371 // based on those stats. Broken down by eviction signal. 372 EvictionStatsAge = metrics.NewHistogramVec( 373 &metrics.HistogramOpts{ 374 Subsystem: KubeletSubsystem, 375 Name: EvictionStatsAgeKey, 376 Help: "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal", 377 Buckets: metrics.DefBuckets, 378 StabilityLevel: metrics.ALPHA, 379 }, 380 []string{"eviction_signal"}, 381 ) 382 // Preemptions is a Counter that tracks the cumulative number of pod preemptions initiated by the kubelet. 383 // Broken down by preemption signal. A preemption is only recorded for one resource, the sum of all signals 384 // is the number of preemptions on the given node. 385 Preemptions = metrics.NewCounterVec( 386 &metrics.CounterOpts{ 387 Subsystem: KubeletSubsystem, 388 Name: PreemptionsKey, 389 Help: "Cumulative number of pod preemptions by preemption resource", 390 StabilityLevel: metrics.ALPHA, 391 }, 392 []string{"preemption_signal"}, 393 ) 394 // DevicePluginRegistrationCount is a Counter that tracks the cumulative number of device plugin registrations. 395 // Broken down by resource name. 396 DevicePluginRegistrationCount = metrics.NewCounterVec( 397 &metrics.CounterOpts{ 398 Subsystem: KubeletSubsystem, 399 Name: DevicePluginRegistrationCountKey, 400 Help: "Cumulative number of device plugin registrations. Broken down by resource name.", 401 StabilityLevel: metrics.ALPHA, 402 }, 403 []string{"resource_name"}, 404 ) 405 // DevicePluginAllocationDuration is a Histogram that tracks the duration (in seconds) to serve a device plugin allocation request. 406 // Broken down by resource name. 407 DevicePluginAllocationDuration = metrics.NewHistogramVec( 408 &metrics.HistogramOpts{ 409 Subsystem: KubeletSubsystem, 410 Name: DevicePluginAllocationDurationKey, 411 Help: "Duration in seconds to serve a device plugin Allocation request. Broken down by resource name.", 412 Buckets: metrics.DefBuckets, 413 StabilityLevel: metrics.ALPHA, 414 }, 415 []string{"resource_name"}, 416 ) 417 418 // PodResourcesEndpointRequestsTotalCount is a Counter that tracks the cumulative number of requests to the PodResource endpoints. 419 // Broken down by server API version. 420 PodResourcesEndpointRequestsTotalCount = metrics.NewCounterVec( 421 &metrics.CounterOpts{ 422 Subsystem: KubeletSubsystem, 423 Name: PodResourcesEndpointRequestsTotalKey, 424 Help: "Cumulative number of requests to the PodResource endpoint. Broken down by server api version.", 425 StabilityLevel: metrics.ALPHA, 426 }, 427 []string{"server_api_version"}, 428 ) 429 430 // PodResourcesEndpointRequestsListCount is a Counter that tracks the number of requests to the PodResource List() endpoint. 431 // Broken down by server API version. 432 PodResourcesEndpointRequestsListCount = metrics.NewCounterVec( 433 &metrics.CounterOpts{ 434 Subsystem: KubeletSubsystem, 435 Name: PodResourcesEndpointRequestsListKey, 436 Help: "Number of requests to the PodResource List endpoint. Broken down by server api version.", 437 StabilityLevel: metrics.ALPHA, 438 }, 439 []string{"server_api_version"}, 440 ) 441 442 // PodResourcesEndpointRequestsGetAllocatableCount is a Counter that tracks the number of requests to the PodResource GetAllocatableResources() endpoint. 443 // Broken down by server API version. 444 PodResourcesEndpointRequestsGetAllocatableCount = metrics.NewCounterVec( 445 &metrics.CounterOpts{ 446 Subsystem: KubeletSubsystem, 447 Name: PodResourcesEndpointRequestsGetAllocatableKey, 448 Help: "Number of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.", 449 StabilityLevel: metrics.ALPHA, 450 }, 451 []string{"server_api_version"}, 452 ) 453 454 // PodResourcesEndpointErrorsListCount is a Counter that tracks the number of errors returned by he PodResource List() endpoint. 455 // Broken down by server API version. 456 PodResourcesEndpointErrorsListCount = metrics.NewCounterVec( 457 &metrics.CounterOpts{ 458 Subsystem: KubeletSubsystem, 459 Name: PodResourcesEndpointErrorsListKey, 460 Help: "Number of requests to the PodResource List endpoint which returned error. Broken down by server api version.", 461 StabilityLevel: metrics.ALPHA, 462 }, 463 []string{"server_api_version"}, 464 ) 465 466 // PodResourcesEndpointErrorsGetAllocatableCount is a Counter that tracks the number of errors returned by the PodResource GetAllocatableResources() endpoint. 467 // Broken down by server API version. 468 PodResourcesEndpointErrorsGetAllocatableCount = metrics.NewCounterVec( 469 &metrics.CounterOpts{ 470 Subsystem: KubeletSubsystem, 471 Name: PodResourcesEndpointErrorsGetAllocatableKey, 472 Help: "Number of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.", 473 StabilityLevel: metrics.ALPHA, 474 }, 475 []string{"server_api_version"}, 476 ) 477 478 // PodResourcesEndpointRequestsGetCount is a Counter that tracks the number of requests to the PodResource Get() endpoint. 479 // Broken down by server API version. 480 PodResourcesEndpointRequestsGetCount = metrics.NewCounterVec( 481 &metrics.CounterOpts{ 482 Subsystem: KubeletSubsystem, 483 Name: PodResourcesEndpointRequestsGetKey, 484 Help: "Number of requests to the PodResource Get endpoint. Broken down by server api version.", 485 StabilityLevel: metrics.ALPHA, 486 }, 487 []string{"server_api_version"}, 488 ) 489 490 // PodResourcesEndpointErrorsGetCount is a Counter that tracks the number of errors returned by he PodResource List() endpoint. 491 // Broken down by server API version. 492 PodResourcesEndpointErrorsGetCount = metrics.NewCounterVec( 493 &metrics.CounterOpts{ 494 Subsystem: KubeletSubsystem, 495 Name: PodResourcesEndpointErrorsGetKey, 496 Help: "Number of requests to the PodResource Get endpoint which returned error. Broken down by server api version.", 497 StabilityLevel: metrics.ALPHA, 498 }, 499 []string{"server_api_version"}, 500 ) 501 502 // RunPodSandboxDuration is a Histogram that tracks the duration (in seconds) it takes to run Pod Sandbox operations. 503 // Broken down by RuntimeClass.Handler. 504 RunPodSandboxDuration = metrics.NewHistogramVec( 505 &metrics.HistogramOpts{ 506 Subsystem: KubeletSubsystem, 507 Name: RunPodSandboxDurationKey, 508 Help: "Duration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.", 509 // Use DefBuckets for now, will customize the buckets if necessary. 510 Buckets: metrics.DefBuckets, 511 StabilityLevel: metrics.ALPHA, 512 }, 513 []string{"runtime_handler"}, 514 ) 515 // RunPodSandboxErrors is a Counter that tracks the cumulative number of Pod Sandbox operations errors. 516 // Broken down by RuntimeClass.Handler. 517 RunPodSandboxErrors = metrics.NewCounterVec( 518 &metrics.CounterOpts{ 519 Subsystem: KubeletSubsystem, 520 Name: RunPodSandboxErrorsKey, 521 Help: "Cumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.", 522 StabilityLevel: metrics.ALPHA, 523 }, 524 []string{"runtime_handler"}, 525 ) 526 527 // RunningPodCount is a gauge that tracks the number of Pods currently with a running sandbox 528 // It is used to expose the kubelet internal state: how many pods have running containers in the container runtime, and mainly for debugging purpose. 529 RunningPodCount = metrics.NewGauge( 530 &metrics.GaugeOpts{ 531 Subsystem: KubeletSubsystem, 532 Name: RunningPodsKey, 533 Help: "Number of pods that have a running pod sandbox", 534 StabilityLevel: metrics.ALPHA, 535 }, 536 ) 537 // RunningContainerCount is a gauge that tracks the number of containers currently running 538 RunningContainerCount = metrics.NewGaugeVec( 539 &metrics.GaugeOpts{ 540 Subsystem: KubeletSubsystem, 541 Name: RunningContainersKey, 542 Help: "Number of containers currently running", 543 StabilityLevel: metrics.ALPHA, 544 }, 545 []string{"container_state"}, 546 ) 547 // DesiredPodCount tracks the count of pods the Kubelet thinks it should be running 548 DesiredPodCount = metrics.NewGaugeVec( 549 &metrics.GaugeOpts{ 550 Subsystem: KubeletSubsystem, 551 Name: DesiredPodCountKey, 552 Help: "The number of pods the kubelet is being instructed to run. static is true if the pod is not from the apiserver.", 553 StabilityLevel: metrics.ALPHA, 554 }, 555 []string{"static"}, 556 ) 557 // ActivePodCount tracks the count of pods the Kubelet considers as active when deciding to admit a new pod 558 ActivePodCount = metrics.NewGaugeVec( 559 &metrics.GaugeOpts{ 560 Subsystem: KubeletSubsystem, 561 Name: ActivePodCountKey, 562 Help: "The number of pods the kubelet considers active and which are being considered when admitting new pods. static is true if the pod is not from the apiserver.", 563 StabilityLevel: metrics.ALPHA, 564 }, 565 []string{"static"}, 566 ) 567 // MirrorPodCount tracks the number of mirror pods the Kubelet should have created for static pods 568 MirrorPodCount = metrics.NewGauge( 569 &metrics.GaugeOpts{ 570 Subsystem: KubeletSubsystem, 571 Name: MirrorPodCountKey, 572 Help: "The number of mirror pods the kubelet will try to create (one per admitted static pod)", 573 StabilityLevel: metrics.ALPHA, 574 }, 575 ) 576 // WorkingPodCount tracks the count of pods in each lifecycle phase, whether they are static pods, and whether they are desired, orphaned, or runtime_only 577 WorkingPodCount = metrics.NewGaugeVec( 578 &metrics.GaugeOpts{ 579 Subsystem: KubeletSubsystem, 580 Name: WorkingPodCountKey, 581 Help: "Number of pods the kubelet is actually running, broken down by lifecycle phase, whether the pod is desired, orphaned, or runtime only (also orphaned), and whether the pod is static. An orphaned pod has been removed from local configuration or force deleted in the API and consumes resources that are not otherwise visible.", 582 StabilityLevel: metrics.ALPHA, 583 }, 584 []string{"lifecycle", "config", "static"}, 585 ) 586 // OrphanedRuntimePodTotal is incremented every time a pod is detected in the runtime without being known to the pod worker first 587 OrphanedRuntimePodTotal = metrics.NewCounter( 588 &metrics.CounterOpts{ 589 Subsystem: KubeletSubsystem, 590 Name: OrphanedRuntimePodTotalKey, 591 Help: "Number of pods that have been detected in the container runtime without being already known to the pod worker. This typically indicates the kubelet was restarted while a pod was force deleted in the API or in the local configuration, which is unusual.", 592 StabilityLevel: metrics.ALPHA, 593 }, 594 ) 595 // RestartedPodTotal is incremented every time a pod with the same UID is deleted and recreated 596 RestartedPodTotal = metrics.NewCounterVec( 597 &metrics.CounterOpts{ 598 Subsystem: KubeletSubsystem, 599 Name: RestartedPodTotalKey, 600 Help: "Number of pods that have been restarted because they were deleted and recreated with the same UID while the kubelet was watching them (common for static pods, extremely uncommon for API pods)", 601 StabilityLevel: metrics.ALPHA, 602 }, 603 []string{"static"}, 604 ) 605 // StartedPodsTotal is a counter that tracks pod sandbox creation operations 606 StartedPodsTotal = metrics.NewCounter( 607 &metrics.CounterOpts{ 608 Subsystem: KubeletSubsystem, 609 Name: StartedPodsTotalKey, 610 Help: "Cumulative number of pods started", 611 StabilityLevel: metrics.ALPHA, 612 }, 613 ) 614 // StartedPodsErrorsTotal is a counter that tracks the number of errors creating pod sandboxes 615 StartedPodsErrorsTotal = metrics.NewCounter( 616 &metrics.CounterOpts{ 617 Subsystem: KubeletSubsystem, 618 Name: StartedPodsErrorsTotalKey, 619 Help: "Cumulative number of errors when starting pods", 620 StabilityLevel: metrics.ALPHA, 621 }, 622 ) 623 // StartedContainersTotal is a counter that tracks the number of container creation operations 624 StartedContainersTotal = metrics.NewCounterVec( 625 &metrics.CounterOpts{ 626 Subsystem: KubeletSubsystem, 627 Name: StartedContainersTotalKey, 628 Help: "Cumulative number of containers started", 629 StabilityLevel: metrics.ALPHA, 630 }, 631 []string{"container_type"}, 632 ) 633 // StartedContainersTotal is a counter that tracks the number of errors creating containers 634 StartedContainersErrorsTotal = metrics.NewCounterVec( 635 &metrics.CounterOpts{ 636 Subsystem: KubeletSubsystem, 637 Name: StartedContainersErrorsTotalKey, 638 Help: "Cumulative number of errors when starting containers", 639 StabilityLevel: metrics.ALPHA, 640 }, 641 []string{"container_type", "code"}, 642 ) 643 // StartedHostProcessContainersTotal is a counter that tracks the number of hostprocess container creation operations 644 StartedHostProcessContainersTotal = metrics.NewCounterVec( 645 &metrics.CounterOpts{ 646 Subsystem: KubeletSubsystem, 647 Name: StartedHostProcessContainersTotalKey, 648 Help: "Cumulative number of hostprocess containers started. This metric will only be collected on Windows.", 649 StabilityLevel: metrics.ALPHA, 650 }, 651 []string{"container_type"}, 652 ) 653 // StartedHostProcessContainersErrorsTotal is a counter that tracks the number of errors creating hostprocess containers 654 StartedHostProcessContainersErrorsTotal = metrics.NewCounterVec( 655 &metrics.CounterOpts{ 656 Subsystem: KubeletSubsystem, 657 Name: StartedHostProcessContainersErrorsTotalKey, 658 Help: "Cumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows.", 659 StabilityLevel: metrics.ALPHA, 660 }, 661 []string{"container_type", "code"}, 662 ) 663 // ManagedEphemeralContainers is a gauge that indicates how many ephemeral containers are managed by this kubelet. 664 ManagedEphemeralContainers = metrics.NewGauge( 665 &metrics.GaugeOpts{ 666 Subsystem: KubeletSubsystem, 667 Name: ManagedEphemeralContainersKey, 668 Help: "Current number of ephemeral containers in pods managed by this kubelet.", 669 StabilityLevel: metrics.ALPHA, 670 }, 671 ) 672 673 // GracefulShutdownStartTime is a gauge that records the time at which the kubelet started graceful shutdown. 674 GracefulShutdownStartTime = metrics.NewGauge( 675 &metrics.GaugeOpts{ 676 Subsystem: KubeletSubsystem, 677 Name: "graceful_shutdown_start_time_seconds", 678 Help: "Last graceful shutdown start time since unix epoch in seconds", 679 StabilityLevel: metrics.ALPHA, 680 }, 681 ) 682 683 // GracefulShutdownEndTime is a gauge that records the time at which the kubelet completed graceful shutdown. 684 GracefulShutdownEndTime = metrics.NewGauge( 685 &metrics.GaugeOpts{ 686 Subsystem: KubeletSubsystem, 687 Name: "graceful_shutdown_end_time_seconds", 688 Help: "Last graceful shutdown start time since unix epoch in seconds", 689 StabilityLevel: metrics.ALPHA, 690 }, 691 ) 692 693 LifecycleHandlerHTTPFallbacks = metrics.NewCounter( 694 &metrics.CounterOpts{ 695 Subsystem: KubeletSubsystem, 696 Name: "lifecycle_handler_http_fallbacks_total", 697 Help: "The number of times lifecycle handlers successfully fell back to http from https.", 698 StabilityLevel: metrics.ALPHA, 699 }, 700 ) 701 702 // CPUManagerPinningRequestsTotal tracks the number of times the pod spec will cause the cpu manager to pin cores 703 CPUManagerPinningRequestsTotal = metrics.NewCounter( 704 &metrics.CounterOpts{ 705 Subsystem: KubeletSubsystem, 706 Name: CPUManagerPinningRequestsTotalKey, 707 Help: "The number of cpu core allocations which required pinning.", 708 StabilityLevel: metrics.ALPHA, 709 }, 710 ) 711 712 // CPUManagerPinningErrorsTotal tracks the number of times the pod spec required the cpu manager to pin cores, but the allocation failed 713 CPUManagerPinningErrorsTotal = metrics.NewCounter( 714 &metrics.CounterOpts{ 715 Subsystem: KubeletSubsystem, 716 Name: CPUManagerPinningErrorsTotalKey, 717 Help: "The number of cpu core allocations which required pinning failed.", 718 StabilityLevel: metrics.ALPHA, 719 }, 720 ) 721 722 // TopologyManagerAdmissionRequestsTotal tracks the number of times the pod spec will cause the topology manager to admit a pod 723 TopologyManagerAdmissionRequestsTotal = metrics.NewCounter( 724 &metrics.CounterOpts{ 725 Subsystem: KubeletSubsystem, 726 Name: TopologyManagerAdmissionRequestsTotalKey, 727 Help: "The number of admission requests where resources have to be aligned.", 728 StabilityLevel: metrics.ALPHA, 729 }, 730 ) 731 732 // TopologyManagerAdmissionErrorsTotal tracks the number of times the pod spec required the topology manager to admit a pod, but the admission failed 733 TopologyManagerAdmissionErrorsTotal = metrics.NewCounter( 734 &metrics.CounterOpts{ 735 Subsystem: KubeletSubsystem, 736 Name: TopologyManagerAdmissionErrorsTotalKey, 737 Help: "The number of admission request failures where resources could not be aligned.", 738 StabilityLevel: metrics.ALPHA, 739 }, 740 ) 741 742 // TopologyManagerAdmissionDuration is a Histogram that tracks the duration (in seconds) to serve a pod admission request. 743 TopologyManagerAdmissionDuration = metrics.NewHistogram( 744 &metrics.HistogramOpts{ 745 Subsystem: KubeletSubsystem, 746 Name: TopologyManagerAdmissionDurationKey, 747 Help: "Duration in milliseconds to serve a pod admission request.", 748 Buckets: metrics.ExponentialBuckets(.05, 2, 15), 749 StabilityLevel: metrics.ALPHA, 750 }, 751 ) 752 753 // OrphanPodCleanedVolumes is number of orphaned Pods that times that removeOrphanedPodVolumeDirs was called during the last sweep. 754 OrphanPodCleanedVolumes = metrics.NewGauge( 755 &metrics.GaugeOpts{ 756 Subsystem: KubeletSubsystem, 757 Name: orphanPodCleanedVolumesKey, 758 Help: "The total number of orphaned Pods whose volumes were cleaned in the last periodic sweep.", 759 StabilityLevel: metrics.ALPHA, 760 }, 761 ) 762 // OrphanPodCleanedVolumes is number of times that removeOrphanedPodVolumeDirs failed. 763 OrphanPodCleanedVolumesErrors = metrics.NewGauge( 764 &metrics.GaugeOpts{ 765 Subsystem: KubeletSubsystem, 766 Name: orphanPodCleanedVolumesErrorsKey, 767 Help: "The number of orphaned Pods whose volumes failed to be cleaned in the last periodic sweep.", 768 StabilityLevel: metrics.ALPHA, 769 }, 770 ) 771 772 NodeStartupPreKubeletDuration = metrics.NewGauge( 773 &metrics.GaugeOpts{ 774 Subsystem: KubeletSubsystem, 775 Name: NodeStartupPreKubeletKey, 776 Help: "Duration in seconds of node startup before kubelet starts.", 777 StabilityLevel: metrics.ALPHA, 778 }, 779 ) 780 781 NodeStartupPreRegistrationDuration = metrics.NewGauge( 782 &metrics.GaugeOpts{ 783 Subsystem: KubeletSubsystem, 784 Name: NodeStartupPreRegistrationKey, 785 Help: "Duration in seconds of node startup before registration.", 786 StabilityLevel: metrics.ALPHA, 787 }, 788 ) 789 790 NodeStartupRegistrationDuration = metrics.NewGauge( 791 &metrics.GaugeOpts{ 792 Subsystem: KubeletSubsystem, 793 Name: NodeStartupRegistrationKey, 794 Help: "Duration in seconds of node startup during registration.", 795 StabilityLevel: metrics.ALPHA, 796 }, 797 ) 798 799 NodeStartupPostRegistrationDuration = metrics.NewGauge( 800 &metrics.GaugeOpts{ 801 Subsystem: KubeletSubsystem, 802 Name: NodeStartupPostRegistrationKey, 803 Help: "Duration in seconds of node startup after registration.", 804 StabilityLevel: metrics.ALPHA, 805 }, 806 ) 807 808 NodeStartupDuration = metrics.NewGauge( 809 &metrics.GaugeOpts{ 810 Subsystem: KubeletSubsystem, 811 Name: NodeStartupKey, 812 Help: "Duration in seconds of node startup in total.", 813 StabilityLevel: metrics.ALPHA, 814 }, 815 ) 816 817 ImageGarbageCollectedTotal = metrics.NewCounter( 818 &metrics.CounterOpts{ 819 Subsystem: KubeletSubsystem, 820 Name: ImageGarbageCollectedTotalKey, 821 Help: "Total number of images garbage collected by the kubelet, whether through disk usage or image age.", 822 StabilityLevel: metrics.ALPHA, 823 }, 824 ) 825 ) 826 827 var registerMetrics sync.Once 828 829 // Register registers all metrics. 830 func Register(collectors ...metrics.StableCollector) { 831 // Register the metrics. 832 registerMetrics.Do(func() { 833 legacyregistry.MustRegister(NodeName) 834 legacyregistry.MustRegister(PodWorkerDuration) 835 legacyregistry.MustRegister(PodStartDuration) 836 legacyregistry.MustRegister(PodStartSLIDuration) 837 legacyregistry.MustRegister(PodStartTotalDuration) 838 legacyregistry.MustRegister(NodeStartupPreKubeletDuration) 839 legacyregistry.MustRegister(NodeStartupPreRegistrationDuration) 840 legacyregistry.MustRegister(NodeStartupRegistrationDuration) 841 legacyregistry.MustRegister(NodeStartupPostRegistrationDuration) 842 legacyregistry.MustRegister(NodeStartupDuration) 843 legacyregistry.MustRegister(CgroupManagerDuration) 844 legacyregistry.MustRegister(PodWorkerStartDuration) 845 legacyregistry.MustRegister(PodStatusSyncDuration) 846 legacyregistry.MustRegister(ContainersPerPodCount) 847 legacyregistry.MustRegister(PLEGRelistDuration) 848 legacyregistry.MustRegister(PLEGDiscardEvents) 849 legacyregistry.MustRegister(PLEGRelistInterval) 850 legacyregistry.MustRegister(PLEGLastSeen) 851 legacyregistry.MustRegister(EventedPLEGConnErr) 852 legacyregistry.MustRegister(EventedPLEGConn) 853 legacyregistry.MustRegister(EventedPLEGConnLatency) 854 legacyregistry.MustRegister(RuntimeOperations) 855 legacyregistry.MustRegister(RuntimeOperationsDuration) 856 legacyregistry.MustRegister(RuntimeOperationsErrors) 857 legacyregistry.MustRegister(Evictions) 858 legacyregistry.MustRegister(EvictionStatsAge) 859 legacyregistry.MustRegister(Preemptions) 860 legacyregistry.MustRegister(DevicePluginRegistrationCount) 861 legacyregistry.MustRegister(DevicePluginAllocationDuration) 862 legacyregistry.MustRegister(RunningContainerCount) 863 legacyregistry.MustRegister(RunningPodCount) 864 legacyregistry.MustRegister(DesiredPodCount) 865 legacyregistry.MustRegister(ActivePodCount) 866 legacyregistry.MustRegister(MirrorPodCount) 867 legacyregistry.MustRegister(WorkingPodCount) 868 legacyregistry.MustRegister(OrphanedRuntimePodTotal) 869 legacyregistry.MustRegister(RestartedPodTotal) 870 legacyregistry.MustRegister(ManagedEphemeralContainers) 871 legacyregistry.MustRegister(PodResourcesEndpointRequestsTotalCount) 872 legacyregistry.MustRegister(PodResourcesEndpointRequestsListCount) 873 legacyregistry.MustRegister(PodResourcesEndpointRequestsGetAllocatableCount) 874 legacyregistry.MustRegister(PodResourcesEndpointErrorsListCount) 875 legacyregistry.MustRegister(PodResourcesEndpointErrorsGetAllocatableCount) 876 if utilfeature.DefaultFeatureGate.Enabled(features.KubeletPodResourcesGet) { 877 legacyregistry.MustRegister(PodResourcesEndpointRequestsGetCount) 878 legacyregistry.MustRegister(PodResourcesEndpointErrorsGetCount) 879 } 880 legacyregistry.MustRegister(StartedPodsTotal) 881 legacyregistry.MustRegister(StartedPodsErrorsTotal) 882 legacyregistry.MustRegister(StartedContainersTotal) 883 legacyregistry.MustRegister(StartedContainersErrorsTotal) 884 legacyregistry.MustRegister(StartedHostProcessContainersTotal) 885 legacyregistry.MustRegister(StartedHostProcessContainersErrorsTotal) 886 legacyregistry.MustRegister(RunPodSandboxDuration) 887 legacyregistry.MustRegister(RunPodSandboxErrors) 888 legacyregistry.MustRegister(CPUManagerPinningRequestsTotal) 889 legacyregistry.MustRegister(CPUManagerPinningErrorsTotal) 890 legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal) 891 legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal) 892 legacyregistry.MustRegister(TopologyManagerAdmissionDuration) 893 legacyregistry.MustRegister(OrphanPodCleanedVolumes) 894 legacyregistry.MustRegister(OrphanPodCleanedVolumesErrors) 895 896 for _, collector := range collectors { 897 legacyregistry.CustomMustRegister(collector) 898 } 899 900 if utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) && 901 utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) { 902 legacyregistry.MustRegister(GracefulShutdownStartTime) 903 legacyregistry.MustRegister(GracefulShutdownEndTime) 904 } 905 906 if utilfeature.DefaultFeatureGate.Enabled(features.ConsistentHTTPGetHandlers) { 907 legacyregistry.MustRegister(LifecycleHandlerHTTPFallbacks) 908 } 909 }) 910 } 911 912 // GetGather returns the gatherer. It used by test case outside current package. 913 func GetGather() metrics.Gatherer { 914 return legacyregistry.DefaultGatherer 915 } 916 917 // SinceInSeconds gets the time since the specified start in seconds. 918 func SinceInSeconds(start time.Time) float64 { 919 return time.Since(start).Seconds() 920 } 921 922 // SetNodeName sets the NodeName Gauge to 1. 923 func SetNodeName(name types.NodeName) { 924 NodeName.WithLabelValues(string(name)).Set(1) 925 }