k8s.io/kubernetes@v1.29.3/pkg/kubelet/metrics/collectors/resource_metrics.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package collectors
    18  
    19  import (
    20  	"context"
    21  	"time"
    22  
    23  	"k8s.io/component-base/metrics"
    24  	"k8s.io/klog/v2"
    25  	summary "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    26  	"k8s.io/kubernetes/pkg/kubelet/server/stats"
    27  )
    28  
    29  var (
    30  	nodeCPUUsageDesc = metrics.NewDesc("node_cpu_usage_seconds_total",
    31  		"Cumulative cpu time consumed by the node in core-seconds",
    32  		nil,
    33  		nil,
    34  		metrics.STABLE,
    35  		"")
    36  
    37  	nodeMemoryUsageDesc = metrics.NewDesc("node_memory_working_set_bytes",
    38  		"Current working set of the node in bytes",
    39  		nil,
    40  		nil,
    41  		metrics.STABLE,
    42  		"")
    43  
    44  	nodeSwapUsageDesc = metrics.NewDesc("node_swap_usage_bytes",
    45  		"Current swap usage of the node in bytes. Reported only on non-windows systems",
    46  		nil,
    47  		nil,
    48  		metrics.ALPHA,
    49  		"")
    50  
    51  	containerCPUUsageDesc = metrics.NewDesc("container_cpu_usage_seconds_total",
    52  		"Cumulative cpu time consumed by the container in core-seconds",
    53  		[]string{"container", "pod", "namespace"},
    54  		nil,
    55  		metrics.STABLE,
    56  		"")
    57  
    58  	containerMemoryUsageDesc = metrics.NewDesc("container_memory_working_set_bytes",
    59  		"Current working set of the container in bytes",
    60  		[]string{"container", "pod", "namespace"},
    61  		nil,
    62  		metrics.STABLE,
    63  		"")
    64  
    65  	containerSwapUsageDesc = metrics.NewDesc("container_swap_usage_bytes",
    66  		"Current amount of the container swap usage in bytes. Reported only on non-windows systems",
    67  		[]string{"container", "pod", "namespace"},
    68  		nil,
    69  		metrics.ALPHA,
    70  		"")
    71  
    72  	podCPUUsageDesc = metrics.NewDesc("pod_cpu_usage_seconds_total",
    73  		"Cumulative cpu time consumed by the pod in core-seconds",
    74  		[]string{"pod", "namespace"},
    75  		nil,
    76  		metrics.STABLE,
    77  		"")
    78  
    79  	podMemoryUsageDesc = metrics.NewDesc("pod_memory_working_set_bytes",
    80  		"Current working set of the pod in bytes",
    81  		[]string{"pod", "namespace"},
    82  		nil,
    83  		metrics.STABLE,
    84  		"")
    85  
    86  	podSwapUsageDesc = metrics.NewDesc("pod_swap_usage_bytes",
    87  		"Current amount of the pod swap usage in bytes. Reported only on non-windows systems",
    88  		[]string{"pod", "namespace"},
    89  		nil,
    90  		metrics.ALPHA,
    91  		"")
    92  
    93  	resourceScrapeResultDesc = metrics.NewDesc("scrape_error",
    94  		"1 if there was an error while getting container metrics, 0 otherwise",
    95  		nil,
    96  		nil,
    97  		metrics.ALPHA,
    98  		"1.29.0")
    99  
   100  	resourceScrapeErrorResultDesc = metrics.NewDesc("resource_scrape_error",
   101  		"1 if there was an error while getting container metrics, 0 otherwise",
   102  		nil,
   103  		nil,
   104  		metrics.STABLE,
   105  		"")
   106  
   107  	containerStartTimeDesc = metrics.NewDesc("container_start_time_seconds",
   108  		"Start time of the container since unix epoch in seconds",
   109  		[]string{"container", "pod", "namespace"},
   110  		nil,
   111  		metrics.STABLE,
   112  		"")
   113  )
   114  
   115  // NewResourceMetricsCollector returns a metrics.StableCollector which exports resource metrics
   116  func NewResourceMetricsCollector(provider stats.SummaryProvider) metrics.StableCollector {
   117  	return &resourceMetricsCollector{
   118  		provider: provider,
   119  	}
   120  }
   121  
   122  type resourceMetricsCollector struct {
   123  	metrics.BaseStableCollector
   124  
   125  	provider stats.SummaryProvider
   126  }
   127  
   128  // Check if resourceMetricsCollector implements necessary interface
   129  var _ metrics.StableCollector = &resourceMetricsCollector{}
   130  
   131  // DescribeWithStability implements metrics.StableCollector
   132  func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Desc) {
   133  	ch <- nodeCPUUsageDesc
   134  	ch <- nodeMemoryUsageDesc
   135  	ch <- nodeSwapUsageDesc
   136  	ch <- containerStartTimeDesc
   137  	ch <- containerCPUUsageDesc
   138  	ch <- containerMemoryUsageDesc
   139  	ch <- containerSwapUsageDesc
   140  	ch <- podCPUUsageDesc
   141  	ch <- podMemoryUsageDesc
   142  	ch <- podSwapUsageDesc
   143  	ch <- resourceScrapeResultDesc
   144  	ch <- resourceScrapeErrorResultDesc
   145  }
   146  
   147  // CollectWithStability implements metrics.StableCollector
   148  // Since new containers are frequently created and removed, using the Gauge would
   149  // leak metric collectors for containers or pods that no longer exist.  Instead, implement
   150  // custom collector in a way that only collects metrics for active containers.
   151  func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metric) {
   152  	ctx := context.Background()
   153  	var errorCount float64
   154  	defer func() {
   155  		ch <- metrics.NewLazyConstMetric(resourceScrapeResultDesc, metrics.GaugeValue, errorCount)
   156  		ch <- metrics.NewLazyConstMetric(resourceScrapeErrorResultDesc, metrics.GaugeValue, errorCount)
   157  	}()
   158  	statsSummary, err := rc.provider.GetCPUAndMemoryStats(ctx)
   159  	if err != nil {
   160  		errorCount = 1
   161  		klog.ErrorS(err, "Error getting summary for resourceMetric prometheus endpoint")
   162  		return
   163  	}
   164  
   165  	rc.collectNodeCPUMetrics(ch, statsSummary.Node)
   166  	rc.collectNodeMemoryMetrics(ch, statsSummary.Node)
   167  	rc.collectNodeSwapMetrics(ch, statsSummary.Node)
   168  
   169  	for _, pod := range statsSummary.Pods {
   170  		for _, container := range pod.Containers {
   171  			rc.collectContainerStartTime(ch, pod, container)
   172  			rc.collectContainerCPUMetrics(ch, pod, container)
   173  			rc.collectContainerMemoryMetrics(ch, pod, container)
   174  			rc.collectContainerSwapMetrics(ch, pod, container)
   175  		}
   176  		rc.collectPodCPUMetrics(ch, pod)
   177  		rc.collectPodMemoryMetrics(ch, pod)
   178  		rc.collectPodSwapMetrics(ch, pod)
   179  	}
   180  }
   181  
   182  func (rc *resourceMetricsCollector) collectNodeCPUMetrics(ch chan<- metrics.Metric, s summary.NodeStats) {
   183  	if s.CPU == nil || s.CPU.UsageCoreNanoSeconds == nil {
   184  		return
   185  	}
   186  
   187  	ch <- metrics.NewLazyMetricWithTimestamp(s.CPU.Time.Time,
   188  		metrics.NewLazyConstMetric(nodeCPUUsageDesc, metrics.CounterValue, float64(*s.CPU.UsageCoreNanoSeconds)/float64(time.Second)))
   189  }
   190  
   191  func (rc *resourceMetricsCollector) collectNodeMemoryMetrics(ch chan<- metrics.Metric, s summary.NodeStats) {
   192  	if s.Memory == nil || s.Memory.WorkingSetBytes == nil {
   193  		return
   194  	}
   195  
   196  	ch <- metrics.NewLazyMetricWithTimestamp(s.Memory.Time.Time,
   197  		metrics.NewLazyConstMetric(nodeMemoryUsageDesc, metrics.GaugeValue, float64(*s.Memory.WorkingSetBytes)))
   198  }
   199  
   200  func (rc *resourceMetricsCollector) collectNodeSwapMetrics(ch chan<- metrics.Metric, s summary.NodeStats) {
   201  	if s.Swap == nil || s.Swap.SwapUsageBytes == nil {
   202  		return
   203  	}
   204  
   205  	ch <- metrics.NewLazyMetricWithTimestamp(s.Memory.Time.Time,
   206  		metrics.NewLazyConstMetric(nodeSwapUsageDesc, metrics.GaugeValue, float64(*s.Swap.SwapUsageBytes)))
   207  }
   208  
   209  func (rc *resourceMetricsCollector) collectContainerStartTime(ch chan<- metrics.Metric, pod summary.PodStats, s summary.ContainerStats) {
   210  	if s.StartTime.Unix() <= 0 {
   211  		return
   212  	}
   213  
   214  	ch <- metrics.NewLazyConstMetric(containerStartTimeDesc, metrics.GaugeValue, float64(s.StartTime.UnixNano())/float64(time.Second), s.Name, pod.PodRef.Name, pod.PodRef.Namespace)
   215  }
   216  
   217  func (rc *resourceMetricsCollector) collectContainerCPUMetrics(ch chan<- metrics.Metric, pod summary.PodStats, s summary.ContainerStats) {
   218  	if s.CPU == nil || s.CPU.UsageCoreNanoSeconds == nil {
   219  		return
   220  	}
   221  
   222  	ch <- metrics.NewLazyMetricWithTimestamp(s.CPU.Time.Time,
   223  		metrics.NewLazyConstMetric(containerCPUUsageDesc, metrics.CounterValue,
   224  			float64(*s.CPU.UsageCoreNanoSeconds)/float64(time.Second), s.Name, pod.PodRef.Name, pod.PodRef.Namespace))
   225  }
   226  
   227  func (rc *resourceMetricsCollector) collectContainerMemoryMetrics(ch chan<- metrics.Metric, pod summary.PodStats, s summary.ContainerStats) {
   228  	if s.Memory == nil || s.Memory.WorkingSetBytes == nil {
   229  		return
   230  	}
   231  
   232  	ch <- metrics.NewLazyMetricWithTimestamp(s.Memory.Time.Time,
   233  		metrics.NewLazyConstMetric(containerMemoryUsageDesc, metrics.GaugeValue,
   234  			float64(*s.Memory.WorkingSetBytes), s.Name, pod.PodRef.Name, pod.PodRef.Namespace))
   235  }
   236  
   237  func (rc *resourceMetricsCollector) collectContainerSwapMetrics(ch chan<- metrics.Metric, pod summary.PodStats, s summary.ContainerStats) {
   238  	if s.Swap == nil || s.Swap.SwapUsageBytes == nil {
   239  		return
   240  	}
   241  
   242  	ch <- metrics.NewLazyMetricWithTimestamp(s.Swap.Time.Time,
   243  		metrics.NewLazyConstMetric(containerSwapUsageDesc, metrics.GaugeValue,
   244  			float64(*s.Swap.SwapUsageBytes), s.Name, pod.PodRef.Name, pod.PodRef.Namespace))
   245  }
   246  
   247  func (rc *resourceMetricsCollector) collectPodCPUMetrics(ch chan<- metrics.Metric, pod summary.PodStats) {
   248  	if pod.CPU == nil || pod.CPU.UsageCoreNanoSeconds == nil {
   249  		return
   250  	}
   251  
   252  	ch <- metrics.NewLazyMetricWithTimestamp(pod.CPU.Time.Time,
   253  		metrics.NewLazyConstMetric(podCPUUsageDesc, metrics.CounterValue,
   254  			float64(*pod.CPU.UsageCoreNanoSeconds)/float64(time.Second), pod.PodRef.Name, pod.PodRef.Namespace))
   255  }
   256  
   257  func (rc *resourceMetricsCollector) collectPodMemoryMetrics(ch chan<- metrics.Metric, pod summary.PodStats) {
   258  	if pod.Memory == nil || pod.Memory.WorkingSetBytes == nil {
   259  		return
   260  	}
   261  
   262  	ch <- metrics.NewLazyMetricWithTimestamp(pod.Memory.Time.Time,
   263  		metrics.NewLazyConstMetric(podMemoryUsageDesc, metrics.GaugeValue,
   264  			float64(*pod.Memory.WorkingSetBytes), pod.PodRef.Name, pod.PodRef.Namespace))
   265  }
   266  
   267  func (rc *resourceMetricsCollector) collectPodSwapMetrics(ch chan<- metrics.Metric, pod summary.PodStats) {
   268  	if pod.Swap == nil || pod.Swap.SwapUsageBytes == nil {
   269  		return
   270  	}
   271  
   272  	ch <- metrics.NewLazyMetricWithTimestamp(pod.Swap.Time.Time,
   273  		metrics.NewLazyConstMetric(podSwapUsageDesc, metrics.GaugeValue,
   274  			float64(*pod.Swap.SwapUsageBytes), pod.PodRef.Name, pod.PodRef.Namespace))
   275  }