github.com/google/cadvisor@v0.49.1/metrics/prometheus_machine.go (about)

     1  // Copyright 2020 Google Inc. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package metrics
    16  
    17  import (
    18  	"strconv"
    19  
    20  	"github.com/prometheus/client_golang/prometheus"
    21  
    22  	"github.com/google/cadvisor/container"
    23  	info "github.com/google/cadvisor/info/v1"
    24  
    25  	"k8s.io/klog/v2"
    26  )
    27  
    28  var baseLabelsNames = []string{"machine_id", "system_uuid", "boot_id"}
    29  
    30  const (
    31  	prometheusModeLabelName       = "mode"
    32  	prometheusTypeLabelName       = "type"
    33  	prometheusLevelLabelName      = "level"
    34  	prometheusNodeLabelName       = "node_id"
    35  	prometheusCoreLabelName       = "core_id"
    36  	prometheusThreadLabelName     = "thread_id"
    37  	prometheusPageSizeLabelName   = "page_size"
    38  	prometheusTargetNodeLabelName = "target_node_id"
    39  
    40  	nvmMemoryMode    = "memory_mode"
    41  	nvmAppDirectMode = "app_direct_mode"
    42  
    43  	memoryByTypeDimmCountKey    = "DimmCount"
    44  	memoryByTypeDimmCapacityKey = "Capacity"
    45  
    46  	emptyLabelValue = ""
    47  )
    48  
    49  // machineMetric describes a multi-dimensional metric used for exposing a
    50  // certain type of machine statistic.
    51  type machineMetric struct {
    52  	name        string
    53  	help        string
    54  	valueType   prometheus.ValueType
    55  	extraLabels []string
    56  	condition   func(machineInfo *info.MachineInfo) bool
    57  	getValues   func(machineInfo *info.MachineInfo) metricValues
    58  }
    59  
    60  func (metric *machineMetric) desc(baseLabels []string) *prometheus.Desc {
    61  	return prometheus.NewDesc(metric.name, metric.help, append(baseLabels, metric.extraLabels...), nil)
    62  }
    63  
    64  // PrometheusMachineCollector implements prometheus.Collector.
    65  type PrometheusMachineCollector struct {
    66  	infoProvider   infoProvider
    67  	errors         prometheus.Gauge
    68  	machineMetrics []machineMetric
    69  }
    70  
    71  // NewPrometheusMachineCollector returns a new PrometheusCollector.
    72  func NewPrometheusMachineCollector(i infoProvider, includedMetrics container.MetricSet) *PrometheusMachineCollector {
    73  	c := &PrometheusMachineCollector{
    74  
    75  		infoProvider: i,
    76  		errors: prometheus.NewGauge(prometheus.GaugeOpts{
    77  			Namespace: "machine",
    78  			Name:      "scrape_error",
    79  			Help:      "1 if there was an error while getting machine metrics, 0 otherwise.",
    80  		}),
    81  		machineMetrics: []machineMetric{
    82  			{
    83  				name:      "machine_cpu_physical_cores",
    84  				help:      "Number of physical CPU cores.",
    85  				valueType: prometheus.GaugeValue,
    86  				getValues: func(machineInfo *info.MachineInfo) metricValues {
    87  					return metricValues{{value: float64(machineInfo.NumPhysicalCores), timestamp: machineInfo.Timestamp}}
    88  				},
    89  			},
    90  			{
    91  				name:      "machine_cpu_cores",
    92  				help:      "Number of logical CPU cores.",
    93  				valueType: prometheus.GaugeValue,
    94  				getValues: func(machineInfo *info.MachineInfo) metricValues {
    95  					return metricValues{{value: float64(machineInfo.NumCores), timestamp: machineInfo.Timestamp}}
    96  				},
    97  			},
    98  			{
    99  				name:      "machine_cpu_sockets",
   100  				help:      "Number of CPU sockets.",
   101  				valueType: prometheus.GaugeValue,
   102  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   103  					return metricValues{{value: float64(machineInfo.NumSockets), timestamp: machineInfo.Timestamp}}
   104  				},
   105  			},
   106  			{
   107  				name:      "machine_memory_bytes",
   108  				help:      "Amount of memory installed on the machine.",
   109  				valueType: prometheus.GaugeValue,
   110  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   111  					return metricValues{{value: float64(machineInfo.MemoryCapacity), timestamp: machineInfo.Timestamp}}
   112  				},
   113  			},
   114  			{
   115  				name:      "machine_swap_bytes",
   116  				help:      "Amount of swap memory available on the machine.",
   117  				valueType: prometheus.GaugeValue,
   118  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   119  					return metricValues{{value: float64(machineInfo.SwapCapacity), timestamp: machineInfo.Timestamp}}
   120  				},
   121  			},
   122  			{
   123  				name:        "machine_dimm_count",
   124  				help:        "Number of RAM DIMM (all types memory modules) value labeled by dimm type.",
   125  				valueType:   prometheus.GaugeValue,
   126  				extraLabels: []string{prometheusTypeLabelName},
   127  				condition:   func(machineInfo *info.MachineInfo) bool { return len(machineInfo.MemoryByType) != 0 },
   128  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   129  					return getMemoryByType(machineInfo, memoryByTypeDimmCountKey)
   130  				},
   131  			},
   132  			{
   133  				name:        "machine_dimm_capacity_bytes",
   134  				help:        "Total RAM DIMM capacity (all types memory modules) value labeled by dimm type.",
   135  				valueType:   prometheus.GaugeValue,
   136  				extraLabels: []string{prometheusTypeLabelName},
   137  				condition:   func(machineInfo *info.MachineInfo) bool { return len(machineInfo.MemoryByType) != 0 },
   138  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   139  					return getMemoryByType(machineInfo, memoryByTypeDimmCapacityKey)
   140  				},
   141  			},
   142  			{
   143  				name:        "machine_nvm_capacity",
   144  				help:        "NVM capacity value labeled by NVM mode (memory mode or app direct mode).",
   145  				valueType:   prometheus.GaugeValue,
   146  				extraLabels: []string{prometheusModeLabelName},
   147  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   148  					return metricValues{
   149  						{value: float64(machineInfo.NVMInfo.MemoryModeCapacity), labels: []string{nvmMemoryMode}, timestamp: machineInfo.Timestamp},
   150  						{value: float64(machineInfo.NVMInfo.AppDirectModeCapacity), labels: []string{nvmAppDirectMode}, timestamp: machineInfo.Timestamp},
   151  					}
   152  				},
   153  			},
   154  			{
   155  				name:      "machine_nvm_avg_power_budget_watts",
   156  				help:      "NVM power budget.",
   157  				valueType: prometheus.GaugeValue,
   158  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   159  					return metricValues{{value: float64(machineInfo.NVMInfo.AvgPowerBudget), timestamp: machineInfo.Timestamp}}
   160  				},
   161  			},
   162  		},
   163  	}
   164  
   165  	if includedMetrics.Has(container.CPUTopologyMetrics) {
   166  		c.machineMetrics = append(c.machineMetrics, []machineMetric{
   167  			{
   168  				name:        "machine_cpu_cache_capacity_bytes",
   169  				help:        "Cache size in bytes assigned to NUMA node and CPU core.",
   170  				valueType:   prometheus.GaugeValue,
   171  				extraLabels: []string{prometheusNodeLabelName, prometheusCoreLabelName, prometheusTypeLabelName, prometheusLevelLabelName},
   172  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   173  					return getCaches(machineInfo)
   174  				},
   175  			},
   176  			{
   177  				name:        "machine_thread_siblings_count",
   178  				help:        "Number of CPU thread siblings.",
   179  				valueType:   prometheus.GaugeValue,
   180  				extraLabels: []string{prometheusNodeLabelName, prometheusCoreLabelName, prometheusThreadLabelName},
   181  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   182  					return getThreadsSiblingsCount(machineInfo)
   183  				},
   184  			},
   185  			{
   186  				name:        "machine_node_memory_capacity_bytes",
   187  				help:        "Amount of memory assigned to NUMA node.",
   188  				valueType:   prometheus.GaugeValue,
   189  				extraLabels: []string{prometheusNodeLabelName},
   190  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   191  					return getNodeMemory(machineInfo)
   192  				},
   193  			},
   194  			{
   195  				name:        "machine_node_hugepages_count",
   196  				help:        "Numer of hugepages assigned to NUMA node.",
   197  				valueType:   prometheus.GaugeValue,
   198  				extraLabels: []string{prometheusNodeLabelName, prometheusPageSizeLabelName},
   199  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   200  					return getHugePagesCount(machineInfo)
   201  				},
   202  			},
   203  			{
   204  				name:        "machine_node_distance",
   205  				help:        "Distance between NUMA node and target NUMA node.",
   206  				valueType:   prometheus.GaugeValue,
   207  				extraLabels: []string{prometheusNodeLabelName, prometheusTargetNodeLabelName},
   208  				getValues: func(machineInfo *info.MachineInfo) metricValues {
   209  					return getDistance(machineInfo)
   210  				},
   211  			},
   212  		}...)
   213  	}
   214  	return c
   215  }
   216  
   217  // Describe describes all the machine metrics ever exported by cadvisor. It
   218  // implements prometheus.PrometheusCollector.
   219  func (collector *PrometheusMachineCollector) Describe(ch chan<- *prometheus.Desc) {
   220  	collector.errors.Describe(ch)
   221  	for _, metric := range collector.machineMetrics {
   222  		ch <- metric.desc([]string{})
   223  	}
   224  }
   225  
   226  // Collect fetches information about machine and delivers them as
   227  // Prometheus metrics. It implements prometheus.PrometheusCollector.
   228  func (collector *PrometheusMachineCollector) Collect(ch chan<- prometheus.Metric) {
   229  	collector.errors.Set(0)
   230  	collector.collectMachineInfo(ch)
   231  	collector.errors.Collect(ch)
   232  }
   233  
   234  func (collector *PrometheusMachineCollector) collectMachineInfo(ch chan<- prometheus.Metric) {
   235  	machineInfo, err := collector.infoProvider.GetMachineInfo()
   236  	if err != nil {
   237  		collector.errors.Set(1)
   238  		klog.Warningf("Couldn't get machine info: %s", err)
   239  		return
   240  	}
   241  
   242  	baseLabelsValues := []string{machineInfo.MachineID, machineInfo.SystemUUID, machineInfo.BootID}
   243  
   244  	for _, metric := range collector.machineMetrics {
   245  		if metric.condition != nil && !metric.condition(machineInfo) {
   246  			continue
   247  		}
   248  
   249  		for _, metricValue := range metric.getValues(machineInfo) {
   250  			labelValues := make([]string, len(baseLabelsValues))
   251  			copy(labelValues, baseLabelsValues)
   252  			if len(metric.extraLabels) != 0 {
   253  				labelValues = append(labelValues, metricValue.labels...)
   254  			}
   255  
   256  			prometheusMetric := prometheus.MustNewConstMetric(metric.desc(baseLabelsNames),
   257  				metric.valueType, metricValue.value, labelValues...)
   258  
   259  			if metricValue.timestamp.IsZero() {
   260  				ch <- prometheusMetric
   261  			} else {
   262  				ch <- prometheus.NewMetricWithTimestamp(metricValue.timestamp, prometheusMetric)
   263  			}
   264  		}
   265  
   266  	}
   267  }
   268  
   269  func getMemoryByType(machineInfo *info.MachineInfo, property string) metricValues {
   270  	mValues := make(metricValues, 0, len(machineInfo.MemoryByType))
   271  	for memoryType, memoryInfo := range machineInfo.MemoryByType {
   272  		propertyValue := 0.0
   273  		switch property {
   274  		case memoryByTypeDimmCapacityKey:
   275  			propertyValue = float64(memoryInfo.Capacity)
   276  		case memoryByTypeDimmCountKey:
   277  			propertyValue = float64(memoryInfo.DimmCount)
   278  		default:
   279  			klog.Warningf("Incorrect propery name for MemoryByType, property %s", property)
   280  			return metricValues{}
   281  		}
   282  		mValues = append(mValues, metricValue{value: propertyValue, labels: []string{memoryType}, timestamp: machineInfo.Timestamp})
   283  	}
   284  	return mValues
   285  }
   286  
   287  func getThreadsSiblingsCount(machineInfo *info.MachineInfo) metricValues {
   288  	mValues := make(metricValues, 0, machineInfo.NumCores)
   289  	for _, node := range machineInfo.Topology {
   290  		nodeID := strconv.Itoa(node.Id)
   291  
   292  		for _, core := range node.Cores {
   293  			coreID := strconv.Itoa(core.Id)
   294  			siblingsCount := len(core.Threads)
   295  
   296  			for _, thread := range core.Threads {
   297  				mValues = append(mValues,
   298  					metricValue{
   299  						value:     float64(siblingsCount),
   300  						labels:    []string{nodeID, coreID, strconv.Itoa(thread)},
   301  						timestamp: machineInfo.Timestamp,
   302  					})
   303  			}
   304  		}
   305  	}
   306  	return mValues
   307  }
   308  
   309  func getNodeMemory(machineInfo *info.MachineInfo) metricValues {
   310  	mValues := make(metricValues, 0, len(machineInfo.Topology))
   311  	for _, node := range machineInfo.Topology {
   312  		nodeID := strconv.Itoa(node.Id)
   313  		mValues = append(mValues,
   314  			metricValue{
   315  				value:     float64(node.Memory),
   316  				labels:    []string{nodeID},
   317  				timestamp: machineInfo.Timestamp,
   318  			})
   319  	}
   320  	return mValues
   321  }
   322  
   323  func getHugePagesCount(machineInfo *info.MachineInfo) metricValues {
   324  	mValues := make(metricValues, 0)
   325  	for _, node := range machineInfo.Topology {
   326  		nodeID := strconv.Itoa(node.Id)
   327  
   328  		for _, hugePage := range node.HugePages {
   329  			mValues = append(mValues,
   330  				metricValue{
   331  					value:     float64(hugePage.NumPages),
   332  					labels:    []string{nodeID, strconv.FormatUint(hugePage.PageSize, 10)},
   333  					timestamp: machineInfo.Timestamp,
   334  				})
   335  		}
   336  	}
   337  	return mValues
   338  }
   339  
   340  func getCaches(machineInfo *info.MachineInfo) metricValues {
   341  	mValues := make(metricValues, 0)
   342  	for _, node := range machineInfo.Topology {
   343  		nodeID := strconv.Itoa(node.Id)
   344  
   345  		for _, core := range node.Cores {
   346  			coreID := strconv.Itoa(core.Id)
   347  
   348  			for _, cache := range core.Caches {
   349  				mValues = append(mValues,
   350  					metricValue{
   351  						value:     float64(cache.Size),
   352  						labels:    []string{nodeID, coreID, cache.Type, strconv.Itoa(cache.Level)},
   353  						timestamp: machineInfo.Timestamp,
   354  					})
   355  			}
   356  			for _, cache := range core.UncoreCaches {
   357  				mValues = append(mValues,
   358  					metricValue{
   359  						value:     float64(cache.Size),
   360  						labels:    []string{nodeID, coreID, cache.Type, strconv.Itoa(cache.Level)},
   361  						timestamp: machineInfo.Timestamp,
   362  					})
   363  			}
   364  		}
   365  
   366  		for _, cache := range node.Caches {
   367  			mValues = append(mValues,
   368  				metricValue{
   369  					value:     float64(cache.Size),
   370  					labels:    []string{nodeID, emptyLabelValue, cache.Type, strconv.Itoa(cache.Level)},
   371  					timestamp: machineInfo.Timestamp,
   372  				})
   373  		}
   374  	}
   375  	return mValues
   376  }
   377  
   378  func getDistance(machineInfo *info.MachineInfo) metricValues {
   379  	mValues := make(metricValues, 0, len(machineInfo.Topology)^2)
   380  	for _, node := range machineInfo.Topology {
   381  		nodeID := strconv.Itoa(node.Id)
   382  		for i, target := range node.Distances {
   383  			mValues = append(mValues,
   384  				metricValue{
   385  					value:     float64(target),
   386  					labels:    []string{nodeID, strconv.Itoa(i)},
   387  					timestamp: machineInfo.Timestamp,
   388  				})
   389  		}
   390  	}
   391  	return mValues
   392  }