github.com/google/cadvisor@v0.49.1/metrics/prometheus_machine.go (about) 1 // Copyright 2020 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package metrics 16 17 import ( 18 "strconv" 19 20 "github.com/prometheus/client_golang/prometheus" 21 22 "github.com/google/cadvisor/container" 23 info "github.com/google/cadvisor/info/v1" 24 25 "k8s.io/klog/v2" 26 ) 27 28 var baseLabelsNames = []string{"machine_id", "system_uuid", "boot_id"} 29 30 const ( 31 prometheusModeLabelName = "mode" 32 prometheusTypeLabelName = "type" 33 prometheusLevelLabelName = "level" 34 prometheusNodeLabelName = "node_id" 35 prometheusCoreLabelName = "core_id" 36 prometheusThreadLabelName = "thread_id" 37 prometheusPageSizeLabelName = "page_size" 38 prometheusTargetNodeLabelName = "target_node_id" 39 40 nvmMemoryMode = "memory_mode" 41 nvmAppDirectMode = "app_direct_mode" 42 43 memoryByTypeDimmCountKey = "DimmCount" 44 memoryByTypeDimmCapacityKey = "Capacity" 45 46 emptyLabelValue = "" 47 ) 48 49 // machineMetric describes a multi-dimensional metric used for exposing a 50 // certain type of machine statistic. 51 type machineMetric struct { 52 name string 53 help string 54 valueType prometheus.ValueType 55 extraLabels []string 56 condition func(machineInfo *info.MachineInfo) bool 57 getValues func(machineInfo *info.MachineInfo) metricValues 58 } 59 60 func (metric *machineMetric) desc(baseLabels []string) *prometheus.Desc { 61 return prometheus.NewDesc(metric.name, metric.help, append(baseLabels, metric.extraLabels...), nil) 62 } 63 64 // PrometheusMachineCollector implements prometheus.Collector. 65 type PrometheusMachineCollector struct { 66 infoProvider infoProvider 67 errors prometheus.Gauge 68 machineMetrics []machineMetric 69 } 70 71 // NewPrometheusMachineCollector returns a new PrometheusCollector. 72 func NewPrometheusMachineCollector(i infoProvider, includedMetrics container.MetricSet) *PrometheusMachineCollector { 73 c := &PrometheusMachineCollector{ 74 75 infoProvider: i, 76 errors: prometheus.NewGauge(prometheus.GaugeOpts{ 77 Namespace: "machine", 78 Name: "scrape_error", 79 Help: "1 if there was an error while getting machine metrics, 0 otherwise.", 80 }), 81 machineMetrics: []machineMetric{ 82 { 83 name: "machine_cpu_physical_cores", 84 help: "Number of physical CPU cores.", 85 valueType: prometheus.GaugeValue, 86 getValues: func(machineInfo *info.MachineInfo) metricValues { 87 return metricValues{{value: float64(machineInfo.NumPhysicalCores), timestamp: machineInfo.Timestamp}} 88 }, 89 }, 90 { 91 name: "machine_cpu_cores", 92 help: "Number of logical CPU cores.", 93 valueType: prometheus.GaugeValue, 94 getValues: func(machineInfo *info.MachineInfo) metricValues { 95 return metricValues{{value: float64(machineInfo.NumCores), timestamp: machineInfo.Timestamp}} 96 }, 97 }, 98 { 99 name: "machine_cpu_sockets", 100 help: "Number of CPU sockets.", 101 valueType: prometheus.GaugeValue, 102 getValues: func(machineInfo *info.MachineInfo) metricValues { 103 return metricValues{{value: float64(machineInfo.NumSockets), timestamp: machineInfo.Timestamp}} 104 }, 105 }, 106 { 107 name: "machine_memory_bytes", 108 help: "Amount of memory installed on the machine.", 109 valueType: prometheus.GaugeValue, 110 getValues: func(machineInfo *info.MachineInfo) metricValues { 111 return metricValues{{value: float64(machineInfo.MemoryCapacity), timestamp: machineInfo.Timestamp}} 112 }, 113 }, 114 { 115 name: "machine_swap_bytes", 116 help: "Amount of swap memory available on the machine.", 117 valueType: prometheus.GaugeValue, 118 getValues: func(machineInfo *info.MachineInfo) metricValues { 119 return metricValues{{value: float64(machineInfo.SwapCapacity), timestamp: machineInfo.Timestamp}} 120 }, 121 }, 122 { 123 name: "machine_dimm_count", 124 help: "Number of RAM DIMM (all types memory modules) value labeled by dimm type.", 125 valueType: prometheus.GaugeValue, 126 extraLabels: []string{prometheusTypeLabelName}, 127 condition: func(machineInfo *info.MachineInfo) bool { return len(machineInfo.MemoryByType) != 0 }, 128 getValues: func(machineInfo *info.MachineInfo) metricValues { 129 return getMemoryByType(machineInfo, memoryByTypeDimmCountKey) 130 }, 131 }, 132 { 133 name: "machine_dimm_capacity_bytes", 134 help: "Total RAM DIMM capacity (all types memory modules) value labeled by dimm type.", 135 valueType: prometheus.GaugeValue, 136 extraLabels: []string{prometheusTypeLabelName}, 137 condition: func(machineInfo *info.MachineInfo) bool { return len(machineInfo.MemoryByType) != 0 }, 138 getValues: func(machineInfo *info.MachineInfo) metricValues { 139 return getMemoryByType(machineInfo, memoryByTypeDimmCapacityKey) 140 }, 141 }, 142 { 143 name: "machine_nvm_capacity", 144 help: "NVM capacity value labeled by NVM mode (memory mode or app direct mode).", 145 valueType: prometheus.GaugeValue, 146 extraLabels: []string{prometheusModeLabelName}, 147 getValues: func(machineInfo *info.MachineInfo) metricValues { 148 return metricValues{ 149 {value: float64(machineInfo.NVMInfo.MemoryModeCapacity), labels: []string{nvmMemoryMode}, timestamp: machineInfo.Timestamp}, 150 {value: float64(machineInfo.NVMInfo.AppDirectModeCapacity), labels: []string{nvmAppDirectMode}, timestamp: machineInfo.Timestamp}, 151 } 152 }, 153 }, 154 { 155 name: "machine_nvm_avg_power_budget_watts", 156 help: "NVM power budget.", 157 valueType: prometheus.GaugeValue, 158 getValues: func(machineInfo *info.MachineInfo) metricValues { 159 return metricValues{{value: float64(machineInfo.NVMInfo.AvgPowerBudget), timestamp: machineInfo.Timestamp}} 160 }, 161 }, 162 }, 163 } 164 165 if includedMetrics.Has(container.CPUTopologyMetrics) { 166 c.machineMetrics = append(c.machineMetrics, []machineMetric{ 167 { 168 name: "machine_cpu_cache_capacity_bytes", 169 help: "Cache size in bytes assigned to NUMA node and CPU core.", 170 valueType: prometheus.GaugeValue, 171 extraLabels: []string{prometheusNodeLabelName, prometheusCoreLabelName, prometheusTypeLabelName, prometheusLevelLabelName}, 172 getValues: func(machineInfo *info.MachineInfo) metricValues { 173 return getCaches(machineInfo) 174 }, 175 }, 176 { 177 name: "machine_thread_siblings_count", 178 help: "Number of CPU thread siblings.", 179 valueType: prometheus.GaugeValue, 180 extraLabels: []string{prometheusNodeLabelName, prometheusCoreLabelName, prometheusThreadLabelName}, 181 getValues: func(machineInfo *info.MachineInfo) metricValues { 182 return getThreadsSiblingsCount(machineInfo) 183 }, 184 }, 185 { 186 name: "machine_node_memory_capacity_bytes", 187 help: "Amount of memory assigned to NUMA node.", 188 valueType: prometheus.GaugeValue, 189 extraLabels: []string{prometheusNodeLabelName}, 190 getValues: func(machineInfo *info.MachineInfo) metricValues { 191 return getNodeMemory(machineInfo) 192 }, 193 }, 194 { 195 name: "machine_node_hugepages_count", 196 help: "Numer of hugepages assigned to NUMA node.", 197 valueType: prometheus.GaugeValue, 198 extraLabels: []string{prometheusNodeLabelName, prometheusPageSizeLabelName}, 199 getValues: func(machineInfo *info.MachineInfo) metricValues { 200 return getHugePagesCount(machineInfo) 201 }, 202 }, 203 { 204 name: "machine_node_distance", 205 help: "Distance between NUMA node and target NUMA node.", 206 valueType: prometheus.GaugeValue, 207 extraLabels: []string{prometheusNodeLabelName, prometheusTargetNodeLabelName}, 208 getValues: func(machineInfo *info.MachineInfo) metricValues { 209 return getDistance(machineInfo) 210 }, 211 }, 212 }...) 213 } 214 return c 215 } 216 217 // Describe describes all the machine metrics ever exported by cadvisor. It 218 // implements prometheus.PrometheusCollector. 219 func (collector *PrometheusMachineCollector) Describe(ch chan<- *prometheus.Desc) { 220 collector.errors.Describe(ch) 221 for _, metric := range collector.machineMetrics { 222 ch <- metric.desc([]string{}) 223 } 224 } 225 226 // Collect fetches information about machine and delivers them as 227 // Prometheus metrics. It implements prometheus.PrometheusCollector. 228 func (collector *PrometheusMachineCollector) Collect(ch chan<- prometheus.Metric) { 229 collector.errors.Set(0) 230 collector.collectMachineInfo(ch) 231 collector.errors.Collect(ch) 232 } 233 234 func (collector *PrometheusMachineCollector) collectMachineInfo(ch chan<- prometheus.Metric) { 235 machineInfo, err := collector.infoProvider.GetMachineInfo() 236 if err != nil { 237 collector.errors.Set(1) 238 klog.Warningf("Couldn't get machine info: %s", err) 239 return 240 } 241 242 baseLabelsValues := []string{machineInfo.MachineID, machineInfo.SystemUUID, machineInfo.BootID} 243 244 for _, metric := range collector.machineMetrics { 245 if metric.condition != nil && !metric.condition(machineInfo) { 246 continue 247 } 248 249 for _, metricValue := range metric.getValues(machineInfo) { 250 labelValues := make([]string, len(baseLabelsValues)) 251 copy(labelValues, baseLabelsValues) 252 if len(metric.extraLabels) != 0 { 253 labelValues = append(labelValues, metricValue.labels...) 254 } 255 256 prometheusMetric := prometheus.MustNewConstMetric(metric.desc(baseLabelsNames), 257 metric.valueType, metricValue.value, labelValues...) 258 259 if metricValue.timestamp.IsZero() { 260 ch <- prometheusMetric 261 } else { 262 ch <- prometheus.NewMetricWithTimestamp(metricValue.timestamp, prometheusMetric) 263 } 264 } 265 266 } 267 } 268 269 func getMemoryByType(machineInfo *info.MachineInfo, property string) metricValues { 270 mValues := make(metricValues, 0, len(machineInfo.MemoryByType)) 271 for memoryType, memoryInfo := range machineInfo.MemoryByType { 272 propertyValue := 0.0 273 switch property { 274 case memoryByTypeDimmCapacityKey: 275 propertyValue = float64(memoryInfo.Capacity) 276 case memoryByTypeDimmCountKey: 277 propertyValue = float64(memoryInfo.DimmCount) 278 default: 279 klog.Warningf("Incorrect propery name for MemoryByType, property %s", property) 280 return metricValues{} 281 } 282 mValues = append(mValues, metricValue{value: propertyValue, labels: []string{memoryType}, timestamp: machineInfo.Timestamp}) 283 } 284 return mValues 285 } 286 287 func getThreadsSiblingsCount(machineInfo *info.MachineInfo) metricValues { 288 mValues := make(metricValues, 0, machineInfo.NumCores) 289 for _, node := range machineInfo.Topology { 290 nodeID := strconv.Itoa(node.Id) 291 292 for _, core := range node.Cores { 293 coreID := strconv.Itoa(core.Id) 294 siblingsCount := len(core.Threads) 295 296 for _, thread := range core.Threads { 297 mValues = append(mValues, 298 metricValue{ 299 value: float64(siblingsCount), 300 labels: []string{nodeID, coreID, strconv.Itoa(thread)}, 301 timestamp: machineInfo.Timestamp, 302 }) 303 } 304 } 305 } 306 return mValues 307 } 308 309 func getNodeMemory(machineInfo *info.MachineInfo) metricValues { 310 mValues := make(metricValues, 0, len(machineInfo.Topology)) 311 for _, node := range machineInfo.Topology { 312 nodeID := strconv.Itoa(node.Id) 313 mValues = append(mValues, 314 metricValue{ 315 value: float64(node.Memory), 316 labels: []string{nodeID}, 317 timestamp: machineInfo.Timestamp, 318 }) 319 } 320 return mValues 321 } 322 323 func getHugePagesCount(machineInfo *info.MachineInfo) metricValues { 324 mValues := make(metricValues, 0) 325 for _, node := range machineInfo.Topology { 326 nodeID := strconv.Itoa(node.Id) 327 328 for _, hugePage := range node.HugePages { 329 mValues = append(mValues, 330 metricValue{ 331 value: float64(hugePage.NumPages), 332 labels: []string{nodeID, strconv.FormatUint(hugePage.PageSize, 10)}, 333 timestamp: machineInfo.Timestamp, 334 }) 335 } 336 } 337 return mValues 338 } 339 340 func getCaches(machineInfo *info.MachineInfo) metricValues { 341 mValues := make(metricValues, 0) 342 for _, node := range machineInfo.Topology { 343 nodeID := strconv.Itoa(node.Id) 344 345 for _, core := range node.Cores { 346 coreID := strconv.Itoa(core.Id) 347 348 for _, cache := range core.Caches { 349 mValues = append(mValues, 350 metricValue{ 351 value: float64(cache.Size), 352 labels: []string{nodeID, coreID, cache.Type, strconv.Itoa(cache.Level)}, 353 timestamp: machineInfo.Timestamp, 354 }) 355 } 356 for _, cache := range core.UncoreCaches { 357 mValues = append(mValues, 358 metricValue{ 359 value: float64(cache.Size), 360 labels: []string{nodeID, coreID, cache.Type, strconv.Itoa(cache.Level)}, 361 timestamp: machineInfo.Timestamp, 362 }) 363 } 364 } 365 366 for _, cache := range node.Caches { 367 mValues = append(mValues, 368 metricValue{ 369 value: float64(cache.Size), 370 labels: []string{nodeID, emptyLabelValue, cache.Type, strconv.Itoa(cache.Level)}, 371 timestamp: machineInfo.Timestamp, 372 }) 373 } 374 } 375 return mValues 376 } 377 378 func getDistance(machineInfo *info.MachineInfo) metricValues { 379 mValues := make(metricValues, 0, len(machineInfo.Topology)^2) 380 for _, node := range machineInfo.Topology { 381 nodeID := strconv.Itoa(node.Id) 382 for i, target := range node.Distances { 383 mValues = append(mValues, 384 metricValue{ 385 value: float64(target), 386 labels: []string{nodeID, strconv.Itoa(i)}, 387 timestamp: machineInfo.Timestamp, 388 }) 389 } 390 } 391 return mValues 392 }