github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/metrics-resource.go (about) 1 // Copyright (c) 2015-2023 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "context" 22 "fmt" 23 "math" 24 "net/http" 25 "sync" 26 "time" 27 28 "github.com/minio/madmin-go/v3" 29 "github.com/prometheus/client_golang/prometheus" 30 ) 31 32 const ( 33 resourceMetricsCollectionInterval = time.Minute 34 resourceMetricsCacheInterval = time.Minute 35 36 // drive stats 37 totalInodes MetricName = "total_inodes" 38 readsPerSec MetricName = "reads_per_sec" 39 writesPerSec MetricName = "writes_per_sec" 40 readsKBPerSec MetricName = "reads_kb_per_sec" 41 writesKBPerSec MetricName = "writes_kb_per_sec" 42 readsAwait MetricName = "reads_await" 43 writesAwait MetricName = "writes_await" 44 percUtil MetricName = "perc_util" 45 usedInodes MetricName = "used_inodes" 46 47 // network stats 48 interfaceRxBytes MetricName = "rx_bytes" 49 interfaceRxErrors MetricName = "rx_errors" 50 interfaceTxBytes MetricName = "tx_bytes" 51 interfaceTxErrors MetricName = "tx_errors" 52 53 // memory stats 54 memUsed MetricName = "used" 55 memUsedPerc MetricName = "used_perc" 56 memFree MetricName = "free" 57 memShared MetricName = "shared" 58 memBuffers MetricName = "buffers" 59 memCache MetricName = "cache" 60 memAvailable MetricName = "available" 61 62 // cpu stats 63 cpuUser MetricName = "user" 64 cpuSystem MetricName = "system" 65 cpuIOWait MetricName = "iowait" 66 cpuIdle MetricName = "idle" 67 cpuNice MetricName = "nice" 68 cpuSteal MetricName = "steal" 69 cpuLoad1 MetricName = "load1" 70 cpuLoad5 MetricName = "load5" 71 cpuLoad15 MetricName = "load15" 72 cpuLoad1Perc MetricName = "load1_perc" 73 cpuLoad5Perc MetricName = "load5_perc" 74 cpuLoad15Perc MetricName = "load15_perc" 75 ) 76 77 var ( 78 resourceCollector *minioResourceCollector 79 // resourceMetricsMap is a map of subsystem to its metrics 80 resourceMetricsMap map[MetricSubsystem]ResourceMetrics 81 resourceMetricsMapMu sync.RWMutex 82 // resourceMetricsHelpMap maps metric name to its help string 83 resourceMetricsHelpMap map[MetricName]string 84 resourceMetricsGroups []*MetricsGroupV2 85 // initial values for drives (at the time of server startup) 86 // used for calculating avg values for drive metrics 87 latestDriveStats map[string]madmin.DiskIOStats 88 latestDriveStatsMu sync.RWMutex 89 lastDriveStatsRefresh time.Time 90 ) 91 92 // PeerResourceMetrics represents the resource metrics 93 // retrieved from a peer, along with errors if any 94 type PeerResourceMetrics struct { 95 Metrics map[MetricSubsystem]ResourceMetrics 96 Errors []string 97 } 98 99 // ResourceMetrics is a map of unique key identifying 100 // a resource metric (e.g. reads_per_sec_{node}_{drive}) 101 // to its data 102 type ResourceMetrics map[string]ResourceMetric 103 104 // ResourceMetric represents a single resource metric 105 // The metrics are collected from all servers periodically 106 // and stored in the resource metrics map. 107 // It also maintains the count of number of times this metric 108 // was collected since the server started, and the sum, 109 // average and max values across the same. 110 type ResourceMetric struct { 111 Name MetricName 112 Labels map[string]string 113 114 // value captured in current cycle 115 Current float64 116 117 // Used when system provides cumulative (since uptime) values 118 // helps in calculating the current value by comparing the new 119 // cumulative value with previous one 120 Cumulative float64 121 122 Max float64 123 Avg float64 124 Sum float64 125 Count uint64 126 } 127 128 func init() { 129 interval := fmt.Sprintf("%ds", int(resourceMetricsCollectionInterval.Seconds())) 130 resourceMetricsHelpMap = map[MetricName]string{ 131 interfaceRxBytes: "Bytes received on the interface in " + interval, 132 interfaceRxErrors: "Receive errors in " + interval, 133 interfaceTxBytes: "Bytes transmitted in " + interval, 134 interfaceTxErrors: "Transmit errors in " + interval, 135 total: "Total memory on the node", 136 memUsed: "Used memory on the node", 137 memUsedPerc: "Used memory percentage on the node", 138 memFree: "Free memory on the node", 139 memShared: "Shared memory on the node", 140 memBuffers: "Buffers memory on the node", 141 memCache: "Cache memory on the node", 142 memAvailable: "Available memory on the node", 143 readsPerSec: "Reads per second on a drive", 144 writesPerSec: "Writes per second on a drive", 145 readsKBPerSec: "Kilobytes read per second on a drive", 146 writesKBPerSec: "Kilobytes written per second on a drive", 147 readsAwait: "Average time for read requests to be served on a drive", 148 writesAwait: "Average time for write requests to be served on a drive", 149 percUtil: "Percentage of time the disk was busy", 150 usedBytes: "Used bytes on a drive", 151 totalBytes: "Total bytes on a drive", 152 usedInodes: "Total inodes used on a drive", 153 totalInodes: "Total inodes on a drive", 154 cpuUser: "CPU user time", 155 cpuSystem: "CPU system time", 156 cpuIdle: "CPU idle time", 157 cpuIOWait: "CPU ioWait time", 158 cpuSteal: "CPU steal time", 159 cpuNice: "CPU nice time", 160 cpuLoad1: "CPU load average 1min", 161 cpuLoad5: "CPU load average 5min", 162 cpuLoad15: "CPU load average 15min", 163 cpuLoad1Perc: "CPU load average 1min (perentage)", 164 cpuLoad5Perc: "CPU load average 5min (percentage)", 165 cpuLoad15Perc: "CPU load average 15min (percentage)", 166 } 167 resourceMetricsGroups = []*MetricsGroupV2{ 168 getResourceMetrics(), 169 } 170 171 resourceCollector = newMinioResourceCollector(resourceMetricsGroups) 172 } 173 174 func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64, labels map[string]string, isCumulative bool) { 175 resourceMetricsMapMu.Lock() 176 defer resourceMetricsMapMu.Unlock() 177 subsysMetrics, found := resourceMetricsMap[subSys] 178 if !found { 179 subsysMetrics = ResourceMetrics{} 180 } 181 182 // labels are used to uniquely identify a metric 183 // e.g. reads_per_sec_{drive} inside the map 184 sfx := "" 185 for _, v := range labels { 186 if len(sfx) > 0 { 187 sfx += "_" 188 } 189 sfx += v 190 } 191 192 key := string(name) + "_" + sfx 193 metric, found := subsysMetrics[key] 194 if !found { 195 metric = ResourceMetric{ 196 Name: name, 197 Labels: labels, 198 } 199 } 200 201 if isCumulative { 202 metric.Current = val - metric.Cumulative 203 metric.Cumulative = val 204 } else { 205 metric.Current = val 206 } 207 208 if metric.Current > metric.Max { 209 metric.Max = val 210 } 211 212 metric.Sum += metric.Current 213 metric.Count++ 214 215 metric.Avg = metric.Sum / float64(metric.Count) 216 subsysMetrics[key] = metric 217 218 resourceMetricsMap[subSys] = subsysMetrics 219 } 220 221 // updateDriveIOStats - Updates the drive IO stats by calculating the difference between the current and latest updated values. 222 func updateDriveIOStats(currentStats madmin.DiskIOStats, latestStats madmin.DiskIOStats, labels map[string]string) { 223 sectorSize := uint64(512) 224 kib := float64(1 << 10) 225 diffInSeconds := time.Now().UTC().Sub(lastDriveStatsRefresh).Seconds() 226 if diffInSeconds == 0 { 227 // too soon to update the stats 228 return 229 } 230 diffStats := madmin.DiskIOStats{ 231 ReadIOs: currentStats.ReadIOs - latestStats.ReadIOs, 232 WriteIOs: currentStats.WriteIOs - latestStats.WriteIOs, 233 ReadTicks: currentStats.ReadTicks - latestStats.ReadTicks, 234 WriteTicks: currentStats.WriteTicks - latestStats.WriteTicks, 235 TotalTicks: currentStats.TotalTicks - latestStats.TotalTicks, 236 ReadSectors: currentStats.ReadSectors - latestStats.ReadSectors, 237 WriteSectors: currentStats.WriteSectors - latestStats.WriteSectors, 238 } 239 240 updateResourceMetrics(driveSubsystem, readsPerSec, float64(diffStats.ReadIOs)/diffInSeconds, labels, false) 241 readKib := float64(diffStats.ReadSectors*sectorSize) / kib 242 updateResourceMetrics(driveSubsystem, readsKBPerSec, readKib/diffInSeconds, labels, false) 243 244 updateResourceMetrics(driveSubsystem, writesPerSec, float64(diffStats.WriteIOs)/diffInSeconds, labels, false) 245 writeKib := float64(diffStats.WriteSectors*sectorSize) / kib 246 updateResourceMetrics(driveSubsystem, writesKBPerSec, writeKib/diffInSeconds, labels, false) 247 248 rdAwait := 0.0 249 if diffStats.ReadIOs > 0 { 250 rdAwait = float64(diffStats.ReadTicks) / float64(diffStats.ReadIOs) 251 } 252 updateResourceMetrics(driveSubsystem, readsAwait, rdAwait, labels, false) 253 254 wrAwait := 0.0 255 if diffStats.WriteIOs > 0 { 256 wrAwait = float64(diffStats.WriteTicks) / float64(diffStats.WriteIOs) 257 } 258 updateResourceMetrics(driveSubsystem, writesAwait, wrAwait, labels, false) 259 updateResourceMetrics(driveSubsystem, percUtil, float64(diffStats.TotalTicks)/(diffInSeconds*10), labels, false) 260 } 261 262 func collectDriveMetrics(m madmin.RealtimeMetrics) { 263 latestDriveStatsMu.Lock() 264 for d, dm := range m.ByDisk { 265 labels := map[string]string{"drive": d} 266 latestStats, ok := latestDriveStats[d] 267 if !ok { 268 latestDriveStats[d] = dm.IOStats 269 continue 270 } 271 updateDriveIOStats(dm.IOStats, latestStats, labels) 272 latestDriveStats[d] = dm.IOStats 273 } 274 lastDriveStatsRefresh = time.Now().UTC() 275 latestDriveStatsMu.Unlock() 276 277 globalLocalDrivesMu.RLock() 278 localDrives := cloneDrives(globalLocalDrives) 279 globalLocalDrivesMu.RUnlock() 280 281 for _, d := range localDrives { 282 di, err := d.DiskInfo(GlobalContext, DiskInfoOptions{}) 283 labels := map[string]string{"drive": di.Endpoint} 284 if err == nil { 285 updateResourceMetrics(driveSubsystem, usedBytes, float64(di.Used), labels, false) 286 updateResourceMetrics(driveSubsystem, totalBytes, float64(di.Total), labels, false) 287 updateResourceMetrics(driveSubsystem, usedInodes, float64(di.UsedInodes), labels, false) 288 updateResourceMetrics(driveSubsystem, totalInodes, float64(di.FreeInodes+di.UsedInodes), labels, false) 289 } 290 } 291 } 292 293 func collectLocalResourceMetrics() { 294 var types madmin.MetricType = madmin.MetricsDisk | madmin.MetricNet | madmin.MetricsMem | madmin.MetricsCPU 295 296 m := collectLocalMetrics(types, collectMetricsOpts{ 297 hosts: map[string]struct{}{ 298 globalLocalNodeName: {}, 299 }, 300 }) 301 302 for host, hm := range m.ByHost { 303 if len(host) > 0 { 304 if hm.Net != nil && len(hm.Net.NetStats.Name) > 0 { 305 stats := hm.Net.NetStats 306 labels := map[string]string{"interface": stats.Name} 307 updateResourceMetrics(interfaceSubsystem, interfaceRxBytes, float64(stats.RxBytes), labels, true) 308 updateResourceMetrics(interfaceSubsystem, interfaceRxErrors, float64(stats.RxErrors), labels, true) 309 updateResourceMetrics(interfaceSubsystem, interfaceTxBytes, float64(stats.TxBytes), labels, true) 310 updateResourceMetrics(interfaceSubsystem, interfaceTxErrors, float64(stats.TxErrors), labels, true) 311 } 312 if hm.Mem != nil && len(hm.Mem.Info.Addr) > 0 { 313 labels := map[string]string{} 314 stats := hm.Mem.Info 315 updateResourceMetrics(memSubsystem, total, float64(stats.Total), labels, false) 316 updateResourceMetrics(memSubsystem, memUsed, float64(stats.Used), labels, false) 317 perc := math.Round(float64(stats.Used*100*100)/float64(stats.Total)) / 100 318 updateResourceMetrics(memSubsystem, memUsedPerc, perc, labels, false) 319 updateResourceMetrics(memSubsystem, memFree, float64(stats.Free), labels, false) 320 updateResourceMetrics(memSubsystem, memShared, float64(stats.Shared), labels, false) 321 updateResourceMetrics(memSubsystem, memBuffers, float64(stats.Buffers), labels, false) 322 updateResourceMetrics(memSubsystem, memAvailable, float64(stats.Available), labels, false) 323 updateResourceMetrics(memSubsystem, memCache, float64(stats.Cache), labels, false) 324 } 325 if hm.CPU != nil { 326 labels := map[string]string{} 327 ts := hm.CPU.TimesStat 328 if ts != nil { 329 tot := ts.User + ts.System + ts.Idle + ts.Iowait + ts.Nice + ts.Steal 330 cpuUserVal := math.Round(ts.User/tot*100*100) / 100 331 updateResourceMetrics(cpuSubsystem, cpuUser, cpuUserVal, labels, false) 332 cpuSystemVal := math.Round(ts.System/tot*100*100) / 100 333 updateResourceMetrics(cpuSubsystem, cpuSystem, cpuSystemVal, labels, false) 334 cpuIdleVal := math.Round(ts.Idle/tot*100*100) / 100 335 updateResourceMetrics(cpuSubsystem, cpuIdle, cpuIdleVal, labels, false) 336 cpuIOWaitVal := math.Round(ts.Iowait/tot*100*100) / 100 337 updateResourceMetrics(cpuSubsystem, cpuIOWait, cpuIOWaitVal, labels, false) 338 cpuNiceVal := math.Round(ts.Nice/tot*100*100) / 100 339 updateResourceMetrics(cpuSubsystem, cpuNice, cpuNiceVal, labels, false) 340 cpuStealVal := math.Round(ts.Steal/tot*100*100) / 100 341 updateResourceMetrics(cpuSubsystem, cpuSteal, cpuStealVal, labels, false) 342 } 343 ls := hm.CPU.LoadStat 344 if ls != nil { 345 updateResourceMetrics(cpuSubsystem, cpuLoad1, ls.Load1, labels, false) 346 updateResourceMetrics(cpuSubsystem, cpuLoad5, ls.Load5, labels, false) 347 updateResourceMetrics(cpuSubsystem, cpuLoad15, ls.Load15, labels, false) 348 if hm.CPU.CPUCount > 0 { 349 perc := math.Round(ls.Load1*100*100/float64(hm.CPU.CPUCount)) / 100 350 updateResourceMetrics(cpuSubsystem, cpuLoad1Perc, perc, labels, false) 351 perc = math.Round(ls.Load5*100*100/float64(hm.CPU.CPUCount)) / 100 352 updateResourceMetrics(cpuSubsystem, cpuLoad5Perc, perc, labels, false) 353 perc = math.Round(ls.Load15*100*100/float64(hm.CPU.CPUCount)) / 100 354 updateResourceMetrics(cpuSubsystem, cpuLoad15Perc, perc, labels, false) 355 } 356 } 357 } 358 break // only one host expected 359 } 360 } 361 362 collectDriveMetrics(m) 363 } 364 365 func initLatestValues() { 366 m := collectLocalMetrics(madmin.MetricsDisk, collectMetricsOpts{ 367 hosts: map[string]struct{}{ 368 globalLocalNodeName: {}, 369 }, 370 }) 371 372 latestDriveStatsMu.Lock() 373 latestDriveStats = map[string]madmin.DiskIOStats{} 374 for d, dm := range m.ByDisk { 375 latestDriveStats[d] = dm.IOStats 376 } 377 lastDriveStatsRefresh = time.Now().UTC() 378 latestDriveStatsMu.Unlock() 379 } 380 381 // startResourceMetricsCollection - starts the job for collecting resource metrics 382 func startResourceMetricsCollection() { 383 initLatestValues() 384 385 resourceMetricsMapMu.Lock() 386 resourceMetricsMap = map[MetricSubsystem]ResourceMetrics{} 387 resourceMetricsMapMu.Unlock() 388 metricsTimer := time.NewTimer(resourceMetricsCollectionInterval) 389 defer metricsTimer.Stop() 390 391 collectLocalResourceMetrics() 392 393 for { 394 select { 395 case <-GlobalContext.Done(): 396 return 397 case <-metricsTimer.C: 398 collectLocalResourceMetrics() 399 400 // Reset the timer for next cycle. 401 metricsTimer.Reset(resourceMetricsCollectionInterval) 402 } 403 } 404 } 405 406 // minioResourceCollector is the Collector for resource metrics 407 type minioResourceCollector struct { 408 metricsGroups []*MetricsGroupV2 409 desc *prometheus.Desc 410 } 411 412 // Describe sends the super-set of all possible descriptors of metrics 413 func (c *minioResourceCollector) Describe(ch chan<- *prometheus.Desc) { 414 ch <- c.desc 415 } 416 417 // Collect is called by the Prometheus registry when collecting metrics. 418 func (c *minioResourceCollector) Collect(out chan<- prometheus.Metric) { 419 var wg sync.WaitGroup 420 publish := func(in <-chan MetricV2) { 421 defer wg.Done() 422 for metric := range in { 423 labels, values := getOrderedLabelValueArrays(metric.VariableLabels) 424 collectMetric(metric, labels, values, "resource", out) 425 } 426 } 427 428 // Call peer api to fetch metrics 429 wg.Add(2) 430 go publish(ReportMetrics(GlobalContext, c.metricsGroups)) 431 go publish(globalNotificationSys.GetResourceMetrics(GlobalContext)) 432 wg.Wait() 433 } 434 435 // newMinioResourceCollector describes the collector 436 // and returns reference of minio resource Collector 437 // It creates the Prometheus Description which is used 438 // to define Metric and help string 439 func newMinioResourceCollector(metricsGroups []*MetricsGroupV2) *minioResourceCollector { 440 return &minioResourceCollector{ 441 metricsGroups: metricsGroups, 442 desc: prometheus.NewDesc("minio_resource_stats", "Resource statistics exposed by MinIO server", nil, nil), 443 } 444 } 445 446 func prepareResourceMetrics(rm ResourceMetric, subSys MetricSubsystem, requireAvgMax bool) []MetricV2 { 447 help := resourceMetricsHelpMap[rm.Name] 448 name := rm.Name 449 metrics := make([]MetricV2, 0, 3) 450 metrics = append(metrics, MetricV2{ 451 Description: getResourceMetricDescription(subSys, name, help), 452 Value: rm.Current, 453 VariableLabels: cloneMSS(rm.Labels), 454 }) 455 456 if requireAvgMax { 457 avgName := MetricName(fmt.Sprintf("%s_avg", name)) 458 avgHelp := fmt.Sprintf("%s (avg)", help) 459 metrics = append(metrics, MetricV2{ 460 Description: getResourceMetricDescription(subSys, avgName, avgHelp), 461 Value: math.Round(rm.Avg*100) / 100, 462 VariableLabels: cloneMSS(rm.Labels), 463 }) 464 465 maxName := MetricName(fmt.Sprintf("%s_max", name)) 466 maxHelp := fmt.Sprintf("%s (max)", help) 467 metrics = append(metrics, MetricV2{ 468 Description: getResourceMetricDescription(subSys, maxName, maxHelp), 469 Value: rm.Max, 470 VariableLabels: cloneMSS(rm.Labels), 471 }) 472 } 473 474 return metrics 475 } 476 477 func getResourceMetricDescription(subSys MetricSubsystem, name MetricName, help string) MetricDescription { 478 return MetricDescription{ 479 Namespace: nodeMetricNamespace, 480 Subsystem: subSys, 481 Name: name, 482 Help: help, 483 Type: gaugeMetric, 484 } 485 } 486 487 func getResourceMetrics() *MetricsGroupV2 { 488 mg := &MetricsGroupV2{ 489 cacheInterval: resourceMetricsCacheInterval, 490 } 491 mg.RegisterRead(func(ctx context.Context) []MetricV2 { 492 metrics := []MetricV2{} 493 494 subSystems := []MetricSubsystem{interfaceSubsystem, memSubsystem, driveSubsystem, cpuSubsystem} 495 resourceMetricsMapMu.RLock() 496 defer resourceMetricsMapMu.RUnlock() 497 for _, subSys := range subSystems { 498 stats, found := resourceMetricsMap[subSys] 499 if found { 500 requireAvgMax := true 501 if subSys == driveSubsystem { 502 requireAvgMax = false 503 } 504 for _, m := range stats { 505 metrics = append(metrics, prepareResourceMetrics(m, subSys, requireAvgMax)...) 506 } 507 } 508 } 509 510 return metrics 511 }) 512 return mg 513 } 514 515 // metricsResourceHandler is the prometheus handler for resource metrics 516 func metricsResourceHandler() http.Handler { 517 return metricsHTTPHandler(resourceCollector, "handler.MetricsResource") 518 }