storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/metrics-v2.go (about) 1 /* 2 * MinIO Cloud Storage, (C) 2018-2020 MinIO, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package cmd 18 19 import ( 20 "context" 21 "fmt" 22 "net/http" 23 "runtime" 24 "strings" 25 "sync" 26 "time" 27 28 "github.com/prometheus/client_golang/prometheus" 29 "github.com/prometheus/client_golang/prometheus/promhttp" 30 dto "github.com/prometheus/client_model/go" 31 "github.com/prometheus/procfs" 32 33 "storj.io/minio/cmd/logger" 34 ) 35 36 // MetricNamespace is top level grouping of metrics to create the metric name. 37 type MetricNamespace string 38 39 // MetricSubsystem is the sub grouping for metrics within a namespace. 40 type MetricSubsystem string 41 42 const ( 43 bucketMetricNamespace MetricNamespace = "minio_bucket" 44 clusterMetricNamespace MetricNamespace = "minio_cluster" 45 healMetricNamespace MetricNamespace = "minio_heal" 46 interNodeMetricNamespace MetricNamespace = "minio_inter_node" 47 nodeMetricNamespace MetricNamespace = "minio_node" 48 minioMetricNamespace MetricNamespace = "minio" 49 s3MetricNamespace MetricNamespace = "minio_s3" 50 ) 51 52 const ( 53 cacheSubsystem MetricSubsystem = "cache" 54 capacityRawSubsystem MetricSubsystem = "capacity_raw" 55 capacityUsableSubsystem MetricSubsystem = "capacity_usable" 56 diskSubsystem MetricSubsystem = "disk" 57 fileDescriptorSubsystem MetricSubsystem = "file_descriptor" 58 goRoutines MetricSubsystem = "go_routine" 59 ioSubsystem MetricSubsystem = "io" 60 nodesSubsystem MetricSubsystem = "nodes" 61 objectsSubsystem MetricSubsystem = "objects" 62 processSubsystem MetricSubsystem = "process" 63 replicationSubsystem MetricSubsystem = "replication" 64 requestsSubsystem MetricSubsystem = "requests" 65 requestsRejectedSubsystem MetricSubsystem = "requests_rejected" 66 timeSubsystem MetricSubsystem = "time" 67 trafficSubsystem MetricSubsystem = "traffic" 68 softwareSubsystem MetricSubsystem = "software" 69 sysCallSubsystem MetricSubsystem = "syscall" 70 usageSubsystem MetricSubsystem = "usage" 71 ) 72 73 // MetricName are the individual names for the metric. 74 type MetricName string 75 76 const ( 77 authTotal MetricName = "auth_total" 78 canceledTotal MetricName = "canceled_total" 79 errorsTotal MetricName = "errors_total" 80 headerTotal MetricName = "header_total" 81 healTotal MetricName = "heal_total" 82 hitsTotal MetricName = "hits_total" 83 inflightTotal MetricName = "inflight_total" 84 invalidTotal MetricName = "invalid_total" 85 limitTotal MetricName = "limit_total" 86 missedTotal MetricName = "missed_total" 87 waitingTotal MetricName = "waiting_total" 88 objectTotal MetricName = "object_total" 89 offlineTotal MetricName = "offline_total" 90 onlineTotal MetricName = "online_total" 91 openTotal MetricName = "open_total" 92 readTotal MetricName = "read_total" 93 timestampTotal MetricName = "timestamp_total" 94 writeTotal MetricName = "write_total" 95 total MetricName = "total" 96 97 failedCount MetricName = "failed_count" 98 failedBytes MetricName = "failed_bytes" 99 freeBytes MetricName = "free_bytes" 100 pendingBytes MetricName = "pending_bytes" 101 pendingCount MetricName = "pending_count" 102 readBytes MetricName = "read_bytes" 103 rcharBytes MetricName = "rchar_bytes" 104 receivedBytes MetricName = "received_bytes" 105 sentBytes MetricName = "sent_bytes" 106 totalBytes MetricName = "total_bytes" 107 usedBytes MetricName = "used_bytes" 108 writeBytes MetricName = "write_bytes" 109 wcharBytes MetricName = "wchar_bytes" 110 111 usagePercent MetricName = "update_percent" 112 113 commitInfo MetricName = "commit_info" 114 usageInfo MetricName = "usage_info" 115 versionInfo MetricName = "version_info" 116 117 sizeDistribution = "size_distribution" 118 ttfbDistribution = "ttfb_seconds_distribution" 119 120 lastActivityTime = "last_activity_nano_seconds" 121 startTime = "starttime_seconds" 122 upTime = "uptime_seconds" 123 ) 124 125 const ( 126 serverName = "server" 127 ) 128 129 // MetricType for the types of metrics supported 130 type MetricType string 131 132 const ( 133 gaugeMetric = "gaugeMetric" 134 counterMetric = "counterMetric" 135 histogramMetric = "histogramMetric" 136 ) 137 138 // MetricDescription describes the metric 139 type MetricDescription struct { 140 Namespace MetricNamespace `json:"MetricNamespace"` 141 Subsystem MetricSubsystem `json:"Subsystem"` 142 Name MetricName `json:"MetricName"` 143 Help string `json:"Help"` 144 Type MetricType `json:"Type"` 145 } 146 147 // Metric captures the details for a metric 148 type Metric struct { 149 Description MetricDescription `json:"Description"` 150 StaticLabels map[string]string `json:"StaticLabels"` 151 Value float64 `json:"Value"` 152 VariableLabels map[string]string `json:"VariableLabels"` 153 HistogramBucketLabel string `json:"HistogramBucketLabel"` 154 Histogram map[string]uint64 `json:"Histogram"` 155 } 156 157 func (m *Metric) copyMetric() Metric { 158 metric := Metric{ 159 Description: m.Description, 160 Value: m.Value, 161 HistogramBucketLabel: m.HistogramBucketLabel, 162 StaticLabels: make(map[string]string), 163 VariableLabels: make(map[string]string), 164 Histogram: make(map[string]uint64), 165 } 166 for k, v := range m.StaticLabels { 167 metric.StaticLabels[k] = v 168 } 169 for k, v := range m.VariableLabels { 170 metric.VariableLabels[k] = v 171 } 172 for k, v := range m.Histogram { 173 metric.Histogram[k] = v 174 } 175 return metric 176 } 177 178 // MetricsGroup are a group of metrics that are initialized together. 179 type MetricsGroup struct { 180 id string 181 cacheInterval time.Duration 182 cachedRead func(ctx context.Context, mg *MetricsGroup) []Metric 183 read func(ctx context.Context) []Metric 184 } 185 186 var metricsGroupCache = make(map[string]*timedValue) 187 var cacheLock sync.Mutex 188 189 func cachedRead(ctx context.Context, mg *MetricsGroup) (metrics []Metric) { 190 cacheLock.Lock() 191 defer cacheLock.Unlock() 192 v, ok := metricsGroupCache[mg.id] 193 if !ok { 194 interval := mg.cacheInterval 195 if interval == 0 { 196 interval = 30 * time.Second 197 } 198 v = &timedValue{} 199 v.Once.Do(func() { 200 v.Update = func() (interface{}, error) { 201 c := mg.read(ctx) 202 return c, nil 203 } 204 v.TTL = interval 205 }) 206 metricsGroupCache[mg.id] = v 207 } 208 c, err := v.Get() 209 if err != nil { 210 return []Metric{} 211 } 212 m := c.([]Metric) 213 for i := range m { 214 metrics = append(metrics, m[i].copyMetric()) 215 } 216 return metrics 217 } 218 219 // MetricsGenerator are functions that generate metric groups. 220 type MetricsGenerator func() MetricsGroup 221 222 // GetGlobalGenerators gets all the generators the report global metrics pre calculated. 223 func GetGlobalGenerators() []MetricsGenerator { 224 g := []MetricsGenerator{ 225 getBucketUsageMetrics, 226 getMinioHealingMetrics, 227 getNodeHealthMetrics, 228 getClusterStorageMetrics, 229 } 230 return g 231 } 232 233 // GetAllGenerators gets all the metric generators. 234 func GetAllGenerators() []MetricsGenerator { 235 g := GetGlobalGenerators() 236 g = append(g, GetGeneratorsForPeer()...) 237 return g 238 } 239 240 // GetGeneratorsForPeer - gets the generators to report to peer. 241 func GetGeneratorsForPeer() []MetricsGenerator { 242 g := []MetricsGenerator{ 243 getCacheMetrics, 244 getGoMetrics, 245 getHTTPMetrics, 246 getLocalStorageMetrics, 247 getMinioProcMetrics, 248 getMinioVersionMetrics, 249 getNetworkMetrics, 250 getS3TTFBMetric, 251 } 252 return g 253 } 254 255 // GetSingleNodeGenerators gets the metrics that are local 256 func GetSingleNodeGenerators() []MetricsGenerator { 257 g := []MetricsGenerator{ 258 getNodeHealthMetrics, 259 getCacheMetrics, 260 getHTTPMetrics, 261 getNetworkMetrics, 262 getMinioVersionMetrics, 263 getS3TTFBMetric, 264 } 265 return g 266 } 267 268 func getClusterCapacityTotalBytesMD() MetricDescription { 269 return MetricDescription{ 270 Namespace: clusterMetricNamespace, 271 Subsystem: capacityRawSubsystem, 272 Name: totalBytes, 273 Help: "Total capacity online in the cluster.", 274 Type: gaugeMetric, 275 } 276 } 277 func getClusterCapacityFreeBytesMD() MetricDescription { 278 return MetricDescription{ 279 Namespace: clusterMetricNamespace, 280 Subsystem: capacityRawSubsystem, 281 Name: freeBytes, 282 Help: "Total free capacity online in the cluster.", 283 Type: gaugeMetric, 284 } 285 } 286 func getClusterCapacityUsageBytesMD() MetricDescription { 287 return MetricDescription{ 288 Namespace: clusterMetricNamespace, 289 Subsystem: capacityUsableSubsystem, 290 Name: totalBytes, 291 Help: "Total usable capacity online in the cluster.", 292 Type: gaugeMetric, 293 } 294 } 295 func getClusterCapacityUsageFreeBytesMD() MetricDescription { 296 return MetricDescription{ 297 Namespace: clusterMetricNamespace, 298 Subsystem: capacityUsableSubsystem, 299 Name: freeBytes, 300 Help: "Total free usable capacity online in the cluster.", 301 Type: gaugeMetric, 302 } 303 } 304 305 func getNodeDiskUsedBytesMD() MetricDescription { 306 return MetricDescription{ 307 Namespace: nodeMetricNamespace, 308 Subsystem: diskSubsystem, 309 Name: usedBytes, 310 Help: "Total storage used on a disk.", 311 Type: gaugeMetric, 312 } 313 } 314 func getNodeDiskFreeBytesMD() MetricDescription { 315 return MetricDescription{ 316 Namespace: nodeMetricNamespace, 317 Subsystem: diskSubsystem, 318 Name: freeBytes, 319 Help: "Total storage available on a disk.", 320 Type: gaugeMetric, 321 } 322 } 323 func getClusterDisksOfflineTotalMD() MetricDescription { 324 return MetricDescription{ 325 Namespace: clusterMetricNamespace, 326 Subsystem: diskSubsystem, 327 Name: offlineTotal, 328 Help: "Total disks offline.", 329 Type: gaugeMetric, 330 } 331 } 332 333 func getClusterDisksOnlineTotalMD() MetricDescription { 334 return MetricDescription{ 335 Namespace: clusterMetricNamespace, 336 Subsystem: diskSubsystem, 337 Name: onlineTotal, 338 Help: "Total disks online.", 339 Type: gaugeMetric, 340 } 341 } 342 343 func getClusterDisksTotalMD() MetricDescription { 344 return MetricDescription{ 345 Namespace: clusterMetricNamespace, 346 Subsystem: diskSubsystem, 347 Name: total, 348 Help: "Total disks.", 349 Type: gaugeMetric, 350 } 351 } 352 353 func getNodeDiskTotalBytesMD() MetricDescription { 354 return MetricDescription{ 355 Namespace: nodeMetricNamespace, 356 Subsystem: diskSubsystem, 357 Name: totalBytes, 358 Help: "Total storage on a disk.", 359 Type: gaugeMetric, 360 } 361 } 362 func getUsageLastScanActivityMD() MetricDescription { 363 return MetricDescription{ 364 Namespace: minioMetricNamespace, 365 Subsystem: usageSubsystem, 366 Name: lastActivityTime, 367 Help: "Time elapsed (in nano seconds) since last scan activity. This is set to 0 until first scan cycle", 368 Type: gaugeMetric, 369 } 370 } 371 372 func getBucketUsageTotalBytesMD() MetricDescription { 373 return MetricDescription{ 374 Namespace: bucketMetricNamespace, 375 Subsystem: usageSubsystem, 376 Name: totalBytes, 377 Help: "Total bucket size in bytes", 378 Type: gaugeMetric, 379 } 380 } 381 func getBucketUsageObjectsTotalMD() MetricDescription { 382 return MetricDescription{ 383 Namespace: bucketMetricNamespace, 384 Subsystem: usageSubsystem, 385 Name: objectTotal, 386 Help: "Total number of objects", 387 Type: gaugeMetric, 388 } 389 } 390 func getBucketRepPendingBytesMD() MetricDescription { 391 return MetricDescription{ 392 Namespace: bucketMetricNamespace, 393 Subsystem: replicationSubsystem, 394 Name: pendingBytes, 395 Help: "Total bytes pending to replicate.", 396 Type: gaugeMetric, 397 } 398 } 399 func getBucketRepFailedBytesMD() MetricDescription { 400 return MetricDescription{ 401 Namespace: bucketMetricNamespace, 402 Subsystem: replicationSubsystem, 403 Name: failedBytes, 404 Help: "Total number of bytes failed at least once to replicate.", 405 Type: gaugeMetric, 406 } 407 } 408 func getBucketRepSentBytesMD() MetricDescription { 409 return MetricDescription{ 410 Namespace: bucketMetricNamespace, 411 Subsystem: replicationSubsystem, 412 Name: sentBytes, 413 Help: "Total number of bytes replicated to the target bucket.", 414 Type: gaugeMetric, 415 } 416 } 417 func getBucketRepReceivedBytesMD() MetricDescription { 418 return MetricDescription{ 419 Namespace: bucketMetricNamespace, 420 Subsystem: replicationSubsystem, 421 Name: receivedBytes, 422 Help: "Total number of bytes replicated to this bucket from another source bucket.", 423 Type: gaugeMetric, 424 } 425 } 426 func getBucketRepPendingOperationsMD() MetricDescription { 427 return MetricDescription{ 428 Namespace: bucketMetricNamespace, 429 Subsystem: replicationSubsystem, 430 Name: pendingCount, 431 Help: "Total number of objects pending replication", 432 Type: gaugeMetric, 433 } 434 } 435 func getBucketRepFailedOperationsMD() MetricDescription { 436 return MetricDescription{ 437 Namespace: bucketMetricNamespace, 438 Subsystem: replicationSubsystem, 439 Name: failedCount, 440 Help: "Total number of objects which failed replication", 441 Type: gaugeMetric, 442 } 443 } 444 func getBucketObjectDistributionMD() MetricDescription { 445 return MetricDescription{ 446 Namespace: bucketMetricNamespace, 447 Subsystem: objectsSubsystem, 448 Name: sizeDistribution, 449 Help: "Distribution of object sizes in the bucket, includes label for the bucket name.", 450 Type: histogramMetric, 451 } 452 } 453 func getInternodeFailedRequests() MetricDescription { 454 return MetricDescription{ 455 Namespace: interNodeMetricNamespace, 456 Subsystem: trafficSubsystem, 457 Name: errorsTotal, 458 Help: "Total number of failed internode calls.", 459 Type: counterMetric, 460 } 461 } 462 463 func getInterNodeSentBytesMD() MetricDescription { 464 return MetricDescription{ 465 Namespace: interNodeMetricNamespace, 466 Subsystem: trafficSubsystem, 467 Name: sentBytes, 468 Help: "Total number of bytes sent to the other peer nodes.", 469 Type: counterMetric, 470 } 471 } 472 func getInterNodeReceivedBytesMD() MetricDescription { 473 return MetricDescription{ 474 Namespace: interNodeMetricNamespace, 475 Subsystem: trafficSubsystem, 476 Name: receivedBytes, 477 Help: "Total number of bytes received from other peer nodes.", 478 Type: counterMetric, 479 } 480 } 481 func getS3SentBytesMD() MetricDescription { 482 return MetricDescription{ 483 Namespace: s3MetricNamespace, 484 Subsystem: trafficSubsystem, 485 Name: sentBytes, 486 Help: "Total number of s3 bytes sent", 487 Type: counterMetric, 488 } 489 } 490 func getS3ReceivedBytesMD() MetricDescription { 491 return MetricDescription{ 492 Namespace: s3MetricNamespace, 493 Subsystem: trafficSubsystem, 494 Name: receivedBytes, 495 Help: "Total number of s3 bytes received.", 496 Type: counterMetric, 497 } 498 } 499 func getS3RequestsInFlightMD() MetricDescription { 500 return MetricDescription{ 501 Namespace: s3MetricNamespace, 502 Subsystem: requestsSubsystem, 503 Name: inflightTotal, 504 Help: "Total number of S3 requests currently in flight", 505 Type: gaugeMetric, 506 } 507 } 508 func getS3RequestsInQueueMD() MetricDescription { 509 return MetricDescription{ 510 Namespace: s3MetricNamespace, 511 Subsystem: requestsSubsystem, 512 Name: waitingTotal, 513 Help: "Number of S3 requests in the waiting queue", 514 Type: gaugeMetric, 515 } 516 } 517 func getS3RequestsTotalMD() MetricDescription { 518 return MetricDescription{ 519 Namespace: s3MetricNamespace, 520 Subsystem: requestsSubsystem, 521 Name: total, 522 Help: "Total number S3 requests", 523 Type: counterMetric, 524 } 525 } 526 func getS3RequestsErrorsMD() MetricDescription { 527 return MetricDescription{ 528 Namespace: s3MetricNamespace, 529 Subsystem: requestsSubsystem, 530 Name: errorsTotal, 531 Help: "Total number S3 requests with errors", 532 Type: counterMetric, 533 } 534 } 535 func getS3RequestsCanceledMD() MetricDescription { 536 return MetricDescription{ 537 Namespace: s3MetricNamespace, 538 Subsystem: requestsSubsystem, 539 Name: canceledTotal, 540 Help: "Total number S3 requests that were canceled from the client while processing", 541 Type: counterMetric, 542 } 543 } 544 func getS3RejectedAuthRequestsTotalMD() MetricDescription { 545 return MetricDescription{ 546 Namespace: s3MetricNamespace, 547 Subsystem: requestsRejectedSubsystem, 548 Name: authTotal, 549 Help: "Total number S3 requests rejected for auth failure.", 550 Type: counterMetric, 551 } 552 } 553 func getS3RejectedHeaderRequestsTotalMD() MetricDescription { 554 return MetricDescription{ 555 Namespace: s3MetricNamespace, 556 Subsystem: requestsRejectedSubsystem, 557 Name: headerTotal, 558 Help: "Total number S3 requests rejected for invalid header.", 559 Type: counterMetric, 560 } 561 } 562 func getS3RejectedTimestampRequestsTotalMD() MetricDescription { 563 return MetricDescription{ 564 Namespace: s3MetricNamespace, 565 Subsystem: requestsRejectedSubsystem, 566 Name: timestampTotal, 567 Help: "Total number S3 requests rejected for invalid timestamp.", 568 Type: counterMetric, 569 } 570 } 571 func getS3RejectedInvalidRequestsTotalMD() MetricDescription { 572 return MetricDescription{ 573 Namespace: s3MetricNamespace, 574 Subsystem: requestsRejectedSubsystem, 575 Name: invalidTotal, 576 Help: "Total number S3 invalid requests.", 577 Type: counterMetric, 578 } 579 } 580 func getCacheHitsTotalMD() MetricDescription { 581 return MetricDescription{ 582 Namespace: minioNamespace, 583 Subsystem: cacheSubsystem, 584 Name: hitsTotal, 585 Help: "Total number of disk cache hits", 586 Type: counterMetric, 587 } 588 } 589 func getCacheHitsMissedTotalMD() MetricDescription { 590 return MetricDescription{ 591 Namespace: minioNamespace, 592 Subsystem: cacheSubsystem, 593 Name: missedTotal, 594 Help: "Total number of disk cache misses", 595 Type: counterMetric, 596 } 597 } 598 func getCacheUsagePercentMD() MetricDescription { 599 return MetricDescription{ 600 Namespace: minioNamespace, 601 Subsystem: minioNamespace, 602 Name: usagePercent, 603 Help: "Total percentage cache usage", 604 Type: gaugeMetric, 605 } 606 } 607 func getCacheUsageInfoMD() MetricDescription { 608 return MetricDescription{ 609 Namespace: minioNamespace, 610 Subsystem: cacheSubsystem, 611 Name: usageInfo, 612 Help: "Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well", 613 Type: gaugeMetric, 614 } 615 } 616 func getCacheUsedBytesMD() MetricDescription { 617 return MetricDescription{ 618 Namespace: minioNamespace, 619 Subsystem: cacheSubsystem, 620 Name: usedBytes, 621 Help: "Current cache usage in bytes", 622 Type: gaugeMetric, 623 } 624 } 625 func getCacheTotalBytesMD() MetricDescription { 626 return MetricDescription{ 627 Namespace: minioNamespace, 628 Subsystem: cacheSubsystem, 629 Name: totalBytes, 630 Help: "Total size of cache disk in bytes", 631 Type: gaugeMetric, 632 } 633 } 634 func getCacheSentBytesMD() MetricDescription { 635 return MetricDescription{ 636 Namespace: minioNamespace, 637 Subsystem: cacheSubsystem, 638 Name: sentBytes, 639 Help: "Total number of bytes served from cache", 640 Type: counterMetric, 641 } 642 } 643 func getHealObjectsTotalMD() MetricDescription { 644 return MetricDescription{ 645 Namespace: healMetricNamespace, 646 Subsystem: objectsSubsystem, 647 Name: total, 648 Help: "Objects scanned in current self healing run", 649 Type: gaugeMetric, 650 } 651 } 652 func getHealObjectsHealTotalMD() MetricDescription { 653 return MetricDescription{ 654 Namespace: healMetricNamespace, 655 Subsystem: objectsSubsystem, 656 Name: healTotal, 657 Help: "Objects healed in current self healing run", 658 Type: gaugeMetric, 659 } 660 } 661 662 func getHealObjectsFailTotalMD() MetricDescription { 663 return MetricDescription{ 664 Namespace: healMetricNamespace, 665 Subsystem: objectsSubsystem, 666 Name: errorsTotal, 667 Help: "Objects for which healing failed in current self healing run", 668 Type: gaugeMetric, 669 } 670 } 671 func getHealLastActivityTimeMD() MetricDescription { 672 return MetricDescription{ 673 Namespace: healMetricNamespace, 674 Subsystem: timeSubsystem, 675 Name: lastActivityTime, 676 Help: "Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity", 677 Type: gaugeMetric, 678 } 679 } 680 func getNodeOnlineTotalMD() MetricDescription { 681 return MetricDescription{ 682 Namespace: clusterMetricNamespace, 683 Subsystem: nodesSubsystem, 684 Name: onlineTotal, 685 Help: "Total number of MinIO nodes online.", 686 Type: gaugeMetric, 687 } 688 } 689 func getNodeOfflineTotalMD() MetricDescription { 690 return MetricDescription{ 691 Namespace: clusterMetricNamespace, 692 Subsystem: nodesSubsystem, 693 Name: offlineTotal, 694 Help: "Total number of MinIO nodes offline.", 695 Type: gaugeMetric, 696 } 697 } 698 func getMinIOVersionMD() MetricDescription { 699 return MetricDescription{ 700 Namespace: minioMetricNamespace, 701 Subsystem: softwareSubsystem, 702 Name: versionInfo, 703 Help: "MinIO Release tag for the server", 704 Type: gaugeMetric, 705 } 706 } 707 func getMinIOCommitMD() MetricDescription { 708 return MetricDescription{ 709 Namespace: minioMetricNamespace, 710 Subsystem: softwareSubsystem, 711 Name: commitInfo, 712 Help: "Git commit hash for the MinIO release.", 713 Type: gaugeMetric, 714 } 715 } 716 func getS3TTFBDistributionMD() MetricDescription { 717 return MetricDescription{ 718 Namespace: s3MetricNamespace, 719 Subsystem: timeSubsystem, 720 Name: ttfbDistribution, 721 Help: "Distribution of the time to first byte across API calls.", 722 Type: gaugeMetric, 723 } 724 } 725 func getMinioFDOpenMD() MetricDescription { 726 return MetricDescription{ 727 Namespace: nodeMetricNamespace, 728 Subsystem: fileDescriptorSubsystem, 729 Name: openTotal, 730 Help: "Total number of open file descriptors by the MinIO Server process.", 731 Type: gaugeMetric, 732 } 733 } 734 func getMinioFDLimitMD() MetricDescription { 735 return MetricDescription{ 736 Namespace: nodeMetricNamespace, 737 Subsystem: fileDescriptorSubsystem, 738 Name: limitTotal, 739 Help: "Limit on total number of open file descriptors for the MinIO Server process.", 740 Type: gaugeMetric, 741 } 742 } 743 func getMinioProcessIOWriteBytesMD() MetricDescription { 744 return MetricDescription{ 745 Namespace: nodeMetricNamespace, 746 Subsystem: ioSubsystem, 747 Name: writeBytes, 748 Help: "Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes", 749 Type: counterMetric, 750 } 751 } 752 func getMinioProcessIOReadBytesMD() MetricDescription { 753 return MetricDescription{ 754 Namespace: nodeMetricNamespace, 755 Subsystem: ioSubsystem, 756 Name: readBytes, 757 Help: "Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes", 758 Type: counterMetric, 759 } 760 } 761 func getMinioProcessIOWriteCachedBytesMD() MetricDescription { 762 return MetricDescription{ 763 Namespace: nodeMetricNamespace, 764 Subsystem: ioSubsystem, 765 Name: wcharBytes, 766 Help: "Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar", 767 Type: counterMetric, 768 } 769 } 770 func getMinioProcessIOReadCachedBytesMD() MetricDescription { 771 return MetricDescription{ 772 Namespace: nodeMetricNamespace, 773 Subsystem: ioSubsystem, 774 Name: rcharBytes, 775 Help: "Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar", 776 Type: counterMetric, 777 } 778 } 779 func getMinIOProcessSysCallRMD() MetricDescription { 780 return MetricDescription{ 781 Namespace: nodeMetricNamespace, 782 Subsystem: sysCallSubsystem, 783 Name: readTotal, 784 Help: "Total read SysCalls to the kernel. /proc/[pid]/io syscr", 785 Type: counterMetric, 786 } 787 } 788 func getMinIOProcessSysCallWMD() MetricDescription { 789 return MetricDescription{ 790 Namespace: nodeMetricNamespace, 791 Subsystem: sysCallSubsystem, 792 Name: writeTotal, 793 Help: "Total write SysCalls to the kernel. /proc/[pid]/io syscw", 794 Type: counterMetric, 795 } 796 } 797 func getMinIOGORoutineCountMD() MetricDescription { 798 return MetricDescription{ 799 Namespace: nodeMetricNamespace, 800 Subsystem: goRoutines, 801 Name: total, 802 Help: "Total number of go routines running.", 803 Type: gaugeMetric, 804 } 805 } 806 func getMinIOProcessStartTimeMD() MetricDescription { 807 return MetricDescription{ 808 Namespace: nodeMetricNamespace, 809 Subsystem: processSubsystem, 810 Name: startTime, 811 Help: "Start time for MinIO process per node, time in seconds since Unix epoc.", 812 Type: gaugeMetric, 813 } 814 } 815 func getMinIOProcessUptimeMD() MetricDescription { 816 return MetricDescription{ 817 Namespace: nodeMetricNamespace, 818 Subsystem: processSubsystem, 819 Name: upTime, 820 Help: "Uptime for MinIO process per node in seconds.", 821 Type: gaugeMetric, 822 } 823 } 824 func getMinioProcMetrics() MetricsGroup { 825 return MetricsGroup{ 826 id: "MinioProcMetrics", 827 cachedRead: cachedRead, 828 read: func(ctx context.Context) (metrics []Metric) { 829 if runtime.GOOS == "windows" { 830 return nil 831 } 832 metrics = make([]Metric, 0, 20) 833 p, err := procfs.Self() 834 if err != nil { 835 logger.LogOnceIf(ctx, err, nodeMetricNamespace) 836 return 837 } 838 var openFDs int 839 openFDs, err = p.FileDescriptorsLen() 840 if err != nil { 841 logger.LogOnceIf(ctx, err, getMinioFDOpenMD()) 842 return 843 } 844 l, err := p.Limits() 845 if err != nil { 846 logger.LogOnceIf(ctx, err, getMinioFDLimitMD()) 847 return 848 } 849 io, err := p.IO() 850 if err != nil { 851 logger.LogOnceIf(ctx, err, ioSubsystem) 852 return 853 } 854 stat, err := p.Stat() 855 if err != nil { 856 logger.LogOnceIf(ctx, err, processSubsystem) 857 return 858 } 859 startTime, err := stat.StartTime() 860 if err != nil { 861 logger.LogOnceIf(ctx, err, startTime) 862 return 863 } 864 865 metrics = append(metrics, 866 Metric{ 867 Description: getMinioFDOpenMD(), 868 Value: float64(openFDs), 869 }, 870 ) 871 metrics = append(metrics, 872 Metric{ 873 Description: getMinioFDLimitMD(), 874 Value: float64(l.OpenFiles), 875 }) 876 metrics = append(metrics, 877 Metric{ 878 Description: getMinIOProcessSysCallRMD(), 879 Value: float64(io.SyscR), 880 }) 881 metrics = append(metrics, 882 Metric{ 883 Description: getMinIOProcessSysCallWMD(), 884 Value: float64(io.SyscW), 885 }) 886 metrics = append(metrics, 887 Metric{ 888 Description: getMinioProcessIOReadBytesMD(), 889 Value: float64(io.ReadBytes), 890 }) 891 metrics = append(metrics, 892 Metric{ 893 Description: getMinioProcessIOWriteBytesMD(), 894 Value: float64(io.WriteBytes), 895 }) 896 metrics = append(metrics, 897 Metric{ 898 Description: getMinioProcessIOReadCachedBytesMD(), 899 Value: float64(io.RChar), 900 }) 901 metrics = append(metrics, 902 Metric{ 903 Description: getMinioProcessIOWriteCachedBytesMD(), 904 Value: float64(io.WChar), 905 }) 906 metrics = append(metrics, 907 Metric{ 908 Description: getMinIOProcessStartTimeMD(), 909 Value: startTime, 910 }) 911 metrics = append(metrics, 912 Metric{ 913 Description: getMinIOProcessUptimeMD(), 914 Value: time.Since(globalBootTime).Seconds(), 915 }) 916 return 917 }, 918 } 919 } 920 func getGoMetrics() MetricsGroup { 921 return MetricsGroup{ 922 id: "GoMetrics", 923 cachedRead: cachedRead, 924 read: func(ctx context.Context) (metrics []Metric) { 925 metrics = append(metrics, Metric{ 926 Description: getMinIOGORoutineCountMD(), 927 Value: float64(runtime.NumGoroutine()), 928 }) 929 return 930 }, 931 } 932 } 933 func getS3TTFBMetric() MetricsGroup { 934 return MetricsGroup{ 935 id: "s3TTFBMetric", 936 cachedRead: cachedRead, 937 read: func(ctx context.Context) (metrics []Metric) { 938 939 // Read prometheus metric on this channel 940 ch := make(chan prometheus.Metric) 941 var wg sync.WaitGroup 942 wg.Add(1) 943 944 // Read prometheus histogram data and convert it to internal metric data 945 go func() { 946 defer wg.Done() 947 for promMetric := range ch { 948 dtoMetric := &dto.Metric{} 949 err := promMetric.Write(dtoMetric) 950 if err != nil { 951 logger.LogIf(GlobalContext, err) 952 return 953 } 954 h := dtoMetric.GetHistogram() 955 for _, b := range h.Bucket { 956 labels := make(map[string]string) 957 for _, lp := range dtoMetric.GetLabel() { 958 labels[*lp.Name] = *lp.Value 959 } 960 labels["le"] = fmt.Sprintf("%.3f", *b.UpperBound) 961 metric := Metric{ 962 Description: getS3TTFBDistributionMD(), 963 VariableLabels: labels, 964 Value: float64(b.GetCumulativeCount()), 965 } 966 metrics = append(metrics, metric) 967 } 968 } 969 970 }() 971 972 httpRequestsDuration.Collect(ch) 973 close(ch) 974 wg.Wait() 975 return 976 }, 977 } 978 } 979 980 func getMinioVersionMetrics() MetricsGroup { 981 return MetricsGroup{ 982 id: "MinioVersionMetrics", 983 cachedRead: cachedRead, 984 read: func(_ context.Context) (metrics []Metric) { 985 metrics = append(metrics, Metric{ 986 Description: getMinIOCommitMD(), 987 VariableLabels: map[string]string{"commit": CommitID}, 988 }) 989 metrics = append(metrics, Metric{ 990 Description: getMinIOVersionMD(), 991 VariableLabels: map[string]string{"version": Version}, 992 }) 993 return 994 }, 995 } 996 } 997 998 func getNodeHealthMetrics() MetricsGroup { 999 return MetricsGroup{ 1000 id: "NodeHealthMetrics", 1001 cachedRead: cachedRead, 1002 read: func(_ context.Context) (metrics []Metric) { 1003 nodesUp, nodesDown := GetPeerOnlineCount() 1004 metrics = append(metrics, Metric{ 1005 Description: getNodeOnlineTotalMD(), 1006 Value: float64(nodesUp), 1007 }) 1008 metrics = append(metrics, Metric{ 1009 Description: getNodeOfflineTotalMD(), 1010 Value: float64(nodesDown), 1011 }) 1012 return 1013 }, 1014 } 1015 } 1016 1017 func getMinioHealingMetrics() MetricsGroup { 1018 return MetricsGroup{ 1019 id: "minioHealingMetrics", 1020 cachedRead: cachedRead, 1021 read: func(_ context.Context) (metrics []Metric) { 1022 metrics = make([]Metric, 0, 5) 1023 if !globalIsErasure { 1024 return 1025 } 1026 bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) 1027 if !exists { 1028 return 1029 } 1030 1031 if bgSeq.lastHealActivity.IsZero() { 1032 return 1033 } 1034 1035 metrics = append(metrics, Metric{ 1036 Description: getHealLastActivityTimeMD(), 1037 Value: float64(time.Since(bgSeq.lastHealActivity)), 1038 }) 1039 metrics = append(metrics, getObjectsScanned(bgSeq)...) 1040 metrics = append(metrics, getScannedItems(bgSeq)...) 1041 metrics = append(metrics, getFailedItems(bgSeq)...) 1042 return 1043 }, 1044 } 1045 } 1046 1047 func getFailedItems(seq *healSequence) (m []Metric) { 1048 m = make([]Metric, 0, 1) 1049 for k, v := range seq.gethealFailedItemsMap() { 1050 s := strings.Split(k, ",") 1051 m = append(m, Metric{ 1052 Description: getHealObjectsFailTotalMD(), 1053 VariableLabels: map[string]string{ 1054 "mount_path": s[0], 1055 "volume_status": s[1], 1056 }, 1057 Value: float64(v), 1058 }) 1059 } 1060 return 1061 } 1062 1063 func getScannedItems(seq *healSequence) (m []Metric) { 1064 items := seq.getHealedItemsMap() 1065 m = make([]Metric, 0, len(items)) 1066 for k, v := range items { 1067 m = append(m, Metric{ 1068 Description: getHealObjectsHealTotalMD(), 1069 VariableLabels: map[string]string{"type": string(k)}, 1070 Value: float64(v), 1071 }) 1072 } 1073 return 1074 } 1075 1076 func getObjectsScanned(seq *healSequence) (m []Metric) { 1077 items := seq.getHealedItemsMap() 1078 m = make([]Metric, 0, len(items)) 1079 for k, v := range seq.getScannedItemsMap() { 1080 m = append(m, Metric{ 1081 Description: getHealObjectsTotalMD(), 1082 VariableLabels: map[string]string{"type": string(k)}, 1083 Value: float64(v), 1084 }) 1085 } 1086 return 1087 } 1088 func getCacheMetrics() MetricsGroup { 1089 return MetricsGroup{ 1090 id: "CacheMetrics", 1091 cachedRead: cachedRead, 1092 read: func(ctx context.Context) (metrics []Metric) { 1093 metrics = make([]Metric, 0, 20) 1094 cacheObjLayer := newCachedObjectLayerFn() 1095 // Service not initialized yet 1096 if cacheObjLayer == nil { 1097 return 1098 } 1099 metrics = append(metrics, Metric{ 1100 Description: getCacheHitsTotalMD(), 1101 Value: float64(cacheObjLayer.CacheStats().getHits()), 1102 }) 1103 metrics = append(metrics, Metric{ 1104 Description: getCacheHitsMissedTotalMD(), 1105 Value: float64(cacheObjLayer.CacheStats().getMisses()), 1106 }) 1107 metrics = append(metrics, Metric{ 1108 Description: getCacheSentBytesMD(), 1109 Value: float64(cacheObjLayer.CacheStats().getBytesServed()), 1110 }) 1111 for _, cdStats := range cacheObjLayer.CacheStats().GetDiskStats() { 1112 metrics = append(metrics, Metric{ 1113 Description: getCacheUsagePercentMD(), 1114 Value: float64(cdStats.UsagePercent), 1115 VariableLabels: map[string]string{"disk": cdStats.Dir}, 1116 }) 1117 metrics = append(metrics, Metric{ 1118 Description: getCacheUsageInfoMD(), 1119 Value: float64(cdStats.UsageState), 1120 VariableLabels: map[string]string{"disk": cdStats.Dir, "level": cdStats.GetUsageLevelString()}, 1121 }) 1122 metrics = append(metrics, Metric{ 1123 Description: getCacheUsedBytesMD(), 1124 Value: float64(cdStats.UsageSize), 1125 VariableLabels: map[string]string{"disk": cdStats.Dir}, 1126 }) 1127 metrics = append(metrics, Metric{ 1128 Description: getCacheTotalBytesMD(), 1129 Value: float64(cdStats.TotalCapacity), 1130 VariableLabels: map[string]string{"disk": cdStats.Dir}, 1131 }) 1132 } 1133 return 1134 }, 1135 } 1136 } 1137 1138 func getHTTPMetrics() MetricsGroup { 1139 return MetricsGroup{ 1140 id: "httpMetrics", 1141 cachedRead: cachedRead, 1142 read: func(ctx context.Context) (metrics []Metric) { 1143 httpStats := globalHTTPStats.toServerHTTPStats() 1144 metrics = make([]Metric, 0, 3+ 1145 len(httpStats.CurrentS3Requests.APIStats)+ 1146 len(httpStats.TotalS3Requests.APIStats)+ 1147 len(httpStats.TotalS3Errors.APIStats)) 1148 metrics = append(metrics, Metric{ 1149 Description: getS3RejectedAuthRequestsTotalMD(), 1150 Value: float64(httpStats.TotalS3RejectedAuth), 1151 }) 1152 metrics = append(metrics, Metric{ 1153 Description: getS3RejectedTimestampRequestsTotalMD(), 1154 Value: float64(httpStats.TotalS3RejectedTime), 1155 }) 1156 metrics = append(metrics, Metric{ 1157 Description: getS3RejectedHeaderRequestsTotalMD(), 1158 Value: float64(httpStats.TotalS3RejectedHeader), 1159 }) 1160 metrics = append(metrics, Metric{ 1161 Description: getS3RejectedInvalidRequestsTotalMD(), 1162 Value: float64(httpStats.TotalS3RejectedInvalid), 1163 }) 1164 metrics = append(metrics, Metric{ 1165 Description: getS3RequestsInQueueMD(), 1166 Value: float64(httpStats.S3RequestsInQueue), 1167 }) 1168 for api, value := range httpStats.CurrentS3Requests.APIStats { 1169 metrics = append(metrics, Metric{ 1170 Description: getS3RequestsInFlightMD(), 1171 Value: float64(value), 1172 VariableLabels: map[string]string{"api": api}, 1173 }) 1174 } 1175 for api, value := range httpStats.TotalS3Requests.APIStats { 1176 metrics = append(metrics, Metric{ 1177 Description: getS3RequestsTotalMD(), 1178 Value: float64(value), 1179 VariableLabels: map[string]string{"api": api}, 1180 }) 1181 } 1182 for api, value := range httpStats.TotalS3Errors.APIStats { 1183 metrics = append(metrics, Metric{ 1184 Description: getS3RequestsErrorsMD(), 1185 Value: float64(value), 1186 VariableLabels: map[string]string{"api": api}, 1187 }) 1188 } 1189 for api, value := range httpStats.TotalS3Canceled.APIStats { 1190 metrics = append(metrics, Metric{ 1191 Description: getS3RequestsCanceledMD(), 1192 Value: float64(value), 1193 VariableLabels: map[string]string{"api": api}, 1194 }) 1195 } 1196 return 1197 }, 1198 } 1199 } 1200 1201 func getNetworkMetrics() MetricsGroup { 1202 return MetricsGroup{ 1203 id: "networkMetrics", 1204 cachedRead: cachedRead, 1205 read: func(ctx context.Context) (metrics []Metric) { 1206 metrics = make([]Metric, 0, 10) 1207 metrics = append(metrics, Metric{ 1208 Description: getInternodeFailedRequests(), 1209 Value: float64(loadAndResetRPCNetworkErrsCounter()), 1210 }) 1211 connStats := globalConnStats.toServerConnStats() 1212 metrics = append(metrics, Metric{ 1213 Description: getInterNodeSentBytesMD(), 1214 Value: float64(connStats.TotalOutputBytes), 1215 }) 1216 metrics = append(metrics, Metric{ 1217 Description: getInterNodeReceivedBytesMD(), 1218 Value: float64(connStats.TotalInputBytes), 1219 }) 1220 metrics = append(metrics, Metric{ 1221 Description: getS3SentBytesMD(), 1222 Value: float64(connStats.S3OutputBytes), 1223 }) 1224 metrics = append(metrics, Metric{ 1225 Description: getS3ReceivedBytesMD(), 1226 Value: float64(connStats.S3InputBytes), 1227 }) 1228 return 1229 }, 1230 } 1231 } 1232 1233 func getBucketUsageMetrics() MetricsGroup { 1234 return MetricsGroup{ 1235 id: "BucketUsageMetrics", 1236 cachedRead: cachedRead, 1237 read: func(ctx context.Context) (metrics []Metric) { 1238 metrics = make([]Metric, 0, 50) 1239 objLayer := newObjectLayerFn() 1240 // Service not initialized yet 1241 if objLayer == nil { 1242 return 1243 } 1244 1245 if GlobalIsGateway { 1246 return 1247 } 1248 1249 dataUsageInfo, err := loadDataUsageFromBackend(ctx, objLayer) 1250 if err != nil { 1251 return 1252 } 1253 1254 // data usage has not captured any data yet. 1255 if dataUsageInfo.LastUpdate.IsZero() { 1256 return 1257 } 1258 1259 metrics = append(metrics, Metric{ 1260 Description: getUsageLastScanActivityMD(), 1261 Value: float64(time.Since(dataUsageInfo.LastUpdate)), 1262 }) 1263 1264 for bucket, usage := range dataUsageInfo.BucketsUsage { 1265 stat := getLatestReplicationStats(bucket, usage) 1266 1267 metrics = append(metrics, Metric{ 1268 Description: getBucketUsageTotalBytesMD(), 1269 Value: float64(usage.Size), 1270 VariableLabels: map[string]string{"bucket": bucket}, 1271 }) 1272 1273 metrics = append(metrics, Metric{ 1274 Description: getBucketUsageObjectsTotalMD(), 1275 Value: float64(usage.ObjectsCount), 1276 VariableLabels: map[string]string{"bucket": bucket}, 1277 }) 1278 1279 if stat.hasReplicationUsage() { 1280 metrics = append(metrics, Metric{ 1281 Description: getBucketRepPendingBytesMD(), 1282 Value: float64(stat.PendingSize), 1283 VariableLabels: map[string]string{"bucket": bucket}, 1284 }) 1285 metrics = append(metrics, Metric{ 1286 Description: getBucketRepFailedBytesMD(), 1287 Value: float64(stat.FailedSize), 1288 VariableLabels: map[string]string{"bucket": bucket}, 1289 }) 1290 metrics = append(metrics, Metric{ 1291 Description: getBucketRepSentBytesMD(), 1292 Value: float64(stat.ReplicatedSize), 1293 VariableLabels: map[string]string{"bucket": bucket}, 1294 }) 1295 metrics = append(metrics, Metric{ 1296 Description: getBucketRepReceivedBytesMD(), 1297 Value: float64(stat.ReplicaSize), 1298 VariableLabels: map[string]string{"bucket": bucket}, 1299 }) 1300 metrics = append(metrics, Metric{ 1301 Description: getBucketRepPendingOperationsMD(), 1302 Value: float64(stat.PendingCount), 1303 VariableLabels: map[string]string{"bucket": bucket}, 1304 }) 1305 metrics = append(metrics, Metric{ 1306 Description: getBucketRepFailedOperationsMD(), 1307 Value: float64(stat.FailedCount), 1308 VariableLabels: map[string]string{"bucket": bucket}, 1309 }) 1310 } 1311 1312 metrics = append(metrics, Metric{ 1313 Description: getBucketObjectDistributionMD(), 1314 Histogram: usage.ObjectSizesHistogram, 1315 HistogramBucketLabel: "range", 1316 VariableLabels: map[string]string{"bucket": bucket}, 1317 }) 1318 1319 } 1320 return 1321 }, 1322 } 1323 } 1324 func getLocalStorageMetrics() MetricsGroup { 1325 return MetricsGroup{ 1326 id: "localStorageMetrics", 1327 cachedRead: cachedRead, 1328 read: func(ctx context.Context) (metrics []Metric) { 1329 objLayer := newObjectLayerFn() 1330 // Service not initialized yet 1331 if objLayer == nil { 1332 return 1333 } 1334 1335 if GlobalIsGateway { 1336 return 1337 } 1338 1339 metrics = make([]Metric, 0, 50) 1340 storageInfo, _ := objLayer.LocalStorageInfo(ctx) 1341 for _, disk := range storageInfo.Disks { 1342 metrics = append(metrics, Metric{ 1343 Description: getNodeDiskUsedBytesMD(), 1344 Value: float64(disk.UsedSpace), 1345 VariableLabels: map[string]string{"disk": disk.DrivePath}, 1346 }) 1347 1348 metrics = append(metrics, Metric{ 1349 Description: getNodeDiskFreeBytesMD(), 1350 Value: float64(disk.AvailableSpace), 1351 VariableLabels: map[string]string{"disk": disk.DrivePath}, 1352 }) 1353 1354 metrics = append(metrics, Metric{ 1355 Description: getNodeDiskTotalBytesMD(), 1356 Value: float64(disk.TotalSpace), 1357 VariableLabels: map[string]string{"disk": disk.DrivePath}, 1358 }) 1359 } 1360 return 1361 }, 1362 } 1363 } 1364 func getClusterStorageMetrics() MetricsGroup { 1365 return MetricsGroup{ 1366 id: "ClusterStorageMetrics", 1367 cachedRead: cachedRead, 1368 read: func(ctx context.Context) (metrics []Metric) { 1369 objLayer := newObjectLayerFn() 1370 // Service not initialized yet 1371 if objLayer == nil { 1372 return 1373 } 1374 1375 if GlobalIsGateway { 1376 return 1377 } 1378 1379 // Fetch disk space info, ignore errors 1380 metrics = make([]Metric, 0, 10) 1381 storageInfo, _ := objLayer.StorageInfo(ctx) 1382 onlineDisks, offlineDisks := getOnlineOfflineDisksStats(storageInfo.Disks) 1383 totalDisks := onlineDisks.Merge(offlineDisks) 1384 1385 metrics = append(metrics, Metric{ 1386 Description: getClusterCapacityTotalBytesMD(), 1387 Value: float64(GetTotalCapacity(storageInfo.Disks)), 1388 }) 1389 1390 metrics = append(metrics, Metric{ 1391 Description: getClusterCapacityFreeBytesMD(), 1392 Value: float64(GetTotalCapacityFree(storageInfo.Disks)), 1393 }) 1394 1395 metrics = append(metrics, Metric{ 1396 Description: getClusterCapacityUsageBytesMD(), 1397 Value: GetTotalUsableCapacity(storageInfo.Disks, storageInfo), 1398 }) 1399 1400 metrics = append(metrics, Metric{ 1401 Description: getClusterCapacityUsageFreeBytesMD(), 1402 Value: GetTotalUsableCapacityFree(storageInfo.Disks, storageInfo), 1403 }) 1404 1405 metrics = append(metrics, Metric{ 1406 Description: getClusterDisksOfflineTotalMD(), 1407 Value: float64(offlineDisks.Sum()), 1408 }) 1409 1410 metrics = append(metrics, Metric{ 1411 Description: getClusterDisksOnlineTotalMD(), 1412 Value: float64(onlineDisks.Sum()), 1413 }) 1414 1415 metrics = append(metrics, Metric{ 1416 Description: getClusterDisksTotalMD(), 1417 Value: float64(totalDisks.Sum()), 1418 }) 1419 return 1420 }, 1421 } 1422 } 1423 1424 type minioClusterCollector struct { 1425 desc *prometheus.Desc 1426 } 1427 1428 func newMinioClusterCollector() *minioClusterCollector { 1429 return &minioClusterCollector{ 1430 desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server", nil, nil), 1431 } 1432 } 1433 1434 // Describe sends the super-set of all possible descriptors of metrics 1435 func (c *minioClusterCollector) Describe(ch chan<- *prometheus.Desc) { 1436 ch <- c.desc 1437 } 1438 1439 // Collect is called by the Prometheus registry when collecting metrics. 1440 func (c *minioClusterCollector) Collect(out chan<- prometheus.Metric) { 1441 1442 var wg sync.WaitGroup 1443 publish := func(in <-chan Metric) { 1444 defer wg.Done() 1445 for metric := range in { 1446 labels, values := getOrderedLabelValueArrays(metric.VariableLabels) 1447 if metric.Description.Type == histogramMetric { 1448 if metric.Histogram == nil { 1449 continue 1450 } 1451 for k, v := range metric.Histogram { 1452 l := append(labels, metric.HistogramBucketLabel) 1453 lv := append(values, k) 1454 out <- prometheus.MustNewConstMetric( 1455 prometheus.NewDesc( 1456 prometheus.BuildFQName(string(metric.Description.Namespace), 1457 string(metric.Description.Subsystem), 1458 string(metric.Description.Name)), 1459 metric.Description.Help, 1460 l, 1461 metric.StaticLabels, 1462 ), 1463 prometheus.GaugeValue, 1464 float64(v), 1465 lv...) 1466 } 1467 continue 1468 } 1469 metricType := prometheus.GaugeValue 1470 switch metric.Description.Type { 1471 case counterMetric: 1472 metricType = prometheus.CounterValue 1473 } 1474 toPost := prometheus.MustNewConstMetric( 1475 prometheus.NewDesc( 1476 prometheus.BuildFQName(string(metric.Description.Namespace), 1477 string(metric.Description.Subsystem), 1478 string(metric.Description.Name)), 1479 metric.Description.Help, 1480 labels, 1481 metric.StaticLabels, 1482 ), 1483 metricType, 1484 metric.Value, 1485 values...) 1486 out <- toPost 1487 } 1488 } 1489 1490 // Call peer api to fetch metrics 1491 peerCh := GlobalNotificationSys.GetClusterMetrics(GlobalContext) 1492 selfCh := ReportMetrics(GlobalContext, GetAllGenerators) 1493 wg.Add(2) 1494 go publish(peerCh) 1495 go publish(selfCh) 1496 wg.Wait() 1497 } 1498 1499 // ReportMetrics reports serialized metrics to the channel passed for the metrics generated. 1500 func ReportMetrics(ctx context.Context, generators func() []MetricsGenerator) <-chan Metric { 1501 ch := make(chan Metric) 1502 go func() { 1503 defer close(ch) 1504 populateAndPublish(generators, func(m Metric) bool { 1505 if m.VariableLabels == nil { 1506 m.VariableLabels = make(map[string]string) 1507 } 1508 m.VariableLabels[serverName] = globalLocalNodeName 1509 for { 1510 select { 1511 case ch <- m: 1512 return true 1513 case <-ctx.Done(): 1514 return false 1515 } 1516 } 1517 }) 1518 }() 1519 return ch 1520 } 1521 1522 // minioCollectorV2 is the Custom Collector 1523 type minioCollectorV2 struct { 1524 generator func() []MetricsGenerator 1525 desc *prometheus.Desc 1526 } 1527 1528 // Describe sends the super-set of all possible descriptors of metrics 1529 func (c *minioCollectorV2) Describe(ch chan<- *prometheus.Desc) { 1530 ch <- c.desc 1531 } 1532 1533 // populateAndPublish populates and then publishes the metrics generated by the generator function. 1534 func populateAndPublish(generatorFn func() []MetricsGenerator, publish func(m Metric) bool) { 1535 generators := generatorFn() 1536 for _, g := range generators { 1537 metricsGroup := g() 1538 metrics := metricsGroup.cachedRead(GlobalContext, &metricsGroup) 1539 for _, metric := range metrics { 1540 if !publish(metric) { 1541 return 1542 } 1543 } 1544 } 1545 } 1546 1547 // Collect is called by the Prometheus registry when collecting metrics. 1548 func (c *minioCollectorV2) Collect(ch chan<- prometheus.Metric) { 1549 1550 // Expose MinIO's version information 1551 minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0) 1552 1553 populateAndPublish(c.generator, func(metric Metric) bool { 1554 labels, values := getOrderedLabelValueArrays(metric.VariableLabels) 1555 values = append(values, globalLocalNodeName) 1556 labels = append(labels, serverName) 1557 1558 if metric.Description.Type == histogramMetric { 1559 if metric.Histogram == nil { 1560 return true 1561 } 1562 for k, v := range metric.Histogram { 1563 labels = append(labels, metric.HistogramBucketLabel) 1564 values = append(values, k) 1565 ch <- prometheus.MustNewConstMetric( 1566 prometheus.NewDesc( 1567 prometheus.BuildFQName(string(metric.Description.Namespace), 1568 string(metric.Description.Subsystem), 1569 string(metric.Description.Name)), 1570 metric.Description.Help, 1571 labels, 1572 metric.StaticLabels, 1573 ), 1574 prometheus.GaugeValue, 1575 float64(v), 1576 values...) 1577 } 1578 return true 1579 } 1580 1581 metricType := prometheus.GaugeValue 1582 switch metric.Description.Type { 1583 case counterMetric: 1584 metricType = prometheus.CounterValue 1585 } 1586 ch <- prometheus.MustNewConstMetric( 1587 prometheus.NewDesc( 1588 prometheus.BuildFQName(string(metric.Description.Namespace), 1589 string(metric.Description.Subsystem), 1590 string(metric.Description.Name)), 1591 metric.Description.Help, 1592 labels, 1593 metric.StaticLabels, 1594 ), 1595 metricType, 1596 metric.Value, 1597 values...) 1598 return true 1599 }) 1600 } 1601 1602 func getOrderedLabelValueArrays(labelsWithValue map[string]string) (labels, values []string) { 1603 labels = make([]string, 0) 1604 values = make([]string, 0) 1605 for l, v := range labelsWithValue { 1606 labels = append(labels, l) 1607 values = append(values, v) 1608 } 1609 return 1610 } 1611 1612 // newMinioCollectorV2 describes the collector 1613 // and returns reference of minioCollector for version 2 1614 // It creates the Prometheus Description which is used 1615 // to define Metric and help string 1616 func newMinioCollectorV2(generator func() []MetricsGenerator) *minioCollectorV2 { 1617 return &minioCollectorV2{ 1618 generator: generator, 1619 desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server", nil, nil), 1620 } 1621 } 1622 1623 func metricsServerHandler() http.Handler { 1624 1625 registry := prometheus.NewRegistry() 1626 1627 // Report all other metrics 1628 err := registry.Register(newMinioClusterCollector()) 1629 if err != nil { 1630 logger.CriticalIf(GlobalContext, err) 1631 } 1632 // DefaultGatherers include golang metrics and process metrics. 1633 gatherers := prometheus.Gatherers{ 1634 registry, 1635 } 1636 // Delegate http serving to Prometheus client library, which will call collector.Collect. 1637 return promhttp.InstrumentMetricHandler( 1638 registry, 1639 promhttp.HandlerFor(gatherers, 1640 promhttp.HandlerOpts{ 1641 ErrorHandling: promhttp.ContinueOnError, 1642 }), 1643 ) 1644 } 1645 1646 func metricsNodeHandler() http.Handler { 1647 registry := prometheus.NewRegistry() 1648 1649 err := registry.Register(newMinioCollectorV2(GetSingleNodeGenerators)) 1650 if err != nil { 1651 logger.CriticalIf(GlobalContext, err) 1652 } 1653 err = registry.Register(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{ 1654 Namespace: minioNamespace, 1655 ReportErrors: true, 1656 })) 1657 if err != nil { 1658 logger.CriticalIf(GlobalContext, err) 1659 } 1660 err = registry.Register(prometheus.NewGoCollector()) 1661 if err != nil { 1662 logger.CriticalIf(GlobalContext, err) 1663 } 1664 gatherers := prometheus.Gatherers{ 1665 registry, 1666 } 1667 // Delegate http serving to Prometheus client library, which will call collector.Collect. 1668 return promhttp.InstrumentMetricHandler( 1669 registry, 1670 promhttp.HandlerFor(gatherers, 1671 promhttp.HandlerOpts{ 1672 ErrorHandling: promhttp.ContinueOnError, 1673 }), 1674 ) 1675 }