github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/metrics.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "net/http" 22 "strings" 23 "time" 24 25 "github.com/minio/minio/internal/auth" 26 "github.com/minio/minio/internal/logger" 27 "github.com/minio/minio/internal/mcontext" 28 "github.com/minio/pkg/v2/policy" 29 "github.com/prometheus/client_golang/prometheus" 30 "github.com/prometheus/common/expfmt" 31 ) 32 33 var ( 34 httpRequestsDuration = prometheus.NewHistogramVec( 35 prometheus.HistogramOpts{ 36 Name: "s3_ttfb_seconds", 37 Help: "Time taken by requests served by current MinIO server instance", 38 Buckets: []float64{.05, .1, .25, .5, 1, 2.5, 5, 10}, 39 }, 40 []string{"api"}, 41 ) 42 bucketHTTPRequestsDuration = prometheus.NewHistogramVec( 43 prometheus.HistogramOpts{ 44 Name: "s3_ttfb_seconds", 45 Help: "Time taken by requests served by current MinIO server instance per bucket", 46 Buckets: []float64{.05, .1, .25, .5, 1, 2.5, 5, 10}, 47 }, 48 []string{"api", "bucket"}, 49 ) 50 minioVersionInfo = prometheus.NewGaugeVec( 51 prometheus.GaugeOpts{ 52 Namespace: "minio", 53 Name: "version_info", 54 Help: "Version of current MinIO server instance", 55 }, 56 []string{ 57 // current version 58 "version", 59 // commit-id of the current version 60 "commit", 61 }, 62 ) 63 ) 64 65 const ( 66 healMetricsNamespace = "self_heal" 67 cacheNamespace = "cache" 68 s3Namespace = "s3" 69 bucketNamespace = "bucket" 70 minioNamespace = "minio" 71 diskNamespace = "disk" 72 interNodeNamespace = "internode" 73 ) 74 75 func init() { 76 prometheus.MustRegister(httpRequestsDuration) 77 prometheus.MustRegister(newMinioCollector()) 78 prometheus.MustRegister(minioVersionInfo) 79 } 80 81 // newMinioCollector describes the collector 82 // and returns reference of minioCollector 83 // It creates the Prometheus Description which is used 84 // to define metric and help string 85 func newMinioCollector() *minioCollector { 86 return &minioCollector{ 87 desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server", nil, nil), 88 } 89 } 90 91 // minioCollector is the Custom Collector 92 type minioCollector struct { 93 desc *prometheus.Desc 94 } 95 96 // Describe sends the super-set of all possible descriptors of metrics 97 func (c *minioCollector) Describe(ch chan<- *prometheus.Desc) { 98 ch <- c.desc 99 } 100 101 // Collect is called by the Prometheus registry when collecting metrics. 102 func (c *minioCollector) Collect(ch chan<- prometheus.Metric) { 103 // Expose MinIO's version information 104 minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0) 105 106 storageMetricsPrometheus(ch) 107 nodeHealthMetricsPrometheus(ch) 108 bucketUsageMetricsPrometheus(ch) 109 networkMetricsPrometheus(ch) 110 httpMetricsPrometheus(ch) 111 healingMetricsPrometheus(ch) 112 } 113 114 func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) { 115 nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount() 116 ch <- prometheus.MustNewConstMetric( 117 prometheus.NewDesc( 118 prometheus.BuildFQName(minioNamespace, "nodes", "online"), 119 "Total number of MinIO nodes online", 120 nil, nil), 121 prometheus.GaugeValue, 122 float64(nodesUp), 123 ) 124 ch <- prometheus.MustNewConstMetric( 125 prometheus.NewDesc( 126 prometheus.BuildFQName(minioNamespace, "nodes", "offline"), 127 "Total number of MinIO nodes offline", 128 nil, nil), 129 prometheus.GaugeValue, 130 float64(nodesDown), 131 ) 132 } 133 134 // collects healing specific metrics for MinIO instance in Prometheus specific format 135 // and sends to given channel 136 func healingMetricsPrometheus(ch chan<- prometheus.Metric) { 137 bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) 138 if !exists { 139 return 140 } 141 142 var dur time.Duration 143 if !bgSeq.lastHealActivity.IsZero() { 144 dur = time.Since(bgSeq.lastHealActivity) 145 } 146 147 ch <- prometheus.MustNewConstMetric( 148 prometheus.NewDesc( 149 prometheus.BuildFQName(healMetricsNamespace, "time", "since_last_activity"), 150 "Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity", 151 nil, nil), 152 prometheus.GaugeValue, 153 float64(dur), 154 ) 155 for k, v := range bgSeq.getScannedItemsMap() { 156 ch <- prometheus.MustNewConstMetric( 157 prometheus.NewDesc( 158 prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"), 159 "Objects scanned in current self healing run", 160 []string{"type"}, nil), 161 prometheus.GaugeValue, 162 float64(v), string(k), 163 ) 164 } 165 for k, v := range bgSeq.getHealedItemsMap() { 166 ch <- prometheus.MustNewConstMetric( 167 prometheus.NewDesc( 168 prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"), 169 "Objects healed in current self healing run", 170 []string{"type"}, nil), 171 prometheus.GaugeValue, 172 float64(v), string(k), 173 ) 174 } 175 for k, v := range bgSeq.gethealFailedItemsMap() { 176 // healFailedItemsMap stores the endpoint and volume state separated by comma, 177 // split the fields and pass to channel at correct index 178 s := strings.Split(k, ",") 179 ch <- prometheus.MustNewConstMetric( 180 prometheus.NewDesc( 181 prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"), 182 "Objects for which healing failed in current self healing run", 183 []string{"mount_path", "volume_status"}, nil), 184 prometheus.GaugeValue, 185 float64(v), s[0], s[1], 186 ) 187 } 188 } 189 190 // collects http metrics for MinIO server in Prometheus specific format 191 // and sends to given channel 192 func httpMetricsPrometheus(ch chan<- prometheus.Metric) { 193 httpStats := globalHTTPStats.toServerHTTPStats(true) 194 195 for api, value := range httpStats.CurrentS3Requests.APIStats { 196 ch <- prometheus.MustNewConstMetric( 197 prometheus.NewDesc( 198 prometheus.BuildFQName(s3Namespace, "requests", "current"), 199 "Total number of running s3 requests in current MinIO server instance", 200 []string{"api"}, nil), 201 prometheus.CounterValue, 202 float64(value), 203 api, 204 ) 205 } 206 207 for api, value := range httpStats.TotalS3Requests.APIStats { 208 ch <- prometheus.MustNewConstMetric( 209 prometheus.NewDesc( 210 prometheus.BuildFQName(s3Namespace, "requests", "total"), 211 "Total number of s3 requests in current MinIO server instance", 212 []string{"api"}, nil), 213 prometheus.CounterValue, 214 float64(value), 215 api, 216 ) 217 } 218 219 for api, value := range httpStats.TotalS3Errors.APIStats { 220 ch <- prometheus.MustNewConstMetric( 221 prometheus.NewDesc( 222 prometheus.BuildFQName(s3Namespace, "errors", "total"), 223 "Total number of s3 errors in current MinIO server instance", 224 []string{"api"}, nil), 225 prometheus.CounterValue, 226 float64(value), 227 api, 228 ) 229 } 230 231 for api, value := range httpStats.TotalS3Canceled.APIStats { 232 ch <- prometheus.MustNewConstMetric( 233 prometheus.NewDesc( 234 prometheus.BuildFQName(s3Namespace, "canceled", "total"), 235 "Total number of client canceled s3 request in current MinIO server instance", 236 []string{"api"}, nil), 237 prometheus.CounterValue, 238 float64(value), 239 api, 240 ) 241 } 242 } 243 244 // collects network metrics for MinIO server in Prometheus specific format 245 // and sends to given channel 246 func networkMetricsPrometheus(ch chan<- prometheus.Metric) { 247 connStats := globalConnStats.toServerConnStats() 248 249 // Network Sent/Received Bytes (internode) 250 ch <- prometheus.MustNewConstMetric( 251 prometheus.NewDesc( 252 prometheus.BuildFQName(interNodeNamespace, "tx", "bytes_total"), 253 "Total number of bytes sent to the other peer nodes by current MinIO server instance", 254 nil, nil), 255 prometheus.CounterValue, 256 float64(connStats.internodeOutputBytes), 257 ) 258 259 ch <- prometheus.MustNewConstMetric( 260 prometheus.NewDesc( 261 prometheus.BuildFQName(interNodeNamespace, "rx", "bytes_total"), 262 "Total number of internode bytes received by current MinIO server instance", 263 nil, nil), 264 prometheus.CounterValue, 265 float64(connStats.internodeInputBytes), 266 ) 267 268 // Network Sent/Received Bytes (Outbound) 269 ch <- prometheus.MustNewConstMetric( 270 prometheus.NewDesc( 271 prometheus.BuildFQName(s3Namespace, "tx", "bytes_total"), 272 "Total number of s3 bytes sent by current MinIO server instance", 273 nil, nil), 274 prometheus.CounterValue, 275 float64(connStats.s3OutputBytes), 276 ) 277 278 ch <- prometheus.MustNewConstMetric( 279 prometheus.NewDesc( 280 prometheus.BuildFQName(s3Namespace, "rx", "bytes_total"), 281 "Total number of s3 bytes received by current MinIO server instance", 282 nil, nil), 283 prometheus.CounterValue, 284 float64(connStats.s3InputBytes), 285 ) 286 } 287 288 // Populates prometheus with bucket usage metrics, this metrics 289 // is only enabled if scanner is enabled. 290 func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) { 291 objLayer := newObjectLayerFn() 292 // Service not initialized yet 293 if objLayer == nil { 294 return 295 } 296 297 dataUsageInfo, err := loadDataUsageFromBackend(GlobalContext, objLayer) 298 if err != nil { 299 return 300 } 301 // data usage has not captured any data yet. 302 if dataUsageInfo.LastUpdate.IsZero() { 303 return 304 } 305 306 for bucket, usageInfo := range dataUsageInfo.BucketsUsage { 307 stat := globalReplicationStats.getLatestReplicationStats(bucket) 308 // Total space used by bucket 309 ch <- prometheus.MustNewConstMetric( 310 prometheus.NewDesc( 311 prometheus.BuildFQName(bucketNamespace, "usage", "size"), 312 "Total bucket size", 313 []string{"bucket"}, nil), 314 prometheus.GaugeValue, 315 float64(usageInfo.Size), 316 bucket, 317 ) 318 ch <- prometheus.MustNewConstMetric( 319 prometheus.NewDesc( 320 prometheus.BuildFQName(bucketNamespace, "objects", "count"), 321 "Total number of objects in a bucket", 322 []string{"bucket"}, nil), 323 prometheus.GaugeValue, 324 float64(usageInfo.ObjectsCount), 325 bucket, 326 ) 327 ch <- prometheus.MustNewConstMetric( 328 prometheus.NewDesc( 329 prometheus.BuildFQName("bucket", "replication", "successful_size"), 330 "Total capacity replicated to destination", 331 []string{"bucket"}, nil), 332 prometheus.GaugeValue, 333 float64(stat.ReplicationStats.ReplicatedSize), 334 bucket, 335 ) 336 ch <- prometheus.MustNewConstMetric( 337 prometheus.NewDesc( 338 prometheus.BuildFQName("bucket", "replication", "received_size"), 339 "Total capacity replicated to this instance", 340 []string{"bucket"}, nil), 341 prometheus.GaugeValue, 342 float64(stat.ReplicationStats.ReplicaSize), 343 bucket, 344 ) 345 346 for k, v := range usageInfo.ObjectSizesHistogram { 347 ch <- prometheus.MustNewConstMetric( 348 prometheus.NewDesc( 349 prometheus.BuildFQName(bucketNamespace, "objects", "histogram"), 350 "Total number of objects of different sizes in a bucket", 351 []string{"bucket", "object_size"}, nil), 352 prometheus.GaugeValue, 353 float64(v), 354 bucket, 355 k, 356 ) 357 } 358 for k, v := range usageInfo.ObjectVersionsHistogram { 359 ch <- prometheus.MustNewConstMetric( 360 prometheus.NewDesc( 361 prometheus.BuildFQName(bucketNamespace, "objects", "histogram"), 362 "Total number of versions of objects in a bucket", 363 []string{"bucket", "object_versions"}, nil), 364 prometheus.GaugeValue, 365 float64(v), 366 bucket, 367 k, 368 ) 369 } 370 } 371 } 372 373 // collects storage metrics for MinIO server in Prometheus specific format 374 // and sends to given channel 375 func storageMetricsPrometheus(ch chan<- prometheus.Metric) { 376 objLayer := newObjectLayerFn() 377 // Service not initialized yet 378 if objLayer == nil { 379 return 380 } 381 382 server := getLocalServerProperty(globalEndpoints, &http.Request{ 383 Host: globalLocalNodeName, 384 }, true) 385 386 onlineDisks, offlineDisks := getOnlineOfflineDisksStats(server.Disks) 387 totalDisks := offlineDisks.Merge(onlineDisks) 388 389 // Report total capacity 390 ch <- prometheus.MustNewConstMetric( 391 prometheus.NewDesc( 392 prometheus.BuildFQName(minioNamespace, "capacity_raw", "total"), 393 "Total capacity online in the cluster", 394 nil, nil), 395 prometheus.GaugeValue, 396 float64(GetTotalCapacity(server.Disks)), 397 ) 398 399 // Report total capacity free 400 ch <- prometheus.MustNewConstMetric( 401 prometheus.NewDesc( 402 prometheus.BuildFQName(minioNamespace, "capacity_raw_free", "total"), 403 "Total free capacity online in the cluster", 404 nil, nil), 405 prometheus.GaugeValue, 406 float64(GetTotalCapacityFree(server.Disks)), 407 ) 408 409 sinfo := objLayer.StorageInfo(GlobalContext, true) 410 411 // Report total usable capacity 412 ch <- prometheus.MustNewConstMetric( 413 prometheus.NewDesc( 414 prometheus.BuildFQName(minioNamespace, "capacity_usable", "total"), 415 "Total usable capacity online in the cluster", 416 nil, nil), 417 prometheus.GaugeValue, 418 float64(GetTotalUsableCapacity(server.Disks, sinfo)), 419 ) 420 421 // Report total usable capacity free 422 ch <- prometheus.MustNewConstMetric( 423 prometheus.NewDesc( 424 prometheus.BuildFQName(minioNamespace, "capacity_usable_free", "total"), 425 "Total free usable capacity online in the cluster", 426 nil, nil), 427 prometheus.GaugeValue, 428 float64(GetTotalUsableCapacityFree(server.Disks, sinfo)), 429 ) 430 431 // MinIO Offline Disks per node 432 ch <- prometheus.MustNewConstMetric( 433 prometheus.NewDesc( 434 prometheus.BuildFQName(minioNamespace, "disks", "offline"), 435 "Total number of offline drives in current MinIO server instance", 436 nil, nil), 437 prometheus.GaugeValue, 438 float64(offlineDisks.Sum()), 439 ) 440 441 // MinIO Total Disks per node 442 ch <- prometheus.MustNewConstMetric( 443 prometheus.NewDesc( 444 prometheus.BuildFQName(minioNamespace, "drives", "total"), 445 "Total number of drives for current MinIO server instance", 446 nil, nil), 447 prometheus.GaugeValue, 448 float64(totalDisks.Sum()), 449 ) 450 451 for _, disk := range server.Disks { 452 // Total disk usage by the disk 453 ch <- prometheus.MustNewConstMetric( 454 prometheus.NewDesc( 455 prometheus.BuildFQName(diskNamespace, "storage", "used"), 456 "Total disk storage used on the drive", 457 []string{"disk"}, nil), 458 prometheus.GaugeValue, 459 float64(disk.UsedSpace), 460 disk.DrivePath, 461 ) 462 463 // Total available space in the disk 464 ch <- prometheus.MustNewConstMetric( 465 prometheus.NewDesc( 466 prometheus.BuildFQName(diskNamespace, "storage", "available"), 467 "Total available space left on the drive", 468 []string{"disk"}, nil), 469 prometheus.GaugeValue, 470 float64(disk.AvailableSpace), 471 disk.DrivePath, 472 ) 473 474 // Total storage space of the disk 475 ch <- prometheus.MustNewConstMetric( 476 prometheus.NewDesc( 477 prometheus.BuildFQName(diskNamespace, "storage", "total"), 478 "Total space on the drive", 479 []string{"disk"}, nil), 480 prometheus.GaugeValue, 481 float64(disk.TotalSpace), 482 disk.DrivePath, 483 ) 484 } 485 } 486 487 func metricsHandler() http.Handler { 488 registry := prometheus.NewRegistry() 489 490 logger.CriticalIf(GlobalContext, registry.Register(minioVersionInfo)) 491 492 logger.CriticalIf(GlobalContext, registry.Register(newMinioCollector())) 493 494 gatherers := prometheus.Gatherers{ 495 prometheus.DefaultGatherer, 496 registry, 497 } 498 499 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 500 tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt) 501 if ok { 502 tc.FuncName = "handler.MetricsLegacy" 503 tc.ResponseRecorder.LogErrBody = true 504 } 505 506 mfs, err := gatherers.Gather() 507 if err != nil { 508 if len(mfs) == 0 { 509 writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL) 510 return 511 } 512 } 513 514 contentType := expfmt.Negotiate(r.Header) 515 w.Header().Set("Content-Type", string(contentType)) 516 517 enc := expfmt.NewEncoder(w, contentType) 518 for _, mf := range mfs { 519 if err := enc.Encode(mf); err != nil { 520 // client may disconnect for any reasons 521 // we do not have to log this. 522 return 523 } 524 } 525 if closer, ok := enc.(expfmt.Closer); ok { 526 closer.Close() 527 } 528 }) 529 } 530 531 // NoAuthMiddleware no auth middle ware. 532 func NoAuthMiddleware(h http.Handler) http.Handler { 533 return h 534 } 535 536 // AuthMiddleware checks if the bearer token is valid and authorized. 537 func AuthMiddleware(h http.Handler) http.Handler { 538 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 539 tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt) 540 541 claims, groups, owner, authErr := metricsRequestAuthenticate(r) 542 if authErr != nil || (claims != nil && !claims.VerifyIssuer("prometheus", true)) { 543 if ok { 544 tc.FuncName = "handler.MetricsAuth" 545 tc.ResponseRecorder.LogErrBody = true 546 } 547 548 writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), errAuthentication), r.URL) 549 return 550 } 551 552 cred := auth.Credentials{ 553 AccessKey: claims.AccessKey, 554 Claims: claims.Map(), 555 Groups: groups, 556 } 557 558 // For authenticated users apply IAM policy. 559 if !globalIAMSys.IsAllowed(policy.Args{ 560 AccountName: cred.AccessKey, 561 Groups: cred.Groups, 562 Action: policy.PrometheusAdminAction, 563 ConditionValues: getConditionValues(r, "", cred), 564 IsOwner: owner, 565 Claims: cred.Claims, 566 }) { 567 if ok { 568 tc.FuncName = "handler.MetricsAuth" 569 tc.ResponseRecorder.LogErrBody = true 570 } 571 572 writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), errAuthentication), r.URL) 573 return 574 } 575 h.ServeHTTP(w, r) 576 }) 577 }