github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/metrics.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"net/http"
    22  	"strings"
    23  	"time"
    24  
    25  	"github.com/minio/minio/internal/auth"
    26  	"github.com/minio/minio/internal/logger"
    27  	"github.com/minio/minio/internal/mcontext"
    28  	"github.com/minio/pkg/v2/policy"
    29  	"github.com/prometheus/client_golang/prometheus"
    30  	"github.com/prometheus/common/expfmt"
    31  )
    32  
    33  var (
    34  	httpRequestsDuration = prometheus.NewHistogramVec(
    35  		prometheus.HistogramOpts{
    36  			Name:    "s3_ttfb_seconds",
    37  			Help:    "Time taken by requests served by current MinIO server instance",
    38  			Buckets: []float64{.05, .1, .25, .5, 1, 2.5, 5, 10},
    39  		},
    40  		[]string{"api"},
    41  	)
    42  	bucketHTTPRequestsDuration = prometheus.NewHistogramVec(
    43  		prometheus.HistogramOpts{
    44  			Name:    "s3_ttfb_seconds",
    45  			Help:    "Time taken by requests served by current MinIO server instance per bucket",
    46  			Buckets: []float64{.05, .1, .25, .5, 1, 2.5, 5, 10},
    47  		},
    48  		[]string{"api", "bucket"},
    49  	)
    50  	minioVersionInfo = prometheus.NewGaugeVec(
    51  		prometheus.GaugeOpts{
    52  			Namespace: "minio",
    53  			Name:      "version_info",
    54  			Help:      "Version of current MinIO server instance",
    55  		},
    56  		[]string{
    57  			// current version
    58  			"version",
    59  			// commit-id of the current version
    60  			"commit",
    61  		},
    62  	)
    63  )
    64  
    65  const (
    66  	healMetricsNamespace = "self_heal"
    67  	cacheNamespace       = "cache"
    68  	s3Namespace          = "s3"
    69  	bucketNamespace      = "bucket"
    70  	minioNamespace       = "minio"
    71  	diskNamespace        = "disk"
    72  	interNodeNamespace   = "internode"
    73  )
    74  
    75  func init() {
    76  	prometheus.MustRegister(httpRequestsDuration)
    77  	prometheus.MustRegister(newMinioCollector())
    78  	prometheus.MustRegister(minioVersionInfo)
    79  }
    80  
    81  // newMinioCollector describes the collector
    82  // and returns reference of minioCollector
    83  // It creates the Prometheus Description which is used
    84  // to define metric and  help string
    85  func newMinioCollector() *minioCollector {
    86  	return &minioCollector{
    87  		desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server", nil, nil),
    88  	}
    89  }
    90  
    91  // minioCollector is the Custom Collector
    92  type minioCollector struct {
    93  	desc *prometheus.Desc
    94  }
    95  
    96  // Describe sends the super-set of all possible descriptors of metrics
    97  func (c *minioCollector) Describe(ch chan<- *prometheus.Desc) {
    98  	ch <- c.desc
    99  }
   100  
   101  // Collect is called by the Prometheus registry when collecting metrics.
   102  func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {
   103  	// Expose MinIO's version information
   104  	minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0)
   105  
   106  	storageMetricsPrometheus(ch)
   107  	nodeHealthMetricsPrometheus(ch)
   108  	bucketUsageMetricsPrometheus(ch)
   109  	networkMetricsPrometheus(ch)
   110  	httpMetricsPrometheus(ch)
   111  	healingMetricsPrometheus(ch)
   112  }
   113  
   114  func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) {
   115  	nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount()
   116  	ch <- prometheus.MustNewConstMetric(
   117  		prometheus.NewDesc(
   118  			prometheus.BuildFQName(minioNamespace, "nodes", "online"),
   119  			"Total number of MinIO nodes online",
   120  			nil, nil),
   121  		prometheus.GaugeValue,
   122  		float64(nodesUp),
   123  	)
   124  	ch <- prometheus.MustNewConstMetric(
   125  		prometheus.NewDesc(
   126  			prometheus.BuildFQName(minioNamespace, "nodes", "offline"),
   127  			"Total number of MinIO nodes offline",
   128  			nil, nil),
   129  		prometheus.GaugeValue,
   130  		float64(nodesDown),
   131  	)
   132  }
   133  
   134  // collects healing specific metrics for MinIO instance in Prometheus specific format
   135  // and sends to given channel
   136  func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
   137  	bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
   138  	if !exists {
   139  		return
   140  	}
   141  
   142  	var dur time.Duration
   143  	if !bgSeq.lastHealActivity.IsZero() {
   144  		dur = time.Since(bgSeq.lastHealActivity)
   145  	}
   146  
   147  	ch <- prometheus.MustNewConstMetric(
   148  		prometheus.NewDesc(
   149  			prometheus.BuildFQName(healMetricsNamespace, "time", "since_last_activity"),
   150  			"Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity",
   151  			nil, nil),
   152  		prometheus.GaugeValue,
   153  		float64(dur),
   154  	)
   155  	for k, v := range bgSeq.getScannedItemsMap() {
   156  		ch <- prometheus.MustNewConstMetric(
   157  			prometheus.NewDesc(
   158  				prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"),
   159  				"Objects scanned in current self healing run",
   160  				[]string{"type"}, nil),
   161  			prometheus.GaugeValue,
   162  			float64(v), string(k),
   163  		)
   164  	}
   165  	for k, v := range bgSeq.getHealedItemsMap() {
   166  		ch <- prometheus.MustNewConstMetric(
   167  			prometheus.NewDesc(
   168  				prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"),
   169  				"Objects healed in current self healing run",
   170  				[]string{"type"}, nil),
   171  			prometheus.GaugeValue,
   172  			float64(v), string(k),
   173  		)
   174  	}
   175  	for k, v := range bgSeq.gethealFailedItemsMap() {
   176  		// healFailedItemsMap stores the endpoint and volume state separated by comma,
   177  		// split the fields and pass to channel at correct index
   178  		s := strings.Split(k, ",")
   179  		ch <- prometheus.MustNewConstMetric(
   180  			prometheus.NewDesc(
   181  				prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"),
   182  				"Objects for which healing failed in current self healing run",
   183  				[]string{"mount_path", "volume_status"}, nil),
   184  			prometheus.GaugeValue,
   185  			float64(v), s[0], s[1],
   186  		)
   187  	}
   188  }
   189  
   190  // collects http metrics for MinIO server in Prometheus specific format
   191  // and sends to given channel
   192  func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
   193  	httpStats := globalHTTPStats.toServerHTTPStats(true)
   194  
   195  	for api, value := range httpStats.CurrentS3Requests.APIStats {
   196  		ch <- prometheus.MustNewConstMetric(
   197  			prometheus.NewDesc(
   198  				prometheus.BuildFQName(s3Namespace, "requests", "current"),
   199  				"Total number of running s3 requests in current MinIO server instance",
   200  				[]string{"api"}, nil),
   201  			prometheus.CounterValue,
   202  			float64(value),
   203  			api,
   204  		)
   205  	}
   206  
   207  	for api, value := range httpStats.TotalS3Requests.APIStats {
   208  		ch <- prometheus.MustNewConstMetric(
   209  			prometheus.NewDesc(
   210  				prometheus.BuildFQName(s3Namespace, "requests", "total"),
   211  				"Total number of s3 requests in current MinIO server instance",
   212  				[]string{"api"}, nil),
   213  			prometheus.CounterValue,
   214  			float64(value),
   215  			api,
   216  		)
   217  	}
   218  
   219  	for api, value := range httpStats.TotalS3Errors.APIStats {
   220  		ch <- prometheus.MustNewConstMetric(
   221  			prometheus.NewDesc(
   222  				prometheus.BuildFQName(s3Namespace, "errors", "total"),
   223  				"Total number of s3 errors in current MinIO server instance",
   224  				[]string{"api"}, nil),
   225  			prometheus.CounterValue,
   226  			float64(value),
   227  			api,
   228  		)
   229  	}
   230  
   231  	for api, value := range httpStats.TotalS3Canceled.APIStats {
   232  		ch <- prometheus.MustNewConstMetric(
   233  			prometheus.NewDesc(
   234  				prometheus.BuildFQName(s3Namespace, "canceled", "total"),
   235  				"Total number of client canceled s3 request in current MinIO server instance",
   236  				[]string{"api"}, nil),
   237  			prometheus.CounterValue,
   238  			float64(value),
   239  			api,
   240  		)
   241  	}
   242  }
   243  
   244  // collects network metrics for MinIO server in Prometheus specific format
   245  // and sends to given channel
   246  func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
   247  	connStats := globalConnStats.toServerConnStats()
   248  
   249  	// Network Sent/Received Bytes (internode)
   250  	ch <- prometheus.MustNewConstMetric(
   251  		prometheus.NewDesc(
   252  			prometheus.BuildFQName(interNodeNamespace, "tx", "bytes_total"),
   253  			"Total number of bytes sent to the other peer nodes by current MinIO server instance",
   254  			nil, nil),
   255  		prometheus.CounterValue,
   256  		float64(connStats.internodeOutputBytes),
   257  	)
   258  
   259  	ch <- prometheus.MustNewConstMetric(
   260  		prometheus.NewDesc(
   261  			prometheus.BuildFQName(interNodeNamespace, "rx", "bytes_total"),
   262  			"Total number of internode bytes received by current MinIO server instance",
   263  			nil, nil),
   264  		prometheus.CounterValue,
   265  		float64(connStats.internodeInputBytes),
   266  	)
   267  
   268  	// Network Sent/Received Bytes (Outbound)
   269  	ch <- prometheus.MustNewConstMetric(
   270  		prometheus.NewDesc(
   271  			prometheus.BuildFQName(s3Namespace, "tx", "bytes_total"),
   272  			"Total number of s3 bytes sent by current MinIO server instance",
   273  			nil, nil),
   274  		prometheus.CounterValue,
   275  		float64(connStats.s3OutputBytes),
   276  	)
   277  
   278  	ch <- prometheus.MustNewConstMetric(
   279  		prometheus.NewDesc(
   280  			prometheus.BuildFQName(s3Namespace, "rx", "bytes_total"),
   281  			"Total number of s3 bytes received by current MinIO server instance",
   282  			nil, nil),
   283  		prometheus.CounterValue,
   284  		float64(connStats.s3InputBytes),
   285  	)
   286  }
   287  
   288  // Populates prometheus with bucket usage metrics, this metrics
   289  // is only enabled if scanner is enabled.
   290  func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
   291  	objLayer := newObjectLayerFn()
   292  	// Service not initialized yet
   293  	if objLayer == nil {
   294  		return
   295  	}
   296  
   297  	dataUsageInfo, err := loadDataUsageFromBackend(GlobalContext, objLayer)
   298  	if err != nil {
   299  		return
   300  	}
   301  	// data usage has not captured any data yet.
   302  	if dataUsageInfo.LastUpdate.IsZero() {
   303  		return
   304  	}
   305  
   306  	for bucket, usageInfo := range dataUsageInfo.BucketsUsage {
   307  		stat := globalReplicationStats.getLatestReplicationStats(bucket)
   308  		// Total space used by bucket
   309  		ch <- prometheus.MustNewConstMetric(
   310  			prometheus.NewDesc(
   311  				prometheus.BuildFQName(bucketNamespace, "usage", "size"),
   312  				"Total bucket size",
   313  				[]string{"bucket"}, nil),
   314  			prometheus.GaugeValue,
   315  			float64(usageInfo.Size),
   316  			bucket,
   317  		)
   318  		ch <- prometheus.MustNewConstMetric(
   319  			prometheus.NewDesc(
   320  				prometheus.BuildFQName(bucketNamespace, "objects", "count"),
   321  				"Total number of objects in a bucket",
   322  				[]string{"bucket"}, nil),
   323  			prometheus.GaugeValue,
   324  			float64(usageInfo.ObjectsCount),
   325  			bucket,
   326  		)
   327  		ch <- prometheus.MustNewConstMetric(
   328  			prometheus.NewDesc(
   329  				prometheus.BuildFQName("bucket", "replication", "successful_size"),
   330  				"Total capacity replicated to destination",
   331  				[]string{"bucket"}, nil),
   332  			prometheus.GaugeValue,
   333  			float64(stat.ReplicationStats.ReplicatedSize),
   334  			bucket,
   335  		)
   336  		ch <- prometheus.MustNewConstMetric(
   337  			prometheus.NewDesc(
   338  				prometheus.BuildFQName("bucket", "replication", "received_size"),
   339  				"Total capacity replicated to this instance",
   340  				[]string{"bucket"}, nil),
   341  			prometheus.GaugeValue,
   342  			float64(stat.ReplicationStats.ReplicaSize),
   343  			bucket,
   344  		)
   345  
   346  		for k, v := range usageInfo.ObjectSizesHistogram {
   347  			ch <- prometheus.MustNewConstMetric(
   348  				prometheus.NewDesc(
   349  					prometheus.BuildFQName(bucketNamespace, "objects", "histogram"),
   350  					"Total number of objects of different sizes in a bucket",
   351  					[]string{"bucket", "object_size"}, nil),
   352  				prometheus.GaugeValue,
   353  				float64(v),
   354  				bucket,
   355  				k,
   356  			)
   357  		}
   358  		for k, v := range usageInfo.ObjectVersionsHistogram {
   359  			ch <- prometheus.MustNewConstMetric(
   360  				prometheus.NewDesc(
   361  					prometheus.BuildFQName(bucketNamespace, "objects", "histogram"),
   362  					"Total number of versions of objects in a bucket",
   363  					[]string{"bucket", "object_versions"}, nil),
   364  				prometheus.GaugeValue,
   365  				float64(v),
   366  				bucket,
   367  				k,
   368  			)
   369  		}
   370  	}
   371  }
   372  
   373  // collects storage metrics for MinIO server in Prometheus specific format
   374  // and sends to given channel
   375  func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
   376  	objLayer := newObjectLayerFn()
   377  	// Service not initialized yet
   378  	if objLayer == nil {
   379  		return
   380  	}
   381  
   382  	server := getLocalServerProperty(globalEndpoints, &http.Request{
   383  		Host: globalLocalNodeName,
   384  	}, true)
   385  
   386  	onlineDisks, offlineDisks := getOnlineOfflineDisksStats(server.Disks)
   387  	totalDisks := offlineDisks.Merge(onlineDisks)
   388  
   389  	// Report total capacity
   390  	ch <- prometheus.MustNewConstMetric(
   391  		prometheus.NewDesc(
   392  			prometheus.BuildFQName(minioNamespace, "capacity_raw", "total"),
   393  			"Total capacity online in the cluster",
   394  			nil, nil),
   395  		prometheus.GaugeValue,
   396  		float64(GetTotalCapacity(server.Disks)),
   397  	)
   398  
   399  	// Report total capacity free
   400  	ch <- prometheus.MustNewConstMetric(
   401  		prometheus.NewDesc(
   402  			prometheus.BuildFQName(minioNamespace, "capacity_raw_free", "total"),
   403  			"Total free capacity online in the cluster",
   404  			nil, nil),
   405  		prometheus.GaugeValue,
   406  		float64(GetTotalCapacityFree(server.Disks)),
   407  	)
   408  
   409  	sinfo := objLayer.StorageInfo(GlobalContext, true)
   410  
   411  	// Report total usable capacity
   412  	ch <- prometheus.MustNewConstMetric(
   413  		prometheus.NewDesc(
   414  			prometheus.BuildFQName(minioNamespace, "capacity_usable", "total"),
   415  			"Total usable capacity online in the cluster",
   416  			nil, nil),
   417  		prometheus.GaugeValue,
   418  		float64(GetTotalUsableCapacity(server.Disks, sinfo)),
   419  	)
   420  
   421  	// Report total usable capacity free
   422  	ch <- prometheus.MustNewConstMetric(
   423  		prometheus.NewDesc(
   424  			prometheus.BuildFQName(minioNamespace, "capacity_usable_free", "total"),
   425  			"Total free usable capacity online in the cluster",
   426  			nil, nil),
   427  		prometheus.GaugeValue,
   428  		float64(GetTotalUsableCapacityFree(server.Disks, sinfo)),
   429  	)
   430  
   431  	// MinIO Offline Disks per node
   432  	ch <- prometheus.MustNewConstMetric(
   433  		prometheus.NewDesc(
   434  			prometheus.BuildFQName(minioNamespace, "disks", "offline"),
   435  			"Total number of offline drives in current MinIO server instance",
   436  			nil, nil),
   437  		prometheus.GaugeValue,
   438  		float64(offlineDisks.Sum()),
   439  	)
   440  
   441  	// MinIO Total Disks per node
   442  	ch <- prometheus.MustNewConstMetric(
   443  		prometheus.NewDesc(
   444  			prometheus.BuildFQName(minioNamespace, "drives", "total"),
   445  			"Total number of drives for current MinIO server instance",
   446  			nil, nil),
   447  		prometheus.GaugeValue,
   448  		float64(totalDisks.Sum()),
   449  	)
   450  
   451  	for _, disk := range server.Disks {
   452  		// Total disk usage by the disk
   453  		ch <- prometheus.MustNewConstMetric(
   454  			prometheus.NewDesc(
   455  				prometheus.BuildFQName(diskNamespace, "storage", "used"),
   456  				"Total disk storage used on the drive",
   457  				[]string{"disk"}, nil),
   458  			prometheus.GaugeValue,
   459  			float64(disk.UsedSpace),
   460  			disk.DrivePath,
   461  		)
   462  
   463  		// Total available space in the disk
   464  		ch <- prometheus.MustNewConstMetric(
   465  			prometheus.NewDesc(
   466  				prometheus.BuildFQName(diskNamespace, "storage", "available"),
   467  				"Total available space left on the drive",
   468  				[]string{"disk"}, nil),
   469  			prometheus.GaugeValue,
   470  			float64(disk.AvailableSpace),
   471  			disk.DrivePath,
   472  		)
   473  
   474  		// Total storage space of the disk
   475  		ch <- prometheus.MustNewConstMetric(
   476  			prometheus.NewDesc(
   477  				prometheus.BuildFQName(diskNamespace, "storage", "total"),
   478  				"Total space on the drive",
   479  				[]string{"disk"}, nil),
   480  			prometheus.GaugeValue,
   481  			float64(disk.TotalSpace),
   482  			disk.DrivePath,
   483  		)
   484  	}
   485  }
   486  
   487  func metricsHandler() http.Handler {
   488  	registry := prometheus.NewRegistry()
   489  
   490  	logger.CriticalIf(GlobalContext, registry.Register(minioVersionInfo))
   491  
   492  	logger.CriticalIf(GlobalContext, registry.Register(newMinioCollector()))
   493  
   494  	gatherers := prometheus.Gatherers{
   495  		prometheus.DefaultGatherer,
   496  		registry,
   497  	}
   498  
   499  	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
   500  		tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
   501  		if ok {
   502  			tc.FuncName = "handler.MetricsLegacy"
   503  			tc.ResponseRecorder.LogErrBody = true
   504  		}
   505  
   506  		mfs, err := gatherers.Gather()
   507  		if err != nil {
   508  			if len(mfs) == 0 {
   509  				writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
   510  				return
   511  			}
   512  		}
   513  
   514  		contentType := expfmt.Negotiate(r.Header)
   515  		w.Header().Set("Content-Type", string(contentType))
   516  
   517  		enc := expfmt.NewEncoder(w, contentType)
   518  		for _, mf := range mfs {
   519  			if err := enc.Encode(mf); err != nil {
   520  				// client may disconnect for any reasons
   521  				// we do not have to log this.
   522  				return
   523  			}
   524  		}
   525  		if closer, ok := enc.(expfmt.Closer); ok {
   526  			closer.Close()
   527  		}
   528  	})
   529  }
   530  
   531  // NoAuthMiddleware no auth middle ware.
   532  func NoAuthMiddleware(h http.Handler) http.Handler {
   533  	return h
   534  }
   535  
   536  // AuthMiddleware checks if the bearer token is valid and authorized.
   537  func AuthMiddleware(h http.Handler) http.Handler {
   538  	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
   539  		tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
   540  
   541  		claims, groups, owner, authErr := metricsRequestAuthenticate(r)
   542  		if authErr != nil || (claims != nil && !claims.VerifyIssuer("prometheus", true)) {
   543  			if ok {
   544  				tc.FuncName = "handler.MetricsAuth"
   545  				tc.ResponseRecorder.LogErrBody = true
   546  			}
   547  
   548  			writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), errAuthentication), r.URL)
   549  			return
   550  		}
   551  
   552  		cred := auth.Credentials{
   553  			AccessKey: claims.AccessKey,
   554  			Claims:    claims.Map(),
   555  			Groups:    groups,
   556  		}
   557  
   558  		// For authenticated users apply IAM policy.
   559  		if !globalIAMSys.IsAllowed(policy.Args{
   560  			AccountName:     cred.AccessKey,
   561  			Groups:          cred.Groups,
   562  			Action:          policy.PrometheusAdminAction,
   563  			ConditionValues: getConditionValues(r, "", cred),
   564  			IsOwner:         owner,
   565  			Claims:          cred.Claims,
   566  		}) {
   567  			if ok {
   568  				tc.FuncName = "handler.MetricsAuth"
   569  				tc.ResponseRecorder.LogErrBody = true
   570  			}
   571  
   572  			writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), errAuthentication), r.URL)
   573  			return
   574  		}
   575  		h.ServeHTTP(w, r)
   576  	})
   577  }