github.com/minio/madmin-go@v1.7.5/metrics.go (about)

     1  //
     2  // MinIO Object Storage (c) 2022 MinIO, Inc.
     3  //
     4  // Licensed under the Apache License, Version 2.0 (the "License");
     5  // you may not use this file except in compliance with the License.
     6  // You may obtain a copy of the License at
     7  //
     8  //      http://www.apache.org/licenses/LICENSE-2.0
     9  //
    10  // Unless required by applicable law or agreed to in writing, software
    11  // distributed under the License is distributed on an "AS IS" BASIS,
    12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  // See the License for the specific language governing permissions and
    14  // limitations under the License.
    15  //
    16  
    17  package madmin
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"errors"
    23  	"fmt"
    24  	"io"
    25  	"net/http"
    26  	"net/url"
    27  	"sort"
    28  	"strconv"
    29  	"strings"
    30  	"time"
    31  )
    32  
    33  // MetricType is a bitfield representation of different metric types.
    34  type MetricType uint32
    35  
    36  // MetricsNone indicates no metrics.
    37  const MetricsNone MetricType = 0
    38  
    39  const (
    40  	MetricsScanner MetricType = 1 << (iota)
    41  	MetricsDisk
    42  	MetricsOS
    43  	MetricsBatchJobs
    44  	MetricsSiteResync
    45  
    46  	// MetricsAll must be last.
    47  	// Enables all metrics.
    48  	MetricsAll = 1<<(iota) - 1
    49  )
    50  
    51  // MetricsOptions are options provided to Metrics call.
    52  type MetricsOptions struct {
    53  	Type     MetricType    // Return only these metric types. Several types can be combined using |. Leave at 0 to return all.
    54  	N        int           // Maximum number of samples to return. 0 will return endless stream.
    55  	Interval time.Duration // Interval between samples. Will be rounded up to 1s.
    56  	Hosts    []string      // Leave empty for all
    57  	ByHost   bool          // Return metrics by host.
    58  	Disks    []string
    59  	ByDisk   bool
    60  	ByJobID  string
    61  	ByDepID  string
    62  }
    63  
    64  // Metrics makes an admin call to retrieve metrics.
    65  // The provided function is called for each received entry.
    66  func (adm *AdminClient) Metrics(ctx context.Context, o MetricsOptions, out func(RealtimeMetrics)) (err error) {
    67  	path := fmt.Sprintf(adminAPIPrefix + "/metrics")
    68  	q := make(url.Values)
    69  	q.Set("types", strconv.FormatUint(uint64(o.Type), 10))
    70  	q.Set("n", strconv.Itoa(o.N))
    71  	q.Set("interval", o.Interval.String())
    72  	q.Set("hosts", strings.Join(o.Hosts, ","))
    73  	if o.ByHost {
    74  		q.Set("by-host", "true")
    75  	}
    76  	q.Set("disks", strings.Join(o.Disks, ","))
    77  	if o.ByDisk {
    78  		q.Set("by-disk", "true")
    79  	}
    80  	if o.ByJobID != "" {
    81  		q.Set("by-jobID", o.ByJobID)
    82  	}
    83  	if o.ByDepID != "" {
    84  		q.Set("by-depID", o.ByDepID)
    85  	}
    86  
    87  	resp, err := adm.executeMethod(ctx,
    88  		http.MethodGet, requestData{
    89  			relPath:     path,
    90  			queryValues: q,
    91  		},
    92  	)
    93  	if err != nil {
    94  		return err
    95  	}
    96  
    97  	if resp.StatusCode != http.StatusOK {
    98  		closeResponse(resp)
    99  		return httpRespToErrorResponse(resp)
   100  	}
   101  	defer closeResponse(resp)
   102  	dec := json.NewDecoder(resp.Body)
   103  	for {
   104  		var m RealtimeMetrics
   105  		err := dec.Decode(&m)
   106  		if err != nil {
   107  			if errors.Is(err, io.EOF) {
   108  				err = io.ErrUnexpectedEOF
   109  			}
   110  			return err
   111  		}
   112  		out(m)
   113  		if m.Final {
   114  			break
   115  		}
   116  	}
   117  	return nil
   118  }
   119  
   120  // Contains returns whether m contains all of x.
   121  func (m MetricType) Contains(x MetricType) bool {
   122  	return m&x == x
   123  }
   124  
   125  // RealtimeMetrics provides realtime metrics.
   126  // This is intended to be expanded over time to cover more types.
   127  type RealtimeMetrics struct {
   128  	// Error indicates an error occurred.
   129  	Errors []string `json:"errors,omitempty"`
   130  	// Hosts indicates the scanned hosts
   131  	Hosts      []string              `json:"hosts"`
   132  	Aggregated Metrics               `json:"aggregated"`
   133  	ByHost     map[string]Metrics    `json:"by_host,omitempty"`
   134  	ByDisk     map[string]DiskMetric `json:"by_disk,omitempty"`
   135  	// Final indicates whether this is the final packet and the receiver can exit.
   136  	Final bool `json:"final"`
   137  }
   138  
   139  // Metrics contains all metric types.
   140  type Metrics struct {
   141  	Scanner    *ScannerMetrics    `json:"scanner,omitempty"`
   142  	Disk       *DiskMetric        `json:"disk,omitempty"`
   143  	OS         *OSMetrics         `json:"os,omitempty"`
   144  	BatchJobs  *BatchJobMetrics   `json:"batchJobs,omitempty"`
   145  	SiteResync *SiteResyncMetrics `json:"siteResync,omitempty"`
   146  }
   147  
   148  // Merge other into r.
   149  func (r *Metrics) Merge(other *Metrics) {
   150  	if other == nil {
   151  		return
   152  	}
   153  	if r.Scanner == nil && other.Scanner != nil {
   154  		r.Scanner = &ScannerMetrics{}
   155  	}
   156  	r.Scanner.Merge(other.Scanner)
   157  
   158  	if r.Disk == nil && other.Disk != nil {
   159  		r.Disk = &DiskMetric{}
   160  	}
   161  	r.Disk.Merge(other.Disk)
   162  
   163  	if r.OS == nil && other.OS != nil {
   164  		r.OS = &OSMetrics{}
   165  	}
   166  	r.OS.Merge(other.OS)
   167  	if r.BatchJobs == nil && other.BatchJobs != nil {
   168  		r.BatchJobs = &BatchJobMetrics{}
   169  	}
   170  	r.BatchJobs.Merge(other.BatchJobs)
   171  
   172  	if r.SiteResync == nil && other.SiteResync != nil {
   173  		r.SiteResync = &SiteResyncMetrics{}
   174  	}
   175  	r.SiteResync.Merge(other.SiteResync)
   176  }
   177  
   178  // Merge will merge other into r.
   179  func (r *RealtimeMetrics) Merge(other *RealtimeMetrics) {
   180  	if other == nil {
   181  		return
   182  	}
   183  
   184  	if len(other.Errors) > 0 {
   185  		r.Errors = append(r.Errors, other.Errors...)
   186  	}
   187  
   188  	if r.ByHost == nil && len(other.ByHost) > 0 {
   189  		r.ByHost = make(map[string]Metrics, len(other.ByHost))
   190  	}
   191  	for host, metrics := range other.ByHost {
   192  		r.ByHost[host] = metrics
   193  	}
   194  
   195  	r.Hosts = append(r.Hosts, other.Hosts...)
   196  	r.Aggregated.Merge(&other.Aggregated)
   197  	sort.Strings(r.Hosts)
   198  
   199  	// Gather per disk metrics
   200  	if r.ByDisk == nil && len(other.ByDisk) > 0 {
   201  		r.ByDisk = make(map[string]DiskMetric, len(other.ByDisk))
   202  	}
   203  	for disk, metrics := range other.ByDisk {
   204  		r.ByDisk[disk] = metrics
   205  	}
   206  }
   207  
   208  // ScannerMetrics contains scanner information.
   209  type ScannerMetrics struct {
   210  	// Time these metrics were collected
   211  	CollectedAt time.Time `json:"collected"`
   212  
   213  	// Current scanner cycle
   214  	CurrentCycle uint64 `json:"current_cycle"`
   215  
   216  	// Start time of current cycle
   217  	CurrentStarted time.Time `json:"current_started"`
   218  
   219  	// History of when last cycles completed
   220  	CyclesCompletedAt []time.Time `json:"cycle_complete_times"`
   221  
   222  	// Number of accumulated operations by type since server restart.
   223  	LifeTimeOps map[string]uint64 `json:"life_time_ops,omitempty"`
   224  
   225  	// Number of accumulated ILM operations by type since server restart.
   226  	LifeTimeILM map[string]uint64 `json:"ilm_ops,omitempty"`
   227  
   228  	// Last minute operation statistics.
   229  	LastMinute struct {
   230  		// Scanner actions.
   231  		Actions map[string]TimedAction `json:"actions,omitempty"`
   232  		// ILM actions.
   233  		ILM map[string]TimedAction `json:"ilm,omitempty"`
   234  	} `json:"last_minute"`
   235  
   236  	// Currently active path(s) being scanned.
   237  	ActivePaths []string `json:"active,omitempty"`
   238  }
   239  
   240  // TimedAction contains a number of actions and their accumulated duration in nanoseconds.
   241  type TimedAction struct {
   242  	Count   uint64 `json:"count"`
   243  	AccTime uint64 `json:"acc_time_ns"`
   244  	Bytes   uint64 `json:"bytes,omitempty"`
   245  }
   246  
   247  // Avg returns the average time spent on the action.
   248  func (t TimedAction) Avg() time.Duration {
   249  	if t.Count == 0 {
   250  		return 0
   251  	}
   252  	return time.Duration(t.AccTime / t.Count)
   253  }
   254  
   255  // AvgBytes returns the average time spent on the action.
   256  func (t TimedAction) AvgBytes() uint64 {
   257  	if t.Count == 0 {
   258  		return 0
   259  	}
   260  	return t.Bytes / t.Count
   261  }
   262  
   263  // Merge other into t.
   264  func (t *TimedAction) Merge(other TimedAction) {
   265  	t.Count += other.Count
   266  	t.AccTime += other.AccTime
   267  	t.Bytes += other.Bytes
   268  }
   269  
   270  // Merge other into 's'.
   271  func (s *ScannerMetrics) Merge(other *ScannerMetrics) {
   272  	if other == nil {
   273  		return
   274  	}
   275  	if s.CollectedAt.Before(other.CollectedAt) {
   276  		// Use latest timestamp
   277  		s.CollectedAt = other.CollectedAt
   278  	}
   279  	if s.CurrentCycle < other.CurrentCycle {
   280  		s.CurrentCycle = other.CurrentCycle
   281  		s.CyclesCompletedAt = other.CyclesCompletedAt
   282  		s.CurrentStarted = other.CurrentStarted
   283  	}
   284  	if len(other.CyclesCompletedAt) > len(s.CyclesCompletedAt) {
   285  		s.CyclesCompletedAt = other.CyclesCompletedAt
   286  	}
   287  
   288  	// Regular ops
   289  	if len(other.LifeTimeOps) > 0 && s.LifeTimeOps == nil {
   290  		s.LifeTimeOps = make(map[string]uint64, len(other.LifeTimeOps))
   291  	}
   292  	for k, v := range other.LifeTimeOps {
   293  		total := s.LifeTimeOps[k] + v
   294  		s.LifeTimeOps[k] = total
   295  	}
   296  	if s.LastMinute.Actions == nil && len(other.LastMinute.Actions) > 0 {
   297  		s.LastMinute.Actions = make(map[string]TimedAction, len(other.LastMinute.Actions))
   298  	}
   299  	for k, v := range other.LastMinute.Actions {
   300  		total := s.LastMinute.Actions[k]
   301  		total.Merge(v)
   302  		s.LastMinute.Actions[k] = total
   303  	}
   304  
   305  	// ILM
   306  	if len(other.LifeTimeILM) > 0 && s.LifeTimeILM == nil {
   307  		s.LifeTimeILM = make(map[string]uint64, len(other.LifeTimeILM))
   308  	}
   309  	for k, v := range other.LifeTimeILM {
   310  		total := s.LifeTimeILM[k] + v
   311  		s.LifeTimeILM[k] = total
   312  	}
   313  	if s.LastMinute.ILM == nil && len(other.LastMinute.ILM) > 0 {
   314  		s.LastMinute.ILM = make(map[string]TimedAction, len(other.LastMinute.ILM))
   315  	}
   316  	for k, v := range other.LastMinute.ILM {
   317  		total := s.LastMinute.ILM[k]
   318  		total.Merge(v)
   319  		s.LastMinute.ILM[k] = total
   320  	}
   321  	s.ActivePaths = append(s.ActivePaths, other.ActivePaths...)
   322  	sort.Strings(s.ActivePaths)
   323  }
   324  
   325  // DiskIOStats contains IO stats of a single drive
   326  type DiskIOStats struct {
   327  	ReadIOs        uint64 `json:"read_ios"`
   328  	ReadMerges     uint64 `json:"read_merges"`
   329  	ReadSectors    uint64 `json:"read_sectors"`
   330  	ReadTicks      uint64 `json:"read_ticks"`
   331  	WriteIOs       uint64 `json:"write_ios"`
   332  	WriteMerges    uint64 `json:"write_merges"`
   333  	WriteSectors   uint64 `json:"wrte_sectors"`
   334  	WriteTicks     uint64 `json:"write_ticks"`
   335  	CurrentIOs     uint64 `json:"current_ios"`
   336  	TotalTicks     uint64 `json:"total_ticks"`
   337  	ReqTicks       uint64 `json:"req_ticks"`
   338  	DiscardIOs     uint64 `json:"discard_ios"`
   339  	DiscardMerges  uint64 `json:"discard_merges"`
   340  	DiscardSectors uint64 `json:"discard_secotrs"`
   341  	DiscardTicks   uint64 `json:"discard_ticks"`
   342  	FlushIOs       uint64 `json:"flush_ios"`
   343  	FlushTicks     uint64 `json:"flush_ticks"`
   344  }
   345  
   346  // DiskMetric contains metrics for one or more disks.
   347  type DiskMetric struct {
   348  	// Time these metrics were collected
   349  	CollectedAt time.Time `json:"collected"`
   350  
   351  	// Number of disks
   352  	NDisks int `json:"n_disks"`
   353  
   354  	// Offline disks
   355  	Offline int `json:"offline,omitempty"`
   356  
   357  	// Healing disks
   358  	Healing int `json:"healing,omitempty"`
   359  
   360  	// Number of accumulated operations by type since server restart.
   361  	LifeTimeOps map[string]uint64 `json:"life_time_ops,omitempty"`
   362  
   363  	// Last minute statistics.
   364  	LastMinute struct {
   365  		Operations map[string]TimedAction `json:"operations,omitempty"`
   366  	} `json:"last_minute"`
   367  
   368  	IOStats DiskIOStats `json:"iostats,omitempty"`
   369  }
   370  
   371  // Merge other into 's'.
   372  func (d *DiskMetric) Merge(other *DiskMetric) {
   373  	if other == nil {
   374  		return
   375  	}
   376  	if d.CollectedAt.Before(other.CollectedAt) {
   377  		// Use latest timestamp
   378  		d.CollectedAt = other.CollectedAt
   379  	}
   380  	d.NDisks += other.NDisks
   381  	d.Offline += other.Offline
   382  	d.Healing += other.Healing
   383  
   384  	if len(other.LifeTimeOps) > 0 && d.LifeTimeOps == nil {
   385  		d.LifeTimeOps = make(map[string]uint64, len(other.LifeTimeOps))
   386  	}
   387  	for k, v := range other.LifeTimeOps {
   388  		total := d.LifeTimeOps[k] + v
   389  		d.LifeTimeOps[k] = total
   390  	}
   391  
   392  	if d.LastMinute.Operations == nil && len(other.LastMinute.Operations) > 0 {
   393  		d.LastMinute.Operations = make(map[string]TimedAction, len(other.LastMinute.Operations))
   394  	}
   395  	for k, v := range other.LastMinute.Operations {
   396  		total := d.LastMinute.Operations[k]
   397  		total.Merge(v)
   398  		d.LastMinute.Operations[k] = total
   399  	}
   400  }
   401  
   402  // OSMetrics contains metrics for OS operations.
   403  type OSMetrics struct {
   404  	// Time these metrics were collected
   405  	CollectedAt time.Time `json:"collected"`
   406  
   407  	// Number of accumulated operations by type since server restart.
   408  	LifeTimeOps map[string]uint64 `json:"life_time_ops,omitempty"`
   409  
   410  	// Last minute statistics.
   411  	LastMinute struct {
   412  		Operations map[string]TimedAction `json:"operations,omitempty"`
   413  	} `json:"last_minute"`
   414  }
   415  
   416  // Merge other into 'o'.
   417  func (o *OSMetrics) Merge(other *OSMetrics) {
   418  	if other == nil {
   419  		return
   420  	}
   421  	if o.CollectedAt.Before(other.CollectedAt) {
   422  		// Use latest timestamp
   423  		o.CollectedAt = other.CollectedAt
   424  	}
   425  
   426  	if len(other.LifeTimeOps) > 0 && o.LifeTimeOps == nil {
   427  		o.LifeTimeOps = make(map[string]uint64, len(other.LifeTimeOps))
   428  	}
   429  	for k, v := range other.LifeTimeOps {
   430  		total := o.LifeTimeOps[k] + v
   431  		o.LifeTimeOps[k] = total
   432  	}
   433  
   434  	if o.LastMinute.Operations == nil && len(other.LastMinute.Operations) > 0 {
   435  		o.LastMinute.Operations = make(map[string]TimedAction, len(other.LastMinute.Operations))
   436  	}
   437  	for k, v := range other.LastMinute.Operations {
   438  		total := o.LastMinute.Operations[k]
   439  		total.Merge(v)
   440  		o.LastMinute.Operations[k] = total
   441  	}
   442  }
   443  
   444  // BatchJobMetrics contains metrics for batch operations
   445  type BatchJobMetrics struct {
   446  	// Time these metrics were collected
   447  	CollectedAt time.Time `json:"collected"`
   448  
   449  	// Jobs by ID.
   450  	Jobs map[string]JobMetric
   451  }
   452  
   453  type JobMetric struct {
   454  	JobID         string    `json:"jobID"`
   455  	JobType       string    `json:"jobType"`
   456  	StartTime     time.Time `json:"startTime"`
   457  	LastUpdate    time.Time `json:"lastUpdate"`
   458  	RetryAttempts int       `json:"retryAttempts"`
   459  
   460  	Complete bool `json:"complete"`
   461  	Failed   bool `json:"failed"`
   462  
   463  	// Specific job type data:
   464  	Replicate *ReplicateInfo `json:"replicate,omitempty"`
   465  }
   466  
   467  type ReplicateInfo struct {
   468  	// Last bucket/object batch replicated
   469  	Bucket string `json:"lastBucket"`
   470  	Object string `json:"lastObject"`
   471  
   472  	// Verbose information
   473  	Objects          int64 `json:"objects"`
   474  	ObjectsFailed    int64 `json:"objectsFailed"`
   475  	BytesTransferred int64 `json:"bytesTransferred"`
   476  	BytesFailed      int64 `json:"bytesFailed"`
   477  }
   478  
   479  // Merge other into 'o'.
   480  func (o *BatchJobMetrics) Merge(other *BatchJobMetrics) {
   481  	if other == nil || len(other.Jobs) == 0 {
   482  		return
   483  	}
   484  	if o.CollectedAt.Before(other.CollectedAt) {
   485  		// Use latest timestamp
   486  		o.CollectedAt = other.CollectedAt
   487  	}
   488  	if o.Jobs == nil {
   489  		o.Jobs = make(map[string]JobMetric, len(other.Jobs))
   490  	}
   491  	// Job
   492  	for k, v := range other.Jobs {
   493  		o.Jobs[k] = v
   494  	}
   495  }
   496  
   497  // SiteResyncMetrics contains metrics for site resync operation
   498  type SiteResyncMetrics struct {
   499  	// Time these metrics were collected
   500  	CollectedAt time.Time `json:"collected"`
   501  	// Status of resync operation
   502  	ResyncStatus string    `json:"resyncStatus,omitempty"`
   503  	StartTime    time.Time `json:"startTime"`
   504  	LastUpdate   time.Time `json:"lastUpdate"`
   505  	NumBuckets   int64     `json:"numBuckets"`
   506  	ResyncID     string    `json:"resyncID"`
   507  	DeplID       string    `json:"deplID"`
   508  
   509  	// Completed size in bytes
   510  	ReplicatedSize int64 `json:"completedReplicationSize"`
   511  	// Total number of objects replicated
   512  	ReplicatedCount int64 `json:"replicationCount"`
   513  	// Failed size in bytes
   514  	FailedSize int64 `json:"failedReplicationSize"`
   515  	// Total number of failed operations
   516  	FailedCount int64 `json:"failedReplicationCount"`
   517  	// Buckets that could not be synced
   518  	FailedBuckets []string `json:"failedBuckets"`
   519  	// Last bucket/object replicated.
   520  	Bucket string `json:"bucket,omitempty"`
   521  	Object string `json:"object,omitempty"`
   522  }
   523  
   524  func (o SiteResyncMetrics) Complete() bool {
   525  	return strings.ToLower(o.ResyncStatus) == "completed"
   526  }
   527  
   528  // Merge other into 'o'.
   529  func (o *SiteResyncMetrics) Merge(other *SiteResyncMetrics) {
   530  	if other == nil {
   531  		return
   532  	}
   533  	if o.CollectedAt.Before(other.CollectedAt) {
   534  		// Use latest
   535  		*o = *other
   536  	}
   537  }