github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/site-replication-metrics.go (about)

     1  // Copyright (c) 2015-2023 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"fmt"
    22  	"sync"
    23  	"sync/atomic"
    24  	"time"
    25  
    26  	"github.com/minio/madmin-go/v3"
    27  	"github.com/minio/minio-go/v7"
    28  )
    29  
    30  //go:generate msgp -file $GOFILE
    31  
    32  // RStat has replication error stats
    33  type RStat struct {
    34  	Count int64 `json:"count"`
    35  	Bytes int64 `json:"bytes"`
    36  }
    37  
    38  // RTimedMetrics has replication error stats for various time windows
    39  type RTimedMetrics struct {
    40  	LastHour    ReplicationLastHour `json:"lastHour"`
    41  	SinceUptime RStat               `json:"sinceUptime"`
    42  	LastMinute  ReplicationLastMinute
    43  	// Error counts
    44  	ErrCounts map[string]int `json:"errCounts"` // Count of credential errors
    45  }
    46  
    47  func (rt *RTimedMetrics) String() string {
    48  	s := rt.toMetric()
    49  	return fmt.Sprintf("Errors in LastMinute: %v, LastHour: %v, SinceUptime: %v", s.LastMinute.Count, s.LastHour.Count, s.Totals.Count)
    50  }
    51  
    52  func (rt *RTimedMetrics) toMetric() madmin.TimedErrStats {
    53  	if rt == nil {
    54  		return madmin.TimedErrStats{}
    55  	}
    56  	errCounts := make(map[string]int)
    57  	for k, v := range rt.ErrCounts {
    58  		errCounts[k] = v
    59  	}
    60  	minuteTotals := rt.LastMinute.getTotal()
    61  	hourTotals := rt.LastHour.getTotal()
    62  	return madmin.TimedErrStats{
    63  		LastMinute: madmin.RStat{
    64  			Count: float64(minuteTotals.N),
    65  			Bytes: minuteTotals.Size,
    66  		},
    67  		LastHour: madmin.RStat{
    68  			Count: float64(hourTotals.N),
    69  			Bytes: hourTotals.Size,
    70  		},
    71  		Totals: madmin.RStat{
    72  			Count: float64(rt.SinceUptime.Count),
    73  			Bytes: rt.SinceUptime.Bytes,
    74  		},
    75  		ErrCounts: errCounts,
    76  	}
    77  }
    78  
    79  func (rt *RTimedMetrics) addsize(size int64, err error) {
    80  	// failures seen since uptime
    81  	atomic.AddInt64(&rt.SinceUptime.Bytes, size)
    82  	atomic.AddInt64(&rt.SinceUptime.Count, 1)
    83  	rt.LastMinute.addsize(size)
    84  	rt.LastHour.addsize(size)
    85  	if err != nil && minio.ToErrorResponse(err).Code == "AccessDenied" {
    86  		if rt.ErrCounts == nil {
    87  			rt.ErrCounts = make(map[string]int)
    88  		}
    89  		rt.ErrCounts["AccessDenied"]++
    90  	}
    91  }
    92  
    93  func (rt *RTimedMetrics) merge(o RTimedMetrics) (n RTimedMetrics) {
    94  	n.SinceUptime.Bytes = atomic.LoadInt64(&rt.SinceUptime.Bytes) + atomic.LoadInt64(&o.SinceUptime.Bytes)
    95  	n.SinceUptime.Count = atomic.LoadInt64(&rt.SinceUptime.Count) + atomic.LoadInt64(&o.SinceUptime.Count)
    96  
    97  	n.LastMinute = n.LastMinute.merge(rt.LastMinute)
    98  	n.LastMinute = n.LastMinute.merge(o.LastMinute)
    99  	n.LastHour = n.LastHour.merge(rt.LastHour)
   100  	n.LastHour = n.LastHour.merge(o.LastHour)
   101  	n.ErrCounts = make(map[string]int)
   102  	for k, v := range rt.ErrCounts {
   103  		n.ErrCounts[k] = v
   104  	}
   105  	for k, v := range o.ErrCounts {
   106  		n.ErrCounts[k] += v
   107  	}
   108  	return n
   109  }
   110  
   111  // SRStats has replication stats at site level
   112  type SRStats struct {
   113  	// Total Replica size in bytes
   114  	ReplicaSize int64 `json:"replicaSize"`
   115  	// Total Replica received
   116  	ReplicaCount int64                `json:"replicaCount"`
   117  	M            map[string]*SRStatus `json:"srStatusMap"`
   118  
   119  	movingAvgTicker *time.Ticker // Ticker for calculating moving averages
   120  	lock            sync.RWMutex // mutex for srStats
   121  }
   122  
   123  // SRStatus has replication stats at deployment level
   124  type SRStatus struct {
   125  	ReplicatedSize int64 `json:"completedReplicationSize"`
   126  	// Total number of failed operations including metadata updates in the last minute
   127  	Failed RTimedMetrics `json:"failedReplication"`
   128  	// Total number of completed operations
   129  	ReplicatedCount int64 `json:"replicationCount"`
   130  	// Replication latency information
   131  	Latency ReplicationLatency `json:"replicationLatency"`
   132  	// transfer rate for large uploads
   133  	XferRateLrg *XferStats `json:"largeTransferRate" msg:"lt"`
   134  	// transfer rate for small uploads
   135  	XferRateSml *XferStats `json:"smallTransferRate" msg:"st"`
   136  	// Endpoint is the replication target endpoint
   137  	Endpoint string `json:"-"`
   138  	// Secure is true if the replication target endpoint is secure
   139  	Secure bool `json:"-"`
   140  }
   141  
   142  func (sr *SRStats) update(st replStat, dID string) {
   143  	sr.lock.Lock()
   144  	defer sr.lock.Unlock()
   145  	srs, ok := sr.M[dID]
   146  	if !ok {
   147  		srs = &SRStatus{
   148  			XferRateLrg: newXferStats(),
   149  			XferRateSml: newXferStats(),
   150  		}
   151  	}
   152  	srs.Endpoint = st.Endpoint
   153  	srs.Secure = st.Secure
   154  	switch {
   155  	case st.Completed:
   156  		srs.ReplicatedSize += st.TransferSize
   157  		srs.ReplicatedCount++
   158  		if st.TransferDuration > 0 {
   159  			srs.Latency.update(st.TransferSize, st.TransferDuration)
   160  			srs.updateXferRate(st.TransferSize, st.TransferDuration)
   161  		}
   162  	case st.Failed:
   163  		srs.Failed.addsize(st.TransferSize, st.Err)
   164  	case st.Pending:
   165  	}
   166  	sr.M[dID] = srs
   167  }
   168  
   169  func (sr *SRStats) get() map[string]SRMetric {
   170  	epMap := globalBucketTargetSys.healthStats()
   171  
   172  	sr.lock.RLock()
   173  	defer sr.lock.RUnlock()
   174  	m := make(map[string]SRMetric, len(sr.M))
   175  	for dID, v := range sr.M {
   176  		t := newXferStats()
   177  		mx := make(map[RMetricName]XferStats)
   178  
   179  		if v.XferRateLrg != nil {
   180  			mx[Large] = *v.XferRateLrg.Clone()
   181  			m := t.merge(*v.XferRateLrg)
   182  			t = &m
   183  		}
   184  		if v.XferRateSml != nil {
   185  			mx[Small] = *v.XferRateSml.Clone()
   186  			m := t.merge(*v.XferRateSml)
   187  			t = &m
   188  		}
   189  
   190  		mx[Total] = *t
   191  		metric := SRMetric{
   192  			ReplicatedSize:  v.ReplicatedSize,
   193  			ReplicatedCount: v.ReplicatedCount,
   194  			DeploymentID:    dID,
   195  			Failed:          v.Failed.toMetric(),
   196  			XferStats:       mx,
   197  		}
   198  		epHealth, ok := epMap[v.Endpoint]
   199  		if ok {
   200  			metric.Endpoint = epHealth.Endpoint
   201  			metric.TotalDowntime = epHealth.offlineDuration
   202  			metric.LastOnline = epHealth.lastOnline
   203  			metric.Online = epHealth.Online
   204  			metric.Latency = madmin.LatencyStat{
   205  				Curr: epHealth.latency.curr,
   206  				Avg:  epHealth.latency.avg,
   207  				Max:  epHealth.latency.peak,
   208  			}
   209  		}
   210  		m[dID] = metric
   211  	}
   212  	return m
   213  }
   214  
   215  func (srs *SRStatus) updateXferRate(sz int64, duration time.Duration) {
   216  	if sz > minLargeObjSize {
   217  		srs.XferRateLrg.addSize(sz, duration)
   218  	} else {
   219  		srs.XferRateSml.addSize(sz, duration)
   220  	}
   221  }
   222  
   223  func newSRStats() *SRStats {
   224  	s := SRStats{
   225  		M:               make(map[string]*SRStatus),
   226  		movingAvgTicker: time.NewTicker(time.Second * 2),
   227  	}
   228  	go s.trackEWMA()
   229  	return &s
   230  }
   231  
   232  func (sr *SRStats) trackEWMA() {
   233  	for {
   234  		select {
   235  		case <-sr.movingAvgTicker.C:
   236  			sr.updateMovingAvg()
   237  		case <-GlobalContext.Done():
   238  			return
   239  		}
   240  	}
   241  }
   242  
   243  func (sr *SRStats) updateMovingAvg() {
   244  	sr.lock.Lock()
   245  	defer sr.lock.Unlock()
   246  	for _, s := range sr.M {
   247  		s.XferRateLrg.measure.updateExponentialMovingAverage(time.Now())
   248  		s.XferRateSml.measure.updateExponentialMovingAverage(time.Now())
   249  	}
   250  }
   251  
   252  // SRMetric captures replication metrics for a deployment
   253  type SRMetric struct {
   254  	DeploymentID  string             `json:"deploymentID"`
   255  	Endpoint      string             `json:"endpoint"`
   256  	TotalDowntime time.Duration      `json:"totalDowntime"`
   257  	LastOnline    time.Time          `json:"lastOnline"`
   258  	Online        bool               `json:"isOnline"`
   259  	Latency       madmin.LatencyStat `json:"latency"`
   260  
   261  	// replication metrics across buckets roll up
   262  	ReplicatedSize int64 `json:"replicatedSize"`
   263  	// Total number of completed operations
   264  	ReplicatedCount int64 `json:"replicatedCount"`
   265  	// Failed captures replication errors in various time windows
   266  
   267  	Failed madmin.TimedErrStats `json:"failed,omitempty"`
   268  
   269  	XferStats map[RMetricName]XferStats `json:"transferSummary"`
   270  }
   271  
   272  // SRMetricsSummary captures summary of replication counts across buckets on site
   273  // along with op metrics rollup.
   274  type SRMetricsSummary struct {
   275  	// op metrics roll up
   276  	ActiveWorkers ActiveWorkerStat `json:"activeWorkers"`
   277  
   278  	// Total Replica size in bytes
   279  	ReplicaSize int64 `json:"replicaSize"`
   280  
   281  	// Total number of replica received
   282  	ReplicaCount int64 `json:"replicaCount"`
   283  	// Queued operations
   284  	Queued InQueueMetric `json:"queued"`
   285  	// Proxy stats
   286  	Proxied ProxyMetric `json:"proxied"`
   287  	// replication metrics summary for each site replication peer
   288  	Metrics map[string]SRMetric `json:"replMetrics"`
   289  	// uptime of node being queried for site replication metrics
   290  	Uptime int64 `json:"uptime"`
   291  }