github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/bucket-replication-stats.go

github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/bucket-replication-stats.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"context"
    22  	"sync"
    23  	"sync/atomic"
    24  	"time"
    25  
    26  	"github.com/minio/minio/internal/bucket/replication"
    27  	"github.com/rcrowley/go-metrics"
    28  )
    29  
    30  func (b *BucketReplicationStats) hasReplicationUsage() bool {
    31  	for _, s := range b.Stats {
    32  		if s.hasReplicationUsage() {
    33  			return true
    34  		}
    35  	}
    36  	return false
    37  }
    38  
    39  // ReplicationStats holds the global in-memory replication stats
    40  type ReplicationStats struct {
    41  	// map of site deployment ID to site replication status
    42  	// for site replication - maintain stats at global level
    43  	srStats *SRStats
    44  	// active worker stats
    45  	workers *ActiveWorkerStat
    46  	// queue stats cache
    47  	qCache queueCache
    48  
    49  	pCache proxyStatsCache
    50  	// mrf backlog stats
    51  	mrfStats ReplicationMRFStats
    52  	// for bucket replication, continue to use existing cache
    53  	Cache             map[string]*BucketReplicationStats
    54  	mostRecentStats   BucketStatsMap
    55  	registry          metrics.Registry
    56  	sync.RWMutex                 // mutex for Cache
    57  	mostRecentStatsMu sync.Mutex // mutex for mostRecentStats
    58  
    59  	wlock sync.RWMutex // mutex for active workers
    60  
    61  	movingAvgTicker *time.Ticker // Ticker for calculating moving averages
    62  	wTimer          *time.Ticker // ticker for calculating active workers
    63  	qTimer          *time.Ticker // ticker for calculating queue stats
    64  }
    65  
    66  func (r *ReplicationStats) trackEWMA() {
    67  	for {
    68  		select {
    69  		case <-r.movingAvgTicker.C:
    70  			r.updateMovingAvg()
    71  		case <-GlobalContext.Done():
    72  			return
    73  		}
    74  	}
    75  }
    76  
    77  func (r *ReplicationStats) updateMovingAvg() {
    78  	r.RLock()
    79  	for _, s := range r.Cache {
    80  		for _, st := range s.Stats {
    81  			st.XferRateLrg.measure.updateExponentialMovingAverage(time.Now())
    82  			st.XferRateSml.measure.updateExponentialMovingAverage(time.Now())
    83  		}
    84  	}
    85  	r.RUnlock()
    86  }
    87  
    88  // ActiveWorkers returns worker stats
    89  func (r *ReplicationStats) ActiveWorkers() ActiveWorkerStat {
    90  	r.wlock.RLock()
    91  	defer r.wlock.RUnlock()
    92  	w := r.workers.get()
    93  	return ActiveWorkerStat{
    94  		Curr: w.Curr,
    95  		Max:  w.Max,
    96  		Avg:  w.Avg,
    97  	}
    98  }
    99  
   100  func (r *ReplicationStats) collectWorkerMetrics(ctx context.Context) {
   101  	if r == nil {
   102  		return
   103  	}
   104  	for {
   105  		select {
   106  		case <-ctx.Done():
   107  			return
   108  		case <-r.wTimer.C:
   109  			r.wlock.Lock()
   110  			r.workers.update()
   111  			r.wlock.Unlock()
   112  
   113  		}
   114  	}
   115  }
   116  
   117  func (r *ReplicationStats) collectQueueMetrics(ctx context.Context) {
   118  	if r == nil {
   119  		return
   120  	}
   121  
   122  	for {
   123  		select {
   124  		case <-ctx.Done():
   125  			return
   126  		case <-r.qTimer.C:
   127  			r.qCache.update()
   128  		}
   129  	}
   130  }
   131  
   132  // Delete deletes in-memory replication statistics for a bucket.
   133  func (r *ReplicationStats) Delete(bucket string) {
   134  	if r == nil {
   135  		return
   136  	}
   137  
   138  	r.Lock()
   139  	defer r.Unlock()
   140  	delete(r.Cache, bucket)
   141  }
   142  
   143  // UpdateReplicaStat updates in-memory replica statistics with new values.
   144  func (r *ReplicationStats) UpdateReplicaStat(bucket string, n int64) {
   145  	if r == nil {
   146  		return
   147  	}
   148  
   149  	r.Lock()
   150  	defer r.Unlock()
   151  	bs, ok := r.Cache[bucket]
   152  	if !ok {
   153  		bs = newBucketReplicationStats()
   154  	}
   155  	bs.ReplicaSize += n
   156  	bs.ReplicaCount++
   157  	r.Cache[bucket] = bs
   158  	r.srUpdateReplicaStat(n)
   159  }
   160  
   161  func (r *ReplicationStats) srUpdateReplicaStat(sz int64) {
   162  	if r == nil {
   163  		return
   164  	}
   165  	atomic.AddInt64(&r.srStats.ReplicaSize, sz)
   166  	atomic.AddInt64(&r.srStats.ReplicaCount, 1)
   167  }
   168  
   169  func (r *ReplicationStats) srUpdate(sr replStat) {
   170  	dID, err := globalSiteReplicationSys.getDeplIDForEndpoint(sr.endpoint())
   171  	if err == nil {
   172  		r.srStats.update(sr, dID)
   173  	}
   174  }
   175  
   176  // Update updates in-memory replication statistics with new values.
   177  func (r *ReplicationStats) Update(bucket string, ri replicatedTargetInfo, status, prevStatus replication.StatusType) {
   178  	if r == nil {
   179  		return
   180  	}
   181  	var rs replStat
   182  	switch status {
   183  	case replication.Pending:
   184  		if ri.OpType.IsDataReplication() && prevStatus != status {
   185  			rs.set(ri.Arn, ri.Size, 0, status, ri.OpType, ri.endpoint, ri.secure, ri.Err)
   186  		}
   187  	case replication.Completed:
   188  		if ri.OpType.IsDataReplication() {
   189  			rs.set(ri.Arn, ri.Size, ri.Duration, status, ri.OpType, ri.endpoint, ri.secure, ri.Err)
   190  		}
   191  	case replication.Failed:
   192  		if ri.OpType.IsDataReplication() && prevStatus == replication.Pending {
   193  			rs.set(ri.Arn, ri.Size, ri.Duration, status, ri.OpType, ri.endpoint, ri.secure, ri.Err)
   194  		}
   195  	case replication.Replica:
   196  		if ri.OpType == replication.ObjectReplicationType {
   197  			rs.set(ri.Arn, ri.Size, 0, status, ri.OpType, "", false, ri.Err)
   198  		}
   199  	}
   200  
   201  	// update site-replication in-memory stats
   202  	if rs.Completed || rs.Failed {
   203  		r.srUpdate(rs)
   204  	}
   205  
   206  	r.Lock()
   207  	defer r.Unlock()
   208  
   209  	// update bucket replication in-memory stats
   210  	bs, ok := r.Cache[bucket]
   211  	if !ok {
   212  		bs = newBucketReplicationStats()
   213  		r.Cache[bucket] = bs
   214  	}
   215  	b, ok := bs.Stats[ri.Arn]
   216  	if !ok {
   217  		b = &BucketReplicationStat{
   218  			XferRateLrg: newXferStats(),
   219  			XferRateSml: newXferStats(),
   220  		}
   221  		bs.Stats[ri.Arn] = b
   222  	}
   223  
   224  	switch {
   225  	case rs.Completed:
   226  		b.ReplicatedSize += rs.TransferSize
   227  		b.ReplicatedCount++
   228  		if rs.TransferDuration > 0 {
   229  			b.Latency.update(rs.TransferSize, rs.TransferDuration)
   230  			b.updateXferRate(rs.TransferSize, rs.TransferDuration)
   231  		}
   232  	case rs.Failed:
   233  		b.FailStats.addsize(rs.TransferSize, rs.Err)
   234  	case rs.Pending:
   235  	}
   236  }
   237  
   238  type replStat struct {
   239  	Arn       string
   240  	Completed bool
   241  	Pending   bool
   242  	Failed    bool
   243  	opType    replication.Type
   244  	// transfer size
   245  	TransferSize int64
   246  	// transfer duration
   247  	TransferDuration time.Duration
   248  	Endpoint         string
   249  	Secure           bool
   250  	Err              error
   251  }
   252  
   253  func (rs *replStat) endpoint() string {
   254  	scheme := "http"
   255  	if rs.Secure {
   256  		scheme = "https"
   257  	}
   258  	return scheme + "://" + rs.Endpoint
   259  }
   260  
   261  func (rs *replStat) set(arn string, n int64, duration time.Duration, status replication.StatusType, opType replication.Type, endpoint string, secure bool, err error) {
   262  	rs.Endpoint = endpoint
   263  	rs.Secure = secure
   264  	rs.TransferSize = n
   265  	rs.Arn = arn
   266  	rs.TransferDuration = duration
   267  	rs.opType = opType
   268  	switch status {
   269  	case replication.Completed:
   270  		rs.Completed = true
   271  	case replication.Pending:
   272  		rs.Pending = true
   273  	case replication.Failed:
   274  		rs.Failed = true
   275  		rs.Err = err
   276  	}
   277  }
   278  
   279  // GetAll returns replication metrics for all buckets at once.
   280  func (r *ReplicationStats) GetAll() map[string]BucketReplicationStats {
   281  	if r == nil {
   282  		return map[string]BucketReplicationStats{}
   283  	}
   284  
   285  	r.RLock()
   286  
   287  	bucketReplicationStats := make(map[string]BucketReplicationStats, len(r.Cache))
   288  	for k, v := range r.Cache {
   289  		bucketReplicationStats[k] = v.Clone()
   290  	}
   291  	r.RUnlock()
   292  	for k, v := range bucketReplicationStats {
   293  		v.QStat = r.qCache.getBucketStats(k)
   294  		bucketReplicationStats[k] = v
   295  	}
   296  
   297  	return bucketReplicationStats
   298  }
   299  
   300  func (r *ReplicationStats) getSRMetricsForNode() SRMetricsSummary {
   301  	if r == nil {
   302  		return SRMetricsSummary{}
   303  	}
   304  
   305  	m := SRMetricsSummary{
   306  		Uptime:        UTCNow().Unix() - globalBootTime.Unix(),
   307  		Queued:        r.qCache.getSiteStats(),
   308  		ActiveWorkers: r.ActiveWorkers(),
   309  		Metrics:       r.srStats.get(),
   310  		Proxied:       r.pCache.getSiteStats(),
   311  		ReplicaSize:   atomic.LoadInt64(&r.srStats.ReplicaSize),
   312  		ReplicaCount:  atomic.LoadInt64(&r.srStats.ReplicaCount),
   313  	}
   314  	return m
   315  }
   316  
   317  // Get replication metrics for a bucket from this node since this node came up.
   318  func (r *ReplicationStats) Get(bucket string) BucketReplicationStats {
   319  	if r == nil {
   320  		return BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
   321  	}
   322  
   323  	r.RLock()
   324  	defer r.RUnlock()
   325  
   326  	st, ok := r.Cache[bucket]
   327  	if !ok {
   328  		return BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
   329  	}
   330  	return st.Clone()
   331  }
   332  
   333  // NewReplicationStats initialize in-memory replication statistics
   334  func NewReplicationStats(ctx context.Context, objectAPI ObjectLayer) *ReplicationStats {
   335  	r := metrics.NewRegistry()
   336  	rs := ReplicationStats{
   337  		Cache:           make(map[string]*BucketReplicationStats),
   338  		qCache:          newQueueCache(r),
   339  		pCache:          newProxyStatsCache(),
   340  		srStats:         newSRStats(),
   341  		movingAvgTicker: time.NewTicker(2 * time.Second),
   342  		wTimer:          time.NewTicker(2 * time.Second),
   343  		qTimer:          time.NewTicker(2 * time.Second),
   344  
   345  		workers:  newActiveWorkerStat(r),
   346  		registry: r,
   347  	}
   348  	go rs.collectWorkerMetrics(ctx)
   349  	go rs.collectQueueMetrics(ctx)
   350  	return &rs
   351  }
   352  
   353  func (r *ReplicationStats) getAllLatest(bucketsUsage map[string]BucketUsageInfo) (bucketsReplicationStats map[string]BucketStats) {
   354  	peerBucketStatsList := globalNotificationSys.GetClusterAllBucketStats(GlobalContext)
   355  	bucketsReplicationStats = make(map[string]BucketStats, len(bucketsUsage))
   356  
   357  	for bucket := range bucketsUsage {
   358  		bucketStats := make([]BucketStats, len(peerBucketStatsList))
   359  		for i, peerBucketStats := range peerBucketStatsList {
   360  			bucketStat, ok := peerBucketStats.Stats[bucket]
   361  			if !ok {
   362  				continue
   363  			}
   364  			bucketStats[i] = bucketStat
   365  		}
   366  		bucketsReplicationStats[bucket] = r.calculateBucketReplicationStats(bucket, bucketStats)
   367  	}
   368  	return bucketsReplicationStats
   369  }
   370  
   371  func (r *ReplicationStats) calculateBucketReplicationStats(bucket string, bucketStats []BucketStats) (bs BucketStats) {
   372  	if r == nil {
   373  		bs = BucketStats{
   374  			ReplicationStats: BucketReplicationStats{
   375  				Stats: make(map[string]*BucketReplicationStat),
   376  			},
   377  			QueueStats: ReplicationQueueStats{},
   378  			ProxyStats: ProxyMetric{},
   379  		}
   380  		return bs
   381  	}
   382  	var s BucketReplicationStats
   383  	// accumulate cluster bucket stats
   384  	stats := make(map[string]*BucketReplicationStat)
   385  	var (
   386  		totReplicaSize, totReplicatedSize   int64
   387  		totReplicaCount, totReplicatedCount int64
   388  		totFailed                           RTimedMetrics
   389  		tq                                  InQueueMetric
   390  	)
   391  	for _, bucketStat := range bucketStats {
   392  		totReplicaSize += bucketStat.ReplicationStats.ReplicaSize
   393  		totReplicaCount += bucketStat.ReplicationStats.ReplicaCount
   394  		for _, q := range bucketStat.QueueStats.Nodes {
   395  			tq = tq.merge(q.QStats)
   396  		}
   397  
   398  		for arn, stat := range bucketStat.ReplicationStats.Stats {
   399  			oldst := stats[arn]
   400  			if oldst == nil {
   401  				oldst = &BucketReplicationStat{
   402  					XferRateLrg: newXferStats(),
   403  					XferRateSml: newXferStats(),
   404  				}
   405  			}
   406  			fstats := stat.FailStats.merge(oldst.FailStats)
   407  			lrg := oldst.XferRateLrg.merge(*stat.XferRateLrg)
   408  			sml := oldst.XferRateSml.merge(*stat.XferRateSml)
   409  			stats[arn] = &BucketReplicationStat{
   410  				Failed:          fstats.toMetric(),
   411  				FailStats:       fstats,
   412  				ReplicatedSize:  stat.ReplicatedSize + oldst.ReplicatedSize,
   413  				ReplicatedCount: stat.ReplicatedCount + oldst.ReplicatedCount,
   414  				Latency:         stat.Latency.merge(oldst.Latency),
   415  				XferRateLrg:     &lrg,
   416  				XferRateSml:     &sml,
   417  			}
   418  			totReplicatedSize += stat.ReplicatedSize
   419  			totReplicatedCount += stat.ReplicatedCount
   420  			totFailed = totFailed.merge(stat.FailStats)
   421  		}
   422  	}
   423  
   424  	s = BucketReplicationStats{
   425  		Stats:           stats,
   426  		QStat:           tq,
   427  		ReplicaSize:     totReplicaSize,
   428  		ReplicaCount:    totReplicaCount,
   429  		ReplicatedSize:  totReplicatedSize,
   430  		ReplicatedCount: totReplicatedCount,
   431  		Failed:          totFailed.toMetric(),
   432  	}
   433  
   434  	var qs ReplicationQueueStats
   435  	for _, bs := range bucketStats {
   436  		qs.Nodes = append(qs.Nodes, bs.QueueStats.Nodes...)
   437  	}
   438  	qs.Uptime = UTCNow().Unix() - globalBootTime.Unix()
   439  
   440  	var ps ProxyMetric
   441  	for _, bs := range bucketStats {
   442  		ps.add(bs.ProxyStats)
   443  	}
   444  	bs = BucketStats{
   445  		ReplicationStats: s,
   446  		QueueStats:       qs,
   447  		ProxyStats:       ps,
   448  	}
   449  	r.mostRecentStatsMu.Lock()
   450  	if len(r.mostRecentStats.Stats) == 0 {
   451  		r.mostRecentStats = BucketStatsMap{Stats: make(map[string]BucketStats, 1), Timestamp: UTCNow()}
   452  	}
   453  	if len(bs.ReplicationStats.Stats) > 0 {
   454  		r.mostRecentStats.Stats[bucket] = bs
   455  	}
   456  	r.mostRecentStats.Timestamp = UTCNow()
   457  	r.mostRecentStatsMu.Unlock()
   458  	return bs
   459  }
   460  
   461  // get the most current of in-memory replication stats  and data usage info from crawler.
   462  func (r *ReplicationStats) getLatestReplicationStats(bucket string) (s BucketStats) {
   463  	bucketStats := globalNotificationSys.GetClusterBucketStats(GlobalContext, bucket)
   464  	return r.calculateBucketReplicationStats(bucket, bucketStats)
   465  }
   466  
   467  func (r *ReplicationStats) incQ(bucket string, sz int64, isDeleteRepl bool, opType replication.Type) {
   468  	r.qCache.Lock()
   469  	defer r.qCache.Unlock()
   470  	v, ok := r.qCache.bucketStats[bucket]
   471  	if !ok {
   472  		v = newInQueueStats(r.registry, bucket)
   473  	}
   474  	atomic.AddInt64(&v.nowBytes, sz)
   475  	atomic.AddInt64(&v.nowCount, 1)
   476  	r.qCache.bucketStats[bucket] = v
   477  	atomic.AddInt64(&r.qCache.srQueueStats.nowBytes, sz)
   478  	atomic.AddInt64(&r.qCache.srQueueStats.nowCount, 1)
   479  }
   480  
   481  func (r *ReplicationStats) decQ(bucket string, sz int64, isDelMarker bool, opType replication.Type) {
   482  	r.qCache.Lock()
   483  	defer r.qCache.Unlock()
   484  	v, ok := r.qCache.bucketStats[bucket]
   485  	if !ok {
   486  		v = newInQueueStats(r.registry, bucket)
   487  	}
   488  	atomic.AddInt64(&v.nowBytes, -1*sz)
   489  	atomic.AddInt64(&v.nowCount, -1)
   490  	r.qCache.bucketStats[bucket] = v
   491  
   492  	atomic.AddInt64(&r.qCache.srQueueStats.nowBytes, -1*sz)
   493  	atomic.AddInt64(&r.qCache.srQueueStats.nowCount, -1)
   494  }
   495  
   496  // incProxy increments proxy metrics for proxied calls
   497  func (r *ReplicationStats) incProxy(bucket string, api replProxyAPI, isErr bool) {
   498  	r.pCache.inc(bucket, api, isErr)
   499  }
   500  
   501  func (r *ReplicationStats) getProxyStats(bucket string) ProxyMetric {
   502  	return r.pCache.getBucketStats(bucket)
   503  }