github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/site-replication-metrics.go (about) 1 // Copyright (c) 2015-2023 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "fmt" 22 "sync" 23 "sync/atomic" 24 "time" 25 26 "github.com/minio/madmin-go/v3" 27 "github.com/minio/minio-go/v7" 28 ) 29 30 //go:generate msgp -file $GOFILE 31 32 // RStat has replication error stats 33 type RStat struct { 34 Count int64 `json:"count"` 35 Bytes int64 `json:"bytes"` 36 } 37 38 // RTimedMetrics has replication error stats for various time windows 39 type RTimedMetrics struct { 40 LastHour ReplicationLastHour `json:"lastHour"` 41 SinceUptime RStat `json:"sinceUptime"` 42 LastMinute ReplicationLastMinute 43 // Error counts 44 ErrCounts map[string]int `json:"errCounts"` // Count of credential errors 45 } 46 47 func (rt *RTimedMetrics) String() string { 48 s := rt.toMetric() 49 return fmt.Sprintf("Errors in LastMinute: %v, LastHour: %v, SinceUptime: %v", s.LastMinute.Count, s.LastHour.Count, s.Totals.Count) 50 } 51 52 func (rt *RTimedMetrics) toMetric() madmin.TimedErrStats { 53 if rt == nil { 54 return madmin.TimedErrStats{} 55 } 56 errCounts := make(map[string]int) 57 for k, v := range rt.ErrCounts { 58 errCounts[k] = v 59 } 60 minuteTotals := rt.LastMinute.getTotal() 61 hourTotals := rt.LastHour.getTotal() 62 return madmin.TimedErrStats{ 63 LastMinute: madmin.RStat{ 64 Count: float64(minuteTotals.N), 65 Bytes: minuteTotals.Size, 66 }, 67 LastHour: madmin.RStat{ 68 Count: float64(hourTotals.N), 69 Bytes: hourTotals.Size, 70 }, 71 Totals: madmin.RStat{ 72 Count: float64(rt.SinceUptime.Count), 73 Bytes: rt.SinceUptime.Bytes, 74 }, 75 ErrCounts: errCounts, 76 } 77 } 78 79 func (rt *RTimedMetrics) addsize(size int64, err error) { 80 // failures seen since uptime 81 atomic.AddInt64(&rt.SinceUptime.Bytes, size) 82 atomic.AddInt64(&rt.SinceUptime.Count, 1) 83 rt.LastMinute.addsize(size) 84 rt.LastHour.addsize(size) 85 if err != nil && minio.ToErrorResponse(err).Code == "AccessDenied" { 86 if rt.ErrCounts == nil { 87 rt.ErrCounts = make(map[string]int) 88 } 89 rt.ErrCounts["AccessDenied"]++ 90 } 91 } 92 93 func (rt *RTimedMetrics) merge(o RTimedMetrics) (n RTimedMetrics) { 94 n.SinceUptime.Bytes = atomic.LoadInt64(&rt.SinceUptime.Bytes) + atomic.LoadInt64(&o.SinceUptime.Bytes) 95 n.SinceUptime.Count = atomic.LoadInt64(&rt.SinceUptime.Count) + atomic.LoadInt64(&o.SinceUptime.Count) 96 97 n.LastMinute = n.LastMinute.merge(rt.LastMinute) 98 n.LastMinute = n.LastMinute.merge(o.LastMinute) 99 n.LastHour = n.LastHour.merge(rt.LastHour) 100 n.LastHour = n.LastHour.merge(o.LastHour) 101 n.ErrCounts = make(map[string]int) 102 for k, v := range rt.ErrCounts { 103 n.ErrCounts[k] = v 104 } 105 for k, v := range o.ErrCounts { 106 n.ErrCounts[k] += v 107 } 108 return n 109 } 110 111 // SRStats has replication stats at site level 112 type SRStats struct { 113 // Total Replica size in bytes 114 ReplicaSize int64 `json:"replicaSize"` 115 // Total Replica received 116 ReplicaCount int64 `json:"replicaCount"` 117 M map[string]*SRStatus `json:"srStatusMap"` 118 119 movingAvgTicker *time.Ticker // Ticker for calculating moving averages 120 lock sync.RWMutex // mutex for srStats 121 } 122 123 // SRStatus has replication stats at deployment level 124 type SRStatus struct { 125 ReplicatedSize int64 `json:"completedReplicationSize"` 126 // Total number of failed operations including metadata updates in the last minute 127 Failed RTimedMetrics `json:"failedReplication"` 128 // Total number of completed operations 129 ReplicatedCount int64 `json:"replicationCount"` 130 // Replication latency information 131 Latency ReplicationLatency `json:"replicationLatency"` 132 // transfer rate for large uploads 133 XferRateLrg *XferStats `json:"largeTransferRate" msg:"lt"` 134 // transfer rate for small uploads 135 XferRateSml *XferStats `json:"smallTransferRate" msg:"st"` 136 // Endpoint is the replication target endpoint 137 Endpoint string `json:"-"` 138 // Secure is true if the replication target endpoint is secure 139 Secure bool `json:"-"` 140 } 141 142 func (sr *SRStats) update(st replStat, dID string) { 143 sr.lock.Lock() 144 defer sr.lock.Unlock() 145 srs, ok := sr.M[dID] 146 if !ok { 147 srs = &SRStatus{ 148 XferRateLrg: newXferStats(), 149 XferRateSml: newXferStats(), 150 } 151 } 152 srs.Endpoint = st.Endpoint 153 srs.Secure = st.Secure 154 switch { 155 case st.Completed: 156 srs.ReplicatedSize += st.TransferSize 157 srs.ReplicatedCount++ 158 if st.TransferDuration > 0 { 159 srs.Latency.update(st.TransferSize, st.TransferDuration) 160 srs.updateXferRate(st.TransferSize, st.TransferDuration) 161 } 162 case st.Failed: 163 srs.Failed.addsize(st.TransferSize, st.Err) 164 case st.Pending: 165 } 166 sr.M[dID] = srs 167 } 168 169 func (sr *SRStats) get() map[string]SRMetric { 170 epMap := globalBucketTargetSys.healthStats() 171 172 sr.lock.RLock() 173 defer sr.lock.RUnlock() 174 m := make(map[string]SRMetric, len(sr.M)) 175 for dID, v := range sr.M { 176 t := newXferStats() 177 mx := make(map[RMetricName]XferStats) 178 179 if v.XferRateLrg != nil { 180 mx[Large] = *v.XferRateLrg.Clone() 181 m := t.merge(*v.XferRateLrg) 182 t = &m 183 } 184 if v.XferRateSml != nil { 185 mx[Small] = *v.XferRateSml.Clone() 186 m := t.merge(*v.XferRateSml) 187 t = &m 188 } 189 190 mx[Total] = *t 191 metric := SRMetric{ 192 ReplicatedSize: v.ReplicatedSize, 193 ReplicatedCount: v.ReplicatedCount, 194 DeploymentID: dID, 195 Failed: v.Failed.toMetric(), 196 XferStats: mx, 197 } 198 epHealth, ok := epMap[v.Endpoint] 199 if ok { 200 metric.Endpoint = epHealth.Endpoint 201 metric.TotalDowntime = epHealth.offlineDuration 202 metric.LastOnline = epHealth.lastOnline 203 metric.Online = epHealth.Online 204 metric.Latency = madmin.LatencyStat{ 205 Curr: epHealth.latency.curr, 206 Avg: epHealth.latency.avg, 207 Max: epHealth.latency.peak, 208 } 209 } 210 m[dID] = metric 211 } 212 return m 213 } 214 215 func (srs *SRStatus) updateXferRate(sz int64, duration time.Duration) { 216 if sz > minLargeObjSize { 217 srs.XferRateLrg.addSize(sz, duration) 218 } else { 219 srs.XferRateSml.addSize(sz, duration) 220 } 221 } 222 223 func newSRStats() *SRStats { 224 s := SRStats{ 225 M: make(map[string]*SRStatus), 226 movingAvgTicker: time.NewTicker(time.Second * 2), 227 } 228 go s.trackEWMA() 229 return &s 230 } 231 232 func (sr *SRStats) trackEWMA() { 233 for { 234 select { 235 case <-sr.movingAvgTicker.C: 236 sr.updateMovingAvg() 237 case <-GlobalContext.Done(): 238 return 239 } 240 } 241 } 242 243 func (sr *SRStats) updateMovingAvg() { 244 sr.lock.Lock() 245 defer sr.lock.Unlock() 246 for _, s := range sr.M { 247 s.XferRateLrg.measure.updateExponentialMovingAverage(time.Now()) 248 s.XferRateSml.measure.updateExponentialMovingAverage(time.Now()) 249 } 250 } 251 252 // SRMetric captures replication metrics for a deployment 253 type SRMetric struct { 254 DeploymentID string `json:"deploymentID"` 255 Endpoint string `json:"endpoint"` 256 TotalDowntime time.Duration `json:"totalDowntime"` 257 LastOnline time.Time `json:"lastOnline"` 258 Online bool `json:"isOnline"` 259 Latency madmin.LatencyStat `json:"latency"` 260 261 // replication metrics across buckets roll up 262 ReplicatedSize int64 `json:"replicatedSize"` 263 // Total number of completed operations 264 ReplicatedCount int64 `json:"replicatedCount"` 265 // Failed captures replication errors in various time windows 266 267 Failed madmin.TimedErrStats `json:"failed,omitempty"` 268 269 XferStats map[RMetricName]XferStats `json:"transferSummary"` 270 } 271 272 // SRMetricsSummary captures summary of replication counts across buckets on site 273 // along with op metrics rollup. 274 type SRMetricsSummary struct { 275 // op metrics roll up 276 ActiveWorkers ActiveWorkerStat `json:"activeWorkers"` 277 278 // Total Replica size in bytes 279 ReplicaSize int64 `json:"replicaSize"` 280 281 // Total number of replica received 282 ReplicaCount int64 `json:"replicaCount"` 283 // Queued operations 284 Queued InQueueMetric `json:"queued"` 285 // Proxy stats 286 Proxied ProxyMetric `json:"proxied"` 287 // replication metrics summary for each site replication peer 288 Metrics map[string]SRMetric `json:"replMetrics"` 289 // uptime of node being queried for site replication metrics 290 Uptime int64 `json:"uptime"` 291 }