github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/bucket-replication-stats.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "context" 22 "sync" 23 "sync/atomic" 24 "time" 25 26 "github.com/minio/minio/internal/bucket/replication" 27 "github.com/rcrowley/go-metrics" 28 ) 29 30 func (b *BucketReplicationStats) hasReplicationUsage() bool { 31 for _, s := range b.Stats { 32 if s.hasReplicationUsage() { 33 return true 34 } 35 } 36 return false 37 } 38 39 // ReplicationStats holds the global in-memory replication stats 40 type ReplicationStats struct { 41 // map of site deployment ID to site replication status 42 // for site replication - maintain stats at global level 43 srStats *SRStats 44 // active worker stats 45 workers *ActiveWorkerStat 46 // queue stats cache 47 qCache queueCache 48 49 pCache proxyStatsCache 50 // mrf backlog stats 51 mrfStats ReplicationMRFStats 52 // for bucket replication, continue to use existing cache 53 Cache map[string]*BucketReplicationStats 54 mostRecentStats BucketStatsMap 55 registry metrics.Registry 56 sync.RWMutex // mutex for Cache 57 mostRecentStatsMu sync.Mutex // mutex for mostRecentStats 58 59 wlock sync.RWMutex // mutex for active workers 60 61 movingAvgTicker *time.Ticker // Ticker for calculating moving averages 62 wTimer *time.Ticker // ticker for calculating active workers 63 qTimer *time.Ticker // ticker for calculating queue stats 64 } 65 66 func (r *ReplicationStats) trackEWMA() { 67 for { 68 select { 69 case <-r.movingAvgTicker.C: 70 r.updateMovingAvg() 71 case <-GlobalContext.Done(): 72 return 73 } 74 } 75 } 76 77 func (r *ReplicationStats) updateMovingAvg() { 78 r.RLock() 79 for _, s := range r.Cache { 80 for _, st := range s.Stats { 81 st.XferRateLrg.measure.updateExponentialMovingAverage(time.Now()) 82 st.XferRateSml.measure.updateExponentialMovingAverage(time.Now()) 83 } 84 } 85 r.RUnlock() 86 } 87 88 // ActiveWorkers returns worker stats 89 func (r *ReplicationStats) ActiveWorkers() ActiveWorkerStat { 90 r.wlock.RLock() 91 defer r.wlock.RUnlock() 92 w := r.workers.get() 93 return ActiveWorkerStat{ 94 Curr: w.Curr, 95 Max: w.Max, 96 Avg: w.Avg, 97 } 98 } 99 100 func (r *ReplicationStats) collectWorkerMetrics(ctx context.Context) { 101 if r == nil { 102 return 103 } 104 for { 105 select { 106 case <-ctx.Done(): 107 return 108 case <-r.wTimer.C: 109 r.wlock.Lock() 110 r.workers.update() 111 r.wlock.Unlock() 112 113 } 114 } 115 } 116 117 func (r *ReplicationStats) collectQueueMetrics(ctx context.Context) { 118 if r == nil { 119 return 120 } 121 122 for { 123 select { 124 case <-ctx.Done(): 125 return 126 case <-r.qTimer.C: 127 r.qCache.update() 128 } 129 } 130 } 131 132 // Delete deletes in-memory replication statistics for a bucket. 133 func (r *ReplicationStats) Delete(bucket string) { 134 if r == nil { 135 return 136 } 137 138 r.Lock() 139 defer r.Unlock() 140 delete(r.Cache, bucket) 141 } 142 143 // UpdateReplicaStat updates in-memory replica statistics with new values. 144 func (r *ReplicationStats) UpdateReplicaStat(bucket string, n int64) { 145 if r == nil { 146 return 147 } 148 149 r.Lock() 150 defer r.Unlock() 151 bs, ok := r.Cache[bucket] 152 if !ok { 153 bs = newBucketReplicationStats() 154 } 155 bs.ReplicaSize += n 156 bs.ReplicaCount++ 157 r.Cache[bucket] = bs 158 r.srUpdateReplicaStat(n) 159 } 160 161 func (r *ReplicationStats) srUpdateReplicaStat(sz int64) { 162 if r == nil { 163 return 164 } 165 atomic.AddInt64(&r.srStats.ReplicaSize, sz) 166 atomic.AddInt64(&r.srStats.ReplicaCount, 1) 167 } 168 169 func (r *ReplicationStats) srUpdate(sr replStat) { 170 dID, err := globalSiteReplicationSys.getDeplIDForEndpoint(sr.endpoint()) 171 if err == nil { 172 r.srStats.update(sr, dID) 173 } 174 } 175 176 // Update updates in-memory replication statistics with new values. 177 func (r *ReplicationStats) Update(bucket string, ri replicatedTargetInfo, status, prevStatus replication.StatusType) { 178 if r == nil { 179 return 180 } 181 var rs replStat 182 switch status { 183 case replication.Pending: 184 if ri.OpType.IsDataReplication() && prevStatus != status { 185 rs.set(ri.Arn, ri.Size, 0, status, ri.OpType, ri.endpoint, ri.secure, ri.Err) 186 } 187 case replication.Completed: 188 if ri.OpType.IsDataReplication() { 189 rs.set(ri.Arn, ri.Size, ri.Duration, status, ri.OpType, ri.endpoint, ri.secure, ri.Err) 190 } 191 case replication.Failed: 192 if ri.OpType.IsDataReplication() && prevStatus == replication.Pending { 193 rs.set(ri.Arn, ri.Size, ri.Duration, status, ri.OpType, ri.endpoint, ri.secure, ri.Err) 194 } 195 case replication.Replica: 196 if ri.OpType == replication.ObjectReplicationType { 197 rs.set(ri.Arn, ri.Size, 0, status, ri.OpType, "", false, ri.Err) 198 } 199 } 200 201 // update site-replication in-memory stats 202 if rs.Completed || rs.Failed { 203 r.srUpdate(rs) 204 } 205 206 r.Lock() 207 defer r.Unlock() 208 209 // update bucket replication in-memory stats 210 bs, ok := r.Cache[bucket] 211 if !ok { 212 bs = newBucketReplicationStats() 213 r.Cache[bucket] = bs 214 } 215 b, ok := bs.Stats[ri.Arn] 216 if !ok { 217 b = &BucketReplicationStat{ 218 XferRateLrg: newXferStats(), 219 XferRateSml: newXferStats(), 220 } 221 bs.Stats[ri.Arn] = b 222 } 223 224 switch { 225 case rs.Completed: 226 b.ReplicatedSize += rs.TransferSize 227 b.ReplicatedCount++ 228 if rs.TransferDuration > 0 { 229 b.Latency.update(rs.TransferSize, rs.TransferDuration) 230 b.updateXferRate(rs.TransferSize, rs.TransferDuration) 231 } 232 case rs.Failed: 233 b.FailStats.addsize(rs.TransferSize, rs.Err) 234 case rs.Pending: 235 } 236 } 237 238 type replStat struct { 239 Arn string 240 Completed bool 241 Pending bool 242 Failed bool 243 opType replication.Type 244 // transfer size 245 TransferSize int64 246 // transfer duration 247 TransferDuration time.Duration 248 Endpoint string 249 Secure bool 250 Err error 251 } 252 253 func (rs *replStat) endpoint() string { 254 scheme := "http" 255 if rs.Secure { 256 scheme = "https" 257 } 258 return scheme + "://" + rs.Endpoint 259 } 260 261 func (rs *replStat) set(arn string, n int64, duration time.Duration, status replication.StatusType, opType replication.Type, endpoint string, secure bool, err error) { 262 rs.Endpoint = endpoint 263 rs.Secure = secure 264 rs.TransferSize = n 265 rs.Arn = arn 266 rs.TransferDuration = duration 267 rs.opType = opType 268 switch status { 269 case replication.Completed: 270 rs.Completed = true 271 case replication.Pending: 272 rs.Pending = true 273 case replication.Failed: 274 rs.Failed = true 275 rs.Err = err 276 } 277 } 278 279 // GetAll returns replication metrics for all buckets at once. 280 func (r *ReplicationStats) GetAll() map[string]BucketReplicationStats { 281 if r == nil { 282 return map[string]BucketReplicationStats{} 283 } 284 285 r.RLock() 286 287 bucketReplicationStats := make(map[string]BucketReplicationStats, len(r.Cache)) 288 for k, v := range r.Cache { 289 bucketReplicationStats[k] = v.Clone() 290 } 291 r.RUnlock() 292 for k, v := range bucketReplicationStats { 293 v.QStat = r.qCache.getBucketStats(k) 294 bucketReplicationStats[k] = v 295 } 296 297 return bucketReplicationStats 298 } 299 300 func (r *ReplicationStats) getSRMetricsForNode() SRMetricsSummary { 301 if r == nil { 302 return SRMetricsSummary{} 303 } 304 305 m := SRMetricsSummary{ 306 Uptime: UTCNow().Unix() - globalBootTime.Unix(), 307 Queued: r.qCache.getSiteStats(), 308 ActiveWorkers: r.ActiveWorkers(), 309 Metrics: r.srStats.get(), 310 Proxied: r.pCache.getSiteStats(), 311 ReplicaSize: atomic.LoadInt64(&r.srStats.ReplicaSize), 312 ReplicaCount: atomic.LoadInt64(&r.srStats.ReplicaCount), 313 } 314 return m 315 } 316 317 // Get replication metrics for a bucket from this node since this node came up. 318 func (r *ReplicationStats) Get(bucket string) BucketReplicationStats { 319 if r == nil { 320 return BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)} 321 } 322 323 r.RLock() 324 defer r.RUnlock() 325 326 st, ok := r.Cache[bucket] 327 if !ok { 328 return BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)} 329 } 330 return st.Clone() 331 } 332 333 // NewReplicationStats initialize in-memory replication statistics 334 func NewReplicationStats(ctx context.Context, objectAPI ObjectLayer) *ReplicationStats { 335 r := metrics.NewRegistry() 336 rs := ReplicationStats{ 337 Cache: make(map[string]*BucketReplicationStats), 338 qCache: newQueueCache(r), 339 pCache: newProxyStatsCache(), 340 srStats: newSRStats(), 341 movingAvgTicker: time.NewTicker(2 * time.Second), 342 wTimer: time.NewTicker(2 * time.Second), 343 qTimer: time.NewTicker(2 * time.Second), 344 345 workers: newActiveWorkerStat(r), 346 registry: r, 347 } 348 go rs.collectWorkerMetrics(ctx) 349 go rs.collectQueueMetrics(ctx) 350 return &rs 351 } 352 353 func (r *ReplicationStats) getAllLatest(bucketsUsage map[string]BucketUsageInfo) (bucketsReplicationStats map[string]BucketStats) { 354 peerBucketStatsList := globalNotificationSys.GetClusterAllBucketStats(GlobalContext) 355 bucketsReplicationStats = make(map[string]BucketStats, len(bucketsUsage)) 356 357 for bucket := range bucketsUsage { 358 bucketStats := make([]BucketStats, len(peerBucketStatsList)) 359 for i, peerBucketStats := range peerBucketStatsList { 360 bucketStat, ok := peerBucketStats.Stats[bucket] 361 if !ok { 362 continue 363 } 364 bucketStats[i] = bucketStat 365 } 366 bucketsReplicationStats[bucket] = r.calculateBucketReplicationStats(bucket, bucketStats) 367 } 368 return bucketsReplicationStats 369 } 370 371 func (r *ReplicationStats) calculateBucketReplicationStats(bucket string, bucketStats []BucketStats) (bs BucketStats) { 372 if r == nil { 373 bs = BucketStats{ 374 ReplicationStats: BucketReplicationStats{ 375 Stats: make(map[string]*BucketReplicationStat), 376 }, 377 QueueStats: ReplicationQueueStats{}, 378 ProxyStats: ProxyMetric{}, 379 } 380 return bs 381 } 382 var s BucketReplicationStats 383 // accumulate cluster bucket stats 384 stats := make(map[string]*BucketReplicationStat) 385 var ( 386 totReplicaSize, totReplicatedSize int64 387 totReplicaCount, totReplicatedCount int64 388 totFailed RTimedMetrics 389 tq InQueueMetric 390 ) 391 for _, bucketStat := range bucketStats { 392 totReplicaSize += bucketStat.ReplicationStats.ReplicaSize 393 totReplicaCount += bucketStat.ReplicationStats.ReplicaCount 394 for _, q := range bucketStat.QueueStats.Nodes { 395 tq = tq.merge(q.QStats) 396 } 397 398 for arn, stat := range bucketStat.ReplicationStats.Stats { 399 oldst := stats[arn] 400 if oldst == nil { 401 oldst = &BucketReplicationStat{ 402 XferRateLrg: newXferStats(), 403 XferRateSml: newXferStats(), 404 } 405 } 406 fstats := stat.FailStats.merge(oldst.FailStats) 407 lrg := oldst.XferRateLrg.merge(*stat.XferRateLrg) 408 sml := oldst.XferRateSml.merge(*stat.XferRateSml) 409 stats[arn] = &BucketReplicationStat{ 410 Failed: fstats.toMetric(), 411 FailStats: fstats, 412 ReplicatedSize: stat.ReplicatedSize + oldst.ReplicatedSize, 413 ReplicatedCount: stat.ReplicatedCount + oldst.ReplicatedCount, 414 Latency: stat.Latency.merge(oldst.Latency), 415 XferRateLrg: &lrg, 416 XferRateSml: &sml, 417 } 418 totReplicatedSize += stat.ReplicatedSize 419 totReplicatedCount += stat.ReplicatedCount 420 totFailed = totFailed.merge(stat.FailStats) 421 } 422 } 423 424 s = BucketReplicationStats{ 425 Stats: stats, 426 QStat: tq, 427 ReplicaSize: totReplicaSize, 428 ReplicaCount: totReplicaCount, 429 ReplicatedSize: totReplicatedSize, 430 ReplicatedCount: totReplicatedCount, 431 Failed: totFailed.toMetric(), 432 } 433 434 var qs ReplicationQueueStats 435 for _, bs := range bucketStats { 436 qs.Nodes = append(qs.Nodes, bs.QueueStats.Nodes...) 437 } 438 qs.Uptime = UTCNow().Unix() - globalBootTime.Unix() 439 440 var ps ProxyMetric 441 for _, bs := range bucketStats { 442 ps.add(bs.ProxyStats) 443 } 444 bs = BucketStats{ 445 ReplicationStats: s, 446 QueueStats: qs, 447 ProxyStats: ps, 448 } 449 r.mostRecentStatsMu.Lock() 450 if len(r.mostRecentStats.Stats) == 0 { 451 r.mostRecentStats = BucketStatsMap{Stats: make(map[string]BucketStats, 1), Timestamp: UTCNow()} 452 } 453 if len(bs.ReplicationStats.Stats) > 0 { 454 r.mostRecentStats.Stats[bucket] = bs 455 } 456 r.mostRecentStats.Timestamp = UTCNow() 457 r.mostRecentStatsMu.Unlock() 458 return bs 459 } 460 461 // get the most current of in-memory replication stats and data usage info from crawler. 462 func (r *ReplicationStats) getLatestReplicationStats(bucket string) (s BucketStats) { 463 bucketStats := globalNotificationSys.GetClusterBucketStats(GlobalContext, bucket) 464 return r.calculateBucketReplicationStats(bucket, bucketStats) 465 } 466 467 func (r *ReplicationStats) incQ(bucket string, sz int64, isDeleteRepl bool, opType replication.Type) { 468 r.qCache.Lock() 469 defer r.qCache.Unlock() 470 v, ok := r.qCache.bucketStats[bucket] 471 if !ok { 472 v = newInQueueStats(r.registry, bucket) 473 } 474 atomic.AddInt64(&v.nowBytes, sz) 475 atomic.AddInt64(&v.nowCount, 1) 476 r.qCache.bucketStats[bucket] = v 477 atomic.AddInt64(&r.qCache.srQueueStats.nowBytes, sz) 478 atomic.AddInt64(&r.qCache.srQueueStats.nowCount, 1) 479 } 480 481 func (r *ReplicationStats) decQ(bucket string, sz int64, isDelMarker bool, opType replication.Type) { 482 r.qCache.Lock() 483 defer r.qCache.Unlock() 484 v, ok := r.qCache.bucketStats[bucket] 485 if !ok { 486 v = newInQueueStats(r.registry, bucket) 487 } 488 atomic.AddInt64(&v.nowBytes, -1*sz) 489 atomic.AddInt64(&v.nowCount, -1) 490 r.qCache.bucketStats[bucket] = v 491 492 atomic.AddInt64(&r.qCache.srQueueStats.nowBytes, -1*sz) 493 atomic.AddInt64(&r.qCache.srQueueStats.nowCount, -1) 494 } 495 496 // incProxy increments proxy metrics for proxied calls 497 func (r *ReplicationStats) incProxy(bucket string, api replProxyAPI, isErr bool) { 498 r.pCache.inc(bucket, api, isErr) 499 } 500 501 func (r *ReplicationStats) getProxyStats(bucket string) ProxyMetric { 502 return r.pCache.getBucketStats(bucket) 503 }