github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/site-replication-utils.go (about) 1 // Copyright (c) 2015-2022 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "context" 22 "math/rand" 23 "sync" 24 "time" 25 26 "github.com/minio/madmin-go/v3" 27 ) 28 29 //go:generate msgp -file=$GOFILE 30 31 // SiteResyncStatus captures current replication resync status for a target site 32 type SiteResyncStatus struct { 33 Version int `json:"version" msg:"v"` 34 // Overall site status 35 Status ResyncStatusType `json:"st" msg:"ss"` 36 DeplID string `json:"dId" msg:"did"` 37 BucketStatuses map[string]ResyncStatusType `json:"buckets" msg:"bkts"` 38 TotBuckets int `json:"totbuckets" msg:"tb"` 39 TargetReplicationResyncStatus `json:"currSt" msg:"cst"` 40 } 41 42 func (s *SiteResyncStatus) clone() SiteResyncStatus { 43 if s == nil { 44 return SiteResyncStatus{} 45 } 46 o := *s 47 o.BucketStatuses = make(map[string]ResyncStatusType, len(s.BucketStatuses)) 48 for b, st := range s.BucketStatuses { 49 o.BucketStatuses[b] = st 50 } 51 return o 52 } 53 54 const ( 55 siteResyncPrefix = bucketMetaPrefix + "/site-replication/resync" 56 ) 57 58 type resyncState struct { 59 resyncID string 60 LastSaved time.Time 61 } 62 63 //msgp:ignore siteResyncMetrics 64 type siteResyncMetrics struct { 65 sync.RWMutex 66 // resyncStatus maps resync ID to resync status for peer 67 resyncStatus map[string]SiteResyncStatus 68 // map peer deployment ID to resync ID 69 peerResyncMap map[string]resyncState 70 } 71 72 func newSiteResyncMetrics(ctx context.Context) *siteResyncMetrics { 73 s := siteResyncMetrics{ 74 resyncStatus: make(map[string]SiteResyncStatus), 75 peerResyncMap: make(map[string]resyncState), 76 } 77 go s.save(ctx) 78 go s.init(ctx) 79 return &s 80 } 81 82 // init site resync metrics 83 func (sm *siteResyncMetrics) init(ctx context.Context) { 84 r := rand.New(rand.NewSource(time.Now().UnixNano())) 85 // Run the site resync metrics load in a loop 86 for { 87 if err := sm.load(ctx, newObjectLayerFn()); err == nil { 88 <-ctx.Done() 89 return 90 } 91 duration := time.Duration(r.Float64() * float64(time.Second*10)) 92 if duration < time.Second { 93 // Make sure to sleep at least a second to avoid high CPU ticks. 94 duration = time.Second 95 } 96 time.Sleep(duration) 97 } 98 } 99 100 // load resync metrics saved on disk into memory 101 func (sm *siteResyncMetrics) load(ctx context.Context, objAPI ObjectLayer) error { 102 if objAPI == nil { 103 return errServerNotInitialized 104 } 105 info, err := globalSiteReplicationSys.GetClusterInfo(ctx) 106 if err != nil { 107 return err 108 } 109 if !info.Enabled { 110 return nil 111 } 112 for _, peer := range info.Sites { 113 if peer.DeploymentID == globalDeploymentID() { 114 continue 115 } 116 rs, err := loadSiteResyncMetadata(ctx, objAPI, peer.DeploymentID) 117 if err != nil { 118 return err 119 } 120 sm.Lock() 121 if _, ok := sm.peerResyncMap[peer.DeploymentID]; !ok { 122 sm.peerResyncMap[peer.DeploymentID] = resyncState{resyncID: rs.ResyncID, LastSaved: time.Time{}} 123 sm.resyncStatus[rs.ResyncID] = rs 124 } 125 sm.Unlock() 126 } 127 return nil 128 } 129 130 func (sm *siteResyncMetrics) report(dID string) *madmin.SiteResyncMetrics { 131 sm.RLock() 132 defer sm.RUnlock() 133 rst, ok := sm.peerResyncMap[dID] 134 if !ok { 135 return nil 136 } 137 rs, ok := sm.resyncStatus[rst.resyncID] 138 if !ok { 139 return nil 140 } 141 m := madmin.SiteResyncMetrics{ 142 CollectedAt: rs.LastUpdate, 143 StartTime: rs.StartTime, 144 LastUpdate: rs.LastUpdate, 145 ResyncStatus: rs.Status.String(), 146 ResyncID: rst.resyncID, 147 DeplID: rs.DeplID, 148 ReplicatedSize: rs.ReplicatedSize, 149 ReplicatedCount: rs.ReplicatedCount, 150 FailedSize: rs.FailedSize, 151 FailedCount: rs.FailedCount, 152 Bucket: rs.Bucket, 153 Object: rs.Object, 154 NumBuckets: int64(rs.TotBuckets), 155 } 156 for b, st := range rs.BucketStatuses { 157 if st == ResyncFailed { 158 m.FailedBuckets = append(m.FailedBuckets, b) 159 } 160 } 161 return &m 162 } 163 164 // save in-memory stats to disk 165 func (sm *siteResyncMetrics) save(ctx context.Context) { 166 sTimer := time.NewTimer(siteResyncSaveInterval) 167 defer sTimer.Stop() 168 for { 169 select { 170 case <-sTimer.C: 171 if globalSiteReplicationSys.isEnabled() { 172 sm.Lock() 173 wg := sync.WaitGroup{} 174 for dID, rs := range sm.peerResyncMap { 175 st, ok := sm.resyncStatus[rs.resyncID] 176 if ok { 177 updt := st.Status.isValid() && st.LastUpdate.After(rs.LastSaved) 178 if !updt { 179 continue 180 } 181 rs.LastSaved = UTCNow() 182 sm.peerResyncMap[dID] = rs 183 wg.Add(1) 184 go func() { 185 defer wg.Done() 186 saveSiteResyncMetadata(ctx, st, newObjectLayerFn()) 187 }() 188 } 189 } 190 wg.Wait() 191 sm.Unlock() 192 } 193 sTimer.Reset(siteResyncSaveInterval) 194 case <-ctx.Done(): 195 return 196 } 197 } 198 } 199 200 // update overall site resync state 201 func (sm *siteResyncMetrics) updateState(s SiteResyncStatus) error { 202 if !globalSiteReplicationSys.isEnabled() { 203 return nil 204 } 205 sm.Lock() 206 defer sm.Unlock() 207 switch s.Status { 208 case ResyncStarted: 209 sm.peerResyncMap[s.DeplID] = resyncState{resyncID: s.ResyncID, LastSaved: time.Time{}} 210 sm.resyncStatus[s.ResyncID] = s 211 case ResyncCompleted, ResyncCanceled, ResyncFailed: 212 st, ok := sm.resyncStatus[s.ResyncID] 213 if ok { 214 st.LastUpdate = s.LastUpdate 215 st.Status = s.Status 216 return nil 217 } 218 sm.resyncStatus[s.ResyncID] = st 219 return saveSiteResyncMetadata(GlobalContext, st, newObjectLayerFn()) 220 } 221 return nil 222 } 223 224 // increment SyncedBuckets count 225 func (sm *siteResyncMetrics) incBucket(o resyncOpts, bktStatus ResyncStatusType) { 226 if !globalSiteReplicationSys.isEnabled() { 227 return 228 } 229 sm.Lock() 230 defer sm.Unlock() 231 st, ok := sm.resyncStatus[o.resyncID] 232 if ok { 233 if st.BucketStatuses == nil { 234 st.BucketStatuses = map[string]ResyncStatusType{} 235 } 236 switch bktStatus { 237 case ResyncCompleted: 238 st.BucketStatuses[o.bucket] = ResyncCompleted 239 st.Status = siteResyncStatus(st.Status, st.BucketStatuses) 240 st.LastUpdate = UTCNow() 241 sm.resyncStatus[o.resyncID] = st 242 case ResyncFailed: 243 st.BucketStatuses[o.bucket] = ResyncFailed 244 st.Status = siteResyncStatus(st.Status, st.BucketStatuses) 245 st.LastUpdate = UTCNow() 246 sm.resyncStatus[o.resyncID] = st 247 } 248 } 249 } 250 251 // remove deleted bucket from active resync tracking 252 func (sm *siteResyncMetrics) deleteBucket(b string) { 253 if !globalSiteReplicationSys.isEnabled() { 254 return 255 } 256 sm.Lock() 257 defer sm.Unlock() 258 for _, rs := range sm.peerResyncMap { 259 st, ok := sm.resyncStatus[rs.resyncID] 260 if !ok { 261 return 262 } 263 switch st.Status { 264 case ResyncCompleted, ResyncFailed: 265 return 266 default: 267 delete(st.BucketStatuses, b) 268 } 269 } 270 } 271 272 // returns overall resync status from individual bucket resync status map 273 func siteResyncStatus(currSt ResyncStatusType, m map[string]ResyncStatusType) ResyncStatusType { 274 // avoid overwriting canceled resync status 275 if currSt != ResyncStarted { 276 return currSt 277 } 278 totBuckets := len(m) 279 var cmpCount, failCount int 280 for _, st := range m { 281 switch st { 282 case ResyncCompleted: 283 cmpCount++ 284 case ResyncFailed: 285 failCount++ 286 } 287 } 288 if cmpCount == totBuckets { 289 return ResyncCompleted 290 } 291 if cmpCount+failCount == totBuckets { 292 return ResyncFailed 293 } 294 return ResyncStarted 295 } 296 297 // update resync metrics per object 298 func (sm *siteResyncMetrics) updateMetric(r TargetReplicationResyncStatus, resyncID string) { 299 if !globalSiteReplicationSys.isEnabled() { 300 return 301 } 302 sm.Lock() 303 defer sm.Unlock() 304 s := sm.resyncStatus[resyncID] 305 if r.ReplicatedCount > 0 { 306 s.ReplicatedCount++ 307 s.ReplicatedSize += r.ReplicatedSize 308 } else { 309 s.FailedCount++ 310 s.FailedSize += r.FailedSize 311 } 312 s.Bucket = r.Bucket 313 s.Object = r.Object 314 s.LastUpdate = UTCNow() 315 sm.resyncStatus[resyncID] = s 316 } 317 318 // Status returns current in-memory resync status for this deployment 319 func (sm *siteResyncMetrics) status(dID string) (rs SiteResyncStatus, err error) { 320 sm.RLock() 321 defer sm.RUnlock() 322 if rst, ok1 := sm.peerResyncMap[dID]; ok1 { 323 if st, ok2 := sm.resyncStatus[rst.resyncID]; ok2 { 324 return st.clone(), nil 325 } 326 } 327 return rs, errSRNoResync 328 } 329 330 // Status returns latest resync status for this deployment 331 func (sm *siteResyncMetrics) siteStatus(ctx context.Context, objAPI ObjectLayer, dID string) (rs SiteResyncStatus, err error) { 332 if !globalSiteReplicationSys.isEnabled() { 333 return rs, errSRNotEnabled 334 } 335 // check in-memory status 336 rs, err = sm.status(dID) 337 if err == nil { 338 return rs, nil 339 } 340 // check disk resync status 341 rs, err = loadSiteResyncMetadata(ctx, objAPI, dID) 342 if err != nil && err == errConfigNotFound { 343 return rs, nil 344 } 345 return rs, err 346 }