github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/site-replication-utils.go (about)

     1  // Copyright (c) 2015-2022 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"context"
    22  	"math/rand"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/minio/madmin-go/v3"
    27  )
    28  
    29  //go:generate msgp -file=$GOFILE
    30  
    31  // SiteResyncStatus captures current replication resync status for a target site
    32  type SiteResyncStatus struct {
    33  	Version int `json:"version" msg:"v"`
    34  	// Overall site status
    35  	Status                        ResyncStatusType            `json:"st" msg:"ss"`
    36  	DeplID                        string                      `json:"dId" msg:"did"`
    37  	BucketStatuses                map[string]ResyncStatusType `json:"buckets" msg:"bkts"`
    38  	TotBuckets                    int                         `json:"totbuckets" msg:"tb"`
    39  	TargetReplicationResyncStatus `json:"currSt" msg:"cst"`
    40  }
    41  
    42  func (s *SiteResyncStatus) clone() SiteResyncStatus {
    43  	if s == nil {
    44  		return SiteResyncStatus{}
    45  	}
    46  	o := *s
    47  	o.BucketStatuses = make(map[string]ResyncStatusType, len(s.BucketStatuses))
    48  	for b, st := range s.BucketStatuses {
    49  		o.BucketStatuses[b] = st
    50  	}
    51  	return o
    52  }
    53  
    54  const (
    55  	siteResyncPrefix = bucketMetaPrefix + "/site-replication/resync"
    56  )
    57  
    58  type resyncState struct {
    59  	resyncID  string
    60  	LastSaved time.Time
    61  }
    62  
    63  //msgp:ignore siteResyncMetrics
    64  type siteResyncMetrics struct {
    65  	sync.RWMutex
    66  	// resyncStatus maps resync ID to resync status for peer
    67  	resyncStatus map[string]SiteResyncStatus
    68  	// map peer deployment ID to resync ID
    69  	peerResyncMap map[string]resyncState
    70  }
    71  
    72  func newSiteResyncMetrics(ctx context.Context) *siteResyncMetrics {
    73  	s := siteResyncMetrics{
    74  		resyncStatus:  make(map[string]SiteResyncStatus),
    75  		peerResyncMap: make(map[string]resyncState),
    76  	}
    77  	go s.save(ctx)
    78  	go s.init(ctx)
    79  	return &s
    80  }
    81  
    82  // init site resync metrics
    83  func (sm *siteResyncMetrics) init(ctx context.Context) {
    84  	r := rand.New(rand.NewSource(time.Now().UnixNano()))
    85  	// Run the site resync metrics load in a loop
    86  	for {
    87  		if err := sm.load(ctx, newObjectLayerFn()); err == nil {
    88  			<-ctx.Done()
    89  			return
    90  		}
    91  		duration := time.Duration(r.Float64() * float64(time.Second*10))
    92  		if duration < time.Second {
    93  			// Make sure to sleep at least a second to avoid high CPU ticks.
    94  			duration = time.Second
    95  		}
    96  		time.Sleep(duration)
    97  	}
    98  }
    99  
   100  // load resync metrics saved on disk into memory
   101  func (sm *siteResyncMetrics) load(ctx context.Context, objAPI ObjectLayer) error {
   102  	if objAPI == nil {
   103  		return errServerNotInitialized
   104  	}
   105  	info, err := globalSiteReplicationSys.GetClusterInfo(ctx)
   106  	if err != nil {
   107  		return err
   108  	}
   109  	if !info.Enabled {
   110  		return nil
   111  	}
   112  	for _, peer := range info.Sites {
   113  		if peer.DeploymentID == globalDeploymentID() {
   114  			continue
   115  		}
   116  		rs, err := loadSiteResyncMetadata(ctx, objAPI, peer.DeploymentID)
   117  		if err != nil {
   118  			return err
   119  		}
   120  		sm.Lock()
   121  		if _, ok := sm.peerResyncMap[peer.DeploymentID]; !ok {
   122  			sm.peerResyncMap[peer.DeploymentID] = resyncState{resyncID: rs.ResyncID, LastSaved: time.Time{}}
   123  			sm.resyncStatus[rs.ResyncID] = rs
   124  		}
   125  		sm.Unlock()
   126  	}
   127  	return nil
   128  }
   129  
   130  func (sm *siteResyncMetrics) report(dID string) *madmin.SiteResyncMetrics {
   131  	sm.RLock()
   132  	defer sm.RUnlock()
   133  	rst, ok := sm.peerResyncMap[dID]
   134  	if !ok {
   135  		return nil
   136  	}
   137  	rs, ok := sm.resyncStatus[rst.resyncID]
   138  	if !ok {
   139  		return nil
   140  	}
   141  	m := madmin.SiteResyncMetrics{
   142  		CollectedAt:     rs.LastUpdate,
   143  		StartTime:       rs.StartTime,
   144  		LastUpdate:      rs.LastUpdate,
   145  		ResyncStatus:    rs.Status.String(),
   146  		ResyncID:        rst.resyncID,
   147  		DeplID:          rs.DeplID,
   148  		ReplicatedSize:  rs.ReplicatedSize,
   149  		ReplicatedCount: rs.ReplicatedCount,
   150  		FailedSize:      rs.FailedSize,
   151  		FailedCount:     rs.FailedCount,
   152  		Bucket:          rs.Bucket,
   153  		Object:          rs.Object,
   154  		NumBuckets:      int64(rs.TotBuckets),
   155  	}
   156  	for b, st := range rs.BucketStatuses {
   157  		if st == ResyncFailed {
   158  			m.FailedBuckets = append(m.FailedBuckets, b)
   159  		}
   160  	}
   161  	return &m
   162  }
   163  
   164  // save in-memory stats to disk
   165  func (sm *siteResyncMetrics) save(ctx context.Context) {
   166  	sTimer := time.NewTimer(siteResyncSaveInterval)
   167  	defer sTimer.Stop()
   168  	for {
   169  		select {
   170  		case <-sTimer.C:
   171  			if globalSiteReplicationSys.isEnabled() {
   172  				sm.Lock()
   173  				wg := sync.WaitGroup{}
   174  				for dID, rs := range sm.peerResyncMap {
   175  					st, ok := sm.resyncStatus[rs.resyncID]
   176  					if ok {
   177  						updt := st.Status.isValid() && st.LastUpdate.After(rs.LastSaved)
   178  						if !updt {
   179  							continue
   180  						}
   181  						rs.LastSaved = UTCNow()
   182  						sm.peerResyncMap[dID] = rs
   183  						wg.Add(1)
   184  						go func() {
   185  							defer wg.Done()
   186  							saveSiteResyncMetadata(ctx, st, newObjectLayerFn())
   187  						}()
   188  					}
   189  				}
   190  				wg.Wait()
   191  				sm.Unlock()
   192  			}
   193  			sTimer.Reset(siteResyncSaveInterval)
   194  		case <-ctx.Done():
   195  			return
   196  		}
   197  	}
   198  }
   199  
   200  // update overall site resync state
   201  func (sm *siteResyncMetrics) updateState(s SiteResyncStatus) error {
   202  	if !globalSiteReplicationSys.isEnabled() {
   203  		return nil
   204  	}
   205  	sm.Lock()
   206  	defer sm.Unlock()
   207  	switch s.Status {
   208  	case ResyncStarted:
   209  		sm.peerResyncMap[s.DeplID] = resyncState{resyncID: s.ResyncID, LastSaved: time.Time{}}
   210  		sm.resyncStatus[s.ResyncID] = s
   211  	case ResyncCompleted, ResyncCanceled, ResyncFailed:
   212  		st, ok := sm.resyncStatus[s.ResyncID]
   213  		if ok {
   214  			st.LastUpdate = s.LastUpdate
   215  			st.Status = s.Status
   216  			return nil
   217  		}
   218  		sm.resyncStatus[s.ResyncID] = st
   219  		return saveSiteResyncMetadata(GlobalContext, st, newObjectLayerFn())
   220  	}
   221  	return nil
   222  }
   223  
   224  // increment SyncedBuckets count
   225  func (sm *siteResyncMetrics) incBucket(o resyncOpts, bktStatus ResyncStatusType) {
   226  	if !globalSiteReplicationSys.isEnabled() {
   227  		return
   228  	}
   229  	sm.Lock()
   230  	defer sm.Unlock()
   231  	st, ok := sm.resyncStatus[o.resyncID]
   232  	if ok {
   233  		if st.BucketStatuses == nil {
   234  			st.BucketStatuses = map[string]ResyncStatusType{}
   235  		}
   236  		switch bktStatus {
   237  		case ResyncCompleted:
   238  			st.BucketStatuses[o.bucket] = ResyncCompleted
   239  			st.Status = siteResyncStatus(st.Status, st.BucketStatuses)
   240  			st.LastUpdate = UTCNow()
   241  			sm.resyncStatus[o.resyncID] = st
   242  		case ResyncFailed:
   243  			st.BucketStatuses[o.bucket] = ResyncFailed
   244  			st.Status = siteResyncStatus(st.Status, st.BucketStatuses)
   245  			st.LastUpdate = UTCNow()
   246  			sm.resyncStatus[o.resyncID] = st
   247  		}
   248  	}
   249  }
   250  
   251  // remove deleted bucket from active resync tracking
   252  func (sm *siteResyncMetrics) deleteBucket(b string) {
   253  	if !globalSiteReplicationSys.isEnabled() {
   254  		return
   255  	}
   256  	sm.Lock()
   257  	defer sm.Unlock()
   258  	for _, rs := range sm.peerResyncMap {
   259  		st, ok := sm.resyncStatus[rs.resyncID]
   260  		if !ok {
   261  			return
   262  		}
   263  		switch st.Status {
   264  		case ResyncCompleted, ResyncFailed:
   265  			return
   266  		default:
   267  			delete(st.BucketStatuses, b)
   268  		}
   269  	}
   270  }
   271  
   272  // returns overall resync status from individual bucket resync status map
   273  func siteResyncStatus(currSt ResyncStatusType, m map[string]ResyncStatusType) ResyncStatusType {
   274  	// avoid overwriting canceled resync status
   275  	if currSt != ResyncStarted {
   276  		return currSt
   277  	}
   278  	totBuckets := len(m)
   279  	var cmpCount, failCount int
   280  	for _, st := range m {
   281  		switch st {
   282  		case ResyncCompleted:
   283  			cmpCount++
   284  		case ResyncFailed:
   285  			failCount++
   286  		}
   287  	}
   288  	if cmpCount == totBuckets {
   289  		return ResyncCompleted
   290  	}
   291  	if cmpCount+failCount == totBuckets {
   292  		return ResyncFailed
   293  	}
   294  	return ResyncStarted
   295  }
   296  
   297  // update resync metrics per object
   298  func (sm *siteResyncMetrics) updateMetric(r TargetReplicationResyncStatus, resyncID string) {
   299  	if !globalSiteReplicationSys.isEnabled() {
   300  		return
   301  	}
   302  	sm.Lock()
   303  	defer sm.Unlock()
   304  	s := sm.resyncStatus[resyncID]
   305  	if r.ReplicatedCount > 0 {
   306  		s.ReplicatedCount++
   307  		s.ReplicatedSize += r.ReplicatedSize
   308  	} else {
   309  		s.FailedCount++
   310  		s.FailedSize += r.FailedSize
   311  	}
   312  	s.Bucket = r.Bucket
   313  	s.Object = r.Object
   314  	s.LastUpdate = UTCNow()
   315  	sm.resyncStatus[resyncID] = s
   316  }
   317  
   318  // Status returns current in-memory resync status for this deployment
   319  func (sm *siteResyncMetrics) status(dID string) (rs SiteResyncStatus, err error) {
   320  	sm.RLock()
   321  	defer sm.RUnlock()
   322  	if rst, ok1 := sm.peerResyncMap[dID]; ok1 {
   323  		if st, ok2 := sm.resyncStatus[rst.resyncID]; ok2 {
   324  			return st.clone(), nil
   325  		}
   326  	}
   327  	return rs, errSRNoResync
   328  }
   329  
   330  // Status returns latest resync status for this deployment
   331  func (sm *siteResyncMetrics) siteStatus(ctx context.Context, objAPI ObjectLayer, dID string) (rs SiteResyncStatus, err error) {
   332  	if !globalSiteReplicationSys.isEnabled() {
   333  		return rs, errSRNotEnabled
   334  	}
   335  	// check in-memory status
   336  	rs, err = sm.status(dID)
   337  	if err == nil {
   338  		return rs, nil
   339  	}
   340  	// check disk resync status
   341  	rs, err = loadSiteResyncMetadata(ctx, objAPI, dID)
   342  	if err != nil && err == errConfigNotFound {
   343  		return rs, nil
   344  	}
   345  	return rs, err
   346  }