github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/reb/status.go (about)

     1  // Package reb provides global cluster-wide rebalance upon adding/removing storage nodes.
     2  /*
     3   * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package reb
     6  
     7  import (
     8  	"time"
     9  
    10  	"github.com/NVIDIA/aistore/cmn"
    11  	"github.com/NVIDIA/aistore/cmn/debug"
    12  	"github.com/NVIDIA/aistore/cmn/mono"
    13  	"github.com/NVIDIA/aistore/cmn/nlog"
    14  	"github.com/NVIDIA/aistore/core"
    15  	"github.com/NVIDIA/aistore/core/meta"
    16  	"github.com/NVIDIA/aistore/xact"
    17  	"github.com/NVIDIA/aistore/xact/xreg"
    18  )
    19  
    20  // via GET /v1/health (apc.Health)
    21  func (reb *Reb) RebStatus(status *Status) {
    22  	var (
    23  		tsmap  = core.T.Sowner().Get()
    24  		marked = xreg.GetRebMarked()
    25  	)
    26  	status.Aborted = marked.Interrupted
    27  	status.Running = marked.Xact != nil && marked.Xact.Running()
    28  
    29  	// rlock
    30  	reb.mu.RLock()
    31  	status.Stage = reb.stages.stage.Load()
    32  	status.RebID = reb.rebID.Load()
    33  	status.Quiescent = reb.isQuiescent()
    34  	status.SmapVersion = tsmap.Version
    35  	smap := reb.smap.Load()
    36  	if smap != nil {
    37  		status.RebVersion = smap.Version
    38  	}
    39  	reb.mu.RUnlock()
    40  
    41  	// xreb, ?running
    42  	xreb := reb.xctn()
    43  	if xreb != nil {
    44  		status.Aborted = xreb.IsAborted()
    45  		status.Running = xreb.Running()
    46  		xreb.ToStats(&status.Stats)
    47  		if status.Running {
    48  			if marked.Xact != nil && marked.Xact.ID() != xreb.ID() {
    49  				id, _ := xact.S2RebID(marked.Xact.ID())
    50  				debug.Assert(id > xreb.RebID(), marked.Xact.String()+" vs "+xreb.String())
    51  				nlog.Warningf("%s: must be transitioning (renewing) from %s (stage %s) to %s",
    52  					core.T, xreb, stages[status.Stage], marked.Xact)
    53  				status.Running = false // not yet
    54  			} else {
    55  				debug.Assertf(reb.RebID() == xreb.RebID(), "rebID[%d] vs %s", reb.RebID(), xreb)
    56  			}
    57  		}
    58  	} else if status.Running {
    59  		nlog.Warningln(core.T.String()+": transitioning (renewing) to", marked.Xact.String())
    60  		status.Running = false
    61  	}
    62  
    63  	// wack status
    64  	if smap == nil || status.Stage != rebStageWaitAck {
    65  		return
    66  	}
    67  	if status.SmapVersion != status.RebVersion {
    68  		nlog.Warningf("%s: Smap v%d != %d", core.T, status.SmapVersion, status.RebVersion)
    69  		return
    70  	}
    71  	reb.awaiting.mtx.Lock()
    72  	reb.wackStatus(status, smap)
    73  	reb.awaiting.mtx.Unlock()
    74  }
    75  
    76  // extended info when stage is <wack>
    77  func (reb *Reb) wackStatus(status *Status, rsmap *meta.Smap) {
    78  	var (
    79  		config     = cmn.GCO.Get()
    80  		sleepRetry = cmn.KeepaliveRetryDuration(config)
    81  	)
    82  	now := mono.NanoTime()
    83  	if time.Duration(now-reb.awaiting.ts) < sleepRetry {
    84  		status.Targets = reb.awaiting.targets
    85  		return
    86  	}
    87  	reb.awaiting.ts = now
    88  	reb.awaiting.targets = reb.awaiting.targets[:0]
    89  	for _, lomAcks := range reb.lomAcks() {
    90  		lomAcks.mu.Lock()
    91  		reb.awaiting.targets = _wackStatusLom(lomAcks, reb.awaiting.targets, rsmap)
    92  		lomAcks.mu.Unlock()
    93  	}
    94  	status.Targets = reb.awaiting.targets
    95  }
    96  
    97  func _wackStatusLom(lomAcks *lomAcks, targets meta.Nodes, rsmap *meta.Smap) meta.Nodes {
    98  outer:
    99  	for _, lom := range lomAcks.q {
   100  		tsi, err := rsmap.HrwHash2T(lom.Digest())
   101  		if err != nil {
   102  			continue
   103  		}
   104  		for _, si := range targets {
   105  			if si.ID() == tsi.ID() {
   106  				continue outer
   107  			}
   108  		}
   109  		targets = append(targets, tsi)
   110  		if len(targets) >= maxWackTargets { // limit reporting
   111  			break
   112  		}
   113  	}
   114  	return targets
   115  }