github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/reb/status.go (about) 1 // Package reb provides global cluster-wide rebalance upon adding/removing storage nodes. 2 /* 3 * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package reb 6 7 import ( 8 "time" 9 10 "github.com/NVIDIA/aistore/cmn" 11 "github.com/NVIDIA/aistore/cmn/debug" 12 "github.com/NVIDIA/aistore/cmn/mono" 13 "github.com/NVIDIA/aistore/cmn/nlog" 14 "github.com/NVIDIA/aistore/core" 15 "github.com/NVIDIA/aistore/core/meta" 16 "github.com/NVIDIA/aistore/xact" 17 "github.com/NVIDIA/aistore/xact/xreg" 18 ) 19 20 // via GET /v1/health (apc.Health) 21 func (reb *Reb) RebStatus(status *Status) { 22 var ( 23 tsmap = core.T.Sowner().Get() 24 marked = xreg.GetRebMarked() 25 ) 26 status.Aborted = marked.Interrupted 27 status.Running = marked.Xact != nil && marked.Xact.Running() 28 29 // rlock 30 reb.mu.RLock() 31 status.Stage = reb.stages.stage.Load() 32 status.RebID = reb.rebID.Load() 33 status.Quiescent = reb.isQuiescent() 34 status.SmapVersion = tsmap.Version 35 smap := reb.smap.Load() 36 if smap != nil { 37 status.RebVersion = smap.Version 38 } 39 reb.mu.RUnlock() 40 41 // xreb, ?running 42 xreb := reb.xctn() 43 if xreb != nil { 44 status.Aborted = xreb.IsAborted() 45 status.Running = xreb.Running() 46 xreb.ToStats(&status.Stats) 47 if status.Running { 48 if marked.Xact != nil && marked.Xact.ID() != xreb.ID() { 49 id, _ := xact.S2RebID(marked.Xact.ID()) 50 debug.Assert(id > xreb.RebID(), marked.Xact.String()+" vs "+xreb.String()) 51 nlog.Warningf("%s: must be transitioning (renewing) from %s (stage %s) to %s", 52 core.T, xreb, stages[status.Stage], marked.Xact) 53 status.Running = false // not yet 54 } else { 55 debug.Assertf(reb.RebID() == xreb.RebID(), "rebID[%d] vs %s", reb.RebID(), xreb) 56 } 57 } 58 } else if status.Running { 59 nlog.Warningln(core.T.String()+": transitioning (renewing) to", marked.Xact.String()) 60 status.Running = false 61 } 62 63 // wack status 64 if smap == nil || status.Stage != rebStageWaitAck { 65 return 66 } 67 if status.SmapVersion != status.RebVersion { 68 nlog.Warningf("%s: Smap v%d != %d", core.T, status.SmapVersion, status.RebVersion) 69 return 70 } 71 reb.awaiting.mtx.Lock() 72 reb.wackStatus(status, smap) 73 reb.awaiting.mtx.Unlock() 74 } 75 76 // extended info when stage is <wack> 77 func (reb *Reb) wackStatus(status *Status, rsmap *meta.Smap) { 78 var ( 79 config = cmn.GCO.Get() 80 sleepRetry = cmn.KeepaliveRetryDuration(config) 81 ) 82 now := mono.NanoTime() 83 if time.Duration(now-reb.awaiting.ts) < sleepRetry { 84 status.Targets = reb.awaiting.targets 85 return 86 } 87 reb.awaiting.ts = now 88 reb.awaiting.targets = reb.awaiting.targets[:0] 89 for _, lomAcks := range reb.lomAcks() { 90 lomAcks.mu.Lock() 91 reb.awaiting.targets = _wackStatusLom(lomAcks, reb.awaiting.targets, rsmap) 92 lomAcks.mu.Unlock() 93 } 94 status.Targets = reb.awaiting.targets 95 } 96 97 func _wackStatusLom(lomAcks *lomAcks, targets meta.Nodes, rsmap *meta.Smap) meta.Nodes { 98 outer: 99 for _, lom := range lomAcks.q { 100 tsi, err := rsmap.HrwHash2T(lom.Digest()) 101 if err != nil { 102 continue 103 } 104 for _, si := range targets { 105 if si.ID() == tsi.ID() { 106 continue outer 107 } 108 } 109 targets = append(targets, tsi) 110 if len(targets) >= maxWackTargets { // limit reporting 111 break 112 } 113 } 114 return targets 115 }