github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/reb/bcast.go (about) 1 // Package reb provides global cluster-wide rebalance upon adding/removing storage nodes. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package reb 6 7 import ( 8 "fmt" 9 "net/url" 10 "time" 11 12 "github.com/NVIDIA/aistore/api/apc" 13 "github.com/NVIDIA/aistore/cmn" 14 "github.com/NVIDIA/aistore/cmn/atomic" 15 "github.com/NVIDIA/aistore/cmn/cos" 16 "github.com/NVIDIA/aistore/cmn/debug" 17 "github.com/NVIDIA/aistore/cmn/nlog" 18 "github.com/NVIDIA/aistore/core" 19 "github.com/NVIDIA/aistore/core/meta" 20 jsoniter "github.com/json-iterator/go" 21 ) 22 23 type ( 24 syncCallback func(tsi *meta.Snode, rargs *rebArgs) (ok bool) 25 26 Status struct { 27 Targets meta.Nodes `json:"targets"` // targets I'm waiting for ACKs from 28 SmapVersion int64 `json:"smap_version,string"` // current Smap version (via smapOwner) 29 RebVersion int64 `json:"reb_version,string"` // Smap version of *this* rebalancing op 30 RebID int64 `json:"reb_id,string"` // rebalance ID 31 Stats core.Stats `json:"stats"` // transmitted/received totals 32 Stage uint32 `json:"stage"` // the current stage - see enum above 33 Aborted bool `json:"aborted"` // aborted? 34 Running bool `json:"running"` // running? 35 Quiescent bool `json:"quiescent"` // true when queue is empty 36 } 37 ) 38 39 //////////////////////////////////////////// 40 // rebalance manager: node <=> node comm. // 41 //////////////////////////////////////////// 42 43 // main method 44 func bcast(rargs *rebArgs, cb syncCallback) (errCnt int) { 45 var ( 46 cnt atomic.Int32 47 wg = cos.NewLimitedWaitGroup(cmn.MaxParallelism(), len(rargs.smap.Tmap)) 48 ) 49 for _, tsi := range rargs.smap.Tmap { 50 if tsi.ID() == core.T.SID() { 51 continue 52 } 53 wg.Add(1) 54 go func(tsi *meta.Snode) { 55 if !cb(tsi, rargs) { 56 cnt.Inc() 57 } 58 wg.Done() 59 }(tsi) 60 } 61 wg.Wait() 62 errCnt = int(cnt.Load()) 63 return 64 } 65 66 // pingTarget checks if target is running (type syncCallback) 67 // TODO: reuse keepalive 68 func (reb *Reb) pingTarget(tsi *meta.Snode, rargs *rebArgs) (ok bool) { 69 var ( 70 ver = rargs.smap.Version 71 sleep = cmn.Rom.CplaneOperation() 72 logHdr = reb.logHdr(rargs.id, rargs.smap) 73 tname = tsi.StringEx() 74 ) 75 for i := range 4 { 76 _, code, err := core.T.Health(tsi, cmn.Rom.MaxKeepalive(), nil) 77 if err == nil { 78 if i > 0 { 79 nlog.Infof("%s: %s is online", logHdr, tname) 80 } 81 return true 82 } 83 if !cos.IsUnreachable(err, code) { 84 nlog.Errorf("%s: health(%s) returned %v(%d) - aborting", logHdr, tname, err, code) 85 return 86 } 87 nlog.Warningf("%s: waiting for %s, err %v(%d)", logHdr, tname, err, code) 88 time.Sleep(sleep) 89 nver := core.T.Sowner().Get().Version 90 if nver > ver { 91 return 92 } 93 } 94 nlog.Errorf("%s: timed out waiting for %s", logHdr, tname) 95 return 96 } 97 98 // wait for target to get ready to receive objects (type syncCallback) 99 func (reb *Reb) rxReady(tsi *meta.Snode, rargs *rebArgs) (ok bool) { 100 var ( 101 curwt time.Duration 102 sleep = cmn.Rom.CplaneOperation() * 2 103 maxwt = rargs.config.Rebalance.DestRetryTime.D() + rargs.config.Rebalance.DestRetryTime.D()/2 104 xreb = reb.xctn() 105 ) 106 for curwt < maxwt { 107 if reb.stages.isInStage(tsi, rebStageTraverse) { 108 // do not request the node stage if it has sent stage notification 109 return true 110 } 111 if _, ok = reb.checkStage(tsi, rargs, rebStageTraverse); ok { 112 return 113 } 114 if err := xreb.AbortedAfter(sleep); err != nil { 115 return 116 } 117 curwt += sleep 118 } 119 logHdr, tname := reb.logHdr(rargs.id, rargs.smap), tsi.StringEx() 120 nlog.Errorf("%s: timed out waiting for %s to reach %s state", logHdr, tname, stages[rebStageTraverse]) 121 return 122 } 123 124 // wait for the target to reach strage = rebStageFin (i.e., finish traversing and sending) 125 // if the target that has reached rebStageWaitAck but not yet in the rebStageFin stage, 126 // separately check whether it is waiting for my ACKs 127 func (reb *Reb) waitFinExtended(tsi *meta.Snode, rargs *rebArgs) (ok bool) { 128 var ( 129 curwt time.Duration 130 status *Status 131 sleep = rargs.config.Timeout.CplaneOperation.D() 132 maxwt = rargs.config.Rebalance.DestRetryTime.D() 133 sleepRetry = cmn.KeepaliveRetryDuration(rargs.config) 134 logHdr = reb.logHdr(rargs.id, rargs.smap) 135 xreb = reb.xctn() 136 ) 137 debug.Assertf(reb.RebID() == xreb.RebID(), "%s (rebID=%d) vs %s", logHdr, reb.RebID(), xreb) 138 for curwt < maxwt { 139 if err := xreb.AbortedAfter(sleep); err != nil { 140 nlog.Infof("%s: abort wack (%v)", logHdr, err) 141 return 142 } 143 if reb.stages.isInStage(tsi, rebStageFin) { 144 return true // tsi stage=<fin> 145 } 146 // otherwise, inquire status and check the stage 147 curwt += sleep 148 if status, ok = reb.checkStage(tsi, rargs, rebStageFin); ok || status == nil { 149 return 150 } 151 if err := xreb.AbortErr(); err != nil { 152 nlog.Infof("%s: abort wack (%v)", logHdr, err) 153 return 154 } 155 // 156 // tsi in rebStageWaitAck 157 // 158 var w4me bool // true: this target is waiting for ACKs from me 159 for _, si := range status.Targets { 160 if si.ID() == core.T.SID() { 161 nlog.Infof("%s: keep wack <= %s[%s]", logHdr, tsi.StringEx(), stages[status.Stage]) 162 w4me = true 163 break 164 } 165 } 166 if !w4me { 167 nlog.Infof("%s: %s[%s] ok (not waiting for me)", logHdr, tsi.StringEx(), stages[status.Stage]) 168 ok = true 169 return 170 } 171 time.Sleep(sleepRetry) 172 curwt += sleepRetry 173 } 174 nlog.Errorf("%s: timed out waiting for %s to reach %s", logHdr, tsi.StringEx(), stages[rebStageFin]) 175 return 176 } 177 178 // calls tsi.reb.RebStatus() and handles conditions; may abort the current xreb 179 // returns: 180 // - `Status` or nil 181 // - OK iff the desiredStage has been reached 182 func (reb *Reb) checkStage(tsi *meta.Snode, rargs *rebArgs, desiredStage uint32) (status *Status, ok bool) { 183 var ( 184 sleepRetry = cmn.KeepaliveRetryDuration(rargs.config) 185 logHdr = reb.logHdr(rargs.id, rargs.smap) 186 query = url.Values{apc.QparamRebStatus: []string{"true"}} 187 xreb = reb.xctn() 188 tname = tsi.StringEx() 189 ) 190 if xreb == nil || xreb.IsAborted() { 191 return 192 } 193 debug.Assertf(reb.RebID() == xreb.RebID(), "%s (rebID=%d) vs %s", logHdr, reb.RebID(), xreb) 194 body, code, err := core.T.Health(tsi, apc.DefaultTimeout, query) 195 if err != nil { 196 if errAborted := xreb.AbortedAfter(sleepRetry); errAborted != nil { 197 nlog.Infoln(logHdr, "abort check status", errAborted) 198 return 199 } 200 body, code, err = core.T.Health(tsi, apc.DefaultTimeout, query) // retry once 201 } 202 if err != nil { 203 ctx := fmt.Sprintf("health(%s) failure: %v(%d)", tname, err, code) 204 err = cmn.NewErrAborted(xreb.Name(), ctx, err) 205 reb.abortAndBroadcast(err) 206 return 207 } 208 209 status = &Status{} 210 err = jsoniter.Unmarshal(body, status) 211 if err != nil { 212 err = fmt.Errorf(cmn.FmtErrUnmarshal, logHdr, "reb status from "+tname, cos.BHead(body), err) 213 reb.abortAndBroadcast(err) 214 return 215 } 216 // enforce global transaction ID 217 if status.RebID > reb.rebID.Load() { 218 err := cmn.NewErrAborted(xreb.Name(), logHdr, fmt.Errorf("%s runs newer g%d", tname, status.RebID)) 219 reb.abortAndBroadcast(err) 220 return 221 } 222 if xreb.IsAborted() { 223 return 224 } 225 // let the target to catch-up 226 if status.RebID < reb.RebID() { 227 nlog.Warningf("%s: %s runs older (g%d) global rebalance - keep waiting...", logHdr, tname, status.RebID) 228 return 229 } 230 // Remote target has aborted its running rebalance with the same ID. 231 // Do not call `reb.abortAndBroadcast()` - no need. 232 if status.RebID == reb.RebID() && status.Aborted { 233 err := cmn.NewErrAborted(xreb.Name(), logHdr, fmt.Errorf("status 'aborted' from %s", tname)) 234 xreb.Abort(err) 235 return 236 } 237 if status.Stage >= desiredStage { 238 ok = true 239 return 240 } 241 nlog.Infof("%s: %s[%s] not yet at the right stage %s", logHdr, tname, stages[status.Stage], stages[desiredStage]) 242 return 243 }