github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/rebmeta.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "fmt" 9 "os" 10 "path/filepath" 11 "sync" 12 ratomic "sync/atomic" 13 14 "github.com/NVIDIA/aistore/api/apc" 15 "github.com/NVIDIA/aistore/cmn" 16 "github.com/NVIDIA/aistore/cmn/atomic" 17 "github.com/NVIDIA/aistore/cmn/cos" 18 "github.com/NVIDIA/aistore/cmn/debug" 19 "github.com/NVIDIA/aistore/cmn/fname" 20 "github.com/NVIDIA/aistore/cmn/jsp" 21 "github.com/NVIDIA/aistore/cmn/nlog" 22 "github.com/NVIDIA/aistore/core/meta" 23 "github.com/NVIDIA/aistore/memsys" 24 "github.com/NVIDIA/aistore/nl" 25 "github.com/NVIDIA/aistore/xact" 26 ) 27 28 // Rebalance metadata is distributed to trigger global (a.k.a. cluster) rebalance. 29 // This (distribution) happens: 30 // - when a new target node joins cluster; 31 // - at startup, when cluster detects unfinished (aborted) rebalance; 32 // - when we remove a target while some bucket(s) are erasure-coded 33 // (we then must redistribute slices); 34 // - upon bucket rename: 35 // 1. bucket is renamed (and the paths of the objects change) 36 // 2. rebalance must be started to redistribute the objects to the targets 37 // depending on HRW; 38 // - when requested by user (`ais start rebalance` or REST API); 39 // - upon target node powercycle (and more). 40 41 type ( 42 // rebMD is revs (see metasync) which is distributed by primary proxy to 43 // the targets. It is distributed when some kind of rebalance is required. 44 rebMD struct { 45 meta.RMD 46 } 47 48 // rmdOwner is used to keep the information about the rebalances. Currently 49 // it keeps the Version of the latest rebalance. 50 rmdOwner struct { 51 cluID string 52 fpath string 53 sync.Mutex 54 rmd ratomic.Pointer[rebMD] 55 interrupted atomic.Bool // when joining target reports interrupted rebalance 56 starting atomic.Bool // when starting up 57 } 58 59 rmdModifier struct { 60 pre func(ctx *rmdModifier, clone *rebMD) 61 final func(ctx *rmdModifier, clone *rebMD) 62 63 prev *rebMD // pre-modification rmd 64 cur *rebMD // CoW clone 65 rebID string // cluster-wide rebalance ID, "g[uuid]" in the logs 66 67 cluID string // cluster ID (== smap.UUID) - never changes 68 p *proxy 69 smapCtx *smapModifier 70 wait bool 71 } 72 ) 73 74 // interface guard 75 var _ revs = (*rebMD)(nil) 76 77 // as revs 78 func (*rebMD) tag() string { return revsRMDTag } 79 func (r *rebMD) version() int64 { return r.Version } 80 func (r *rebMD) marshal() []byte { return cos.MustMarshal(r) } 81 func (*rebMD) jit(p *proxy) revs { return p.owner.rmd.get() } 82 func (*rebMD) sgl() *memsys.SGL { return nil } 83 84 func (r *rebMD) inc() { r.Version++ } 85 86 func (r *rebMD) clone() *rebMD { 87 dst := &rebMD{} 88 cos.CopyStruct(dst, r) 89 return dst 90 } 91 92 func (r *rebMD) String() string { 93 if r == nil { 94 return "RMD <nil>" 95 } 96 if len(r.TargetIDs) == 0 && r.Resilver == "" { 97 return fmt.Sprintf("RMD v%d[%s]", r.Version, r.CluID) 98 } 99 var s string 100 if r.Resilver != "" { 101 s = ", " + r.Resilver 102 } 103 return fmt.Sprintf("RMD v%d[%s, %v%s]", r.Version, r.CluID, r.TargetIDs, s) 104 } 105 106 ////////////// 107 // rmdOwner // 108 ////////////// 109 110 func newRMDOwner(config *cmn.Config) *rmdOwner { 111 rmdo := &rmdOwner{fpath: filepath.Join(config.ConfigDir, fname.Rmd)} 112 rmdo.put(&rebMD{}) 113 return rmdo 114 } 115 116 func (r *rmdOwner) persist(rmd *rebMD) error { 117 return jsp.SaveMeta(r.fpath, rmd, nil /*wto*/) 118 } 119 120 func (r *rmdOwner) load() { 121 rmd := &rebMD{} 122 _, err := jsp.LoadMeta(r.fpath, rmd) 123 if err == nil { 124 r.put(rmd) 125 return 126 } 127 if !os.IsNotExist(err) { 128 nlog.Errorln("failed to load RMD:", err) 129 nlog.Infoln("Warning: make sure to properly decommission previously deployed clusters, proceeding anyway...") 130 } 131 } 132 133 func (r *rmdOwner) put(rmd *rebMD) { r.rmd.Store(rmd) } 134 func (r *rmdOwner) get() *rebMD { return r.rmd.Load() } 135 136 func (r *rmdOwner) modify(ctx *rmdModifier) (clone *rebMD, err error) { 137 r.Lock() 138 clone, err = r.do(ctx) 139 r.Unlock() 140 141 if err == nil && ctx.final != nil { 142 ctx.final(ctx, clone) 143 } 144 return 145 } 146 147 const rmdFromAnother = ` 148 %s: RMD v%d (cluster ID %q) belongs to a different cluster %q 149 150 ----------------- 151 To troubleshoot: 152 1. first, make sure you are not trying to run two different clusters that utilize (or include) the same machine 153 2. remove possibly misplaced RMD from the %s (located at %s) 154 3. restart %s 155 -----------------` 156 157 func (r *rmdOwner) newClusterIntegrityErr(node, otherCID, haveCID string, version int64) (err error) { 158 return fmt.Errorf(rmdFromAnother, node, version, haveCID, otherCID, node, r.fpath, node) 159 } 160 161 func (r *rmdOwner) do(ctx *rmdModifier) (clone *rebMD, err error) { 162 ctx.prev = r.get() 163 164 if r.cluID == "" { 165 r.cluID = ctx.cluID 166 } 167 if ctx.smapCtx == nil { 168 return 169 } 170 if r.cluID == "" { 171 r.cluID = ctx.smapCtx.smap.UUID 172 } else if r.cluID != ctx.smapCtx.smap.UUID { 173 err := r.newClusterIntegrityErr("primary", ctx.smapCtx.smap.UUID, r.cluID, ctx.prev.Version) 174 cos.ExitLog(err) // FATAL 175 } 176 177 clone = ctx.prev.clone() 178 clone.TargetIDs = nil 179 clone.Resilver = "" 180 clone.CluID = r.cluID 181 debug.Assert(cos.IsValidUUID(clone.CluID), clone.CluID) 182 ctx.pre(ctx, clone) // `pre` callback 183 184 if err = r.persist(clone); err == nil { 185 r.put(clone) 186 } 187 ctx.cur = clone 188 ctx.rebID = xact.RebID2S(clone.Version) // new rebID 189 return 190 } 191 192 ///////////////// 193 // rmdModifier // 194 ///////////////// 195 196 func rmdInc(_ *rmdModifier, clone *rebMD) { clone.inc() } 197 198 // via `rmdModifier.final` 199 func rmdSync(m *rmdModifier, clone *rebMD) { 200 debug.Assert(m.cur == clone) 201 m.listen(nil) 202 msg := &aisMsg{ActMsg: apc.ActMsg{Action: apc.ActRebalance}, UUID: m.rebID} // user-requested rebalance 203 wg := m.p.metasyncer.sync(revsPair{m.cur, msg}) 204 if m.wait { 205 wg.Wait() 206 } 207 } 208 209 // see `receiveRMD` (upon termination, notify IC) 210 func (m *rmdModifier) listen(cb func(nl nl.Listener)) { 211 nl := xact.NewXactNL(m.rebID, apc.ActRebalance, &m.smapCtx.smap.Smap, nil) 212 nl = nl.WithCause(m.smapCtx.msg.Action) 213 nl.SetOwner(equalIC) 214 215 nl.F = m.log 216 if cb != nil { 217 nl.F = cb 218 } 219 err := m.p.notifs.add(nl) 220 debug.AssertNoErr(err) 221 } 222 223 // deactivate or remove node from the cluster (as per msg.Action) 224 // called when rebalance is done 225 func (m *rmdModifier) postRm(nl nl.Listener) { 226 var ( 227 p = m.p 228 tsi = m.smapCtx.smap.GetNode(m.smapCtx.sid) 229 sname = tsi.StringEx() 230 xname = "rebalance[" + nl.UUID() + "]" 231 smap = p.owner.smap.get() 232 ntsi = smap.GetNode(m.smapCtx.sid) // with updated flags 233 warn = "remove " + sname + " from the current " 234 ) 235 if ntsi != nil && (ntsi.Flags.IsSet(meta.SnodeMaint) || ntsi.Flags.IsSet(meta.SnodeMaintPostReb)) { 236 warn = "mark " + sname + " for maintenance mode in the current " 237 } 238 warn += smap.StringEx() 239 debug.Assert(nl.UUID() == m.rebID && tsi.IsTarget()) 240 241 if nl.ErrCnt() == 0 { 242 nlog.Infoln("post-rebalance commit:", warn) 243 if _, err := p.rmNodeFinal(m.smapCtx.msg, tsi, m.smapCtx); err != nil { 244 nlog.Errorln(err) 245 } 246 return 247 } 248 249 m.log(nl) 250 nlerr := nl.Err() 251 252 rmd := p.owner.rmd.get() 253 if nlerr == cmn.ErrXactRenewAbort || nlerr.Error() == cmn.ErrXactRenewAbort.Error() || m.cur.Version < rmd.Version { 254 nlog.Errorf("Warning: %s (%s) got renewed (interrupted) - will not %s (%s)", xname, m.smapCtx.smap, warn, rmd) 255 return 256 } 257 if m.smapCtx.msg.Action != apc.ActRmNodeUnsafe && m.smapCtx.msg.Action != apc.ActDecommissionNode { 258 nlog.Errorf("operation %q => %s (%s) failed - will not %s", m.smapCtx.msg.Action, xname, m.smapCtx.smap, warn) 259 return 260 } 261 262 // go ahead to decommission anyway 263 nlog.Errorf("given %q operation and despite [%v] - proceeding to %s", m.smapCtx.msg.Action, nlerr, warn) 264 if _, err := p.rmNodeFinal(m.smapCtx.msg, tsi, m.smapCtx); err != nil { 265 nlog.Errorln(err) 266 } 267 268 // 269 // TODO: bcast targets to re-rebalance for the same `m.rebID` iff there isn't a new one that's running or about to run 270 // 271 } 272 273 func (m *rmdModifier) log(nl nl.Listener) { 274 debug.Assert(nl.UUID() == m.rebID) 275 var ( 276 err = nl.Err() 277 abrt = nl.Aborted() 278 name = "rebalance[" + nl.UUID() + "]" 279 ) 280 switch { 281 case err == nil && !abrt: 282 nlog.InfoDepth(1, name, "done") 283 case abrt: 284 debug.Assert(err != nil, nl.String()+" - aborted w/ no errors") 285 nlog.ErrorDepth(1, name, err) 286 default: 287 nlog.ErrorDepth(1, name, "failed:", err) 288 } 289 }