github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/clustermap.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "errors" 9 "fmt" 10 "os" 11 "path/filepath" 12 "strconv" 13 "sync" 14 ratomic "sync/atomic" 15 16 "github.com/NVIDIA/aistore/api/apc" 17 "github.com/NVIDIA/aistore/cmn" 18 "github.com/NVIDIA/aistore/cmn/atomic" 19 "github.com/NVIDIA/aistore/cmn/cos" 20 "github.com/NVIDIA/aistore/cmn/debug" 21 "github.com/NVIDIA/aistore/cmn/fname" 22 "github.com/NVIDIA/aistore/cmn/jsp" 23 "github.com/NVIDIA/aistore/cmn/nlog" 24 "github.com/NVIDIA/aistore/core/meta" 25 "github.com/NVIDIA/aistore/memsys" 26 jsoniter "github.com/json-iterator/go" 27 ) 28 29 const clusterMap = "Smap" 30 31 // NOTE: to access Snode, Smap and related structures, external 32 // packages and HTTP clients must import aistore/cluster (and not ais) 33 34 //===================================================================== 35 // 36 // - smapX is a server-side extension of the meta.Smap 37 // - smapX represents AIStore cluster in terms of its member nodes and their properties 38 // - smapX (instance) can be obtained via smapOwner.get() 39 // - smapX is immutable and versioned 40 // - smapX versioning is monotonic and incremental 41 // - smapX uniquely and solely defines the current primary proxy in the AIStore cluster 42 // 43 // smapX typical update transaction: 44 // lock -- clone() -- modify the clone -- smapOwner.put(clone) -- unlock 45 // 46 // (*) for merges and conflict resolution, check smapX version prior to put() 47 // (version check must be protected by the same critical section) 48 // 49 //===================================================================== 50 51 type ( 52 smapX struct { 53 _sgl *memsys.SGL // jsp-formatted 54 vstr string // itoa(Version) 55 meta.Smap 56 } 57 smapOwner struct { 58 smap ratomic.Pointer[smapX] 59 sls *sls 60 fpath string 61 immSize int64 62 mu sync.Mutex 63 } 64 sls struct { 65 listeners map[string]meta.Slistener 66 postCh chan int64 67 wg sync.WaitGroup 68 mu sync.RWMutex 69 running atomic.Bool 70 } 71 smapModifier struct { 72 pre func(ctx *smapModifier, clone *smapX) error 73 post func(ctx *smapModifier, clone *smapX) 74 final func(ctx *smapModifier, clone *smapX) 75 76 smap *smapX // pre-modification smap that the modifier clones (and modifies) 77 rmdCtx *rmdModifier // in particular, rmd prev and cur (below) 78 79 msg *apc.ActMsg // action modifying smap (apc.Act*) 80 nsi *meta.Snode // new node to be added 81 nid string // node ID of the candidate primary 82 sid string // ID of the node to modify 83 flags cos.BitFlags // enum cmn.Snode* to set or clear 84 nver int64 // new Smap version (cloned and modified `smap` - see above) 85 status int // resulting http.Status* 86 interrupted bool // target reports interrupted rebalance 87 restarted bool // target reports cold restart (powercycle) 88 skipReb bool // skip rebalance when target added/removed 89 gfn bool // sent start-gfn notification 90 } 91 92 smapUpdatedCB func(newSmap, oldSmap *smapX, nfl, ofl cos.BitFlags) 93 ) 94 95 // interface guard 96 var ( 97 _ revs = (*smapX)(nil) 98 _ meta.Sowner = (*smapOwner)(nil) 99 _ meta.SmapListeners = (*sls)(nil) 100 ) 101 102 // as revs 103 func (*smapX) tag() string { return revsSmapTag } 104 func (m *smapX) version() int64 { return m.Version } 105 func (*smapX) jit(p *proxy) revs { return p.owner.smap.get() } 106 107 func (m *smapX) sgl() *memsys.SGL { 108 if m._sgl.IsNil() { 109 return nil 110 } 111 return m._sgl 112 } 113 114 func (m *smapX) marshal() []byte { 115 m._sgl = m._encode(0) 116 return m._sgl.Bytes() 117 } 118 119 func (m *smapX) _encode(immSize int64) (sgl *memsys.SGL) { 120 sgl = memsys.PageMM().NewSGL(immSize) 121 err := jsp.Encode(sgl, m, m.JspOpts()) 122 debug.AssertNoErr(err) 123 return 124 } 125 126 func (m *smapX) _free() { 127 m._sgl.Free() 128 m._sgl = nil 129 } 130 131 /////////// 132 // smapX // 133 /////////// 134 135 func newSmap() (smap *smapX) { 136 smap = &smapX{} 137 smap.init(8, 8) 138 return 139 } 140 141 func (m *smapX) init(tsize, psize int) { 142 m.Tmap = make(meta.NodeMap, tsize) 143 m.Pmap = make(meta.NodeMap, psize) 144 } 145 146 // 147 // begin IC ----------- 148 // 149 150 // executed only by primary 151 func (m *smapX) staffIC() (count int) { 152 _ = m._setIC(m.Primary) 153 m.Primary = m.GetNode(m.Primary.ID()) 154 155 count = m.ICCount() 156 if count < meta.DfltCountIC { 157 // assign additional IC members, if available 158 for _, psi := range m.Pmap { 159 if psi.Flags.IsSet(meta.SnodeNonElectable) || !m._setIC(psi) { 160 continue 161 } 162 count++ 163 if count >= meta.DfltCountIC { 164 return count 165 } 166 } 167 } else if count > meta.DfltCountIC { 168 if m.unstaffIC() { 169 count-- 170 } 171 } 172 return count 173 } 174 175 // remove one 176 func (m *smapX) unstaffIC() bool { 177 for pid, psi := range m.Pmap { 178 if pid == m.Primary.ID() || !m.IsIC(psi) { 179 continue 180 } 181 m.clearNodeFlags(pid, meta.SnodeIC) 182 return true 183 } 184 return false 185 } 186 187 func (m *smapX) _setIC(psi *meta.Snode) (ok bool) { 188 if !m.IsIC(psi) { 189 m.setNodeFlags(psi.ID(), meta.SnodeIC) 190 ok = true 191 } 192 return ok 193 } 194 195 // check configured "original" and "discovery" URLs vs IC members' control, 196 // or pick IC members to provide alternative ones 197 func (m *smapX) configURLsIC(original, discovery string) (orig, disc string) { 198 // extra effort to avoid changing existing URLs if they work 199 for _, psi := range m.Pmap { 200 if !m.IsIC(psi) { 201 continue 202 } 203 if orig == "" && original != "" && psi.URL(cmn.NetIntraControl) == original { 204 orig = original 205 } else if disc == "" && discovery != "" && psi.URL(cmn.NetIntraControl) == discovery { 206 disc = discovery 207 } 208 if orig != "" && disc != "" { 209 return orig, disc 210 } 211 } 212 // pick alternatives 213 for _, psi := range m.Pmap { 214 if !m.IsIC(psi) { 215 continue 216 } 217 if orig == "" { 218 orig = psi.URL(cmn.NetIntraControl) 219 } else if disc == "" { 220 disc = psi.URL(cmn.NetIntraControl) 221 } else { 222 break 223 } 224 } 225 return orig, disc 226 } 227 228 // 229 // end IC ----------- 230 // 231 232 // to be used exclusively at startup - compare with validate() below 233 func (m *smapX) isValid() bool { 234 if m == nil { 235 return false 236 } 237 if m.Primary == nil { 238 return false 239 } 240 if m.isPresent(m.Primary) { 241 cos.Assert(m.Primary.ID() != "") 242 return true 243 } 244 return false 245 } 246 247 // a stronger version of the above 248 func (m *smapX) validate() error { 249 if m == nil { 250 return errors.New(clusterMap + " is <nil>") 251 } 252 if m.version() == 0 { 253 return errors.New(clusterMap + " v0") 254 } 255 if m.Primary == nil { 256 return errors.New(clusterMap + ": primary <nil>") 257 } 258 if !m.isPresent(m.Primary) { 259 return errors.New(clusterMap + ": primary not present") 260 } 261 cos.Assert(m.Primary.ID() != "") 262 if !cos.IsValidUUID(m.UUID) { 263 return fmt.Errorf(clusterMap+": invalid UUID %q", m.UUID) 264 } 265 return nil 266 } 267 268 func (m *smapX) isPrimary(self *meta.Snode) bool { 269 if !m.isValid() { 270 return false 271 } 272 return m.Primary.ID() == self.ID() 273 } 274 275 func (m *smapX) isPresent(si *meta.Snode) bool { 276 if si.IsProxy() { 277 psi := m.GetProxy(si.ID()) 278 return psi != nil 279 } 280 tsi := m.GetTarget(si.ID()) 281 return tsi != nil 282 } 283 284 func (m *smapX) addTarget(tsi *meta.Snode) { 285 if si := m.GetNode(tsi.ID()); si != nil { 286 cos.Assertf(false, "FATAL: duplicate SID: new %s vs %s", tsi.StringEx(), si.StringEx()) 287 } 288 tsi.SetName() 289 m.Tmap[tsi.ID()] = tsi 290 m.Version++ 291 } 292 293 func (m *smapX) addProxy(psi *meta.Snode) { 294 if si := m.GetNode(psi.ID()); si != nil { 295 cos.Assertf(false, "FATAL: duplicate SID: new %s vs %s", psi.StringEx(), si.StringEx()) 296 } 297 psi.SetName() 298 m.Pmap[psi.ID()] = psi 299 m.Version++ 300 } 301 302 func (m *smapX) delTarget(sid string) { 303 if m.GetTarget(sid) == nil { 304 cos.Assertf(false, "FATAL: target %q is not in: %s", sid, m.pp()) 305 } 306 delete(m.Tmap, sid) 307 m.Version++ 308 } 309 310 func (m *smapX) delProxy(pid string) { 311 if m.GetProxy(pid) == nil { 312 cos.Assertf(false, "FATAL: proxy %q is not in: %s", pid, m.pp()) 313 } 314 delete(m.Pmap, pid) 315 m.Version++ 316 } 317 318 func (m *smapX) putNode(nsi *meta.Snode, flags cos.BitFlags, silent bool) { 319 var ( 320 id = nsi.ID() 321 old *meta.Snode 322 ) 323 nsi.Flags = flags 324 if nsi.IsProxy() { 325 if old = m.GetProxy(id); old != nil { 326 m.delProxy(id) 327 } 328 m.addProxy(nsi) 329 if flags.IsSet(meta.SnodeNonElectable) { 330 nlog.Warningln(nsi.String(), "won't be electable") 331 } 332 } else { 333 debug.Assert(nsi.IsTarget()) 334 if old = m.GetTarget(id); old != nil { // ditto 335 m.delTarget(id) 336 } 337 m.addTarget(nsi) 338 } 339 if old != nil { 340 nlog.Warningln("same ID", old.StringEx(), "vs (joining)", nsi.StringEx(), "->", m.StringEx()) 341 } else if !silent { 342 nlog.Infoln("joined", nsi.String(), "->", m.StringEx()) 343 } 344 } 345 346 func (m *smapX) clone() *smapX { 347 dst := &smapX{} 348 cos.CopyStruct(dst, m) 349 debug.Assert(dst.vstr == m.vstr) 350 dst.init(m.CountTargets(), m.CountProxies()) 351 for id, v := range m.Tmap { 352 dst.Tmap[id] = v.Clone() 353 } 354 for id, v := range m.Pmap { 355 dst.Pmap[id] = v.Clone() 356 } 357 dst.Primary = dst.GetProxy(m.Primary.ID()) 358 dst._sgl = nil 359 return dst 360 } 361 362 func (m *smapX) merge(dst *smapX, override bool) (added int, err error) { 363 for id, si := range m.Tmap { 364 err = dst.handleDuplicateNode(si, override) 365 if err != nil { 366 return 367 } 368 if _, ok := dst.Tmap[id]; !ok { 369 if _, ok = dst.Pmap[id]; !ok { 370 dst.Tmap[id] = si 371 added++ 372 } 373 } 374 } 375 for id, si := range m.Pmap { 376 err = dst.handleDuplicateNode(si, override) 377 if err != nil { 378 return 379 } 380 if _, ok := dst.Pmap[id]; !ok { 381 if _, ok = dst.Tmap[id]; !ok { 382 dst.Pmap[id] = si 383 added++ 384 } 385 } 386 } 387 if m.UUID != "" && dst.UUID == "" { 388 dst.UUID = m.UUID 389 dst.CreationTime = m.CreationTime 390 } 391 return 392 } 393 394 // detect duplicate URL and/or IP 395 // if `del` is true delete the old one so that the caller can update Snode 396 func (m *smapX) handleDuplicateNode(nsi *meta.Snode, del bool) (err error) { 397 var osi *meta.Snode 398 if osi, err = m.IsDupNet(nsi); err == nil { 399 return 400 } 401 nlog.Errorln(err) 402 if !del { 403 return 404 } 405 // TODO: more diligence in determining old-ness 406 nlog.Errorf("%v: removing old (?) %s from the current %s and future Smaps", err, osi, m) 407 err = nil 408 if osi.IsProxy() { 409 m.delProxy(osi.ID()) 410 } else { 411 m.delTarget(osi.ID()) 412 } 413 return 414 } 415 416 func (m *smapX) validateUUID(si *meta.Snode, newSmap *smapX, caller string, cieNum int) (err error) { 417 if m == nil || newSmap == nil || newSmap.Version == 0 { 418 return 419 } 420 if !cos.IsValidUUID(m.UUID) || !cos.IsValidUUID(newSmap.UUID) { 421 return 422 } 423 if m.UUID == newSmap.UUID { 424 return 425 } 426 // cluster integrity error (cie) 427 if caller == "" { 428 caller = "???" 429 } 430 s := fmt.Sprintf("%s: Smaps have different UUIDs: local [%s, %s] vs from [%s, %s]", 431 ciError(cieNum), si, m.StringEx(), caller, newSmap.StringEx()) 432 err = &errSmapUUIDDiffer{s} 433 return 434 } 435 436 func (m *smapX) pp() string { 437 s, _ := jsoniter.MarshalIndent(m, "", " ") 438 return string(s) 439 } 440 441 func (m *smapX) _applyFlags(si *meta.Snode, newFlags cos.BitFlags) { 442 si.Flags = newFlags 443 if si.IsTarget() { 444 m.Tmap[si.ID()] = si 445 } else { 446 m.Pmap[si.ID()] = si 447 } 448 m.Version++ 449 } 450 451 // Must be called under lock 452 func (m *smapX) setNodeFlags(sid string, flags cos.BitFlags) { 453 si := m.GetNode(sid) 454 newFlags := si.Flags.Set(flags) 455 if flags.IsAnySet(meta.SnodeMaintDecomm) { 456 newFlags = newFlags.Clear(meta.SnodeIC) 457 } 458 m._applyFlags(si, newFlags) 459 } 460 461 // Must be called under lock 462 func (m *smapX) clearNodeFlags(id string, flags cos.BitFlags) { 463 si := m.GetNode(id) 464 m._applyFlags(si, si.Flags.Clear(flags)) 465 } 466 467 func (m *smapX) mergeFlags(from *smapX) (clone *smapX) { 468 all := []meta.NodeMap{from.Tmap, from.Pmap} 469 for _, mm := range all { 470 for _, osi := range mm { 471 nsi := m.GetNode(osi.ID()) 472 if nsi == nil { 473 continue 474 } 475 if osi.Flags == nsi.Flags { 476 continue 477 } 478 if clone == nil { 479 clone = m.clone() 480 } 481 nsi = clone.GetNode(osi.ID()) 482 nsi.Flags = osi.Flags 483 if nsi.IsTarget() { 484 clone.Tmap[nsi.ID()] = nsi 485 } else { 486 clone.Pmap[nsi.ID()] = nsi 487 } 488 } 489 } 490 return clone 491 } 492 493 /////////////// 494 // smapOwner // 495 /////////////// 496 497 func newSmapOwner(config *cmn.Config) *smapOwner { 498 return &smapOwner{ 499 sls: newSmapListeners(), 500 fpath: filepath.Join(config.ConfigDir, fname.Smap), 501 } 502 } 503 504 func (r *smapOwner) load(smap *smapX) (loaded bool, err error) { 505 _, err = jsp.LoadMeta(r.fpath, smap) 506 if err != nil { 507 if os.IsNotExist(err) { 508 return false, nil 509 } 510 return false, err 511 } 512 if smap.version() == 0 || !smap.isValid() { 513 return false, fmt.Errorf("unexpected: persistent %s is invalid", smap) 514 } 515 return true, nil 516 } 517 518 func (r *smapOwner) Get() *meta.Smap { return &r.get().Smap } 519 func (r *smapOwner) Listeners() meta.SmapListeners { return r.sls } 520 521 // 522 // private 523 // 524 525 func (r *smapOwner) put(smap *smapX) { 526 smap.InitDigests() 527 smap.vstr = strconv.FormatInt(smap.Version, 10) 528 r.smap.Store(smap) 529 r.sls.notify(smap.version()) 530 } 531 532 func (r *smapOwner) get() (smap *smapX) { return r.smap.Load() } 533 534 func (r *smapOwner) synchronize(si *meta.Snode, newSmap *smapX, payload msPayload, cb smapUpdatedCB) (err error) { 535 if err = newSmap.validate(); err != nil { 536 debug.Assertf(false, "%s: %s is invalid: %v", si, newSmap, err) 537 return 538 } 539 540 var ( 541 ofl, nfl cos.BitFlags 542 ofs, nfs string 543 ) 544 r.mu.Lock() 545 smap := r.get() 546 if nsi := newSmap.GetNode(si.ID()); nsi != nil && si.Flags != nsi.Flags { 547 ofl, nfl = si.Flags, nsi.Flags 548 ofs, nfs = si.Fl2S(), nsi.Fl2S() 549 si.Flags = nsi.Flags 550 } 551 if smap != nil { 552 curVer, newVer := smap.Version, newSmap.version() 553 if newVer <= curVer { 554 if newVer < curVer { 555 // NOTE: considered benign in most cases 556 err = newErrDowngrade(si, smap.String(), newSmap.String()) 557 } 558 r.mu.Unlock() 559 return 560 } 561 } 562 if !r.persistBytes(payload) { 563 err = r.persist(newSmap) 564 } 565 if err == nil { 566 r.put(newSmap) 567 } 568 r.mu.Unlock() 569 570 if err == nil { 571 if ofl != nfl { 572 nlog.Infof("%s flags: from %s to %s", si, ofs, nfs) 573 } 574 cb(newSmap, smap, nfl, ofl) 575 } 576 return 577 } 578 579 // write metasync-sent bytes directly (no json) 580 func (r *smapOwner) persistBytes(payload msPayload) (done bool) { 581 if payload == nil { 582 return 583 } 584 smapValue := payload[revsSmapTag] 585 if smapValue == nil { 586 return 587 } 588 var ( 589 smap *meta.Smap 590 wto = cos.NewBuffer(smapValue) 591 err = jsp.SaveMeta(r.fpath, smap, wto) 592 ) 593 done = err == nil 594 return 595 } 596 597 // Must be called under lock 598 func (r *smapOwner) persist(newSmap *smapX) error { 599 sgl := newSmap._sgl 600 if sgl == nil { 601 sgl = newSmap._encode(r.immSize) 602 r.immSize = max(r.immSize, sgl.Len()) 603 defer sgl.Free() 604 } 605 return jsp.SaveMeta(r.fpath, newSmap, sgl) 606 } 607 608 // executes under lock 609 func (r *smapOwner) prepost(ctx *smapModifier) (clone *smapX, err error) { 610 ctx.smap = r.get() 611 clone = ctx.smap.clone() 612 if err = ctx.pre(ctx, clone); err != nil { 613 return 614 } 615 clone._sgl = clone._encode(r.immSize) 616 r.immSize = max(r.immSize, clone._sgl.Len()) 617 if err := r.persist(clone); err != nil { 618 clone._free() 619 return nil, cmn.NewErrFailedTo(nil, "persist", clone, err) 620 } 621 ctx.nver = clone.version() 622 if ctx.final == nil { 623 clone._free() 624 } 625 r.put(clone) 626 if ctx.post != nil { 627 ctx.post(ctx, clone) 628 } 629 return 630 } 631 632 func (r *smapOwner) modify(ctx *smapModifier) error { 633 r.mu.Lock() 634 clone, err := r.prepost(ctx) 635 r.mu.Unlock() 636 if err != nil { 637 return err 638 } 639 if ctx.final != nil { 640 ctx.final(ctx, clone) 641 } 642 return nil 643 } 644 645 ///////// 646 // sls // 647 ///////// 648 649 func newSmapListeners() *sls { 650 sls := &sls{ 651 listeners: make(map[string]meta.Slistener, 16), 652 postCh: make(chan int64, 32), 653 } 654 return sls 655 } 656 657 func (sls *sls) run() { 658 // drain 659 for len(sls.postCh) > 0 { 660 <-sls.postCh 661 } 662 sls.wg.Done() 663 sls.running.Store(true) 664 for ver := range sls.postCh { 665 if ver == -1 { 666 break 667 } 668 sls.mu.RLock() 669 for _, l := range sls.listeners { 670 // NOTE: Reg() or Unreg() from inside ListenSmapChanged() callback 671 // may cause a trivial deadlock 672 l.ListenSmapChanged() 673 } 674 sls.mu.RUnlock() 675 } 676 // drain 677 for len(sls.postCh) > 0 { 678 <-sls.postCh 679 } 680 } 681 682 func (sls *sls) Reg(sl meta.Slistener) { 683 cos.Assert(sl.String() != "") 684 sls.mu.Lock() 685 _, ok := sls.listeners[sl.String()] 686 debug.Assert(!ok) 687 sls.listeners[sl.String()] = sl 688 if len(sls.listeners) == 1 { 689 sls.wg.Add(1) 690 go sls.run() 691 sls.wg.Wait() 692 } 693 sls.mu.Unlock() 694 } 695 696 func (sls *sls) Unreg(sl meta.Slistener) { 697 sls.mu.Lock() 698 _, ok := sls.listeners[sl.String()] 699 cos.Assert(ok) 700 delete(sls.listeners, sl.String()) 701 if len(sls.listeners) == 0 { 702 sls.running.Store(false) 703 sls.postCh <- -1 704 } 705 sls.mu.Unlock() 706 } 707 708 func (sls *sls) notify(ver int64) { 709 debug.Assert(ver >= 0) 710 if !sls.running.Load() { 711 return 712 } 713 sls.postCh <- ver 714 if len(sls.postCh) == cap(sls.postCh) { 715 nlog.ErrorDepth(1, "sls channel full: Smap v", ver) // unlikely 716 } 717 }