github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/earlystart.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "fmt" 9 "net/url" 10 "runtime" 11 "time" 12 13 "github.com/NVIDIA/aistore/api/apc" 14 "github.com/NVIDIA/aistore/api/env" 15 "github.com/NVIDIA/aistore/cmn" 16 "github.com/NVIDIA/aistore/cmn/cifl" 17 "github.com/NVIDIA/aistore/cmn/cos" 18 "github.com/NVIDIA/aistore/cmn/debug" 19 "github.com/NVIDIA/aistore/cmn/nlog" 20 "github.com/NVIDIA/aistore/core" 21 "github.com/NVIDIA/aistore/core/meta" 22 ) 23 24 const maxVerConfirmations = 3 // NOTE: minimum number of max-ver confirmations required to make the decision 25 26 const ( 27 metaction1 = "early-start-have-registrations" 28 metaction2 = "primary-started-up" 29 metaction3 = "primary-startup-resume-rebalance" 30 ) 31 32 const ( 33 fmtErrNetInfoChanged = "%s: net-info changed upon restart (on K8s?) - excluding self from the broadcast (%q, %q)" 34 ) 35 36 type ( 37 bmds map[*meta.Snode]*bucketMD 38 smaps map[*meta.Snode]*smapX 39 40 // sourced from: (env, config, smap) 41 prim struct { 42 url string 43 isSmap bool // <-- loaded Smap 44 isCfg bool // <-- config.proxy.primary_url 45 isEP bool // <-- env AIS_PRIMARY_EP 46 } 47 ) 48 49 // Background: 50 // - Each proxy/gateway stores a local copy of the cluster map (Smap) 51 // - Each Smap instance is versioned; the versioning is monotonic (increasing) 52 // - Only the primary (leader) proxy distributes Smap updates to all other clustered nodes 53 // - Bootstrap sequence includes /steps/ intended to resolve all the usual conflicts that may arise. 54 func (p *proxy) bootstrap() { 55 // 1: load a local copy and try to utilize it for discovery 56 var ( 57 smap, reliable = p.loadSmap() 58 isSelf string 59 ) 60 if !reliable { 61 smap = nil 62 nlog.Infoln(p.String() + ": starting without Smap") 63 } else { 64 if smap.Primary.ID() == p.SID() { 65 isSelf = ", where primary is self" 66 } 67 nlog.Infoln(p.String()+": loaded", smap.StringEx()+isSelf) 68 } 69 70 // 2. make preliminary _primary_ decision 71 config := cmn.GCO.Get() 72 prim := p.determineRole(smap, config) 73 74 // 3: start as primary 75 forcePrimaryChange := prim.isCfg || prim.isEP 76 if prim.isSmap || forcePrimaryChange { 77 if prim.isSmap { 78 nlog.Infof("%s: assuming primary role _for now_ %+v", p, prim) 79 } else if prim.isEP && isSelf != "" { 80 nlog.Infof("%s: assuming primary role (and note that env %s=%s is redundant)", p, env.AIS.PrimaryEP, daemon.EP) 81 } else { 82 nlog.Infof("%s: assuming primary role as per: %+v", p, prim) 83 } 84 go p.primaryStartup(smap, config, daemon.cli.primary.ntargets, prim) 85 return 86 } 87 88 // 4: otherwise, join as non-primary 89 nlog.Infoln(p.String() + ": starting up as non-primary") 90 err := p.secondaryStartup(smap, prim.url) 91 if err != nil { 92 if reliable { 93 cm := p.uncoverMeta(smap) 94 if cm.Smap != nil && cm.Smap.Primary != nil { 95 nlog.Infoln(p.String()+": second attempt - joining via", cm.Smap.String()) 96 err = p.secondaryStartup(cm.Smap) 97 } 98 } 99 } 100 if err != nil { 101 cos.ExitLog(p.String(), "(non-primary) failed to join:", err) 102 } 103 } 104 105 // make the *primary* decision taking into account both the environment and loaded Smap, if exists 106 // (cases 1 through 3 below): 107 // 1, environment "AIS_PRIMARY_EP" takes precedence unconditionally (in that exact sequence); 108 // 3: next, loaded Smap (but it can be overridden by newer versions from other nodes); 109 // 3: finally, if none of the above applies, take into account cluster config (its "proxy" section). 110 // See also: "change-of-mind" 111 func (p *proxy) determineRole(smap *smapX /*loaded*/, config *cmn.Config) (prim prim) { 112 switch { 113 case daemon.EP != "": 114 // 1. user override local Smap (if exists) via env-set primary URL 115 prim.isEP = daemon.EP == p.si.URL(cmn.NetIntraControl) || daemon.EP == p.si.URL(cmn.NetPublic) 116 if !prim.isEP { 117 prim.isEP = p.si.HasURL(daemon.EP) 118 } 119 if prim.isEP { 120 daemon.EP = "" 121 } else { 122 prim.url = daemon.EP 123 } 124 case smap != nil: 125 // 2. relying on local copy of Smap (double-checking its version though) 126 prim.isSmap = smap.isPrimary(p.si) 127 if prim.isSmap { 128 cii, cnt := p.bcastHealth(smap, true /*checkAll*/) 129 if cii != nil && cii.Smap.Version > smap.version() { 130 if cii.Smap.Primary.ID != p.SID() || cnt < maxVerConfirmations { 131 nlog.Warningf("%s: cannot assume the primary role: local %s < v%d(%s, cnt=%d)", 132 p.si, smap, cii.Smap.Version, cii.Smap.Primary.ID, cnt) 133 prim.isSmap = false 134 prim.url = cii.Smap.Primary.PubURL 135 } else { 136 nlog.Warningf("%s: proceeding as primary even though local %s < v%d(%s, cnt=%d)", 137 p.si, smap, cii.Smap.Version, cii.Smap.Primary.ID, cnt) 138 } 139 } 140 } 141 default: 142 // 3. initial deployment 143 prim.isCfg = config.Proxy.PrimaryURL == p.si.URL(cmn.NetIntraControl) || 144 config.Proxy.PrimaryURL == p.si.URL(cmn.NetPublic) 145 if !prim.isCfg { 146 prim.isCfg = p.si.HasURL(config.Proxy.PrimaryURL) 147 } 148 } 149 150 return 151 } 152 153 // join cluster 154 // (point of no return: starting up as non-primary; see also: "change-of-mind") 155 func (p *proxy) secondaryStartup(smap *smapX, primaryURLs ...string) error { 156 if smap == nil { 157 smap = newSmap() 158 } else if smap.Primary.ID() == p.SID() { 159 nlog.Infof("%s: zeroing-out primary=self in %s", p, smap) 160 smap.Primary = nil 161 } 162 p.owner.smap.put(smap) 163 if status, err := p.joinCluster(apc.ActSelfJoinProxy, primaryURLs...); err != nil { 164 nlog.Errorf("%s failed to join cluster: %v(%d)", p, err, status) 165 return err 166 } 167 168 p.markNodeStarted() 169 go p.gojoin(cmn.GCO.Get()) 170 171 return nil 172 } 173 174 // Proxy/gateway that is, potentially, the leader of the cluster. 175 // It waits a configured time for other nodes to join, 176 // discovers cluster-wide metadata, and resolve remaining conflicts. 177 func (p *proxy) primaryStartup(loadedSmap *smapX, config *cmn.Config, ntargets int, prim prim) { 178 var ( 179 smap = newSmap() 180 uuid, created string 181 haveJoins bool 182 ) 183 // 1: init Smap to accept reg-s 184 p.owner.smap.mu.Lock() 185 si := p.si.Clone() 186 smap.Primary = si 187 smap.addProxy(si) 188 if loadedSmap != nil { 189 smap.UUID = loadedSmap.UUID 190 smap.Version = loadedSmap.Version 191 } 192 p.owner.smap.put(smap) 193 p.owner.smap.mu.Unlock() 194 195 p.markNodeStarted() 196 197 if !daemon.cli.primary.skipStartup { 198 maxVerSmap := p.acceptRegistrations(smap, loadedSmap, config, ntargets) 199 if maxVerSmap != nil { 200 if _, err := maxVerSmap.IsDupNet(p.si); err != nil { 201 cos.ExitLogf("%s: %v", cmn.BadSmapPrefix, err) 202 } 203 maxVerSmap.Pmap[p.SID()] = p.si 204 p.owner.smap.put(maxVerSmap) 205 nlog.Infof("%s: change-of-mind #1: joining via %s[P])", p.si, maxVerSmap.Primary.StringEx()) 206 if err := p.secondaryStartup(maxVerSmap); err != nil { 207 cos.ExitLogf("%s: %v", cmn.BadSmapPrefix, err) 208 } 209 return 210 } 211 } 212 213 smap = p.owner.smap.get() 214 haveJoins = smap.CountTargets() > 0 || smap.CountProxies() > 1 215 216 if loadedSmap != nil { 217 smap = smap.mergeFlags(loadedSmap) 218 } 219 220 // 2: merging local => boot 221 if haveJoins { 222 var ( 223 before, after cluMeta 224 added int 225 ) 226 p.owner.smap.mu.Lock() 227 228 // NOTE: use regpool to try to upgrade all the four revs: Smap, BMD, RMD, and global Config 229 before.Smap, before.BMD, before.RMD, before.EtlMD = smap, p.owner.bmd.get(), p.owner.rmd.get(), p.owner.etl.get() 230 before.Config, _ = p.owner.config.get() 231 232 forcePrimaryChange := prim.isCfg || prim.isEP 233 smap = p.regpoolMaxVer(&before, &after, forcePrimaryChange) 234 235 uuid, created = smap.UUID, smap.CreationTime 236 237 p.owner.smap.put(smap) 238 p.owner.smap.mu.Unlock() 239 240 msg := p.newAmsgStr(metaction1, after.BMD) 241 wg := p.metasyncer.sync(revsPair{smap, msg}, revsPair{after.BMD, msg}) 242 243 // before and after 244 if loadedSmap != nil { 245 nlog.Infoln(p.String(), "loaded", loadedSmap.StringEx(), "merged", before.Smap.StringEx(), "added", added) 246 } 247 248 nlog.Infof("before: %s, %s, %s, %s", before.BMD.StringEx(), before.RMD, before.Config, before.EtlMD) 249 nlog.Infof("after: %s, %s, %s, %s", after.BMD.StringEx(), after.RMD, after.Config, after.EtlMD) 250 nlog.Infoln("after: ", smap.StringEx()) 251 wg.Wait() 252 } else { 253 nlog.Infoln(p.String() + ": no registrations yet") 254 if loadedSmap != nil { 255 nlog.Infoln(p.String()+": keep going w/ local", loadedSmap.StringEx()) 256 p.owner.smap.mu.Lock() 257 smap = loadedSmap 258 p.owner.smap.put(smap) 259 p.owner.smap.mu.Unlock() 260 } 261 } 262 263 // 3: discover cluster meta and resolve remaining conflicts, if any 264 p.discoverMeta(smap) 265 266 // 4: still primary? 267 p.owner.smap.mu.Lock() 268 smap = p.owner.smap.get() 269 if !smap.isPrimary(p.si) { 270 p.owner.smap.mu.Unlock() 271 nlog.Infoln(p.String()+": registering with primary", smap.Primary.StringEx()) 272 if err := p.secondaryStartup(smap); err != nil { 273 cos.ExitLog(err) 274 } 275 return 276 } 277 278 // 5: persist and finalize w/ sync + BMD 279 if smap.UUID == "" { 280 if !daemon.cli.primary.skipStartup && smap.CountTargets() == 0 { 281 cos.ExitLog(p.String(), "cannot create cluster with no targets,", smap.StringEx()) 282 } 283 clone := smap.clone() 284 if uuid == "" { 285 clone.UUID, clone.CreationTime = newClusterUUID() 286 } else { 287 clone.UUID, clone.CreationTime = uuid, created 288 } 289 clone.Version++ 290 p.owner.smap.put(clone) 291 smap = clone 292 } 293 294 // 5.5: try to start with a fully staffed IC 295 if count := smap.ICCount(); count < meta.DfltCountIC { 296 clone := smap.clone() 297 nc := clone.staffIC() 298 if count != nc { 299 clone.Version++ 300 smap = clone 301 p.owner.smap.put(smap) 302 } 303 } 304 if err := p.owner.smap.persist(smap); err != nil { 305 cos.ExitLog(p.String(), "(primary):", err) 306 } 307 p.owner.smap.mu.Unlock() 308 309 // 6. initialize BMD 310 bmd := p.owner.bmd.get().clone() 311 if bmd.Version == 0 { 312 bmd.Version = 1 // init BMD 313 bmd.UUID = smap.UUID 314 if err := p.owner.bmd.putPersist(bmd, nil); err != nil { 315 cos.ExitLog(err) 316 } 317 } 318 319 // 7. mark RMD as starting up to prevent joining targets from triggering rebalance 320 ok := p.owner.rmd.starting.CAS(false, true) 321 debug.Assert(ok) 322 323 // 8. initialize etl 324 etlMD := p.owner.etl.get().clone() 325 if etlMD.Version > 0 { 326 if err := p.owner.etl.putPersist(etlMD, nil); err != nil { 327 nlog.Errorf("%s: failed to persist etl metadata, err %v - proceeding anyway...", p, err) 328 } 329 } 330 331 // 9. cluster config: load existing _or_ initialize brand new v1 332 cluConfig, err := p._cluConfig(smap) 333 if err != nil { 334 cos.ExitLog(err) 335 } 336 337 // 10. metasync (smap, config, etl & bmd) and startup as primary 338 smap = p.owner.smap.get() 339 var ( 340 aisMsg = p.newAmsgStr(metaction2, bmd) 341 pairs = []revsPair{{smap, aisMsg}, {bmd, aisMsg}, {cluConfig, aisMsg}} 342 ) 343 wg := p.metasyncer.sync(pairs...) 344 wg.Wait() 345 p.markClusterStarted() 346 nlog.Infoln(p.String(), "primary: cluster started up") 347 nlog.Infoln(smap.StringEx()+",", bmd.StringEx()) 348 349 if etlMD.Version > 0 { 350 _ = p.metasyncer.sync(revsPair{etlMD, aisMsg}) 351 } 352 353 // 11. Clear regpool 354 p.reg.mu.Lock() 355 p.reg.pool = p.reg.pool[:0] 356 p.reg.pool = nil 357 p.reg.mu.Unlock() 358 359 // 13. resume rebalance if needed 360 if config.Rebalance.Enabled { 361 p.resumeReb(smap, config) 362 } 363 p.owner.rmd.starting.Store(false) 364 } 365 366 func (p *proxy) _cluConfig(smap *smapX) (config *globalConfig, err error) { 367 var orig, disc string 368 if config, err = p.owner.config.get(); err != nil { 369 return nil, err 370 } 371 if config != nil && config.version() > 0 { 372 orig, disc = smap.configURLsIC(config.Proxy.OriginalURL, config.Proxy.DiscoveryURL) 373 if orig == config.Proxy.OriginalURL && disc == config.Proxy.DiscoveryURL { 374 // no changes, good to go 375 return config, nil 376 } 377 if orig == "" && disc == "" { 378 // likely no IC members yet, nothing can do 379 return config, nil 380 } 381 } 382 383 // update _or_ create version 1; set config (primary, original, discovery) URLs 384 // NOTE: using cmn.NetIntraControl network for all three 385 config, err = p.owner.config.modify(&configModifier{ 386 pre: func(_ *configModifier, clone *globalConfig) (bool /*updated*/, error) { 387 clone.Proxy.PrimaryURL = p.si.URL(cmn.NetIntraControl) 388 if orig != "" { 389 clone.Proxy.OriginalURL = orig 390 } 391 if disc != "" { 392 clone.Proxy.DiscoveryURL = disc 393 } 394 clone.UUID = smap.UUID 395 return true, nil 396 }, 397 }) 398 399 return config, err 400 } 401 402 // [cluster startup]: resume rebalance if `interrupted` 403 func (p *proxy) resumeReb(smap *smapX, config *cmn.Config) { 404 debug.AssertNoErr(smap.validate()) 405 ver := smap.version() 406 407 // initial quiet time 408 nojoins := config.Timeout.MaxKeepalive.D() 409 if p.owner.rmd.interrupted.Load() { 410 nojoins = config.Timeout.MaxHostBusy.D() 411 } 412 sleep := cos.ProbingFrequency(nojoins) 413 until: 414 // until (last-Smap-update + nojoins) 415 for elapsed := time.Duration(0); elapsed < nojoins; { 416 time.Sleep(sleep) 417 elapsed += sleep 418 smap = p.owner.smap.get() 419 if !smap.IsPrimary(p.si) { 420 debug.AssertNoErr(newErrNotPrimary(p.si, smap)) 421 return 422 } 423 if smap.version() != ver { 424 debug.Assert(smap.version() > ver) 425 elapsed = 0 426 nojoins = min(nojoins+sleep, config.Timeout.Startup.D()) 427 if p.owner.rmd.interrupted.Load() { 428 nojoins = max(nojoins+sleep, config.Timeout.MaxHostBusy.D()) 429 } 430 ver = smap.version() 431 } 432 } 433 434 if smap.CountTargets() < 2 && p.owner.smap.get().CountTargets() < 2 { 435 // nothing to do even if interrupted 436 return 437 } 438 439 // NOTE: continue under lock to serialize concurrent node joins (`httpclupost`), if any 440 441 p.owner.smap.mu.Lock() 442 if !p.owner.rmd.interrupted.CAS(true, false) { 443 p.owner.smap.mu.Unlock() // nothing to do 444 return 445 } 446 smap = p.owner.smap.get() 447 if smap.version() != ver { 448 p.owner.smap.mu.Unlock() 449 goto until // repeat 450 } 451 452 // do 453 var ( 454 msg = &apc.ActMsg{Action: apc.ActRebalance, Value: metaction3} 455 aisMsg = p.newAmsg(msg, nil) 456 ctx = &rmdModifier{ 457 pre: func(_ *rmdModifier, clone *rebMD) { clone.Version += 100 }, 458 smapCtx: &smapModifier{smap: smap}, 459 cluID: smap.UUID, 460 } 461 ) 462 rmd, err := p.owner.rmd.modify(ctx) 463 if err != nil { 464 cos.ExitLog(err) 465 } 466 wg := p.metasyncer.sync(revsPair{rmd, aisMsg}) 467 468 p.owner.rmd.starting.Store(false) // done 469 p.owner.smap.mu.Unlock() 470 471 wg.Wait() 472 nlog.Errorln("Warning: resumed global rebalance", ctx.rebID, smap.StringEx(), rmd.String()) 473 } 474 475 // maxVerSmap != nil iff there's a primary change _and_ the cluster has moved on 476 func (p *proxy) acceptRegistrations(smap, loadedSmap *smapX, config *cmn.Config, ntargets int) (maxVerSmap *smapX) { 477 const quiescentIter = 4 // Number of iterations to consider the cluster quiescent. 478 var ( 479 deadlineTime = config.Timeout.Startup.D() 480 checkClusterInterval = deadlineTime / quiescentIter 481 sleepDuration = checkClusterInterval / 5 482 483 definedTargetCnt = ntargets > 0 484 doClusterCheck = loadedSmap != nil && loadedSmap.CountTargets() != 0 485 ) 486 for wait, iter := time.Duration(0), 0; wait < deadlineTime && iter < quiescentIter; wait += sleepDuration { 487 time.Sleep(sleepDuration) 488 // Check the cluster Smap only once at max. 489 if doClusterCheck && wait >= checkClusterInterval { 490 if bcastSmap := p.bcastMaxVerBestEffort(loadedSmap); bcastSmap != nil { 491 maxVerSmap = bcastSmap 492 return 493 } 494 doClusterCheck = false 495 } 496 497 prevTargetCnt := smap.CountTargets() 498 smap = p.owner.smap.get() 499 if !smap.isPrimary(p.si) { 500 break 501 } 502 targetCnt := smap.CountTargets() 503 if targetCnt > prevTargetCnt || (definedTargetCnt && targetCnt < ntargets) { 504 // Reset the counter in case there are new targets or we wait for 505 // targets but we still don't have enough of them. 506 iter = 0 507 } else { 508 iter++ 509 } 510 } 511 512 targetCnt := p.owner.smap.get().CountTargets() 513 514 // log 515 s1 := "target" + cos.Plural(targetCnt) 516 if definedTargetCnt { 517 switch { 518 case targetCnt == ntargets: 519 nlog.Infoln(p.String(), "reached the expected membership of", ntargets, s1) 520 case targetCnt > ntargets: 521 nlog.Infoln(p.String(), "joined", targetCnt, s1, "( greater than expected", ntargets, " )") 522 default: 523 s2 := fmt.Sprintf("%s timed out waiting for %d target%s:", p, ntargets, cos.Plural(ntargets)) 524 if targetCnt > 0 { 525 nlog.Warningln(s2, "joined", targetCnt, "so far", targetCnt) 526 } else { 527 nlog.Warningln(s2, "joined none so far") 528 } 529 } 530 } else { 531 nlog.Infoln(p.String(), "joined", targetCnt, s1) 532 } 533 return 534 } 535 536 // the final major step in the primary startup sequence: 537 // discover cluster-wide metadata and resolve remaining conflicts 538 func (p *proxy) discoverMeta(smap *smapX) { 539 // NOTE [ref0417]: 540 // in addition, consider to return meta.NodeMap(all responded snodes) 541 // and use them 542 cm := p.uncoverMeta(smap) 543 544 if cm.BMD != nil { 545 p.owner.bmd.Lock() 546 bmd := p.owner.bmd.get() 547 if bmd == nil || bmd.version() < cm.BMD.version() { 548 nlog.Infoln(p.String()+"override local", bmd.String(), "with", cm.BMD.String()) 549 if err := p.owner.bmd.putPersist(cm.BMD, nil); err != nil { 550 cos.ExitLog(err) 551 } 552 } 553 p.owner.bmd.Unlock() 554 } 555 if cm.RMD != nil { 556 p.owner.rmd.Lock() 557 rmd := p.owner.rmd.get() 558 if rmd == nil || rmd.version() < cm.RMD.version() { 559 nlog.Infoln(p.String()+"override local", rmd.String(), "with", cm.RMD.String()) 560 p.owner.rmd.put(cm.RMD) 561 } 562 p.owner.rmd.Unlock() 563 } 564 565 if cm.Config != nil && cm.Config.UUID != "" { 566 p.owner.config.Lock() 567 config := cmn.GCO.Get() 568 if config.Version < cm.Config.version() { 569 if !cos.IsValidUUID(cm.Config.UUID) { 570 debug.Assert(false, cm.Config.String()) 571 cos.ExitLogf("%s: invalid config UUID: %s", p, cm.Config) 572 } 573 if cos.IsValidUUID(config.UUID) && config.UUID != cm.Config.UUID { 574 nlog.Errorf("Warning: configs have different UUIDs: (%s, %s) vs %s - proceeding anyway", 575 p, config, cm.Config) 576 } else { 577 nlog.Infoln(p.String(), "override local", config.String(), "with", cm.Config.String()) 578 } 579 cmn.GCO.Update(&cm.Config.ClusterConfig) 580 } 581 p.owner.config.Unlock() 582 } 583 584 if cm.Smap == nil || cm.Smap.version() == 0 { 585 nlog.Infoln(p.String() + ": no max-ver Smaps") 586 return 587 } 588 nlog.Infoln(p.String(), "local", smap.StringEx(), "max-ver", cm.Smap.StringEx()) 589 smapUUID, sameUUID, sameVersion, eq := smap.Compare(&cm.Smap.Smap) 590 if !sameUUID { 591 // FATAL: cluster integrity error (cie) 592 cos.ExitLogf("%s: split-brain uuid [%s %s] vs %s", ciError(10), p, smap.StringEx(), cm.Smap.StringEx()) 593 } 594 if eq && sameVersion { 595 return 596 } 597 if cm.Smap.Primary != nil && cm.Smap.Primary.ID() != p.SID() { 598 if cm.Smap.version() > smap.version() { 599 if dupNode, err := cm.Smap.IsDupNet(p.si); err != nil { 600 if !cm.Smap.IsPrimary(dupNode) { 601 cos.ExitLog(err) 602 } 603 // If the primary in max-ver Smap version and current node only differ by `DaemonID`, 604 // overwrite the proxy entry with current `Snode` and proceed to merging Smap. 605 // TODO: Add validation to ensure `dupNode` and `p.si` only differ in `DaemonID`. 606 cm.Smap.Primary = p.si 607 cm.Smap.delProxy(dupNode.ID()) 608 cm.Smap.Pmap[p.SID()] = p.si 609 goto merge 610 } 611 nlog.Infof("%s: change-of-mind #2 %s <= max-ver %s", p, smap.StringEx(), cm.Smap.StringEx()) 612 cm.Smap.Pmap[p.SID()] = p.si 613 p.owner.smap.put(cm.Smap) 614 return 615 } 616 // FATAL: cluster integrity error (cie) 617 cos.ExitLogf("%s: split-brain local [%s %s] vs %s", ciError(20), p, smap.StringEx(), cm.Smap.StringEx()) 618 } 619 merge: 620 p.owner.smap.mu.Lock() 621 clone := p.owner.smap.get().clone() 622 if !eq { 623 nlog.Infof("%s: merge local %s <== %s", p, clone, cm.Smap) 624 _, err := cm.Smap.merge(clone, false /*err if detected (IP, port) duplicates*/) 625 if err != nil { 626 cos.ExitLogf("%s: %v vs %s", p, err, cm.Smap.StringEx()) 627 } 628 } else { 629 clone.UUID = smapUUID 630 } 631 clone.Version = max(clone.version(), cm.Smap.version()) + 1 632 p.owner.smap.put(clone) 633 p.owner.smap.mu.Unlock() 634 nlog.Infof("%s: merged %s", p, clone.pp()) 635 } 636 637 func (p *proxy) uncoverMeta(bcastSmap *smapX) (cm cluMeta) { 638 var ( 639 err error 640 suuid string 641 config = cmn.GCO.Get() 642 now = time.Now() 643 deadline = now.Add(config.Timeout.Startup.D()) 644 l = bcastSmap.Count() 645 bmds = make(bmds, l) 646 smaps = make(smaps, l) 647 done, slowp bool 648 ) 649 for { 650 if nlog.Stopping() { 651 cm.Smap = nil 652 return 653 } 654 last := time.Now().After(deadline) 655 cm, done, slowp = p.bcastMaxVer(bcastSmap, bmds, smaps) 656 if done || last { 657 break 658 } 659 time.Sleep(config.Timeout.CplaneOperation.D()) 660 } 661 if !slowp { 662 return 663 } 664 nlog.Infoln(p.String(), "(primary) slow path...") 665 if cm.BMD, err = resolveUUIDBMD(bmds); err != nil { 666 if _, split := err.(*errBmdUUIDSplit); split { 667 cos.ExitLog(p.String(), "(primary), err:", err) // cluster integrity error 668 } 669 nlog.Errorln(err) 670 } 671 for si, smap := range smaps { 672 if !si.IsTarget() { 673 continue 674 } 675 if !cos.IsValidUUID(smap.UUID) { 676 continue 677 } 678 if suuid == "" { 679 suuid = smap.UUID 680 if suuid != "" { 681 nlog.Infof("%s: set Smap UUID = %s(%s)", p, si, suuid) 682 } 683 } else if suuid != smap.UUID { 684 // FATAL: cluster integrity error (cie) 685 cos.ExitLogf("%s: split-brain [%s %s] vs [%s %s]", ciError(30), p, suuid, si, smap.UUID) 686 } 687 } 688 for _, smap := range smaps { 689 if smap.UUID != suuid { 690 continue 691 } 692 if cm.Smap == nil { 693 cm.Smap = smap 694 } else if cm.Smap.version() < smap.version() { 695 cm.Smap = smap 696 } 697 } 698 return 699 } 700 701 func (p *proxy) bcastMaxVer(bcastSmap *smapX, bmds bmds, smaps smaps) (out cluMeta, done, slowp bool) { 702 var ( 703 borigin, sorigin string 704 args = allocBcArgs() 705 ) 706 args.req = cmn.HreqArgs{ 707 Path: apc.URLPathDae.S, 708 Query: url.Values{apc.QparamWhat: []string{apc.WhatSmapVote}}, 709 } 710 args.smap = bcastSmap 711 args.to = core.SelectedNodes 712 713 args.nodes = make([]meta.NodeMap, 0, 2) 714 if len(bcastSmap.Tmap) > 0 { 715 args.nodes = append(args.nodes, bcastSmap.Tmap) 716 } 717 pmap := make(meta.NodeMap, len(bcastSmap.Pmap)) 718 ctrl := p.si.URL(cmn.NetIntraControl) 719 for pid, si := range bcastSmap.Pmap { 720 if pid == p.SID() { 721 continue 722 } 723 if si.URL(cmn.NetIntraControl) == ctrl { 724 nlog.Warningf(fmtErrNetInfoChanged, p, si.StringEx(), ctrl) 725 continue 726 } 727 pmap[pid] = si 728 } 729 args.nodes = append(args.nodes, pmap) 730 731 args.cresv = cresCM{} // -> cluMeta 732 results := p.bcastGroup(args) 733 freeBcArgs(args) 734 done = true 735 736 clear(bmds) 737 clear(smaps) 738 739 for _, res := range results { 740 if res.err != nil { 741 done = false 742 continue 743 } 744 cm, ok := res.v.(*cluMeta) 745 debug.Assert(ok) 746 if cm.BMD != nil && cm.BMD.version() > 0 { 747 if out.BMD == nil { // 1. init 748 borigin, out.BMD = cm.BMD.UUID, cm.BMD 749 } else if borigin != "" && borigin != cm.BMD.UUID { // 2. slow path 750 slowp = true 751 } else if !slowp && out.BMD.Version < cm.BMD.Version { // 3. fast path max(version) 752 out.BMD = cm.BMD 753 borigin = cm.BMD.UUID 754 } 755 } 756 if cm.RMD != nil && cm.RMD.version() > 0 { 757 if out.RMD == nil { // 1. init 758 out.RMD = cm.RMD 759 } else if !slowp && out.RMD.Version < cm.RMD.Version { // 3. fast path max(version) 760 out.RMD = cm.RMD 761 } 762 } 763 if cm.Config != nil && cm.Config.version() > 0 { 764 if out.Config == nil { // 1. init 765 out.Config = cm.Config 766 } else if !slowp && out.Config.version() < cm.Config.version() { // 3. fast path max(version) 767 out.Config = cm.Config 768 } 769 } 770 771 // TODO: maxver of EtlMD 772 773 if cm.Smap != nil && cm.Flags.IsSet(cifl.VoteInProgress) { 774 var s string 775 if cm.Smap.Primary != nil { 776 s = " of the current one " + cm.Smap.Primary.ID() 777 } 778 nlog.Warningln(p.String(), "starting up as primary(?) during reelection"+s) 779 out.Smap, out.BMD, out.RMD = nil, nil, nil // zero-out as unusable 780 done = false 781 break 782 } 783 if cm.Smap != nil && cm.Smap.version() > 0 { 784 if out.Smap == nil { // 1. init 785 sorigin, out.Smap = cm.Smap.UUID, cm.Smap 786 } else if sorigin != "" && sorigin != cm.Smap.UUID { // 2. slow path 787 slowp = true 788 } else if !slowp && out.Smap.Version < cm.Smap.Version { // 3. fast path max(version) 789 out.Smap = cm.Smap 790 sorigin = cm.Smap.UUID 791 } 792 } 793 if bmds != nil && cm.BMD != nil && cm.BMD.version() > 0 { 794 bmds[res.si] = cm.BMD 795 } 796 if smaps != nil && cm.Smap != nil && cm.Smap.version() > 0 { 797 smaps[res.si] = cm.Smap 798 } 799 } 800 freeBcastRes(results) 801 return 802 } 803 804 func (p *proxy) bcastMaxVerBestEffort(smap *smapX) *smapX { 805 cm, _, slowp := p.bcastMaxVer(smap, nil, nil) 806 if cm.Smap != nil && !slowp { 807 if cm.Smap.UUID == smap.UUID && cm.Smap.version() > smap.version() && cm.Smap.validate() == nil { 808 if cm.Smap.Primary.ID() != p.SID() { 809 nlog.Warningln(p.String(), "detected primary change, whereby local", smap.StringEx(), 810 "is older than max-ver", cm.Smap.StringEx()) 811 return cm.Smap 812 } 813 } 814 } 815 return nil 816 } 817 818 func (p *proxy) regpoolMaxVer(before, after *cluMeta, forcePrimaryChange bool) (smap *smapX) { 819 var ( 820 voteInProgress bool 821 cloned bool 822 ) 823 *after = *before 824 825 p.reg.mu.RLock() 826 827 if len(p.reg.pool) == 0 { 828 goto ret 829 } 830 for _, regReq := range p.reg.pool { 831 nsi := regReq.SI 832 if err := nsi.Validate(); err != nil { 833 nlog.Errorln("Warning:", err) 834 continue 835 } 836 voteInProgress = voteInProgress || regReq.Flags.IsSet(cifl.VoteInProgress) 837 if regReq.Smap != nil && regReq.Smap.version() > 0 && cos.IsValidUUID(regReq.Smap.UUID) { 838 if after.Smap != nil && after.Smap.version() > 0 { 839 if cos.IsValidUUID(after.Smap.UUID) && after.Smap.UUID != regReq.Smap.UUID { 840 cos.ExitLogf("%s: Smap UUIDs don't match: [%s %s] vs %s", ciError(10), 841 p, after.Smap.StringEx(), regReq.Smap.StringEx()) 842 } 843 } 844 if after.Smap == nil || after.Smap.version() < regReq.Smap.version() { 845 after.Smap = regReq.Smap 846 } 847 } 848 if regReq.BMD != nil && regReq.BMD.version() > 0 && cos.IsValidUUID(regReq.BMD.UUID) { 849 if after.BMD != nil && after.BMD.version() > 0 { 850 if cos.IsValidUUID(after.BMD.UUID) && after.BMD.UUID != regReq.BMD.UUID { 851 cos.ExitLogf("%s: BMD UUIDs don't match: [%s %s] vs %s", ciError(10), 852 p.si, after.BMD.StringEx(), regReq.BMD.StringEx()) 853 } 854 } 855 if after.BMD == nil || after.BMD.version() < regReq.BMD.version() { 856 after.BMD = regReq.BMD 857 } 858 } 859 if regReq.RMD != nil && regReq.RMD.version() > 0 { 860 if after.RMD == nil || after.RMD.version() < regReq.RMD.version() { 861 after.RMD = regReq.RMD 862 } 863 } 864 if regReq.Config != nil && regReq.Config.version() > 0 && cos.IsValidUUID(regReq.Config.UUID) { 865 if after.Config != nil && after.Config.version() > 0 { 866 if cos.IsValidUUID(after.Config.UUID) && after.Config.UUID != regReq.Config.UUID { 867 cos.ExitLogf("%s: Global Config UUIDs don't match: [%s %s] vs %s", ciError(10), 868 p.si, after.Config, regReq.Config) 869 } 870 } 871 if after.Config == nil || after.Config.version() < regReq.Config.version() { 872 after.Config = regReq.Config 873 } 874 } 875 } 876 if after.BMD != before.BMD { 877 if err := p.owner.bmd.putPersist(after.BMD, nil); err != nil { 878 cos.ExitLog(err) 879 } 880 } 881 if after.RMD != before.RMD { 882 p.owner.rmd.put(after.RMD) 883 } 884 if after.Config != before.Config { 885 var err error 886 after.Config, err = p.owner.config.modify(&configModifier{ 887 pre: func(_ *configModifier, clone *globalConfig) (bool, error) { 888 *clone = *after.Config 889 return true, nil 890 }, 891 }) 892 if err != nil { 893 cos.ExitLog(err) 894 } 895 } 896 897 ret: 898 p.reg.mu.RUnlock() 899 900 // not interfering with elections 901 if voteInProgress { 902 before.Smap.UUID, before.Smap.CreationTime = after.Smap.UUID, after.Smap.CreationTime 903 nlog.Errorln("voting is in progress, continuing with potentially older", before.Smap.StringEx()) 904 return before.Smap 905 } 906 907 runtime.Gosched() 908 909 // NOTE [ref0417]: 910 // - always update joining nodes' net-infos; 911 // - alternatively, narrow it down to only proxies (as targets always restart on the same K8s nodes) 912 913 p.reg.mu.RLock() 914 for _, regReq := range p.reg.pool { 915 after.Smap, cloned = _updNetInfo(after.Smap, regReq.SI, cloned) 916 } 917 p.reg.mu.RUnlock() 918 919 if after.Smap.version() == 0 || !cos.IsValidUUID(after.Smap.UUID) { 920 after.Smap.UUID, after.Smap.CreationTime = newClusterUUID() 921 nlog.Infoln(p.String(), "new cluster UUID:", after.Smap.UUID) 922 return after.Smap 923 } 924 if before.Smap == after.Smap { 925 if !forcePrimaryChange { 926 return after.Smap 927 } 928 } else { 929 debug.Assert(before.Smap.version() < after.Smap.version()) 930 nlog.Warningln("before:", before.Smap.StringEx(), "after:", after.Smap.StringEx()) 931 } 932 933 if after.Smap.Primary.ID() != p.SID() { 934 nlog.Warningln(p.String() + ": taking over as primary") 935 } 936 if !cloned { 937 after.Smap = after.Smap.clone() 938 } 939 after.Smap.Primary = p.si 940 after.Smap.Pmap[p.SID()] = p.si 941 942 after.Smap.Version += 50 943 944 config, errN := p.owner.config.modify(&configModifier{ 945 pre: func(_ *configModifier, clone *globalConfig) (bool, error) { 946 clone.Proxy.PrimaryURL = p.si.URL(cmn.NetIntraControl) 947 clone.Version++ 948 return true, nil 949 }, 950 }) 951 if errN != nil { 952 cos.ExitLog(errN) 953 } 954 after.Config = config 955 return after.Smap 956 } 957 958 func _updNetInfo(smap *smapX, nsi *meta.Snode, cloned bool) (*smapX, bool) { 959 if nsi.Validate() != nil { 960 return smap, cloned 961 } 962 osi := smap.GetNode(nsi.ID()) 963 if osi == nil || osi.Type() != nsi.Type() { 964 return smap, cloned 965 } 966 if err := osi.NetEq(nsi); err != nil { 967 nlog.Warningln("Warning: reniewing", err) 968 if !cloned { 969 smap = smap.clone() 970 cloned = true 971 } 972 smap.putNode(nsi, osi.Flags, true /*silent*/) 973 } 974 return smap, cloned 975 }