github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/earlystart.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"fmt"
     9  	"net/url"
    10  	"runtime"
    11  	"time"
    12  
    13  	"github.com/NVIDIA/aistore/api/apc"
    14  	"github.com/NVIDIA/aistore/api/env"
    15  	"github.com/NVIDIA/aistore/cmn"
    16  	"github.com/NVIDIA/aistore/cmn/cifl"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/cmn/debug"
    19  	"github.com/NVIDIA/aistore/cmn/nlog"
    20  	"github.com/NVIDIA/aistore/core"
    21  	"github.com/NVIDIA/aistore/core/meta"
    22  )
    23  
    24  const maxVerConfirmations = 3 // NOTE: minimum number of max-ver confirmations required to make the decision
    25  
    26  const (
    27  	metaction1 = "early-start-have-registrations"
    28  	metaction2 = "primary-started-up"
    29  	metaction3 = "primary-startup-resume-rebalance"
    30  )
    31  
    32  const (
    33  	fmtErrNetInfoChanged = "%s: net-info changed upon restart (on K8s?) - excluding self from the broadcast (%q, %q)"
    34  )
    35  
    36  type (
    37  	bmds  map[*meta.Snode]*bucketMD
    38  	smaps map[*meta.Snode]*smapX
    39  
    40  	// sourced from: (env, config, smap)
    41  	prim struct {
    42  		url    string
    43  		isSmap bool // <-- loaded Smap
    44  		isCfg  bool // <-- config.proxy.primary_url
    45  		isEP   bool // <-- env AIS_PRIMARY_EP
    46  	}
    47  )
    48  
    49  // Background:
    50  //   - Each proxy/gateway stores a local copy of the cluster map (Smap)
    51  //   - Each Smap instance is versioned; the versioning is monotonic (increasing)
    52  //   - Only the primary (leader) proxy distributes Smap updates to all other clustered nodes
    53  //   - Bootstrap sequence includes /steps/ intended to resolve all the usual conflicts that may arise.
    54  func (p *proxy) bootstrap() {
    55  	// 1: load a local copy and try to utilize it for discovery
    56  	var (
    57  		smap, reliable = p.loadSmap()
    58  		isSelf         string
    59  	)
    60  	if !reliable {
    61  		smap = nil
    62  		nlog.Infoln(p.String() + ": starting without Smap")
    63  	} else {
    64  		if smap.Primary.ID() == p.SID() {
    65  			isSelf = ", where primary is self"
    66  		}
    67  		nlog.Infoln(p.String()+": loaded", smap.StringEx()+isSelf)
    68  	}
    69  
    70  	// 2. make preliminary _primary_ decision
    71  	config := cmn.GCO.Get()
    72  	prim := p.determineRole(smap, config)
    73  
    74  	// 3: start as primary
    75  	forcePrimaryChange := prim.isCfg || prim.isEP
    76  	if prim.isSmap || forcePrimaryChange {
    77  		if prim.isSmap {
    78  			nlog.Infof("%s: assuming primary role _for now_ %+v", p, prim)
    79  		} else if prim.isEP && isSelf != "" {
    80  			nlog.Infof("%s: assuming primary role (and note that env %s=%s is redundant)", p, env.AIS.PrimaryEP, daemon.EP)
    81  		} else {
    82  			nlog.Infof("%s: assuming primary role as per: %+v", p, prim)
    83  		}
    84  		go p.primaryStartup(smap, config, daemon.cli.primary.ntargets, prim)
    85  		return
    86  	}
    87  
    88  	// 4: otherwise, join as non-primary
    89  	nlog.Infoln(p.String() + ": starting up as non-primary")
    90  	err := p.secondaryStartup(smap, prim.url)
    91  	if err != nil {
    92  		if reliable {
    93  			cm := p.uncoverMeta(smap)
    94  			if cm.Smap != nil && cm.Smap.Primary != nil {
    95  				nlog.Infoln(p.String()+": second attempt - joining via", cm.Smap.String())
    96  				err = p.secondaryStartup(cm.Smap)
    97  			}
    98  		}
    99  	}
   100  	if err != nil {
   101  		cos.ExitLog(p.String(), "(non-primary) failed to join:", err)
   102  	}
   103  }
   104  
   105  // make the *primary* decision taking into account both the environment and loaded Smap, if exists
   106  // (cases 1 through 3 below):
   107  // 1, environment "AIS_PRIMARY_EP" takes precedence unconditionally (in that exact sequence);
   108  // 3: next, loaded Smap (but it can be overridden by newer versions from other nodes);
   109  // 3: finally, if none of the above applies, take into account cluster config (its "proxy" section).
   110  // See also: "change-of-mind"
   111  func (p *proxy) determineRole(smap *smapX /*loaded*/, config *cmn.Config) (prim prim) {
   112  	switch {
   113  	case daemon.EP != "":
   114  		// 1. user override local Smap (if exists) via env-set primary URL
   115  		prim.isEP = daemon.EP == p.si.URL(cmn.NetIntraControl) || daemon.EP == p.si.URL(cmn.NetPublic)
   116  		if !prim.isEP {
   117  			prim.isEP = p.si.HasURL(daemon.EP)
   118  		}
   119  		if prim.isEP {
   120  			daemon.EP = ""
   121  		} else {
   122  			prim.url = daemon.EP
   123  		}
   124  	case smap != nil:
   125  		// 2. relying on local copy of Smap (double-checking its version though)
   126  		prim.isSmap = smap.isPrimary(p.si)
   127  		if prim.isSmap {
   128  			cii, cnt := p.bcastHealth(smap, true /*checkAll*/)
   129  			if cii != nil && cii.Smap.Version > smap.version() {
   130  				if cii.Smap.Primary.ID != p.SID() || cnt < maxVerConfirmations {
   131  					nlog.Warningf("%s: cannot assume the primary role: local %s < v%d(%s, cnt=%d)",
   132  						p.si, smap, cii.Smap.Version, cii.Smap.Primary.ID, cnt)
   133  					prim.isSmap = false
   134  					prim.url = cii.Smap.Primary.PubURL
   135  				} else {
   136  					nlog.Warningf("%s: proceeding as primary even though local %s < v%d(%s, cnt=%d)",
   137  						p.si, smap, cii.Smap.Version, cii.Smap.Primary.ID, cnt)
   138  				}
   139  			}
   140  		}
   141  	default:
   142  		// 3. initial deployment
   143  		prim.isCfg = config.Proxy.PrimaryURL == p.si.URL(cmn.NetIntraControl) ||
   144  			config.Proxy.PrimaryURL == p.si.URL(cmn.NetPublic)
   145  		if !prim.isCfg {
   146  			prim.isCfg = p.si.HasURL(config.Proxy.PrimaryURL)
   147  		}
   148  	}
   149  
   150  	return
   151  }
   152  
   153  // join cluster
   154  // (point of no return: starting up as non-primary; see also: "change-of-mind")
   155  func (p *proxy) secondaryStartup(smap *smapX, primaryURLs ...string) error {
   156  	if smap == nil {
   157  		smap = newSmap()
   158  	} else if smap.Primary.ID() == p.SID() {
   159  		nlog.Infof("%s: zeroing-out primary=self in %s", p, smap)
   160  		smap.Primary = nil
   161  	}
   162  	p.owner.smap.put(smap)
   163  	if status, err := p.joinCluster(apc.ActSelfJoinProxy, primaryURLs...); err != nil {
   164  		nlog.Errorf("%s failed to join cluster: %v(%d)", p, err, status)
   165  		return err
   166  	}
   167  
   168  	p.markNodeStarted()
   169  	go p.gojoin(cmn.GCO.Get())
   170  
   171  	return nil
   172  }
   173  
   174  // Proxy/gateway that is, potentially, the leader of the cluster.
   175  // It waits a configured time for other nodes to join,
   176  // discovers cluster-wide metadata, and resolve remaining conflicts.
   177  func (p *proxy) primaryStartup(loadedSmap *smapX, config *cmn.Config, ntargets int, prim prim) {
   178  	var (
   179  		smap          = newSmap()
   180  		uuid, created string
   181  		haveJoins     bool
   182  	)
   183  	// 1: init Smap to accept reg-s
   184  	p.owner.smap.mu.Lock()
   185  	si := p.si.Clone()
   186  	smap.Primary = si
   187  	smap.addProxy(si)
   188  	if loadedSmap != nil {
   189  		smap.UUID = loadedSmap.UUID
   190  		smap.Version = loadedSmap.Version
   191  	}
   192  	p.owner.smap.put(smap)
   193  	p.owner.smap.mu.Unlock()
   194  
   195  	p.markNodeStarted()
   196  
   197  	if !daemon.cli.primary.skipStartup {
   198  		maxVerSmap := p.acceptRegistrations(smap, loadedSmap, config, ntargets)
   199  		if maxVerSmap != nil {
   200  			if _, err := maxVerSmap.IsDupNet(p.si); err != nil {
   201  				cos.ExitLogf("%s: %v", cmn.BadSmapPrefix, err)
   202  			}
   203  			maxVerSmap.Pmap[p.SID()] = p.si
   204  			p.owner.smap.put(maxVerSmap)
   205  			nlog.Infof("%s: change-of-mind #1: joining via %s[P])", p.si, maxVerSmap.Primary.StringEx())
   206  			if err := p.secondaryStartup(maxVerSmap); err != nil {
   207  				cos.ExitLogf("%s: %v", cmn.BadSmapPrefix, err)
   208  			}
   209  			return
   210  		}
   211  	}
   212  
   213  	smap = p.owner.smap.get()
   214  	haveJoins = smap.CountTargets() > 0 || smap.CountProxies() > 1
   215  
   216  	if loadedSmap != nil {
   217  		smap = smap.mergeFlags(loadedSmap)
   218  	}
   219  
   220  	// 2: merging local => boot
   221  	if haveJoins {
   222  		var (
   223  			before, after cluMeta
   224  			added         int
   225  		)
   226  		p.owner.smap.mu.Lock()
   227  
   228  		// NOTE: use regpool to try to upgrade all the four revs: Smap, BMD, RMD, and global Config
   229  		before.Smap, before.BMD, before.RMD, before.EtlMD = smap, p.owner.bmd.get(), p.owner.rmd.get(), p.owner.etl.get()
   230  		before.Config, _ = p.owner.config.get()
   231  
   232  		forcePrimaryChange := prim.isCfg || prim.isEP
   233  		smap = p.regpoolMaxVer(&before, &after, forcePrimaryChange)
   234  
   235  		uuid, created = smap.UUID, smap.CreationTime
   236  
   237  		p.owner.smap.put(smap)
   238  		p.owner.smap.mu.Unlock()
   239  
   240  		msg := p.newAmsgStr(metaction1, after.BMD)
   241  		wg := p.metasyncer.sync(revsPair{smap, msg}, revsPair{after.BMD, msg})
   242  
   243  		// before and after
   244  		if loadedSmap != nil {
   245  			nlog.Infoln(p.String(), "loaded", loadedSmap.StringEx(), "merged", before.Smap.StringEx(), "added", added)
   246  		}
   247  
   248  		nlog.Infof("before: %s, %s, %s, %s", before.BMD.StringEx(), before.RMD, before.Config, before.EtlMD)
   249  		nlog.Infof("after:  %s, %s, %s, %s", after.BMD.StringEx(), after.RMD, after.Config, after.EtlMD)
   250  		nlog.Infoln("after: ", smap.StringEx())
   251  		wg.Wait()
   252  	} else {
   253  		nlog.Infoln(p.String() + ": no registrations yet")
   254  		if loadedSmap != nil {
   255  			nlog.Infoln(p.String()+": keep going w/ local", loadedSmap.StringEx())
   256  			p.owner.smap.mu.Lock()
   257  			smap = loadedSmap
   258  			p.owner.smap.put(smap)
   259  			p.owner.smap.mu.Unlock()
   260  		}
   261  	}
   262  
   263  	// 3: discover cluster meta and resolve remaining conflicts, if any
   264  	p.discoverMeta(smap)
   265  
   266  	// 4: still primary?
   267  	p.owner.smap.mu.Lock()
   268  	smap = p.owner.smap.get()
   269  	if !smap.isPrimary(p.si) {
   270  		p.owner.smap.mu.Unlock()
   271  		nlog.Infoln(p.String()+": registering with primary", smap.Primary.StringEx())
   272  		if err := p.secondaryStartup(smap); err != nil {
   273  			cos.ExitLog(err)
   274  		}
   275  		return
   276  	}
   277  
   278  	// 5:  persist and finalize w/ sync + BMD
   279  	if smap.UUID == "" {
   280  		if !daemon.cli.primary.skipStartup && smap.CountTargets() == 0 {
   281  			cos.ExitLog(p.String(), "cannot create cluster with no targets,", smap.StringEx())
   282  		}
   283  		clone := smap.clone()
   284  		if uuid == "" {
   285  			clone.UUID, clone.CreationTime = newClusterUUID()
   286  		} else {
   287  			clone.UUID, clone.CreationTime = uuid, created
   288  		}
   289  		clone.Version++
   290  		p.owner.smap.put(clone)
   291  		smap = clone
   292  	}
   293  
   294  	// 5.5: try to start with a fully staffed IC
   295  	if count := smap.ICCount(); count < meta.DfltCountIC {
   296  		clone := smap.clone()
   297  		nc := clone.staffIC()
   298  		if count != nc {
   299  			clone.Version++
   300  			smap = clone
   301  			p.owner.smap.put(smap)
   302  		}
   303  	}
   304  	if err := p.owner.smap.persist(smap); err != nil {
   305  		cos.ExitLog(p.String(), "(primary):", err)
   306  	}
   307  	p.owner.smap.mu.Unlock()
   308  
   309  	// 6. initialize BMD
   310  	bmd := p.owner.bmd.get().clone()
   311  	if bmd.Version == 0 {
   312  		bmd.Version = 1 // init BMD
   313  		bmd.UUID = smap.UUID
   314  		if err := p.owner.bmd.putPersist(bmd, nil); err != nil {
   315  			cos.ExitLog(err)
   316  		}
   317  	}
   318  
   319  	// 7. mark RMD as starting up to prevent joining targets from triggering rebalance
   320  	ok := p.owner.rmd.starting.CAS(false, true)
   321  	debug.Assert(ok)
   322  
   323  	// 8. initialize etl
   324  	etlMD := p.owner.etl.get().clone()
   325  	if etlMD.Version > 0 {
   326  		if err := p.owner.etl.putPersist(etlMD, nil); err != nil {
   327  			nlog.Errorf("%s: failed to persist etl metadata, err %v - proceeding anyway...", p, err)
   328  		}
   329  	}
   330  
   331  	// 9. cluster config: load existing _or_ initialize brand new v1
   332  	cluConfig, err := p._cluConfig(smap)
   333  	if err != nil {
   334  		cos.ExitLog(err)
   335  	}
   336  
   337  	// 10. metasync (smap, config, etl & bmd) and startup as primary
   338  	smap = p.owner.smap.get()
   339  	var (
   340  		aisMsg = p.newAmsgStr(metaction2, bmd)
   341  		pairs  = []revsPair{{smap, aisMsg}, {bmd, aisMsg}, {cluConfig, aisMsg}}
   342  	)
   343  	wg := p.metasyncer.sync(pairs...)
   344  	wg.Wait()
   345  	p.markClusterStarted()
   346  	nlog.Infoln(p.String(), "primary: cluster started up")
   347  	nlog.Infoln(smap.StringEx()+",", bmd.StringEx())
   348  
   349  	if etlMD.Version > 0 {
   350  		_ = p.metasyncer.sync(revsPair{etlMD, aisMsg})
   351  	}
   352  
   353  	// 11. Clear regpool
   354  	p.reg.mu.Lock()
   355  	p.reg.pool = p.reg.pool[:0]
   356  	p.reg.pool = nil
   357  	p.reg.mu.Unlock()
   358  
   359  	// 13. resume rebalance if needed
   360  	if config.Rebalance.Enabled {
   361  		p.resumeReb(smap, config)
   362  	}
   363  	p.owner.rmd.starting.Store(false)
   364  }
   365  
   366  func (p *proxy) _cluConfig(smap *smapX) (config *globalConfig, err error) {
   367  	var orig, disc string
   368  	if config, err = p.owner.config.get(); err != nil {
   369  		return nil, err
   370  	}
   371  	if config != nil && config.version() > 0 {
   372  		orig, disc = smap.configURLsIC(config.Proxy.OriginalURL, config.Proxy.DiscoveryURL)
   373  		if orig == config.Proxy.OriginalURL && disc == config.Proxy.DiscoveryURL {
   374  			// no changes, good to go
   375  			return config, nil
   376  		}
   377  		if orig == "" && disc == "" {
   378  			// likely no IC members yet, nothing can do
   379  			return config, nil
   380  		}
   381  	}
   382  
   383  	// update _or_ create version 1; set config (primary, original, discovery) URLs
   384  	// NOTE: using cmn.NetIntraControl network for all  three
   385  	config, err = p.owner.config.modify(&configModifier{
   386  		pre: func(_ *configModifier, clone *globalConfig) (bool /*updated*/, error) {
   387  			clone.Proxy.PrimaryURL = p.si.URL(cmn.NetIntraControl)
   388  			if orig != "" {
   389  				clone.Proxy.OriginalURL = orig
   390  			}
   391  			if disc != "" {
   392  				clone.Proxy.DiscoveryURL = disc
   393  			}
   394  			clone.UUID = smap.UUID
   395  			return true, nil
   396  		},
   397  	})
   398  
   399  	return config, err
   400  }
   401  
   402  // [cluster startup]: resume rebalance if `interrupted`
   403  func (p *proxy) resumeReb(smap *smapX, config *cmn.Config) {
   404  	debug.AssertNoErr(smap.validate())
   405  	ver := smap.version()
   406  
   407  	// initial quiet time
   408  	nojoins := config.Timeout.MaxKeepalive.D()
   409  	if p.owner.rmd.interrupted.Load() {
   410  		nojoins = config.Timeout.MaxHostBusy.D()
   411  	}
   412  	sleep := cos.ProbingFrequency(nojoins)
   413  until:
   414  	// until (last-Smap-update + nojoins)
   415  	for elapsed := time.Duration(0); elapsed < nojoins; {
   416  		time.Sleep(sleep)
   417  		elapsed += sleep
   418  		smap = p.owner.smap.get()
   419  		if !smap.IsPrimary(p.si) {
   420  			debug.AssertNoErr(newErrNotPrimary(p.si, smap))
   421  			return
   422  		}
   423  		if smap.version() != ver {
   424  			debug.Assert(smap.version() > ver)
   425  			elapsed = 0
   426  			nojoins = min(nojoins+sleep, config.Timeout.Startup.D())
   427  			if p.owner.rmd.interrupted.Load() {
   428  				nojoins = max(nojoins+sleep, config.Timeout.MaxHostBusy.D())
   429  			}
   430  			ver = smap.version()
   431  		}
   432  	}
   433  
   434  	if smap.CountTargets() < 2 && p.owner.smap.get().CountTargets() < 2 {
   435  		// nothing to do even if interrupted
   436  		return
   437  	}
   438  
   439  	// NOTE: continue under lock to serialize concurrent node joins (`httpclupost`), if any
   440  
   441  	p.owner.smap.mu.Lock()
   442  	if !p.owner.rmd.interrupted.CAS(true, false) {
   443  		p.owner.smap.mu.Unlock() // nothing to do
   444  		return
   445  	}
   446  	smap = p.owner.smap.get()
   447  	if smap.version() != ver {
   448  		p.owner.smap.mu.Unlock()
   449  		goto until // repeat
   450  	}
   451  
   452  	// do
   453  	var (
   454  		msg    = &apc.ActMsg{Action: apc.ActRebalance, Value: metaction3}
   455  		aisMsg = p.newAmsg(msg, nil)
   456  		ctx    = &rmdModifier{
   457  			pre:     func(_ *rmdModifier, clone *rebMD) { clone.Version += 100 },
   458  			smapCtx: &smapModifier{smap: smap},
   459  			cluID:   smap.UUID,
   460  		}
   461  	)
   462  	rmd, err := p.owner.rmd.modify(ctx)
   463  	if err != nil {
   464  		cos.ExitLog(err)
   465  	}
   466  	wg := p.metasyncer.sync(revsPair{rmd, aisMsg})
   467  
   468  	p.owner.rmd.starting.Store(false) // done
   469  	p.owner.smap.mu.Unlock()
   470  
   471  	wg.Wait()
   472  	nlog.Errorln("Warning: resumed global rebalance", ctx.rebID, smap.StringEx(), rmd.String())
   473  }
   474  
   475  // maxVerSmap != nil iff there's a primary change _and_ the cluster has moved on
   476  func (p *proxy) acceptRegistrations(smap, loadedSmap *smapX, config *cmn.Config, ntargets int) (maxVerSmap *smapX) {
   477  	const quiescentIter = 4 // Number of iterations to consider the cluster quiescent.
   478  	var (
   479  		deadlineTime         = config.Timeout.Startup.D()
   480  		checkClusterInterval = deadlineTime / quiescentIter
   481  		sleepDuration        = checkClusterInterval / 5
   482  
   483  		definedTargetCnt = ntargets > 0
   484  		doClusterCheck   = loadedSmap != nil && loadedSmap.CountTargets() != 0
   485  	)
   486  	for wait, iter := time.Duration(0), 0; wait < deadlineTime && iter < quiescentIter; wait += sleepDuration {
   487  		time.Sleep(sleepDuration)
   488  		// Check the cluster Smap only once at max.
   489  		if doClusterCheck && wait >= checkClusterInterval {
   490  			if bcastSmap := p.bcastMaxVerBestEffort(loadedSmap); bcastSmap != nil {
   491  				maxVerSmap = bcastSmap
   492  				return
   493  			}
   494  			doClusterCheck = false
   495  		}
   496  
   497  		prevTargetCnt := smap.CountTargets()
   498  		smap = p.owner.smap.get()
   499  		if !smap.isPrimary(p.si) {
   500  			break
   501  		}
   502  		targetCnt := smap.CountTargets()
   503  		if targetCnt > prevTargetCnt || (definedTargetCnt && targetCnt < ntargets) {
   504  			// Reset the counter in case there are new targets or we wait for
   505  			// targets but we still don't have enough of them.
   506  			iter = 0
   507  		} else {
   508  			iter++
   509  		}
   510  	}
   511  
   512  	targetCnt := p.owner.smap.get().CountTargets()
   513  
   514  	// log
   515  	s1 := "target" + cos.Plural(targetCnt)
   516  	if definedTargetCnt {
   517  		switch {
   518  		case targetCnt == ntargets:
   519  			nlog.Infoln(p.String(), "reached the expected membership of", ntargets, s1)
   520  		case targetCnt > ntargets:
   521  			nlog.Infoln(p.String(), "joined", targetCnt, s1, "( greater than expected", ntargets, " )")
   522  		default:
   523  			s2 := fmt.Sprintf("%s timed out waiting for %d target%s:", p, ntargets, cos.Plural(ntargets))
   524  			if targetCnt > 0 {
   525  				nlog.Warningln(s2, "joined", targetCnt, "so far", targetCnt)
   526  			} else {
   527  				nlog.Warningln(s2, "joined none so far")
   528  			}
   529  		}
   530  	} else {
   531  		nlog.Infoln(p.String(), "joined", targetCnt, s1)
   532  	}
   533  	return
   534  }
   535  
   536  // the final major step in the primary startup sequence:
   537  // discover cluster-wide metadata and resolve remaining conflicts
   538  func (p *proxy) discoverMeta(smap *smapX) {
   539  	// NOTE [ref0417]:
   540  	// in addition, consider to return meta.NodeMap(all responded snodes)
   541  	// and use them
   542  	cm := p.uncoverMeta(smap)
   543  
   544  	if cm.BMD != nil {
   545  		p.owner.bmd.Lock()
   546  		bmd := p.owner.bmd.get()
   547  		if bmd == nil || bmd.version() < cm.BMD.version() {
   548  			nlog.Infoln(p.String()+"override local", bmd.String(), "with", cm.BMD.String())
   549  			if err := p.owner.bmd.putPersist(cm.BMD, nil); err != nil {
   550  				cos.ExitLog(err)
   551  			}
   552  		}
   553  		p.owner.bmd.Unlock()
   554  	}
   555  	if cm.RMD != nil {
   556  		p.owner.rmd.Lock()
   557  		rmd := p.owner.rmd.get()
   558  		if rmd == nil || rmd.version() < cm.RMD.version() {
   559  			nlog.Infoln(p.String()+"override local", rmd.String(), "with", cm.RMD.String())
   560  			p.owner.rmd.put(cm.RMD)
   561  		}
   562  		p.owner.rmd.Unlock()
   563  	}
   564  
   565  	if cm.Config != nil && cm.Config.UUID != "" {
   566  		p.owner.config.Lock()
   567  		config := cmn.GCO.Get()
   568  		if config.Version < cm.Config.version() {
   569  			if !cos.IsValidUUID(cm.Config.UUID) {
   570  				debug.Assert(false, cm.Config.String())
   571  				cos.ExitLogf("%s: invalid config UUID: %s", p, cm.Config)
   572  			}
   573  			if cos.IsValidUUID(config.UUID) && config.UUID != cm.Config.UUID {
   574  				nlog.Errorf("Warning: configs have different UUIDs: (%s, %s) vs %s - proceeding anyway",
   575  					p, config, cm.Config)
   576  			} else {
   577  				nlog.Infoln(p.String(), "override local", config.String(), "with", cm.Config.String())
   578  			}
   579  			cmn.GCO.Update(&cm.Config.ClusterConfig)
   580  		}
   581  		p.owner.config.Unlock()
   582  	}
   583  
   584  	if cm.Smap == nil || cm.Smap.version() == 0 {
   585  		nlog.Infoln(p.String() + ": no max-ver Smaps")
   586  		return
   587  	}
   588  	nlog.Infoln(p.String(), "local", smap.StringEx(), "max-ver", cm.Smap.StringEx())
   589  	smapUUID, sameUUID, sameVersion, eq := smap.Compare(&cm.Smap.Smap)
   590  	if !sameUUID {
   591  		// FATAL: cluster integrity error (cie)
   592  		cos.ExitLogf("%s: split-brain uuid [%s %s] vs %s", ciError(10), p, smap.StringEx(), cm.Smap.StringEx())
   593  	}
   594  	if eq && sameVersion {
   595  		return
   596  	}
   597  	if cm.Smap.Primary != nil && cm.Smap.Primary.ID() != p.SID() {
   598  		if cm.Smap.version() > smap.version() {
   599  			if dupNode, err := cm.Smap.IsDupNet(p.si); err != nil {
   600  				if !cm.Smap.IsPrimary(dupNode) {
   601  					cos.ExitLog(err)
   602  				}
   603  				// If the primary in max-ver Smap version and current node only differ by `DaemonID`,
   604  				// overwrite the proxy entry with current `Snode` and proceed to merging Smap.
   605  				// TODO: Add validation to ensure `dupNode` and `p.si` only differ in `DaemonID`.
   606  				cm.Smap.Primary = p.si
   607  				cm.Smap.delProxy(dupNode.ID())
   608  				cm.Smap.Pmap[p.SID()] = p.si
   609  				goto merge
   610  			}
   611  			nlog.Infof("%s: change-of-mind #2 %s <= max-ver %s", p, smap.StringEx(), cm.Smap.StringEx())
   612  			cm.Smap.Pmap[p.SID()] = p.si
   613  			p.owner.smap.put(cm.Smap)
   614  			return
   615  		}
   616  		// FATAL: cluster integrity error (cie)
   617  		cos.ExitLogf("%s: split-brain local [%s %s] vs %s", ciError(20), p, smap.StringEx(), cm.Smap.StringEx())
   618  	}
   619  merge:
   620  	p.owner.smap.mu.Lock()
   621  	clone := p.owner.smap.get().clone()
   622  	if !eq {
   623  		nlog.Infof("%s: merge local %s <== %s", p, clone, cm.Smap)
   624  		_, err := cm.Smap.merge(clone, false /*err if detected (IP, port) duplicates*/)
   625  		if err != nil {
   626  			cos.ExitLogf("%s: %v vs %s", p, err, cm.Smap.StringEx())
   627  		}
   628  	} else {
   629  		clone.UUID = smapUUID
   630  	}
   631  	clone.Version = max(clone.version(), cm.Smap.version()) + 1
   632  	p.owner.smap.put(clone)
   633  	p.owner.smap.mu.Unlock()
   634  	nlog.Infof("%s: merged %s", p, clone.pp())
   635  }
   636  
   637  func (p *proxy) uncoverMeta(bcastSmap *smapX) (cm cluMeta) {
   638  	var (
   639  		err         error
   640  		suuid       string
   641  		config      = cmn.GCO.Get()
   642  		now         = time.Now()
   643  		deadline    = now.Add(config.Timeout.Startup.D())
   644  		l           = bcastSmap.Count()
   645  		bmds        = make(bmds, l)
   646  		smaps       = make(smaps, l)
   647  		done, slowp bool
   648  	)
   649  	for {
   650  		if nlog.Stopping() {
   651  			cm.Smap = nil
   652  			return
   653  		}
   654  		last := time.Now().After(deadline)
   655  		cm, done, slowp = p.bcastMaxVer(bcastSmap, bmds, smaps)
   656  		if done || last {
   657  			break
   658  		}
   659  		time.Sleep(config.Timeout.CplaneOperation.D())
   660  	}
   661  	if !slowp {
   662  		return
   663  	}
   664  	nlog.Infoln(p.String(), "(primary) slow path...")
   665  	if cm.BMD, err = resolveUUIDBMD(bmds); err != nil {
   666  		if _, split := err.(*errBmdUUIDSplit); split {
   667  			cos.ExitLog(p.String(), "(primary), err:", err) // cluster integrity error
   668  		}
   669  		nlog.Errorln(err)
   670  	}
   671  	for si, smap := range smaps {
   672  		if !si.IsTarget() {
   673  			continue
   674  		}
   675  		if !cos.IsValidUUID(smap.UUID) {
   676  			continue
   677  		}
   678  		if suuid == "" {
   679  			suuid = smap.UUID
   680  			if suuid != "" {
   681  				nlog.Infof("%s: set Smap UUID = %s(%s)", p, si, suuid)
   682  			}
   683  		} else if suuid != smap.UUID {
   684  			// FATAL: cluster integrity error (cie)
   685  			cos.ExitLogf("%s: split-brain [%s %s] vs [%s %s]", ciError(30), p, suuid, si, smap.UUID)
   686  		}
   687  	}
   688  	for _, smap := range smaps {
   689  		if smap.UUID != suuid {
   690  			continue
   691  		}
   692  		if cm.Smap == nil {
   693  			cm.Smap = smap
   694  		} else if cm.Smap.version() < smap.version() {
   695  			cm.Smap = smap
   696  		}
   697  	}
   698  	return
   699  }
   700  
   701  func (p *proxy) bcastMaxVer(bcastSmap *smapX, bmds bmds, smaps smaps) (out cluMeta, done, slowp bool) {
   702  	var (
   703  		borigin, sorigin string
   704  		args             = allocBcArgs()
   705  	)
   706  	args.req = cmn.HreqArgs{
   707  		Path:  apc.URLPathDae.S,
   708  		Query: url.Values{apc.QparamWhat: []string{apc.WhatSmapVote}},
   709  	}
   710  	args.smap = bcastSmap
   711  	args.to = core.SelectedNodes
   712  
   713  	args.nodes = make([]meta.NodeMap, 0, 2)
   714  	if len(bcastSmap.Tmap) > 0 {
   715  		args.nodes = append(args.nodes, bcastSmap.Tmap)
   716  	}
   717  	pmap := make(meta.NodeMap, len(bcastSmap.Pmap))
   718  	ctrl := p.si.URL(cmn.NetIntraControl)
   719  	for pid, si := range bcastSmap.Pmap {
   720  		if pid == p.SID() {
   721  			continue
   722  		}
   723  		if si.URL(cmn.NetIntraControl) == ctrl {
   724  			nlog.Warningf(fmtErrNetInfoChanged, p, si.StringEx(), ctrl)
   725  			continue
   726  		}
   727  		pmap[pid] = si
   728  	}
   729  	args.nodes = append(args.nodes, pmap)
   730  
   731  	args.cresv = cresCM{} // -> cluMeta
   732  	results := p.bcastGroup(args)
   733  	freeBcArgs(args)
   734  	done = true
   735  
   736  	clear(bmds)
   737  	clear(smaps)
   738  
   739  	for _, res := range results {
   740  		if res.err != nil {
   741  			done = false
   742  			continue
   743  		}
   744  		cm, ok := res.v.(*cluMeta)
   745  		debug.Assert(ok)
   746  		if cm.BMD != nil && cm.BMD.version() > 0 {
   747  			if out.BMD == nil { // 1. init
   748  				borigin, out.BMD = cm.BMD.UUID, cm.BMD
   749  			} else if borigin != "" && borigin != cm.BMD.UUID { // 2. slow path
   750  				slowp = true
   751  			} else if !slowp && out.BMD.Version < cm.BMD.Version { // 3. fast path max(version)
   752  				out.BMD = cm.BMD
   753  				borigin = cm.BMD.UUID
   754  			}
   755  		}
   756  		if cm.RMD != nil && cm.RMD.version() > 0 {
   757  			if out.RMD == nil { // 1. init
   758  				out.RMD = cm.RMD
   759  			} else if !slowp && out.RMD.Version < cm.RMD.Version { // 3. fast path max(version)
   760  				out.RMD = cm.RMD
   761  			}
   762  		}
   763  		if cm.Config != nil && cm.Config.version() > 0 {
   764  			if out.Config == nil { // 1. init
   765  				out.Config = cm.Config
   766  			} else if !slowp && out.Config.version() < cm.Config.version() { // 3. fast path max(version)
   767  				out.Config = cm.Config
   768  			}
   769  		}
   770  
   771  		// TODO: maxver of EtlMD
   772  
   773  		if cm.Smap != nil && cm.Flags.IsSet(cifl.VoteInProgress) {
   774  			var s string
   775  			if cm.Smap.Primary != nil {
   776  				s = " of the current one " + cm.Smap.Primary.ID()
   777  			}
   778  			nlog.Warningln(p.String(), "starting up as primary(?) during reelection"+s)
   779  			out.Smap, out.BMD, out.RMD = nil, nil, nil // zero-out as unusable
   780  			done = false
   781  			break
   782  		}
   783  		if cm.Smap != nil && cm.Smap.version() > 0 {
   784  			if out.Smap == nil { // 1. init
   785  				sorigin, out.Smap = cm.Smap.UUID, cm.Smap
   786  			} else if sorigin != "" && sorigin != cm.Smap.UUID { // 2. slow path
   787  				slowp = true
   788  			} else if !slowp && out.Smap.Version < cm.Smap.Version { // 3. fast path max(version)
   789  				out.Smap = cm.Smap
   790  				sorigin = cm.Smap.UUID
   791  			}
   792  		}
   793  		if bmds != nil && cm.BMD != nil && cm.BMD.version() > 0 {
   794  			bmds[res.si] = cm.BMD
   795  		}
   796  		if smaps != nil && cm.Smap != nil && cm.Smap.version() > 0 {
   797  			smaps[res.si] = cm.Smap
   798  		}
   799  	}
   800  	freeBcastRes(results)
   801  	return
   802  }
   803  
   804  func (p *proxy) bcastMaxVerBestEffort(smap *smapX) *smapX {
   805  	cm, _, slowp := p.bcastMaxVer(smap, nil, nil)
   806  	if cm.Smap != nil && !slowp {
   807  		if cm.Smap.UUID == smap.UUID && cm.Smap.version() > smap.version() && cm.Smap.validate() == nil {
   808  			if cm.Smap.Primary.ID() != p.SID() {
   809  				nlog.Warningln(p.String(), "detected primary change, whereby local", smap.StringEx(),
   810  					"is older than max-ver", cm.Smap.StringEx())
   811  				return cm.Smap
   812  			}
   813  		}
   814  	}
   815  	return nil
   816  }
   817  
   818  func (p *proxy) regpoolMaxVer(before, after *cluMeta, forcePrimaryChange bool) (smap *smapX) {
   819  	var (
   820  		voteInProgress bool
   821  		cloned         bool
   822  	)
   823  	*after = *before
   824  
   825  	p.reg.mu.RLock()
   826  
   827  	if len(p.reg.pool) == 0 {
   828  		goto ret
   829  	}
   830  	for _, regReq := range p.reg.pool {
   831  		nsi := regReq.SI
   832  		if err := nsi.Validate(); err != nil {
   833  			nlog.Errorln("Warning:", err)
   834  			continue
   835  		}
   836  		voteInProgress = voteInProgress || regReq.Flags.IsSet(cifl.VoteInProgress)
   837  		if regReq.Smap != nil && regReq.Smap.version() > 0 && cos.IsValidUUID(regReq.Smap.UUID) {
   838  			if after.Smap != nil && after.Smap.version() > 0 {
   839  				if cos.IsValidUUID(after.Smap.UUID) && after.Smap.UUID != regReq.Smap.UUID {
   840  					cos.ExitLogf("%s: Smap UUIDs don't match: [%s %s] vs %s", ciError(10),
   841  						p, after.Smap.StringEx(), regReq.Smap.StringEx())
   842  				}
   843  			}
   844  			if after.Smap == nil || after.Smap.version() < regReq.Smap.version() {
   845  				after.Smap = regReq.Smap
   846  			}
   847  		}
   848  		if regReq.BMD != nil && regReq.BMD.version() > 0 && cos.IsValidUUID(regReq.BMD.UUID) {
   849  			if after.BMD != nil && after.BMD.version() > 0 {
   850  				if cos.IsValidUUID(after.BMD.UUID) && after.BMD.UUID != regReq.BMD.UUID {
   851  					cos.ExitLogf("%s: BMD UUIDs don't match: [%s %s] vs %s", ciError(10),
   852  						p.si, after.BMD.StringEx(), regReq.BMD.StringEx())
   853  				}
   854  			}
   855  			if after.BMD == nil || after.BMD.version() < regReq.BMD.version() {
   856  				after.BMD = regReq.BMD
   857  			}
   858  		}
   859  		if regReq.RMD != nil && regReq.RMD.version() > 0 {
   860  			if after.RMD == nil || after.RMD.version() < regReq.RMD.version() {
   861  				after.RMD = regReq.RMD
   862  			}
   863  		}
   864  		if regReq.Config != nil && regReq.Config.version() > 0 && cos.IsValidUUID(regReq.Config.UUID) {
   865  			if after.Config != nil && after.Config.version() > 0 {
   866  				if cos.IsValidUUID(after.Config.UUID) && after.Config.UUID != regReq.Config.UUID {
   867  					cos.ExitLogf("%s: Global Config UUIDs don't match: [%s %s] vs %s", ciError(10),
   868  						p.si, after.Config, regReq.Config)
   869  				}
   870  			}
   871  			if after.Config == nil || after.Config.version() < regReq.Config.version() {
   872  				after.Config = regReq.Config
   873  			}
   874  		}
   875  	}
   876  	if after.BMD != before.BMD {
   877  		if err := p.owner.bmd.putPersist(after.BMD, nil); err != nil {
   878  			cos.ExitLog(err)
   879  		}
   880  	}
   881  	if after.RMD != before.RMD {
   882  		p.owner.rmd.put(after.RMD)
   883  	}
   884  	if after.Config != before.Config {
   885  		var err error
   886  		after.Config, err = p.owner.config.modify(&configModifier{
   887  			pre: func(_ *configModifier, clone *globalConfig) (bool, error) {
   888  				*clone = *after.Config
   889  				return true, nil
   890  			},
   891  		})
   892  		if err != nil {
   893  			cos.ExitLog(err)
   894  		}
   895  	}
   896  
   897  ret:
   898  	p.reg.mu.RUnlock()
   899  
   900  	// not interfering with elections
   901  	if voteInProgress {
   902  		before.Smap.UUID, before.Smap.CreationTime = after.Smap.UUID, after.Smap.CreationTime
   903  		nlog.Errorln("voting is in progress, continuing with potentially older", before.Smap.StringEx())
   904  		return before.Smap
   905  	}
   906  
   907  	runtime.Gosched()
   908  
   909  	// NOTE [ref0417]:
   910  	// - always update joining nodes' net-infos;
   911  	// - alternatively, narrow it down to only proxies (as targets always restart on the same K8s nodes)
   912  
   913  	p.reg.mu.RLock()
   914  	for _, regReq := range p.reg.pool {
   915  		after.Smap, cloned = _updNetInfo(after.Smap, regReq.SI, cloned)
   916  	}
   917  	p.reg.mu.RUnlock()
   918  
   919  	if after.Smap.version() == 0 || !cos.IsValidUUID(after.Smap.UUID) {
   920  		after.Smap.UUID, after.Smap.CreationTime = newClusterUUID()
   921  		nlog.Infoln(p.String(), "new cluster UUID:", after.Smap.UUID)
   922  		return after.Smap
   923  	}
   924  	if before.Smap == after.Smap {
   925  		if !forcePrimaryChange {
   926  			return after.Smap
   927  		}
   928  	} else {
   929  		debug.Assert(before.Smap.version() < after.Smap.version())
   930  		nlog.Warningln("before:", before.Smap.StringEx(), "after:", after.Smap.StringEx())
   931  	}
   932  
   933  	if after.Smap.Primary.ID() != p.SID() {
   934  		nlog.Warningln(p.String() + ": taking over as primary")
   935  	}
   936  	if !cloned {
   937  		after.Smap = after.Smap.clone()
   938  	}
   939  	after.Smap.Primary = p.si
   940  	after.Smap.Pmap[p.SID()] = p.si
   941  
   942  	after.Smap.Version += 50
   943  
   944  	config, errN := p.owner.config.modify(&configModifier{
   945  		pre: func(_ *configModifier, clone *globalConfig) (bool, error) {
   946  			clone.Proxy.PrimaryURL = p.si.URL(cmn.NetIntraControl)
   947  			clone.Version++
   948  			return true, nil
   949  		},
   950  	})
   951  	if errN != nil {
   952  		cos.ExitLog(errN)
   953  	}
   954  	after.Config = config
   955  	return after.Smap
   956  }
   957  
   958  func _updNetInfo(smap *smapX, nsi *meta.Snode, cloned bool) (*smapX, bool) {
   959  	if nsi.Validate() != nil {
   960  		return smap, cloned
   961  	}
   962  	osi := smap.GetNode(nsi.ID())
   963  	if osi == nil || osi.Type() != nsi.Type() {
   964  		return smap, cloned
   965  	}
   966  	if err := osi.NetEq(nsi); err != nil {
   967  		nlog.Warningln("Warning: reniewing", err)
   968  		if !cloned {
   969  			smap = smap.clone()
   970  			cloned = true
   971  		}
   972  		smap.putNode(nsi, osi.Flags, true /*silent*/)
   973  	}
   974  	return smap, cloned
   975  }