github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/prxclu.go (about)

     1  // Package ais provides core functionality for the AIStore object storage.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package ais
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"net/http"
    12  	"net/url"
    13  	"path"
    14  	"strconv"
    15  	"strings"
    16  	"time"
    17  
    18  	"github.com/NVIDIA/aistore/api/apc"
    19  	"github.com/NVIDIA/aistore/cmn"
    20  	"github.com/NVIDIA/aistore/cmn/cifl"
    21  	"github.com/NVIDIA/aistore/cmn/cos"
    22  	"github.com/NVIDIA/aistore/cmn/debug"
    23  	"github.com/NVIDIA/aistore/cmn/mono"
    24  	"github.com/NVIDIA/aistore/cmn/nlog"
    25  	"github.com/NVIDIA/aistore/core"
    26  	"github.com/NVIDIA/aistore/core/meta"
    27  	"github.com/NVIDIA/aistore/stats"
    28  	"github.com/NVIDIA/aistore/xact"
    29  	jsoniter "github.com/json-iterator/go"
    30  )
    31  
    32  //
    33  // v1/cluster handlers
    34  //
    35  
    36  func (p *proxy) clusterHandler(w http.ResponseWriter, r *http.Request) {
    37  	switch r.Method {
    38  	case http.MethodGet:
    39  		p.httpcluget(w, r)
    40  	case http.MethodPost:
    41  		p.httpclupost(w, r)
    42  	case http.MethodPut:
    43  		p.httpcluput(w, r)
    44  	case http.MethodDelete:
    45  		p.httpcludel(w, r)
    46  	default:
    47  		cmn.WriteErr405(w, r, http.MethodDelete, http.MethodGet, http.MethodPost, http.MethodPut)
    48  	}
    49  }
    50  
    51  //
    52  // GET /v1/cluster - query cluster states and stats
    53  //
    54  
    55  func (p *proxy) httpcluget(w http.ResponseWriter, r *http.Request) {
    56  	var (
    57  		query = r.URL.Query()
    58  		what  = query.Get(apc.QparamWhat)
    59  	)
    60  	// always allow as the flow involves intra-cluster redirect
    61  	// (ref 1377 for more context)
    62  	if what == apc.WhatOneXactStatus {
    63  		p.ic.xstatusOne(w, r)
    64  		return
    65  	}
    66  
    67  	if err := p.checkAccess(w, r, nil, apc.AceShowCluster); err != nil {
    68  		return
    69  	}
    70  
    71  	switch what {
    72  	case apc.WhatAllXactStatus:
    73  		p.ic.xstatusAll(w, r, query)
    74  	case apc.WhatQueryXactStats:
    75  		p.xquery(w, r, what, query)
    76  	case apc.WhatAllRunningXacts:
    77  		p.xgetRunning(w, r, what, query)
    78  	case apc.WhatNodeStats, apc.WhatNodeStatsV322:
    79  		p.qcluStats(w, r, what, query)
    80  	case apc.WhatSysInfo:
    81  		p.qcluSysinfo(w, r, what, query)
    82  	case apc.WhatMountpaths:
    83  		p.qcluMountpaths(w, r, what, query)
    84  	case apc.WhatRemoteAIS:
    85  		all, err := p.getRemAisVec(true /*refresh*/)
    86  		if err != nil {
    87  			p.writeErr(w, r, err)
    88  			return
    89  		}
    90  		p.writeJSON(w, r, all, what)
    91  	case apc.WhatTargetIPs:
    92  		// Return comma-separated IPs of the targets.
    93  		// It can be used to easily fill the `--noproxy` parameter in cURL.
    94  		var (
    95  			smap = p.owner.smap.Get()
    96  			buf  = bytes.NewBuffer(nil)
    97  		)
    98  		for _, si := range smap.Tmap {
    99  			if buf.Len() > 0 {
   100  				buf.WriteByte(',')
   101  			}
   102  			buf.WriteString(si.PubNet.Hostname)
   103  			buf.WriteByte(',')
   104  			buf.WriteString(si.ControlNet.Hostname)
   105  			buf.WriteByte(',')
   106  			buf.WriteString(si.DataNet.Hostname)
   107  		}
   108  		w.Header().Set(cos.HdrContentLength, strconv.Itoa(buf.Len()))
   109  		w.Write(buf.Bytes())
   110  
   111  	case apc.WhatClusterConfig:
   112  		config := cmn.GCO.Get()
   113  		// hide secret
   114  		c := config.ClusterConfig
   115  		c.Auth.Secret = "**********"
   116  		p.writeJSON(w, r, &c, what)
   117  	case apc.WhatBMD, apc.WhatSmapVote, apc.WhatSnode, apc.WhatSmap:
   118  		p.htrun.httpdaeget(w, r, query, nil /*htext*/)
   119  	default:
   120  		p.writeErrf(w, r, fmtUnknownQue, what)
   121  	}
   122  }
   123  
   124  // apc.WhatQueryXactStats (NOTE: may poll for quiescence)
   125  func (p *proxy) xquery(w http.ResponseWriter, r *http.Request, what string, query url.Values) {
   126  	var xactMsg xact.QueryMsg
   127  	if err := cmn.ReadJSON(w, r, &xactMsg); err != nil {
   128  		return
   129  	}
   130  	xactMsg.Kind, _ = xact.GetKindName(xactMsg.Kind) // convert display name => kind
   131  	body := cos.MustMarshal(xactMsg)
   132  
   133  	args := allocBcArgs()
   134  	args.req = cmn.HreqArgs{Method: http.MethodGet, Path: apc.URLPathXactions.S, Body: body, Query: query}
   135  	args.to = core.Targets
   136  
   137  	var (
   138  		config      = cmn.GCO.Get()
   139  		onlyRunning = xactMsg.OnlyRunning != nil && *xactMsg.OnlyRunning
   140  	)
   141  	args.timeout = config.Client.Timeout.D() // quiescence
   142  	if !onlyRunning {
   143  		args.timeout = config.Client.TimeoutLong.D()
   144  	}
   145  
   146  	results := p.bcastGroup(args)
   147  	freeBcArgs(args)
   148  	resRaw, erred := p._tresRaw(w, r, results)
   149  	if erred {
   150  		return
   151  	}
   152  	if len(resRaw) == 0 {
   153  		smap := p.owner.smap.get()
   154  		if smap.CountActiveTs() > 0 {
   155  			p.writeErrStatusf(w, r, http.StatusNotFound, "%q not found", xactMsg.String())
   156  			return
   157  		}
   158  		err := cmn.NewErrNoNodes(apc.Target, smap.CountTargets())
   159  		nlog.Warningf("%s: %v, %s", p, err, smap)
   160  	}
   161  
   162  	// TODO: if voteInProgress snap and append xele, or else
   163  
   164  	p.writeJSON(w, r, resRaw, what)
   165  }
   166  
   167  // apc.WhatAllRunningXacts
   168  func (p *proxy) xgetRunning(w http.ResponseWriter, r *http.Request, what string, query url.Values) {
   169  	var xactMsg xact.QueryMsg
   170  	if err := cmn.ReadJSON(w, r, &xactMsg); err != nil {
   171  		return
   172  	}
   173  	xactMsg.Kind, _ = xact.GetKindName(xactMsg.Kind) // convert display name => kind
   174  	body := cos.MustMarshal(xactMsg)
   175  
   176  	args := allocBcArgs()
   177  	args.req = cmn.HreqArgs{Method: http.MethodGet, Path: apc.URLPathXactions.S, Body: body, Query: query}
   178  	args.to = core.Targets
   179  	results := p.bcastGroup(args)
   180  	freeBcArgs(args)
   181  
   182  	uniqueKindIDs := cos.StrSet{}
   183  	for _, res := range results {
   184  		if res.err != nil {
   185  			p.writeErr(w, r, res.toErr())
   186  			freeBcastRes(results)
   187  			return
   188  		}
   189  		if len(res.bytes) == 0 {
   190  			continue
   191  		}
   192  		var (
   193  			kindIDs []string
   194  			err     = jsoniter.Unmarshal(res.bytes, &kindIDs)
   195  		)
   196  		debug.AssertNoErr(err)
   197  		for _, ki := range kindIDs {
   198  			uniqueKindIDs.Set(ki)
   199  		}
   200  	}
   201  	freeBcastRes(results)
   202  	p.writeJSON(w, r, uniqueKindIDs.ToSlice(), what)
   203  }
   204  
   205  func (p *proxy) qcluSysinfo(w http.ResponseWriter, r *http.Request, what string, query url.Values) {
   206  	var (
   207  		config  = cmn.GCO.Get()
   208  		timeout = config.Client.Timeout.D()
   209  	)
   210  	proxyResults, err := p._sysinfo(r, timeout, core.Proxies, query)
   211  	if err != nil {
   212  		p.writeErr(w, r, err)
   213  		return
   214  	}
   215  	out := &apc.ClusterSysInfoRaw{}
   216  	out.Proxy = proxyResults
   217  
   218  	targetResults, err := p._sysinfo(r, timeout, core.Targets, query)
   219  	if err != nil {
   220  		p.writeErr(w, r, err)
   221  		return
   222  	}
   223  	out.Target = targetResults
   224  	p.writeJSON(w, r, out, what)
   225  }
   226  
   227  func (p *proxy) getRemAisVec(refresh bool) (*meta.RemAisVec, error) {
   228  	smap := p.owner.smap.get()
   229  	si, errT := smap.GetRandTarget()
   230  	if errT != nil {
   231  		return nil, errT
   232  	}
   233  	q := url.Values{apc.QparamWhat: []string{apc.WhatRemoteAIS}}
   234  	if refresh {
   235  		q[apc.QparamClusterInfo] = []string{"true"} // handshake to check connectivity and get remote Smap
   236  	}
   237  	cargs := allocCargs()
   238  	{
   239  		cargs.si = si
   240  		cargs.req = cmn.HreqArgs{
   241  			Method: http.MethodGet,
   242  			Path:   apc.URLPathDae.S,
   243  			Query:  q,
   244  		}
   245  		cargs.timeout = cmn.Rom.MaxKeepalive()
   246  		cargs.cresv = cresBA{} // -> cmn.BackendInfoAIS
   247  	}
   248  	var (
   249  		v   *meta.RemAisVec
   250  		res = p.call(cargs, smap)
   251  		err = res.toErr()
   252  	)
   253  	if err == nil {
   254  		v = res.v.(*meta.RemAisVec)
   255  	}
   256  	freeCargs(cargs)
   257  	freeCR(res)
   258  	return v, err
   259  }
   260  
   261  func (p *proxy) _sysinfo(r *http.Request, timeout time.Duration, to int, query url.Values) (cos.JSONRawMsgs, error) {
   262  	args := allocBcArgs()
   263  	args.req = cmn.HreqArgs{Method: r.Method, Path: apc.URLPathDae.S, Query: query}
   264  	args.timeout = timeout
   265  	args.to = to
   266  	results := p.bcastGroup(args)
   267  	freeBcArgs(args)
   268  	sysInfoMap := make(cos.JSONRawMsgs, len(results))
   269  	for _, res := range results {
   270  		if res.err != nil {
   271  			err := res.toErr()
   272  			freeBcastRes(results)
   273  			return nil, err
   274  		}
   275  		sysInfoMap[res.si.ID()] = res.bytes
   276  	}
   277  	freeBcastRes(results)
   278  	return sysInfoMap, nil
   279  }
   280  
   281  func (p *proxy) qcluStats(w http.ResponseWriter, r *http.Request, what string, query url.Values) {
   282  	targetStats, erred := p._queryTs(w, r, query)
   283  	if targetStats == nil || erred {
   284  		return
   285  	}
   286  	out := &stats.ClusterRaw{}
   287  	out.Target = targetStats
   288  	out.Proxy = p.statsT.GetStats()
   289  	out.Proxy.Snode = p.si
   290  	p.writeJSON(w, r, out, what)
   291  }
   292  
   293  func (p *proxy) qcluMountpaths(w http.ResponseWriter, r *http.Request, what string, query url.Values) {
   294  	targetMountpaths, erred := p._queryTs(w, r, query)
   295  	if targetMountpaths == nil || erred {
   296  		return
   297  	}
   298  	out := &ClusterMountpathsRaw{}
   299  	out.Targets = targetMountpaths
   300  	p.writeJSON(w, r, out, what)
   301  }
   302  
   303  // helper methods for querying targets
   304  
   305  func (p *proxy) _queryTs(w http.ResponseWriter, r *http.Request, query url.Values) (cos.JSONRawMsgs, bool) {
   306  	var (
   307  		err  error
   308  		body []byte
   309  	)
   310  	if r.Body != nil {
   311  		body, err = cmn.ReadBytes(r)
   312  		if err != nil {
   313  			p.writeErr(w, r, err)
   314  			return nil, true
   315  		}
   316  	}
   317  	args := allocBcArgs()
   318  	args.req = cmn.HreqArgs{Method: r.Method, Path: apc.URLPathDae.S, Query: query, Body: body}
   319  	args.timeout = cmn.Rom.MaxKeepalive()
   320  	results := p.bcastGroup(args)
   321  	freeBcArgs(args)
   322  	return p._tresRaw(w, r, results)
   323  }
   324  
   325  func (p *proxy) _tresRaw(w http.ResponseWriter, r *http.Request, results sliceResults) (tres cos.JSONRawMsgs, erred bool) {
   326  	tres = make(cos.JSONRawMsgs, len(results))
   327  	for _, res := range results {
   328  		if res.status == http.StatusNotFound {
   329  			continue
   330  		}
   331  		if res.err != nil {
   332  			p.writeErr(w, r, res.toErr())
   333  			freeBcastRes(results)
   334  			tres, erred = nil, true
   335  			return
   336  		}
   337  		tres[res.si.ID()] = res.bytes
   338  	}
   339  	freeBcastRes(results)
   340  	return
   341  }
   342  
   343  // POST /v1/cluster - handles joins and keepalives
   344  func (p *proxy) httpclupost(w http.ResponseWriter, r *http.Request) {
   345  	apiItems, err := p.parseURL(w, r, apc.URLPathClu.L, 1, true)
   346  	if err != nil {
   347  		return
   348  	}
   349  	if p.forwardCP(w, r, nil, "httpclupost") {
   350  		return
   351  	}
   352  
   353  	var (
   354  		nsi    *meta.Snode
   355  		action string
   356  		regReq cluMeta
   357  		smap   = p.owner.smap.get()
   358  		config = cmn.GCO.Get()
   359  		apiOp  = apiItems[0]
   360  	)
   361  	if len(apiItems) > 1 && apiOp != apc.Keepalive {
   362  		p.writeErrURL(w, r)
   363  		return
   364  	}
   365  	if p.settingNewPrimary.Load() {
   366  		// ignore of fail
   367  		if apiOp != apc.Keepalive {
   368  			var s string
   369  			if apiOp == apc.AdminJoin {
   370  				s = " (retry in a few seconds)"
   371  			}
   372  			p.writeErr(w, r, errors.New("setting new primary - transitioning"+s), http.StatusServiceUnavailable)
   373  		}
   374  		return
   375  	}
   376  
   377  	switch apiOp {
   378  	case apc.Keepalive:
   379  		// fast path(?)
   380  		if len(apiItems) > 1 {
   381  			p.fastKalive(w, r, smap, config, apiItems[1])
   382  			return
   383  		}
   384  
   385  		// slow path
   386  		if cmn.ReadJSON(w, r, &regReq) != nil {
   387  			return
   388  		}
   389  		nsi = regReq.SI
   390  	case apc.AdminJoin: // administrative join
   391  		if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil {
   392  			return
   393  		}
   394  		if cmn.ReadJSON(w, r, &regReq.SI) != nil {
   395  			return
   396  		}
   397  		nsi = regReq.SI
   398  		// must be reachable and must respond
   399  		si, err := p._getSI(nsi)
   400  		if err != nil {
   401  			p.writeErrf(w, r, "%s: failed to obtain node info from %s: %v", p.si, nsi.StringEx(), err)
   402  			return
   403  		}
   404  		// NOTE: node ID and 3-networks configuration is obtained from the node itself
   405  		*nsi = *si
   406  	case apc.SelfJoin: // auto-join at node startup
   407  		if cmn.ReadJSON(w, r, &regReq) != nil {
   408  			return
   409  		}
   410  		// NOTE: ditto
   411  		nsi = regReq.SI
   412  		if !p.ClusterStarted() {
   413  			p.reg.mu.Lock()
   414  			p.reg.pool = append(p.reg.pool, regReq)
   415  			p.reg.mu.Unlock()
   416  		}
   417  	default:
   418  		p.writeErrURL(w, r)
   419  		return
   420  	}
   421  
   422  	if err := nsi.Validate(); err != nil {
   423  		p.writeErr(w, r, err)
   424  		return
   425  	}
   426  	// given node and operation, set msg.Action
   427  	switch apiOp {
   428  	case apc.AdminJoin:
   429  		if nsi.IsProxy() {
   430  			action = apc.ActAdminJoinProxy
   431  		} else {
   432  			action = apc.ActAdminJoinTarget
   433  		}
   434  	case apc.SelfJoin:
   435  		if nsi.IsProxy() {
   436  			action = apc.ActSelfJoinProxy
   437  		} else {
   438  			action = apc.ActSelfJoinTarget
   439  		}
   440  	case apc.Keepalive:
   441  		action = apc.ActKeepaliveUpdate // (must be an extremely rare case)
   442  	}
   443  
   444  	// more validation && non-electability
   445  	if p.NodeStarted() {
   446  		bmd := p.owner.bmd.get()
   447  		if err := bmd.validateUUID(regReq.BMD, p.si, nsi, ""); err != nil {
   448  			p.writeErr(w, r, err)
   449  			return
   450  		}
   451  	}
   452  	var (
   453  		nonElectable bool
   454  	)
   455  	if nsi.IsProxy() {
   456  		s := r.URL.Query().Get(apc.QparamNonElectable)
   457  		if nonElectable, err = cos.ParseBool(s); err != nil {
   458  			nlog.Errorf("%s: failed to parse %s for non-electability: %v", p, s, err)
   459  		}
   460  	}
   461  	if _, err := cmn.ParseHost2IP(nsi.PubNet.Hostname); err != nil {
   462  		p.writeErrf(w, r, "%s: failed to %s %s: invalid hostname: %v", p.si, apiOp, nsi.StringEx(), err)
   463  		return
   464  	}
   465  
   466  	// node flags
   467  	if osi := smap.GetNode(nsi.ID()); osi != nil {
   468  		nsi.Flags = osi.Flags
   469  	}
   470  	if nonElectable {
   471  		nsi.Flags = nsi.Flags.Set(meta.SnodeNonElectable)
   472  	}
   473  
   474  	// handshake | check dup
   475  	if apiOp == apc.AdminJoin {
   476  		// call the node with cluster-metadata included
   477  		if ecode, err := p.adminJoinHandshake(smap, nsi, apiOp); err != nil {
   478  			p.writeErr(w, r, err, ecode)
   479  			return
   480  		}
   481  	} else if apiOp == apc.SelfJoin {
   482  		//
   483  		// check for: a) different node, duplicate node ID, or b) same node, net-info change
   484  		//
   485  		if osi := smap.GetNode(nsi.ID()); osi != nil && !osi.Eq(nsi) {
   486  			ok, err := p._confirmSnode(osi, nsi) // handshake (expecting nsi in response)
   487  			if err != nil {
   488  				if !cos.IsRetriableConnErr(err) {
   489  					p.writeErrf(w, r, "failed to obtain node info: %v", err)
   490  					return
   491  				}
   492  				// starting up, not listening yet
   493  				// NOTE [ref0417]
   494  				// TODO: try to confirm asynchronously
   495  			} else if !ok {
   496  				p.writeErrf(w, r, "duplicate node ID %q (%s, %s)", nsi.ID(), osi.StringEx(), nsi.StringEx())
   497  				return
   498  			}
   499  			nlog.Warningf("%s: self-joining %s [err %v, confirmed %t]", p, nsi.StringEx(), err, ok)
   500  		}
   501  	}
   502  
   503  	if !config.Rebalance.Enabled {
   504  		regReq.Flags = regReq.Flags.Clear(cifl.RebalanceInterrupted)
   505  		regReq.Flags = regReq.Flags.Clear(cifl.Restarted)
   506  	}
   507  	interrupted, restarted := regReq.Flags.IsSet(cifl.RebalanceInterrupted), regReq.Flags.IsSet(cifl.Restarted)
   508  	if nsi.IsTarget() && (interrupted || restarted) {
   509  		if a, b := p.ClusterStarted(), p.owner.rmd.starting.Load(); !a || b {
   510  			// handle via rmd.starting + resumeReb
   511  			if p.owner.rmd.interrupted.CAS(false, true) {
   512  				nlog.Warningf("%s: will resume rebalance %s(%t, %t)", p, nsi.StringEx(), interrupted, restarted)
   513  			}
   514  		}
   515  	}
   516  	// when keepalive becomes a new join
   517  	if restarted && apiOp == apc.Keepalive {
   518  		apiOp = apc.SelfJoin
   519  	}
   520  
   521  	msg := &apc.ActMsg{Action: action, Name: nsi.ID()}
   522  
   523  	p.owner.smap.mu.Lock()
   524  	upd, err := p._joinKalive(nsi, regReq.Smap, apiOp, nsi.Flags, &regReq, msg)
   525  	p.owner.smap.mu.Unlock()
   526  	if err != nil {
   527  		p.writeErr(w, r, err)
   528  		return
   529  	}
   530  	if !upd {
   531  		if apiOp == apc.AdminJoin {
   532  			// TODO: respond !updated (NOP)
   533  			p.writeJSON(w, r, apc.JoinNodeResult{DaemonID: nsi.ID()}, "")
   534  		}
   535  		return
   536  	}
   537  
   538  	nlog.Infof("%s: %s(%q) %s (%s)", p, apiOp, action, nsi.StringEx(), regReq.Smap)
   539  
   540  	if apiOp == apc.AdminJoin {
   541  		rebID, err := p.mcastJoined(nsi, msg, nsi.Flags, &regReq)
   542  		if err != nil {
   543  			p.writeErr(w, r, err)
   544  			return
   545  		}
   546  		p.writeJSON(w, r, apc.JoinNodeResult{DaemonID: nsi.ID(), RebalanceID: rebID}, "")
   547  		return
   548  	}
   549  
   550  	if apiOp == apc.SelfJoin {
   551  		// respond to the self-joining node with cluster-meta that does not include Smap
   552  		meta, err := p.cluMeta(cmetaFillOpt{skipSmap: true})
   553  		if err != nil {
   554  			p.writeErr(w, r, err)
   555  			return
   556  		}
   557  		p.writeJSON(w, r, meta, path.Join(msg.Action, nsi.ID()))
   558  	}
   559  
   560  	go p.mcastJoined(nsi, msg, nsi.Flags, &regReq)
   561  }
   562  
   563  func (p *proxy) fastKalive(w http.ResponseWriter, r *http.Request, smap *smapX, config *cmn.Config, sid string) {
   564  	fast := p.readyToFastKalive.Load()
   565  	if !fast {
   566  		var (
   567  			now       = mono.NanoTime()
   568  			cfg       = config.Keepalive
   569  			minUptime = max(cfg.Target.Interval.D(), cfg.Proxy.Interval.D()) << 1
   570  		)
   571  		if fast = p.keepalive.cluUptime(now) > minUptime; fast {
   572  			p.readyToFastKalive.Store(true) // not resetting upon a change of primary
   573  		}
   574  	}
   575  	if fast {
   576  		var (
   577  			callerID   = r.Header.Get(apc.HdrCallerID)
   578  			callerSver = r.Header.Get(apc.HdrCallerSmapVer)
   579  		)
   580  		if callerID == sid && callerSver != "" && callerSver == smap.vstr {
   581  			if si := smap.GetNode(sid); si != nil {
   582  				p.keepalive.heardFrom(sid)
   583  				return
   584  			}
   585  		}
   586  	}
   587  	p.writeErr(w, r, errFastKalive, 0, Silent)
   588  }
   589  
   590  // when joining manually: update the node with cluster meta that does not include Smap
   591  // (the later gets finalized and metasync-ed upon success)
   592  func (p *proxy) adminJoinHandshake(smap *smapX, nsi *meta.Snode, apiOp string) (int, error) {
   593  	cm, err := p.cluMeta(cmetaFillOpt{skipSmap: true})
   594  	if err != nil {
   595  		return http.StatusInternalServerError, err
   596  	}
   597  	nlog.Infof("%s: %s %s => (%s)", p, apiOp, nsi.StringEx(), p.owner.smap.get().StringEx())
   598  
   599  	cargs := allocCargs()
   600  	{
   601  		cargs.si = nsi
   602  		cargs.req = cmn.HreqArgs{Method: http.MethodPost, Path: apc.URLPathDaeAdminJoin.S, Body: cos.MustMarshal(cm)}
   603  		cargs.timeout = cmn.Rom.CplaneOperation()
   604  	}
   605  	res := p.call(cargs, smap)
   606  	err = res.err
   607  	status := res.status
   608  	if err != nil {
   609  		if cos.IsRetriableConnErr(res.err) {
   610  			err = fmt.Errorf("%s: failed to reach %s at %s:%s: %w",
   611  				p.si, nsi.StringEx(), nsi.PubNet.Hostname, nsi.PubNet.Port, res.err)
   612  		} else {
   613  			err = res.errorf("%s: failed to %s %s: %v", p.si, apiOp, nsi.StringEx(), res.err)
   614  		}
   615  	}
   616  	freeCargs(cargs)
   617  	freeCR(res)
   618  	return status, err
   619  }
   620  
   621  // executes under lock
   622  func (p *proxy) _joinKalive(nsi *meta.Snode, regSmap *smapX, apiOp string, flags cos.BitFlags, regReq *cluMeta, msg *apc.ActMsg) (upd bool, err error) {
   623  	smap := p.owner.smap.get()
   624  	if !smap.isPrimary(p.si) {
   625  		err = newErrNotPrimary(p.si, smap, "cannot "+apiOp+" "+nsi.StringEx())
   626  		return
   627  	}
   628  
   629  	keepalive := apiOp == apc.Keepalive
   630  	osi := smap.GetNode(nsi.ID())
   631  	if osi == nil {
   632  		if keepalive {
   633  			nlog.Warningln(p.String(), "keepalive", nsi.StringEx(), "- adding back to the", smap.StringEx())
   634  		}
   635  	} else {
   636  		if osi.Type() != nsi.Type() {
   637  			err = fmt.Errorf("unexpected node type: osi=%s, nsi=%s, %s (%t)", osi.StringEx(), nsi.StringEx(), smap.StringEx(), keepalive)
   638  			return
   639  		}
   640  		if keepalive {
   641  			upd = p.kalive(nsi, osi)
   642  		} else if regReq.Flags.IsSet(cifl.Restarted) {
   643  			upd = true
   644  		} else {
   645  			upd = p.rereg(nsi, osi)
   646  		}
   647  		if !upd {
   648  			return
   649  		}
   650  	}
   651  	// check for cluster integrity errors (cie)
   652  	if err = smap.validateUUID(p.si, regSmap, nsi.StringEx(), 80 /* ciError */); err != nil {
   653  		return
   654  	}
   655  	if apiOp == apc.Keepalive {
   656  		// whether IP is in use by a different node
   657  		// (but only for keep-alive - the other two opcodes have been already checked via handshake)
   658  		if _, err = smap.IsDupNet(nsi); err != nil {
   659  			err = errors.New(p.String() + ": " + err.Error())
   660  		}
   661  	}
   662  
   663  	// when cluster's starting up
   664  	if a, b := p.ClusterStarted(), p.owner.rmd.starting.Load(); err == nil && (!a || b) {
   665  		clone := smap.clone()
   666  		// TODO [feature]: updated *nsi contents (e.g., different network) may not "survive" earlystart merge
   667  		clone.putNode(nsi, flags, false /*silent*/)
   668  		p.owner.smap.put(clone)
   669  		upd = false
   670  		if a {
   671  			aisMsg := p.newAmsg(msg, nil)
   672  			_ = p.metasyncer.sync(revsPair{clone, aisMsg})
   673  		}
   674  		return
   675  	}
   676  
   677  	upd = err == nil
   678  	return
   679  }
   680  
   681  func (p *proxy) _confirmSnode(osi, nsi *meta.Snode) (bool, error) {
   682  	si, err := p._getSI(osi)
   683  	if err != nil {
   684  		return false, err
   685  	}
   686  	return nsi.Eq(si), nil
   687  }
   688  
   689  func (p *proxy) kalive(nsi, osi *meta.Snode) bool {
   690  	if !osi.Eq(nsi) {
   691  		ok, err := p._confirmSnode(osi, nsi)
   692  		if err != nil {
   693  			nlog.Errorf("%s: %s(%s) failed to obtain node info: %v", p, nsi.StringEx(), nsi.PubNet.URL, err)
   694  			return false
   695  		}
   696  		if !ok {
   697  			nlog.Errorf("%s: %s(%s) is trying to keepalive with duplicate ID", p, nsi.StringEx(), nsi.PubNet.URL)
   698  			return false
   699  		}
   700  		nlog.Warningf("%s: renewing registration %s (info changed!)", p, nsi.StringEx())
   701  		return true // NOTE: update cluster map
   702  	}
   703  
   704  	p.keepalive.heardFrom(nsi.ID())
   705  	return false
   706  }
   707  
   708  func (p *proxy) rereg(nsi, osi *meta.Snode) bool {
   709  	if !p.NodeStarted() {
   710  		return true
   711  	}
   712  	if osi.Eq(nsi) {
   713  		nlog.Infoln(p.String()+":", nsi.StringEx(), "is already _in_")
   714  		return false
   715  	}
   716  
   717  	// NOTE: see also ref0417 (ais/earlystart)
   718  	nlog.Warningln(p.String()+":", "renewing", nsi.StringEx(), "=>", nsi.StrURLs())
   719  	return true
   720  }
   721  
   722  func (p *proxy) mcastJoined(nsi *meta.Snode, msg *apc.ActMsg, flags cos.BitFlags, regReq *cluMeta) (xid string, err error) {
   723  	ctx := &smapModifier{
   724  		pre:         p._joinedPre,
   725  		post:        p._joinedPost,
   726  		final:       p._joinedFinal,
   727  		nsi:         nsi,
   728  		msg:         msg,
   729  		flags:       flags,
   730  		interrupted: regReq.Flags.IsSet(cifl.RebalanceInterrupted),
   731  		restarted:   regReq.Flags.IsSet(cifl.Restarted),
   732  	}
   733  	if err = p._earlyGFN(ctx, ctx.nsi); err != nil {
   734  		return
   735  	}
   736  	if err = p.owner.smap.modify(ctx); err != nil {
   737  		debug.AssertNoErr(err)
   738  		return
   739  	}
   740  	// with rebalance
   741  	if ctx.rmdCtx != nil && ctx.rmdCtx.cur != nil {
   742  		debug.Assert(ctx.rmdCtx.rebID != "")
   743  		xid = ctx.rmdCtx.rebID
   744  		return
   745  	}
   746  	// cleanup target state
   747  	if ctx.restarted || ctx.interrupted {
   748  		go p.cleanupMark(ctx)
   749  	}
   750  	if ctx.gfn {
   751  		aisMsg := p.newAmsgActVal(apc.ActStopGFN, nil) // "stop-gfn" timed
   752  		aisMsg.UUID = ctx.nsi.ID()
   753  		revs := revsPair{&smapX{Smap: meta.Smap{Version: ctx.nver}}, aisMsg}
   754  		_ = p.metasyncer.notify(false /*wait*/, revs) // async, failed-cnt always zero
   755  	}
   756  	return
   757  }
   758  
   759  func (p *proxy) _earlyGFN(ctx *smapModifier, si *meta.Snode /*being added or removed*/) error {
   760  	smap := p.owner.smap.get()
   761  	if !smap.isPrimary(p.si) {
   762  		return newErrNotPrimary(p.si, smap, fmt.Sprintf("cannot add %s", si))
   763  	}
   764  	if si.IsProxy() {
   765  		return nil
   766  	}
   767  	if err := p.canRebalance(); err != nil {
   768  		if err == errRebalanceDisabled {
   769  			err = nil
   770  		}
   771  		return err
   772  	}
   773  
   774  	// early-GFN notification with an empty (version-only and not yet updated) Smap and
   775  	// message(new target's ID)
   776  	msg := p.newAmsgActVal(apc.ActStartGFN, nil)
   777  	msg.UUID = si.ID()
   778  	revs := revsPair{&smapX{Smap: meta.Smap{Version: smap.Version}}, msg}
   779  	if fcnt := p.metasyncer.notify(true /*wait*/, revs); fcnt > 0 {
   780  		return fmt.Errorf("failed to notify early-gfn (%d)", fcnt)
   781  	}
   782  	ctx.gfn = true // to undo if need be
   783  	return nil
   784  }
   785  
   786  // calls t.cleanupMark
   787  func (p *proxy) cleanupMark(ctx *smapModifier) {
   788  	var (
   789  		val = cleanmark{OldVer: ctx.smap.version(), NewVer: ctx.nver,
   790  			Interrupted: ctx.interrupted, Restarted: ctx.restarted,
   791  		}
   792  		msg     = apc.ActMsg{Action: apc.ActCleanupMarkers, Value: &val}
   793  		cargs   = allocCargs()
   794  		smap    = p.owner.smap.get()
   795  		timeout = cmn.Rom.CplaneOperation()
   796  		sleep   = timeout >> 1
   797  	)
   798  	{
   799  		cargs.si = ctx.nsi
   800  		cargs.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: cos.MustMarshal(msg)}
   801  		cargs.timeout = timeout
   802  	}
   803  	time.Sleep(sleep)
   804  	for i := range 4 { // retry
   805  		res := p.call(cargs, smap)
   806  		err := res.err
   807  		freeCR(res)
   808  		if err == nil {
   809  			break
   810  		}
   811  		if cos.IsRetriableConnErr(err) {
   812  			time.Sleep(sleep)
   813  			smap = p.owner.smap.get()
   814  			nlog.Warningf("%s: %v (cleanmark #%d)", p, err, i+1)
   815  			continue
   816  		}
   817  		nlog.Errorln(err)
   818  		break
   819  	}
   820  	freeCargs(cargs)
   821  }
   822  
   823  func (p *proxy) _joinedPre(ctx *smapModifier, clone *smapX) error {
   824  	if !clone.isPrimary(p.si) {
   825  		return newErrNotPrimary(p.si, clone, fmt.Sprintf("cannot add %s", ctx.nsi))
   826  	}
   827  	clone.putNode(ctx.nsi, ctx.flags, true /*silent*/)
   828  	if ctx.nsi.IsProxy() {
   829  		clone.staffIC()
   830  	}
   831  	return nil
   832  }
   833  
   834  // RMD is always transmitted to provide for its (RMD's) replication -
   835  // done under Smap lock to serialize with respect to new joins.
   836  func (p *proxy) _joinedPost(ctx *smapModifier, clone *smapX) {
   837  	if ctx.nsi.IsProxy() {
   838  		return
   839  	}
   840  	if err := p.canRebalance(); err != nil {
   841  		return
   842  	}
   843  	if !mustRebalance(ctx, clone) {
   844  		return
   845  	}
   846  	// new RMD
   847  	rmdCtx := &rmdModifier{
   848  		pre: func(_ *rmdModifier, clone *rebMD) {
   849  			clone.TargetIDs = []string{ctx.nsi.ID()}
   850  			clone.inc()
   851  		},
   852  		smapCtx: ctx,
   853  		p:       p,
   854  		wait:    true,
   855  	}
   856  	if _, err := p.owner.rmd.modify(rmdCtx); err != nil {
   857  		debug.AssertNoErr(err)
   858  		return
   859  	}
   860  	rmdCtx.listen(nil)
   861  	ctx.rmdCtx = rmdCtx // smap modifier to reference the rmd one directly
   862  }
   863  
   864  func (p *proxy) _joinedFinal(ctx *smapModifier, clone *smapX) {
   865  	var (
   866  		tokens = p.authn.revokedTokenList()
   867  		bmd    = p.owner.bmd.get()
   868  		etlMD  = p.owner.etl.get()
   869  		aisMsg = p.newAmsg(ctx.msg, bmd)
   870  		pairs  = make([]revsPair, 0, 5)
   871  	)
   872  	// when targets join as well (redundant?, minor)
   873  	config, err := p.ensureConfigURLs()
   874  	if config == nil /*not updated*/ && err == nil {
   875  		config, err = p.owner.config.get()
   876  	}
   877  	if err != nil {
   878  		nlog.Errorln(err)
   879  		// proceed anyway
   880  	} else if config != nil {
   881  		pairs = append(pairs, revsPair{config, aisMsg})
   882  	}
   883  
   884  	pairs = append(pairs, revsPair{clone, aisMsg}, revsPair{bmd, aisMsg})
   885  	if etlMD != nil && etlMD.version() > 0 {
   886  		pairs = append(pairs, revsPair{etlMD, aisMsg})
   887  	}
   888  
   889  	reb := ctx.rmdCtx != nil && ctx.rmdCtx.rebID != ""
   890  	if !reb {
   891  		// replicate RMD across (existing nodes will drop it upon version comparison)
   892  		rmd := p.owner.rmd.get()
   893  		pairs = append(pairs, revsPair{rmd, aisMsg})
   894  	} else {
   895  		debug.Assert(ctx.rmdCtx.prev.version() < ctx.rmdCtx.cur.version())
   896  		aisMsg.UUID = ctx.rmdCtx.rebID
   897  		pairs = append(pairs, revsPair{ctx.rmdCtx.cur, aisMsg})
   898  	}
   899  
   900  	if tokens != nil {
   901  		pairs = append(pairs, revsPair{tokens, aisMsg})
   902  	}
   903  	_ = p.metasyncer.sync(pairs...)
   904  	p.syncNewICOwners(ctx.smap, clone)
   905  }
   906  
   907  func (p *proxy) _syncFinal(ctx *smapModifier, clone *smapX) {
   908  	var (
   909  		aisMsg = p.newAmsg(ctx.msg, nil)
   910  		pairs  = make([]revsPair, 0, 2)
   911  		reb    = ctx.rmdCtx != nil && ctx.rmdCtx.rebID != ""
   912  	)
   913  	pairs = append(pairs, revsPair{clone, aisMsg})
   914  	if reb {
   915  		debug.Assert(ctx.rmdCtx.prev.version() < ctx.rmdCtx.cur.version())
   916  		aisMsg.UUID = ctx.rmdCtx.rebID
   917  		pairs = append(pairs, revsPair{ctx.rmdCtx.cur, aisMsg})
   918  	}
   919  	debug.Assert(clone._sgl != nil)
   920  
   921  	config, err := p.ensureConfigURLs()
   922  	if config != nil /*updated*/ {
   923  		debug.AssertNoErr(err)
   924  		pairs = append(pairs, revsPair{config, aisMsg})
   925  	}
   926  
   927  	wg := p.metasyncer.sync(pairs...)
   928  	if ctx.rmdCtx != nil && ctx.rmdCtx.wait {
   929  		wg.Wait()
   930  	}
   931  }
   932  
   933  /////////////////////
   934  // PUT /v1/cluster //
   935  /////////////////////
   936  
   937  // - cluster membership, including maintenance and decommission
   938  // - start/stop xactions
   939  // - rebalance
   940  // - cluster-wide configuration
   941  // - cluster membership, xactions, rebalance, configuration
   942  func (p *proxy) httpcluput(w http.ResponseWriter, r *http.Request) {
   943  	apiItems, err := p.parseURL(w, r, apc.URLPathClu.L, 0, true)
   944  	if err != nil {
   945  		return
   946  	}
   947  	if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil {
   948  		return
   949  	}
   950  	if nlog.Stopping() {
   951  		p.writeErr(w, r, fmt.Errorf("%s is stopping", p), http.StatusServiceUnavailable)
   952  		return
   953  	}
   954  	if !p.NodeStarted() {
   955  		p.writeErrStatusf(w, r, http.StatusServiceUnavailable, "%s is not ready yet (starting up)", p)
   956  		return
   957  	}
   958  	if len(apiItems) == 0 {
   959  		p.cluputJSON(w, r)
   960  	} else {
   961  		p.cluputQuery(w, r, apiItems[0])
   962  	}
   963  }
   964  
   965  func (p *proxy) cluputJSON(w http.ResponseWriter, r *http.Request) {
   966  	msg, err := p.readActionMsg(w, r)
   967  	if err != nil {
   968  		return
   969  	}
   970  	if msg.Action != apc.ActSendOwnershipTbl {
   971  		// must be primary to execute all the rest actions
   972  		if p.forwardCP(w, r, msg, "") {
   973  			return
   974  		}
   975  
   976  		// not just 'cluster-started' - must be ready to rebalance as well
   977  		// with two distinct exceptions
   978  		withRR := (msg.Action != apc.ActShutdownCluster && msg.Action != apc.ActXactStop)
   979  		if err := p.pready(nil, withRR); err != nil {
   980  			p.writeErr(w, r, err, http.StatusServiceUnavailable)
   981  			return
   982  		}
   983  	}
   984  
   985  	switch msg.Action {
   986  	case apc.ActSetConfig:
   987  		toUpdate := &cmn.ConfigToSet{}
   988  		if err := cos.MorphMarshal(msg.Value, toUpdate); err != nil {
   989  			p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err)
   990  			return
   991  		}
   992  		query := r.URL.Query()
   993  		if transient := cos.IsParseBool(query.Get(apc.ActTransient)); transient {
   994  			p.setCluCfgTransient(w, r, toUpdate, msg)
   995  		} else {
   996  			p.setCluCfgPersistent(w, r, toUpdate, msg)
   997  		}
   998  	case apc.ActResetConfig:
   999  		p.resetCluCfgPersistent(w, r, msg)
  1000  	case apc.ActRotateLogs:
  1001  		p.rotateLogs(w, r, msg)
  1002  
  1003  	case apc.ActShutdownCluster:
  1004  		args := allocBcArgs()
  1005  		args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: cos.MustMarshal(msg)}
  1006  		args.to = core.AllNodes
  1007  		_ = p.bcastGroup(args)
  1008  		freeBcArgs(args)
  1009  		// self
  1010  		p.termKalive(msg.Action)
  1011  		p.shutdown(msg.Action)
  1012  	case apc.ActDecommissionCluster:
  1013  		var (
  1014  			opts apc.ActValRmNode
  1015  			args = allocBcArgs()
  1016  		)
  1017  		if err := cos.MorphMarshal(msg.Value, &opts); err != nil {
  1018  			p.writeErr(w, r, err)
  1019  			return
  1020  		}
  1021  		args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: cos.MustMarshal(msg)}
  1022  		args.to = core.AllNodes
  1023  		_ = p.bcastGroup(args)
  1024  		freeBcArgs(args)
  1025  		// self
  1026  		p.termKalive(msg.Action)
  1027  		p.decommission(msg.Action, &opts)
  1028  	case apc.ActStartMaintenance, apc.ActDecommissionNode, apc.ActShutdownNode, apc.ActRmNodeUnsafe:
  1029  		p.rmNode(w, r, msg)
  1030  	case apc.ActStopMaintenance:
  1031  		p.stopMaintenance(w, r, msg)
  1032  
  1033  	case apc.ActResetStats:
  1034  		errorsOnly := msg.Value.(bool)
  1035  		p.statsT.ResetStats(errorsOnly)
  1036  		args := allocBcArgs()
  1037  		args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: cos.MustMarshal(msg)}
  1038  		p.bcastAllNodes(w, r, args)
  1039  		freeBcArgs(args)
  1040  	case apc.ActXactStart:
  1041  		p.xstart(w, r, msg)
  1042  	case apc.ActXactStop:
  1043  		p.xstop(w, r, msg)
  1044  	case apc.ActSendOwnershipTbl:
  1045  		p.sendOwnTbl(w, r, msg)
  1046  	default:
  1047  		p.writeErrAct(w, r, msg.Action)
  1048  	}
  1049  }
  1050  
  1051  func (p *proxy) setCluCfgPersistent(w http.ResponseWriter, r *http.Request, toUpdate *cmn.ConfigToSet, msg *apc.ActMsg) {
  1052  	ctx := &configModifier{
  1053  		pre:      _setConfPre,
  1054  		final:    p._syncConfFinal,
  1055  		msg:      msg,
  1056  		toUpdate: toUpdate,
  1057  		wait:     true,
  1058  	}
  1059  	// NOTE: critical cluster-wide config updates requiring restart (of the cluster)
  1060  	if toUpdate.Net != nil && toUpdate.Net.HTTP != nil {
  1061  		config := cmn.GCO.Get()
  1062  		from, _ := jsoniter.Marshal(config.Net.HTTP)
  1063  		to, _ := jsoniter.Marshal(toUpdate.Net.HTTP)
  1064  		whingeToUpdate("net.http", string(from), string(to))
  1065  
  1066  		// complementary
  1067  		if toUpdate.Net.HTTP.UseHTTPS != nil {
  1068  			use := *toUpdate.Net.HTTP.UseHTTPS
  1069  			if config.Net.HTTP.UseHTTPS != use {
  1070  				if toUpdate.Proxy == nil {
  1071  					toUpdate.Proxy = &cmn.ProxyConfToSet{}
  1072  				}
  1073  				switchHTTPS(toUpdate.Proxy, &config.Proxy, use)
  1074  			}
  1075  		}
  1076  	}
  1077  	if toUpdate.Auth != nil {
  1078  		from, _ := jsoniter.Marshal(cmn.GCO.Get().Auth)
  1079  		to, _ := jsoniter.Marshal(toUpdate.Auth)
  1080  		whingeToUpdate("config.auth", string(from), string(to))
  1081  	}
  1082  
  1083  	// do
  1084  	if _, err := p.owner.config.modify(ctx); err != nil {
  1085  		p.writeErr(w, r, err)
  1086  	}
  1087  }
  1088  
  1089  // switch http => https, or vice versa
  1090  func switchHTTPS(toCfg *cmn.ProxyConfToSet, fromCfg *cmn.ProxyConf, use bool) {
  1091  	toScheme, fromScheme := "http", "https"
  1092  	if use {
  1093  		toScheme, fromScheme = "https", "http"
  1094  	}
  1095  	f := func(to *string, from string) *string {
  1096  		if to == nil && strings.HasPrefix(from, fromScheme) {
  1097  			s := strings.Replace(from, fromScheme, toScheme, 1)
  1098  			to = apc.Ptr(s)
  1099  		}
  1100  		return to
  1101  	}
  1102  	toCfg.PrimaryURL = f(toCfg.PrimaryURL, fromCfg.PrimaryURL)
  1103  	toCfg.OriginalURL = f(toCfg.OriginalURL, fromCfg.OriginalURL)
  1104  	toCfg.DiscoveryURL = f(toCfg.DiscoveryURL, fromCfg.DiscoveryURL)
  1105  
  1106  	nlog.Errorln("Warning: _prior_ to restart make sure to remove all copies of cluster maps")
  1107  }
  1108  
  1109  func whingeToUpdate(what, from, to string) {
  1110  	nlog.Warningf("Updating cluster %s configuration: setting %s", what, to)
  1111  	nlog.Warningf("Prior-to-update %s values: %s", what, from)
  1112  	nlog.Errorln("Warning: this update MAY require cluster restart")
  1113  }
  1114  
  1115  func (p *proxy) resetCluCfgPersistent(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) {
  1116  	if err := p.owner.config.resetDaemonConfig(); err != nil {
  1117  		p.writeErr(w, r, err)
  1118  		return
  1119  	}
  1120  	body := cos.MustMarshal(msg)
  1121  
  1122  	args := allocBcArgs()
  1123  	args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: body}
  1124  	p.bcastAllNodes(w, r, args)
  1125  	freeBcArgs(args)
  1126  }
  1127  
  1128  func (p *proxy) rotateLogs(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) {
  1129  	nlog.Flush(nlog.ActRotate)
  1130  	body := cos.MustMarshal(msg)
  1131  	args := allocBcArgs()
  1132  	args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: body}
  1133  	p.bcastAllNodes(w, r, args)
  1134  	freeBcArgs(args)
  1135  }
  1136  
  1137  func (p *proxy) setCluCfgTransient(w http.ResponseWriter, r *http.Request, toUpdate *cmn.ConfigToSet, msg *apc.ActMsg) {
  1138  	co := p.owner.config
  1139  	co.Lock()
  1140  	err := setConfig(toUpdate, true /* transient */)
  1141  	co.Unlock()
  1142  	if err != nil {
  1143  		p.writeErr(w, r, err)
  1144  		return
  1145  	}
  1146  
  1147  	msg.Value = toUpdate
  1148  	args := allocBcArgs()
  1149  	args.req = cmn.HreqArgs{
  1150  		Method: http.MethodPut,
  1151  		Path:   apc.URLPathDae.S,
  1152  		Body:   cos.MustMarshal(msg),
  1153  		Query:  url.Values{apc.ActTransient: []string{"true"}},
  1154  	}
  1155  	p.bcastAllNodes(w, r, args)
  1156  	freeBcArgs(args)
  1157  }
  1158  
  1159  func _setConfPre(ctx *configModifier, clone *globalConfig) (updated bool, err error) {
  1160  	if err = clone.Apply(ctx.toUpdate, apc.Cluster); err != nil {
  1161  		return
  1162  	}
  1163  	updated = true
  1164  	return
  1165  }
  1166  
  1167  func (p *proxy) _syncConfFinal(ctx *configModifier, clone *globalConfig) {
  1168  	wg := p.metasyncer.sync(revsPair{clone, p.newAmsg(ctx.msg, nil)})
  1169  	if ctx.wait {
  1170  		wg.Wait()
  1171  	}
  1172  }
  1173  
  1174  // xstart: rebalance, resilver, other "startables" (see xaction/api.go)
  1175  func (p *proxy) xstart(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) {
  1176  	var xargs xact.ArgsMsg
  1177  	if err := cos.MorphMarshal(msg.Value, &xargs); err != nil {
  1178  		p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err)
  1179  		return
  1180  	}
  1181  	xargs.Kind, _ = xact.GetKindName(xargs.Kind) // display name => kind
  1182  
  1183  	// rebalance
  1184  	if xargs.Kind == apc.ActRebalance {
  1185  		p.rebalanceCluster(w, r, msg)
  1186  		return
  1187  	}
  1188  
  1189  	args := allocBcArgs()
  1190  	args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathXactions.S}
  1191  
  1192  	switch {
  1193  	case xargs.Kind == apc.ActBlobDl:
  1194  		// validate; select one target
  1195  		args.smap = p.owner.smap.get()
  1196  		tsi, err := p.blobdl(args.smap, &xargs, msg)
  1197  		if err != nil {
  1198  			freeBcArgs(args)
  1199  			p.writeErr(w, r, err)
  1200  			return
  1201  		}
  1202  		args._selected(tsi)
  1203  		args.req.Body = cos.MustMarshal(apc.ActMsg{Action: msg.Action, Value: xargs, Name: msg.Name})
  1204  	case xargs.Kind == apc.ActResilver && xargs.DaemonID != "":
  1205  		args.smap = p.owner.smap.get()
  1206  		tsi := args.smap.GetTarget(xargs.DaemonID)
  1207  		if tsi == nil {
  1208  			err := &errNodeNotFound{"cannot resilver", xargs.DaemonID, p.si, args.smap}
  1209  			p.writeErr(w, r, err)
  1210  			return
  1211  		}
  1212  		args._selected(tsi)
  1213  		args.req.Body = cos.MustMarshal(apc.ActMsg{Action: msg.Action, Value: xargs})
  1214  	default:
  1215  		// all targets, one common UUID for all
  1216  		args.to = core.Targets
  1217  		xargs.ID = cos.GenUUID()
  1218  		args.req.Body = cos.MustMarshal(apc.ActMsg{Action: msg.Action, Value: xargs})
  1219  	}
  1220  
  1221  	results := p.bcastGroup(args)
  1222  	freeBcArgs(args)
  1223  
  1224  	for _, res := range results {
  1225  		if res.err == nil {
  1226  			if xargs.Kind == apc.ActResilver && xargs.DaemonID != "" {
  1227  				// - UUID assigned by the selected target (see above)
  1228  				// - not running notif listener for blob downloads - may reconsider
  1229  				xargs.ID = string(res.bytes)
  1230  			}
  1231  			continue
  1232  		}
  1233  		p.writeErr(w, r, res.toErr())
  1234  		freeBcastRes(results)
  1235  		return
  1236  	}
  1237  	freeBcastRes(results)
  1238  
  1239  	if xargs.ID != "" {
  1240  		smap := p.owner.smap.get()
  1241  		nl := xact.NewXactNL(xargs.ID, xargs.Kind, &smap.Smap, nil)
  1242  		p.ic.registerEqual(regIC{smap: smap, nl: nl})
  1243  
  1244  		w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(xargs.ID)))
  1245  		w.Write([]byte(xargs.ID))
  1246  	}
  1247  }
  1248  
  1249  func (a *bcastArgs) _selected(tsi *meta.Snode) {
  1250  	nmap := make(meta.NodeMap, 1)
  1251  	nmap[tsi.ID()] = tsi
  1252  	a.nodes = []meta.NodeMap{nmap}
  1253  	a.to = core.SelectedNodes
  1254  }
  1255  
  1256  func (p *proxy) blobdl(smap *smapX, xargs *xact.ArgsMsg, msg *apc.ActMsg) (tsi *meta.Snode, err error) {
  1257  	bck := meta.CloneBck(&xargs.Bck)
  1258  	if err := bck.Init(p.owner.bmd); err != nil {
  1259  		return nil, err
  1260  	}
  1261  	if err := cmn.ValidateRemoteBck(apc.ActBlobDl, &xargs.Bck); err != nil {
  1262  		return nil, err
  1263  	}
  1264  	objName := msg.Name
  1265  	tsi, _, err = smap.HrwMultiHome(xargs.Bck.MakeUname(objName))
  1266  	return tsi, err
  1267  }
  1268  
  1269  func (p *proxy) xstop(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) {
  1270  	var (
  1271  		xargs = xact.ArgsMsg{}
  1272  	)
  1273  	if err := cos.MorphMarshal(msg.Value, &xargs); err != nil {
  1274  		p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err)
  1275  		return
  1276  	}
  1277  	xargs.Kind, _ = xact.GetKindName(xargs.Kind) // display name => kind
  1278  
  1279  	// (lso + tco) special
  1280  	p.lstca.abort(&xargs)
  1281  
  1282  	if xargs.Kind == apc.ActRebalance {
  1283  		// disallow aborting rebalance during
  1284  		// critical (meta.SnodeMaint => meta.SnodeMaintPostReb) and (meta.SnodeDecomm => removed) transitions
  1285  		smap := p.owner.smap.get()
  1286  		for _, tsi := range smap.Tmap {
  1287  			if tsi.Flags.IsAnySet(meta.SnodeMaint) && !tsi.Flags.IsAnySet(meta.SnodeMaintPostReb) {
  1288  				p.writeErrf(w, r, "cannot abort %s: putting %s in maintenance mode - rebalancing...",
  1289  					xargs.String(), tsi.StringEx())
  1290  				return
  1291  			}
  1292  			if tsi.Flags.IsAnySet(meta.SnodeDecomm) {
  1293  				p.writeErrf(w, r, "cannot abort %s: decommissioning %s - rebalancing...",
  1294  					xargs.String(), tsi.StringEx())
  1295  				return
  1296  			}
  1297  		}
  1298  	}
  1299  
  1300  	body := cos.MustMarshal(apc.ActMsg{Action: msg.Action, Value: xargs})
  1301  	args := allocBcArgs()
  1302  	args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathXactions.S, Body: body}
  1303  	args.to = core.Targets
  1304  	results := p.bcastGroup(args)
  1305  	freeBcArgs(args)
  1306  
  1307  	for _, res := range results {
  1308  		if res.err != nil {
  1309  			p.writeErr(w, r, res.toErr())
  1310  			break
  1311  		}
  1312  	}
  1313  	freeBcastRes(results)
  1314  }
  1315  
  1316  func (p *proxy) rebalanceCluster(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) {
  1317  	// note operational priority over config-disabled `errRebalanceDisabled`
  1318  	if err := p.canRebalance(); err != nil && err != errRebalanceDisabled {
  1319  		p.writeErr(w, r, err)
  1320  		return
  1321  	}
  1322  	smap := p.owner.smap.get()
  1323  	if smap.CountTargets() < 2 {
  1324  		p.writeErr(w, r, &errNotEnoughTargets{p.si, smap, 2})
  1325  		return
  1326  	}
  1327  	if na := smap.CountActiveTs(); na < 2 {
  1328  		nlog.Warningf("%s: not enough active targets (%d) - proceeding to rebalance anyway", p, na)
  1329  	}
  1330  	rmdCtx := &rmdModifier{
  1331  		pre:     rmdInc,
  1332  		final:   rmdSync, // metasync new rmd instance
  1333  		p:       p,
  1334  		smapCtx: &smapModifier{smap: smap, msg: msg},
  1335  	}
  1336  	_, err := p.owner.rmd.modify(rmdCtx)
  1337  	if err != nil {
  1338  		p.writeErr(w, r, err)
  1339  		return
  1340  	}
  1341  	debug.Assert(rmdCtx.rebID != "")
  1342  	w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(rmdCtx.rebID)))
  1343  	w.Write([]byte(rmdCtx.rebID))
  1344  }
  1345  
  1346  func (p *proxy) sendOwnTbl(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) {
  1347  	var (
  1348  		smap  = p.owner.smap.get()
  1349  		dstID string
  1350  	)
  1351  	if err := cos.MorphMarshal(msg.Value, &dstID); err != nil {
  1352  		p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err)
  1353  		return
  1354  	}
  1355  	dst := smap.GetProxy(dstID)
  1356  	if dst == nil {
  1357  		p.writeErrf(w, r, "%s: unknown proxy node p[%s]", p.si, dstID)
  1358  		return
  1359  	}
  1360  	if !smap.IsIC(dst) {
  1361  		p.writeErrf(w, r, "%s: not an IC member", dst)
  1362  		return
  1363  	}
  1364  	if smap.IsIC(p.si) && !p.si.Eq(dst) {
  1365  		// node has older version than dst node handle locally
  1366  		if err := p.ic.sendOwnershipTbl(dst, smap); err != nil {
  1367  			p.writeErr(w, r, err)
  1368  		}
  1369  		return
  1370  	}
  1371  	// forward
  1372  	var (
  1373  		err   error
  1374  		cargs = allocCargs()
  1375  	)
  1376  	{
  1377  		cargs.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathClu.S, Body: cos.MustMarshal(msg)}
  1378  		cargs.timeout = apc.DefaultTimeout
  1379  	}
  1380  	for pid, psi := range smap.Pmap {
  1381  		if !smap.IsIC(psi) || pid == dstID {
  1382  			continue
  1383  		}
  1384  		cargs.si = psi
  1385  		res := p.call(cargs, smap)
  1386  		if res.err != nil {
  1387  			err = res.toErr()
  1388  		}
  1389  		freeCR(res)
  1390  	}
  1391  	if err != nil {
  1392  		p.writeErr(w, r, err)
  1393  	}
  1394  	freeCargs(cargs)
  1395  }
  1396  
  1397  // gracefully remove node via apc.ActStartMaintenance, apc.ActDecommission, apc.ActShutdownNode
  1398  func (p *proxy) rmNode(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) {
  1399  	var (
  1400  		opts apc.ActValRmNode
  1401  		smap = p.owner.smap.get()
  1402  	)
  1403  	if err := cos.MorphMarshal(msg.Value, &opts); err != nil {
  1404  		p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err)
  1405  		return
  1406  	}
  1407  	si := smap.GetNode(opts.DaemonID)
  1408  	if si == nil {
  1409  		err := cos.NewErrNotFound(p, "node "+opts.DaemonID)
  1410  		p.writeErr(w, r, err, http.StatusNotFound)
  1411  		return
  1412  	}
  1413  	var inMaint bool
  1414  	if smap.InMaintOrDecomm(si) {
  1415  		// only (maintenance => decommission|shutdown) permitted
  1416  		sname := si.StringEx()
  1417  		switch msg.Action {
  1418  		case apc.ActDecommissionNode, apc.ActDecommissionCluster,
  1419  			apc.ActShutdownNode, apc.ActShutdownCluster, apc.ActRmNodeUnsafe:
  1420  			onl := true
  1421  			flt := nlFilter{Kind: apc.ActRebalance, OnlyRunning: &onl}
  1422  			if nl := p.notifs.find(flt); nl != nil {
  1423  				p.writeErrf(w, r, "rebalance[%s] is currently running, please try (%s %s) later",
  1424  					nl.UUID(), msg.Action, si.StringEx())
  1425  				return
  1426  			}
  1427  			if !smap.InMaint(si) {
  1428  				nlog.Errorln("Warning: " + sname + " is currently being decommissioned")
  1429  			}
  1430  			inMaint = true
  1431  			// proceeding anyway
  1432  		default:
  1433  			if smap.InMaint(si) {
  1434  				p.writeErrMsg(w, r, sname+" is already in maintenance mode")
  1435  			} else {
  1436  				p.writeErrMsg(w, r, sname+" is currently being decommissioned")
  1437  			}
  1438  			return
  1439  		}
  1440  	}
  1441  	if p.SID() == opts.DaemonID {
  1442  		p.writeErrf(w, r, "%s is the current primary, cannot perform action %q on itself", p, msg.Action)
  1443  		return
  1444  	}
  1445  
  1446  	nlog.Infof("%s: %s(%s) opts=%v", p, msg.Action, si.StringEx(), opts)
  1447  
  1448  	switch {
  1449  	case si.IsProxy():
  1450  		if _, err := p.mcastMaint(msg, si, false /*reb*/, false /*maintPostReb*/); err != nil {
  1451  			p.writeErr(w, r, cmn.NewErrFailedTo(p, msg.Action, si, err))
  1452  			return
  1453  		}
  1454  		ecode, err := p.rmNodeFinal(msg, si, nil)
  1455  		if err != nil {
  1456  			p.writeErr(w, r, cmn.NewErrFailedTo(p, msg.Action, si, err), ecode)
  1457  		}
  1458  	case msg.Action == apc.ActRmNodeUnsafe: // target unsafe
  1459  		if !opts.SkipRebalance {
  1460  			err := errors.New("unsafe must be unsafe")
  1461  			debug.AssertNoErr(err)
  1462  			p.writeErr(w, r, err)
  1463  			return
  1464  		}
  1465  		ecode, err := p.rmNodeFinal(msg, si, nil)
  1466  		if err != nil {
  1467  			p.writeErr(w, r, cmn.NewErrFailedTo(p, msg.Action, si, err), ecode)
  1468  		}
  1469  	default: // target
  1470  		reb := !opts.SkipRebalance && cmn.GCO.Get().Rebalance.Enabled && !inMaint
  1471  		nlog.Infof("%s: %s reb=%t", p, msg.Action, reb)
  1472  		if reb {
  1473  			if err := p.canRebalance(); err != nil {
  1474  				p.writeErr(w, r, err)
  1475  				return
  1476  			}
  1477  			if err := p.beginRmTarget(si, msg); err != nil {
  1478  				p.writeErr(w, r, err)
  1479  				return
  1480  			}
  1481  		}
  1482  		rebID, err := p.rmTarget(si, msg, reb)
  1483  		if err != nil {
  1484  			p.writeErr(w, r, cmn.NewErrFailedTo(p, msg.Action, si, err))
  1485  			return
  1486  		}
  1487  		if rebID != "" {
  1488  			w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(rebID)))
  1489  			w.Write(cos.UnsafeB(rebID))
  1490  		}
  1491  	}
  1492  }
  1493  
  1494  func (p *proxy) rmTarget(si *meta.Snode, msg *apc.ActMsg, reb bool) (rebID string, err error) {
  1495  	var ctx *smapModifier
  1496  	if ctx, err = p.mcastMaint(msg, si, reb, false /*maintPostReb*/); err != nil {
  1497  		return
  1498  	}
  1499  	if !reb {
  1500  		_, err = p.rmNodeFinal(msg, si, ctx)
  1501  	} else if ctx.rmdCtx != nil {
  1502  		rebID = ctx.rmdCtx.rebID
  1503  		if rebID == "" && ctx.gfn { // stop early gfn
  1504  			aisMsg := p.newAmsgActVal(apc.ActStopGFN, nil)
  1505  			aisMsg.UUID = si.ID()
  1506  			revs := revsPair{&smapX{Smap: meta.Smap{Version: ctx.nver}}, aisMsg}
  1507  			_ = p.metasyncer.notify(false /*wait*/, revs) // async, failed-cnt always zero
  1508  		}
  1509  	}
  1510  	return
  1511  }
  1512  
  1513  func (p *proxy) mcastMaint(msg *apc.ActMsg, si *meta.Snode, reb, maintPostReb bool) (ctx *smapModifier, err error) {
  1514  	var flags cos.BitFlags
  1515  	switch msg.Action {
  1516  	case apc.ActDecommissionNode:
  1517  		flags = meta.SnodeDecomm
  1518  	case apc.ActShutdownNode, apc.ActStartMaintenance:
  1519  		flags = meta.SnodeMaint
  1520  		if maintPostReb {
  1521  			debug.Assert(si.IsTarget())
  1522  			flags |= meta.SnodeMaintPostReb
  1523  		}
  1524  	default:
  1525  		err = fmt.Errorf(fmtErrInvaldAction, msg.Action,
  1526  			[]string{apc.ActDecommissionNode, apc.ActStartMaintenance, apc.ActShutdownNode})
  1527  		return
  1528  	}
  1529  	var dummy = meta.Snode{Flags: flags}
  1530  	nlog.Infof("%s mcast-maint: %s, %s reb=(%t, %t), nflags=%s", p, msg, si.StringEx(), reb, maintPostReb, dummy.Fl2S())
  1531  	ctx = &smapModifier{
  1532  		pre:     p._markMaint,
  1533  		post:    p._rebPostRm, // (rmdCtx.rmNode => p.rmNodeFinal when all done)
  1534  		final:   p._syncFinal,
  1535  		sid:     si.ID(),
  1536  		flags:   flags,
  1537  		msg:     msg,
  1538  		skipReb: !reb,
  1539  	}
  1540  	if err = p._earlyGFN(ctx, si); err != nil {
  1541  		return
  1542  	}
  1543  	if err = p.owner.smap.modify(ctx); err != nil {
  1544  		debug.AssertNoErr(err)
  1545  		return
  1546  	}
  1547  	return
  1548  }
  1549  
  1550  func (p *proxy) _markMaint(ctx *smapModifier, clone *smapX) error {
  1551  	if !clone.isPrimary(p.si) {
  1552  		return newErrNotPrimary(p.si, clone, fmt.Sprintf("cannot put %s in maintenance", ctx.sid))
  1553  	}
  1554  	clone.setNodeFlags(ctx.sid, ctx.flags)
  1555  	clone.staffIC()
  1556  	return nil
  1557  }
  1558  
  1559  func (p *proxy) _rebPostRm(ctx *smapModifier, clone *smapX) {
  1560  	if ctx.skipReb {
  1561  		return
  1562  	}
  1563  	if !mustRebalance(ctx, clone) {
  1564  		return
  1565  	}
  1566  	rmdCtx := &rmdModifier{
  1567  		pre:     rmdInc,
  1568  		p:       p,
  1569  		smapCtx: ctx,
  1570  		wait:    true,
  1571  	}
  1572  	if _, err := p.owner.rmd.modify(rmdCtx); err != nil {
  1573  		debug.AssertNoErr(err)
  1574  		return
  1575  	}
  1576  	rmdCtx.listen(rmdCtx.postRm)
  1577  	ctx.rmdCtx = rmdCtx
  1578  }
  1579  
  1580  func (p *proxy) stopMaintenance(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) {
  1581  	var (
  1582  		opts apc.ActValRmNode
  1583  		smap = p.owner.smap.get()
  1584  	)
  1585  	if err := cos.MorphMarshal(msg.Value, &opts); err != nil {
  1586  		p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err)
  1587  		return
  1588  	}
  1589  	si := smap.GetNode(opts.DaemonID)
  1590  	if si == nil {
  1591  		err := cos.NewErrNotFound(p, "node "+opts.DaemonID)
  1592  		p.writeErr(w, r, err, http.StatusNotFound)
  1593  		return
  1594  	}
  1595  
  1596  	nlog.Infof("%s: %s(%s) opts=%v", p, msg.Action, si.StringEx(), opts)
  1597  
  1598  	if !smap.InMaint(si) {
  1599  		p.writeErrf(w, r, "node %s is not in maintenance mode - nothing to do", si.StringEx())
  1600  		return
  1601  	}
  1602  	timeout := cmn.GCO.Get().Timeout.CplaneOperation.D()
  1603  	if _, status, err := p.reqHealth(si, timeout, nil, smap); err != nil {
  1604  		sleep, retries := timeout/2, 5
  1605  		time.Sleep(sleep)
  1606  		for range retries { // retry
  1607  			time.Sleep(sleep)
  1608  			_, status, err = p.reqHealth(si, timeout, nil, smap)
  1609  			if err == nil {
  1610  				break
  1611  			}
  1612  			if status != http.StatusServiceUnavailable {
  1613  				p.writeErrf(w, r, "%s is unreachable: %v(%d)", si, err, status)
  1614  				return
  1615  			}
  1616  		}
  1617  		if err != nil {
  1618  			debug.Assert(status == http.StatusServiceUnavailable)
  1619  			nlog.Errorf("%s: node %s takes unusually long time to start: %v(%d) - proceeding anyway",
  1620  				p.si, si, err, status)
  1621  		}
  1622  	}
  1623  
  1624  	rebID, err := p.mcastStopMaint(msg, &opts)
  1625  	if err != nil {
  1626  		p.writeErr(w, r, err)
  1627  		return
  1628  	}
  1629  	if rebID != "" {
  1630  		w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(rebID)))
  1631  		w.Write(cos.UnsafeB(rebID))
  1632  	}
  1633  }
  1634  
  1635  func (p *proxy) cluputQuery(w http.ResponseWriter, r *http.Request, action string) {
  1636  	if p.forwardCP(w, r, &apc.ActMsg{Action: action}, "") {
  1637  		return
  1638  	}
  1639  	switch action {
  1640  	case apc.Proxy:
  1641  		if err := p.pready(nil, true); err != nil {
  1642  			p.writeErr(w, r, err, http.StatusServiceUnavailable)
  1643  			return
  1644  		}
  1645  		// cluster-wide: designate a new primary proxy administratively
  1646  		p.cluSetPrimary(w, r)
  1647  	case apc.ActSetConfig: // set-config via query parameters and "?n1=v1&n2=v2..."
  1648  		if err := p.pready(nil, true); err != nil {
  1649  			p.writeErr(w, r, err, http.StatusServiceUnavailable)
  1650  			return
  1651  		}
  1652  		var (
  1653  			query    = r.URL.Query()
  1654  			toUpdate = &cmn.ConfigToSet{}
  1655  			msg      = &apc.ActMsg{Action: action}
  1656  		)
  1657  		if err := toUpdate.FillFromQuery(query); err != nil {
  1658  			p.writeErrf(w, r, err.Error())
  1659  			return
  1660  		}
  1661  		if transient := cos.IsParseBool(query.Get(apc.ActTransient)); transient {
  1662  			p.setCluCfgTransient(w, r, toUpdate, msg)
  1663  		} else {
  1664  			p.setCluCfgPersistent(w, r, toUpdate, msg)
  1665  		}
  1666  	case apc.ActAttachRemAis, apc.ActDetachRemAis:
  1667  		p.attachDetachRemAis(w, r, action, r.URL.Query())
  1668  	}
  1669  }
  1670  
  1671  func (p *proxy) attachDetachRemAis(w http.ResponseWriter, r *http.Request, action string, query url.Values) {
  1672  	what := query.Get(apc.QparamWhat)
  1673  	if what != apc.WhatRemoteAIS {
  1674  		p.writeErr(w, r, fmt.Errorf(fmtUnknownQue, what))
  1675  		return
  1676  	}
  1677  	if !p.ClusterStarted() {
  1678  		const fmerr = "(config-backends modifying) remote cluster: (%t, %s)"
  1679  		var timeout time.Duration
  1680  		for {
  1681  			time.Sleep(cmn.Rom.MaxKeepalive())
  1682  			timeout += cmn.Rom.MaxKeepalive()
  1683  			config := cmn.GCO.Get()
  1684  			if p.ClusterStarted() {
  1685  				break
  1686  			}
  1687  			if timeout > config.Timeout.Startup.D()/2 {
  1688  				p.writeErr(w, r, fmt.Errorf("%s: failed to attach "+fmerr, p, p.ClusterStarted(), config))
  1689  				return
  1690  			}
  1691  			nlog.Errorf("%s: waiting to attach "+fmerr, p, p.ClusterStarted(), config)
  1692  		}
  1693  	}
  1694  	ctx := &configModifier{
  1695  		pre:   p._remaisConf,
  1696  		final: p._syncConfFinal,
  1697  		msg:   &apc.ActMsg{Action: action},
  1698  		query: query,
  1699  		hdr:   r.Header,
  1700  		wait:  true,
  1701  	}
  1702  	newConfig, err := p.owner.config.modify(ctx)
  1703  	if err != nil {
  1704  		p.writeErr(w, r, err)
  1705  	} else if newConfig != nil {
  1706  		go p._remais(&newConfig.ClusterConfig, false)
  1707  	}
  1708  }
  1709  
  1710  // the flow: attach/detach remais => modify cluster config => _remaisConf as the pre phase
  1711  // of the transaction
  1712  func (p *proxy) _remaisConf(ctx *configModifier, config *globalConfig) (bool, error) {
  1713  	var (
  1714  		aisConf cmn.BackendConfAIS
  1715  		action  = ctx.msg.Action
  1716  		v       = config.Backend.Get(apc.AIS)
  1717  	)
  1718  	if v == nil {
  1719  		if action == apc.ActDetachRemAis {
  1720  			return false, fmt.Errorf("%s: remote cluster config is empty", p.si)
  1721  		}
  1722  		aisConf = make(cmn.BackendConfAIS)
  1723  	} else {
  1724  		aisConf = cmn.BackendConfAIS{}
  1725  		cos.MustMorphMarshal(v, &aisConf)
  1726  	}
  1727  
  1728  	alias := ctx.hdr.Get(apc.HdrRemAisAlias)
  1729  	if action == apc.ActDetachRemAis {
  1730  		if _, ok := aisConf[alias]; !ok {
  1731  			return false,
  1732  				cmn.NewErrFailedTo(p, action, "remote cluster", errors.New("not found"), http.StatusNotFound)
  1733  		}
  1734  		delete(aisConf, alias)
  1735  		if len(aisConf) == 0 {
  1736  			aisConf = nil // unconfigure
  1737  		}
  1738  	} else {
  1739  		debug.Assert(action == apc.ActAttachRemAis)
  1740  		u := ctx.hdr.Get(apc.HdrRemAisURL)
  1741  		detail := fmt.Sprintf("remote cluster [alias %s => %v]", alias, u)
  1742  
  1743  		// validation rules:
  1744  		// rule #1: no two remote ais clusters can share the same alias (TODO: allow configuring multiple URLs per)
  1745  		for a, urls := range aisConf {
  1746  			if a != alias {
  1747  				continue
  1748  			}
  1749  			errmsg := fmt.Sprintf("%s: %s is already attached", p.si, detail)
  1750  			if !cos.StringInSlice(u, urls) {
  1751  				return false, errors.New(errmsg)
  1752  			}
  1753  			nlog.Warningln(errmsg + " - proceeding anyway")
  1754  		}
  1755  		// rule #2: aliases and UUIDs are two distinct non-overlapping sets
  1756  		p.remais.mu.RLock()
  1757  		for _, remais := range p.remais.A {
  1758  			debug.Assert(remais.Alias != alias)
  1759  			if alias == remais.UUID {
  1760  				p.remais.mu.RUnlock()
  1761  				return false, fmt.Errorf("%s: alias %q cannot be equal UUID of an already attached cluster [%s => %s]",
  1762  					p.si, alias, remais.Alias, remais.UUID)
  1763  			}
  1764  		}
  1765  		p.remais.mu.RUnlock()
  1766  
  1767  		parsed, err := url.ParseRequestURI(u)
  1768  		if err != nil {
  1769  			return false, cmn.NewErrFailedTo(p, action, detail, err)
  1770  		}
  1771  		if parsed.Scheme != "http" && parsed.Scheme != "https" {
  1772  			return false, cmn.NewErrFailedTo(p, action, detail, errors.New("invalid URL scheme"))
  1773  		}
  1774  		nlog.Infof("%s: %s %s", p, action, detail)
  1775  		aisConf[alias] = []string{u}
  1776  	}
  1777  	config.Backend.Set(apc.AIS, aisConf)
  1778  
  1779  	return true, nil
  1780  }
  1781  
  1782  func (p *proxy) mcastStopMaint(msg *apc.ActMsg, opts *apc.ActValRmNode) (rebID string, err error) {
  1783  	nlog.Infof("%s mcast-stopm: %s, %s, skip-reb=%t", p, msg, opts.DaemonID, opts.SkipRebalance)
  1784  	ctx := &smapModifier{
  1785  		pre:     p._stopMaintPre,
  1786  		post:    p._stopMaintRMD,
  1787  		final:   p._syncFinal,
  1788  		sid:     opts.DaemonID,
  1789  		skipReb: opts.SkipRebalance,
  1790  		msg:     msg,
  1791  		flags:   meta.SnodeMaint | meta.SnodeMaintPostReb, // to clear node flags
  1792  	}
  1793  	err = p.owner.smap.modify(ctx)
  1794  	if ctx.rmdCtx != nil && ctx.rmdCtx.cur != nil {
  1795  		debug.Assert(ctx.rmdCtx.cur.version() > ctx.rmdCtx.prev.version() && ctx.rmdCtx.rebID != "")
  1796  		rebID = ctx.rmdCtx.rebID
  1797  	}
  1798  	return
  1799  }
  1800  
  1801  func (p *proxy) _stopMaintPre(ctx *smapModifier, clone *smapX) error {
  1802  	const efmt = "cannot take %s out of maintenance:"
  1803  	if !clone.isPrimary(p.si) {
  1804  		return newErrNotPrimary(p.si, clone, fmt.Sprintf(efmt, ctx.sid))
  1805  	}
  1806  	node := clone.GetNode(ctx.sid)
  1807  	if node == nil {
  1808  		ctx.status = http.StatusNotFound
  1809  		return &errNodeNotFound{fmt.Sprintf(efmt, ctx.sid), ctx.sid, p.si, clone}
  1810  	}
  1811  	clone.clearNodeFlags(ctx.sid, ctx.flags)
  1812  	if node.IsProxy() {
  1813  		clone.staffIC()
  1814  	}
  1815  	return nil
  1816  }
  1817  
  1818  func (p *proxy) _stopMaintRMD(ctx *smapModifier, clone *smapX) {
  1819  	if ctx.skipReb {
  1820  		nlog.Infoln("ctx.skip-reb", ctx.skipReb)
  1821  		return
  1822  	}
  1823  	if !cmn.GCO.Get().Rebalance.Enabled {
  1824  		return
  1825  	}
  1826  	if nlog.Stopping() {
  1827  		return
  1828  	}
  1829  	if clone.CountActiveTs() < 2 {
  1830  		return
  1831  	}
  1832  	rmdCtx := &rmdModifier{
  1833  		pre:     rmdInc,
  1834  		smapCtx: ctx,
  1835  		p:       p,
  1836  		wait:    true,
  1837  	}
  1838  	if _, err := p.owner.rmd.modify(rmdCtx); err != nil {
  1839  		debug.AssertNoErr(err)
  1840  		return
  1841  	}
  1842  	rmdCtx.listen(nil)
  1843  	ctx.rmdCtx = rmdCtx
  1844  }
  1845  
  1846  func (p *proxy) cluSetPrimary(w http.ResponseWriter, r *http.Request) {
  1847  	apiItems, err := p.parseURL(w, r, apc.URLPathCluProxy.L, 1, false)
  1848  	if err != nil {
  1849  		return
  1850  	}
  1851  	npid := apiItems[0]
  1852  	if p.forwardCP(w, r, nil, "designate new primary proxy '"+npid+"'") {
  1853  		return
  1854  	}
  1855  
  1856  	// am current primary - validating
  1857  	smap := p.owner.smap.get()
  1858  	npsi := smap.GetProxy(npid)
  1859  	if npsi == nil {
  1860  		p.writeErrf(w, r, "new primary proxy %s is not present in the %s", npid, smap.StringEx())
  1861  		return
  1862  	}
  1863  	if npid == p.SID() {
  1864  		debug.Assert(p.SID() == smap.Primary.ID()) // must be forwardCP-ed
  1865  		nlog.Warningf("Request to set primary to %s(self) - nothing to do", npid)
  1866  		return
  1867  	}
  1868  	if smap.InMaintOrDecomm(npsi) {
  1869  		var err error
  1870  		if smap.InMaint(npsi) {
  1871  			err = fmt.Errorf("%s cannot become the new primary as it's currently under maintenance", npsi)
  1872  		} else {
  1873  			err = fmt.Errorf("%s cannot become the new primary as it's currently being decommissioned", npsi)
  1874  		}
  1875  		debug.AssertNoErr(err)
  1876  		p.writeErr(w, r, err, http.StatusServiceUnavailable)
  1877  		return
  1878  	}
  1879  
  1880  	// executing
  1881  	if p.settingNewPrimary.CAS(false, true) {
  1882  		p._setPrimary(w, r, npsi)
  1883  		p.settingNewPrimary.Store(false)
  1884  	}
  1885  }
  1886  
  1887  func (p *proxy) _setPrimary(w http.ResponseWriter, r *http.Request, npsi *meta.Snode) {
  1888  	//
  1889  	// (I.1) Prepare phase - inform other nodes.
  1890  	//
  1891  	urlPath := apc.URLPathDaeProxy.Join(npsi.ID())
  1892  	q := url.Values{}
  1893  	q.Set(apc.QparamPrepare, "true")
  1894  	args := allocBcArgs()
  1895  	args.req = cmn.HreqArgs{Method: http.MethodPut, Path: urlPath, Query: q}
  1896  
  1897  	cluMeta, errM := p.cluMeta(cmetaFillOpt{skipSmap: true, skipPrimeTime: true})
  1898  	if errM != nil {
  1899  		p.writeErr(w, r, errM)
  1900  		return
  1901  	}
  1902  	args.req.Body = cos.MustMarshal(cluMeta)
  1903  
  1904  	args.to = core.AllNodes
  1905  	results := p.bcastGroup(args)
  1906  	freeBcArgs(args)
  1907  	for _, res := range results {
  1908  		if res.err == nil {
  1909  			continue
  1910  		}
  1911  		err := res.errorf("node %s failed to set primary %s in the prepare phase", res.si, npsi.StringEx())
  1912  		p.writeErr(w, r, err)
  1913  		freeBcastRes(results)
  1914  		return
  1915  	}
  1916  	freeBcastRes(results)
  1917  
  1918  	//
  1919  	// (I.2) Prepare phase - local changes.
  1920  	//
  1921  	err := p.owner.smap.modify(&smapModifier{pre: func(_ *smapModifier, clone *smapX) error {
  1922  		clone.Primary = npsi
  1923  		p.metasyncer.becomeNonPrimary()
  1924  		return nil
  1925  	}})
  1926  	debug.AssertNoErr(err)
  1927  
  1928  	//
  1929  	// (II) Commit phase.
  1930  	//
  1931  	q.Set(apc.QparamPrepare, "false")
  1932  	args = allocBcArgs()
  1933  	args.req = cmn.HreqArgs{Method: http.MethodPut, Path: urlPath, Query: q}
  1934  	args.to = core.AllNodes
  1935  	results = p.bcastGroup(args)
  1936  	freeBcArgs(args)
  1937  	for _, res := range results {
  1938  		if res.err == nil {
  1939  			continue
  1940  		}
  1941  		if res.si.ID() == npsi.ID() {
  1942  			cos.ExitLogf("commit phase failure: new primary %s returned %v", npsi.StringEx(), res.err)
  1943  		} else {
  1944  			nlog.Errorf("Commit phase failure: %s returned %v when setting primary = %s",
  1945  				res.si.ID(), res.err, npsi.StringEx())
  1946  		}
  1947  	}
  1948  	freeBcastRes(results)
  1949  }
  1950  
  1951  /////////////////////////////////////////
  1952  // DELET /v1/cluster - self-unregister //
  1953  /////////////////////////////////////////
  1954  
  1955  func (p *proxy) httpcludel(w http.ResponseWriter, r *http.Request) {
  1956  	apiItems, err := p.parseURL(w, r, apc.URLPathCluDaemon.L, 1, false)
  1957  	if err != nil {
  1958  		return
  1959  	}
  1960  	var (
  1961  		sid  = apiItems[0]
  1962  		smap = p.owner.smap.get()
  1963  		node = smap.GetNode(sid)
  1964  	)
  1965  	if node == nil {
  1966  		err = &errNodeNotFound{"cannot remove", sid, p.si, smap}
  1967  		p.writeErr(w, r, err, http.StatusNotFound)
  1968  		return
  1969  	}
  1970  	if smap.IsPrimary(node) {
  1971  		p.writeErrMsg(w, r, "cannot remove primary proxy", http.StatusBadRequest)
  1972  		return
  1973  	}
  1974  	if p.forwardCP(w, r, nil, sid) {
  1975  		return
  1976  	}
  1977  	if !p.NodeStarted() {
  1978  		p.writeErrStatusf(w, r, http.StatusServiceUnavailable, "%s is not ready yet (starting up)", p)
  1979  		return
  1980  	}
  1981  
  1982  	// primary (and cluster) to start and finalize rebalancing status _prior_ to removing invidual nodes
  1983  	if err := p.pready(smap, true); err != nil {
  1984  		p.writeErr(w, r, err, http.StatusServiceUnavailable)
  1985  		return
  1986  	}
  1987  
  1988  	if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil {
  1989  		return
  1990  	}
  1991  	if err := p.isIntraCall(r.Header, false /*from primary*/); err != nil {
  1992  		err = fmt.Errorf("expecting intra-cluster call for self-initiated removal, got %w", err)
  1993  		p.writeErr(w, r, err)
  1994  		return
  1995  	}
  1996  	cid := r.Header.Get(apc.HdrCallerID)
  1997  	if cid != sid {
  1998  		err = fmt.Errorf("expecting self-initiated removal (%s != %s)", cid, sid)
  1999  		p.writeErr(w, r, err)
  2000  		return
  2001  	}
  2002  	if ecode, err := p.mcastUnreg(&apc.ActMsg{Action: "self-initiated-removal"}, node); err != nil {
  2003  		p.writeErr(w, r, err, ecode)
  2004  	}
  2005  }
  2006  
  2007  // post-rebalance or post no-rebalance - last step removing a node
  2008  // (with msg.Action defining semantics)
  2009  func (p *proxy) rmNodeFinal(msg *apc.ActMsg, si *meta.Snode, ctx *smapModifier) (int, error) {
  2010  	var (
  2011  		smap    = p.owner.smap.get()
  2012  		node    = smap.GetNode(si.ID())
  2013  		timeout = cmn.Rom.CplaneOperation()
  2014  	)
  2015  	if node == nil {
  2016  		txt := "cannot \"" + msg.Action + "\""
  2017  		return http.StatusNotFound, &errNodeNotFound{txt, si.ID(), p.si, smap}
  2018  	}
  2019  
  2020  	var (
  2021  		err   error
  2022  		ecode int
  2023  		cargs = allocCargs()
  2024  		body  = cos.MustMarshal(msg)
  2025  		sname = node.StringEx()
  2026  	)
  2027  	cargs.si, cargs.timeout = node, timeout
  2028  	switch msg.Action {
  2029  	case apc.ActShutdownNode, apc.ActRmNodeUnsafe, apc.ActStartMaintenance, apc.ActDecommissionNode:
  2030  		cargs.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: body}
  2031  	default:
  2032  		return 0, fmt.Errorf(fmtErrInvaldAction, msg.Action,
  2033  			[]string{apc.ActShutdownNode, apc.ActStartMaintenance, apc.ActDecommissionNode, apc.ActRmNodeUnsafe})
  2034  	}
  2035  
  2036  	nlog.InfoDepth(1, p.String()+":", msg.Action, sname)
  2037  	res := p.call(cargs, smap)
  2038  	err = res.unwrap()
  2039  	freeCargs(cargs)
  2040  	freeCR(res)
  2041  
  2042  	if err != nil {
  2043  		emsg := fmt.Sprintf("%s: (%s %s) final: %v - proceeding anyway...", p, msg, sname, err)
  2044  		switch msg.Action {
  2045  		case apc.ActShutdownNode, apc.ActDecommissionNode: // expecting EOF
  2046  			if !cos.IsEOF(err) {
  2047  				nlog.Errorln(emsg)
  2048  			}
  2049  		case apc.ActRmNodeUnsafe:
  2050  			if cmn.Rom.FastV(4, cos.SmoduleAIS) {
  2051  				nlog.Errorln(emsg)
  2052  			}
  2053  		default:
  2054  			nlog.Errorln(emsg)
  2055  		}
  2056  		err = nil // NOTE: proceeding anyway
  2057  	}
  2058  
  2059  	switch msg.Action {
  2060  	case apc.ActDecommissionNode, apc.ActRmNodeUnsafe:
  2061  		ecode, err = p.mcastUnreg(msg, node)
  2062  	case apc.ActStartMaintenance, apc.ActShutdownNode:
  2063  		if ctx != nil && ctx.rmdCtx != nil && ctx.rmdCtx.rebID != "" {
  2064  			// final step executing shutdown and start-maintenance transaction:
  2065  			// setting si.Flags |= cluster.SnodeMaintPostReb
  2066  			// (compare w/ rmTarget --> p.mcastMaint above)
  2067  			_, err = p.mcastMaint(msg, node, false /*reb*/, true /*maintPostReb*/)
  2068  		}
  2069  	}
  2070  	if err != nil {
  2071  		nlog.Errorf("%s: (%s %s) FATAL: failed to update %s: %v", p, msg, sname, p.owner.smap.get(), err)
  2072  	}
  2073  	return ecode, err
  2074  }
  2075  
  2076  func (p *proxy) mcastUnreg(msg *apc.ActMsg, si *meta.Snode) (ecode int, err error) {
  2077  	nlog.Infof("%s mcast-unreg: %s, %s", p, msg, si.StringEx())
  2078  	ctx := &smapModifier{
  2079  		pre:     p._unregNodePre,
  2080  		final:   p._syncFinal,
  2081  		msg:     msg,
  2082  		sid:     si.ID(),
  2083  		skipReb: true,
  2084  	}
  2085  	err = p.owner.smap.modify(ctx)
  2086  	return ctx.status, err
  2087  }
  2088  
  2089  func (p *proxy) _unregNodePre(ctx *smapModifier, clone *smapX) error {
  2090  	const verb = "remove"
  2091  	sid := ctx.sid
  2092  	if !clone.isPrimary(p.si) {
  2093  		return newErrNotPrimary(p.si, clone, fmt.Sprintf("cannot cancel %s %s", verb, sid))
  2094  	}
  2095  	node := clone.GetNode(sid)
  2096  	if node == nil {
  2097  		ctx.status = http.StatusNotFound
  2098  		return &errNodeNotFound{"failed to " + verb, sid, p.si, clone}
  2099  	}
  2100  	if node.IsProxy() {
  2101  		clone.delProxy(sid)
  2102  		nlog.Infof("%s %s (num proxies %d)", verb, node.StringEx(), clone.CountProxies())
  2103  		clone.staffIC()
  2104  	} else {
  2105  		clone.delTarget(sid)
  2106  		nlog.Infof("%s %s (num targets %d)", verb, node.StringEx(), clone.CountTargets())
  2107  	}
  2108  	p.rproxy.nodes.Delete(ctx.sid)
  2109  	return nil
  2110  }
  2111  
  2112  // rebalance's `can`: factors not including cluster map
  2113  func (p *proxy) canRebalance() (err error) {
  2114  	if nlog.Stopping() {
  2115  		return fmt.Errorf("%s is stopping", p)
  2116  	}
  2117  	smap := p.owner.smap.get()
  2118  	if err = smap.validate(); err != nil {
  2119  		return
  2120  	}
  2121  	if !smap.IsPrimary(p.si) {
  2122  		err = newErrNotPrimary(p.si, smap)
  2123  		debug.AssertNoErr(err)
  2124  		return
  2125  	}
  2126  	// NOTE: cluster startup handles rebalance elsewhere (see p.resumeReb), and so
  2127  	// all rebalance-triggering events (shutdown, decommission, maintenance, etc.)
  2128  	// are not permitted and will fail during startup.
  2129  	if err = p.pready(smap, true); err != nil {
  2130  		return
  2131  	}
  2132  	if !cmn.GCO.Get().Rebalance.Enabled {
  2133  		err = errRebalanceDisabled
  2134  	}
  2135  	return
  2136  }
  2137  
  2138  // rebalance's `must`: compares previous and current (cloned, updated) Smap
  2139  // TODO: bmd.num-buckets == 0 would be an easy one to check
  2140  func mustRebalance(ctx *smapModifier, cur *smapX) bool {
  2141  	if !cmn.GCO.Get().Rebalance.Enabled {
  2142  		return false
  2143  	}
  2144  	if nlog.Stopping() {
  2145  		return false
  2146  	}
  2147  	prev := ctx.smap
  2148  	if prev.CountActiveTs() == 0 {
  2149  		return false
  2150  	}
  2151  	if ctx.interrupted || ctx.restarted {
  2152  		return true
  2153  	}
  2154  
  2155  	// active <=> inactive transition
  2156  	debug.Assert(prev.version() < cur.version())
  2157  	for _, tsi := range cur.Tmap {
  2158  		// added an active one or activated previously inactive
  2159  		if !tsi.InMaintOrDecomm() && prev.GetActiveNode(tsi.ID()) == nil {
  2160  			return true
  2161  		}
  2162  	}
  2163  	for _, tsi := range prev.Tmap {
  2164  		// removed an active one or deactivated previously active
  2165  		if !tsi.InMaintOrDecomm() && cur.GetActiveNode(tsi.ID()) == nil {
  2166  			return true
  2167  		}
  2168  	}
  2169  	return false
  2170  }