github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/prxclu.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "net/http" 12 "net/url" 13 "path" 14 "strconv" 15 "strings" 16 "time" 17 18 "github.com/NVIDIA/aistore/api/apc" 19 "github.com/NVIDIA/aistore/cmn" 20 "github.com/NVIDIA/aistore/cmn/cifl" 21 "github.com/NVIDIA/aistore/cmn/cos" 22 "github.com/NVIDIA/aistore/cmn/debug" 23 "github.com/NVIDIA/aistore/cmn/mono" 24 "github.com/NVIDIA/aistore/cmn/nlog" 25 "github.com/NVIDIA/aistore/core" 26 "github.com/NVIDIA/aistore/core/meta" 27 "github.com/NVIDIA/aistore/stats" 28 "github.com/NVIDIA/aistore/xact" 29 jsoniter "github.com/json-iterator/go" 30 ) 31 32 // 33 // v1/cluster handlers 34 // 35 36 func (p *proxy) clusterHandler(w http.ResponseWriter, r *http.Request) { 37 switch r.Method { 38 case http.MethodGet: 39 p.httpcluget(w, r) 40 case http.MethodPost: 41 p.httpclupost(w, r) 42 case http.MethodPut: 43 p.httpcluput(w, r) 44 case http.MethodDelete: 45 p.httpcludel(w, r) 46 default: 47 cmn.WriteErr405(w, r, http.MethodDelete, http.MethodGet, http.MethodPost, http.MethodPut) 48 } 49 } 50 51 // 52 // GET /v1/cluster - query cluster states and stats 53 // 54 55 func (p *proxy) httpcluget(w http.ResponseWriter, r *http.Request) { 56 var ( 57 query = r.URL.Query() 58 what = query.Get(apc.QparamWhat) 59 ) 60 // always allow as the flow involves intra-cluster redirect 61 // (ref 1377 for more context) 62 if what == apc.WhatOneXactStatus { 63 p.ic.xstatusOne(w, r) 64 return 65 } 66 67 if err := p.checkAccess(w, r, nil, apc.AceShowCluster); err != nil { 68 return 69 } 70 71 switch what { 72 case apc.WhatAllXactStatus: 73 p.ic.xstatusAll(w, r, query) 74 case apc.WhatQueryXactStats: 75 p.xquery(w, r, what, query) 76 case apc.WhatAllRunningXacts: 77 p.xgetRunning(w, r, what, query) 78 case apc.WhatNodeStats, apc.WhatNodeStatsV322: 79 p.qcluStats(w, r, what, query) 80 case apc.WhatSysInfo: 81 p.qcluSysinfo(w, r, what, query) 82 case apc.WhatMountpaths: 83 p.qcluMountpaths(w, r, what, query) 84 case apc.WhatRemoteAIS: 85 all, err := p.getRemAisVec(true /*refresh*/) 86 if err != nil { 87 p.writeErr(w, r, err) 88 return 89 } 90 p.writeJSON(w, r, all, what) 91 case apc.WhatTargetIPs: 92 // Return comma-separated IPs of the targets. 93 // It can be used to easily fill the `--noproxy` parameter in cURL. 94 var ( 95 smap = p.owner.smap.Get() 96 buf = bytes.NewBuffer(nil) 97 ) 98 for _, si := range smap.Tmap { 99 if buf.Len() > 0 { 100 buf.WriteByte(',') 101 } 102 buf.WriteString(si.PubNet.Hostname) 103 buf.WriteByte(',') 104 buf.WriteString(si.ControlNet.Hostname) 105 buf.WriteByte(',') 106 buf.WriteString(si.DataNet.Hostname) 107 } 108 w.Header().Set(cos.HdrContentLength, strconv.Itoa(buf.Len())) 109 w.Write(buf.Bytes()) 110 111 case apc.WhatClusterConfig: 112 config := cmn.GCO.Get() 113 // hide secret 114 c := config.ClusterConfig 115 c.Auth.Secret = "**********" 116 p.writeJSON(w, r, &c, what) 117 case apc.WhatBMD, apc.WhatSmapVote, apc.WhatSnode, apc.WhatSmap: 118 p.htrun.httpdaeget(w, r, query, nil /*htext*/) 119 default: 120 p.writeErrf(w, r, fmtUnknownQue, what) 121 } 122 } 123 124 // apc.WhatQueryXactStats (NOTE: may poll for quiescence) 125 func (p *proxy) xquery(w http.ResponseWriter, r *http.Request, what string, query url.Values) { 126 var xactMsg xact.QueryMsg 127 if err := cmn.ReadJSON(w, r, &xactMsg); err != nil { 128 return 129 } 130 xactMsg.Kind, _ = xact.GetKindName(xactMsg.Kind) // convert display name => kind 131 body := cos.MustMarshal(xactMsg) 132 133 args := allocBcArgs() 134 args.req = cmn.HreqArgs{Method: http.MethodGet, Path: apc.URLPathXactions.S, Body: body, Query: query} 135 args.to = core.Targets 136 137 var ( 138 config = cmn.GCO.Get() 139 onlyRunning = xactMsg.OnlyRunning != nil && *xactMsg.OnlyRunning 140 ) 141 args.timeout = config.Client.Timeout.D() // quiescence 142 if !onlyRunning { 143 args.timeout = config.Client.TimeoutLong.D() 144 } 145 146 results := p.bcastGroup(args) 147 freeBcArgs(args) 148 resRaw, erred := p._tresRaw(w, r, results) 149 if erred { 150 return 151 } 152 if len(resRaw) == 0 { 153 smap := p.owner.smap.get() 154 if smap.CountActiveTs() > 0 { 155 p.writeErrStatusf(w, r, http.StatusNotFound, "%q not found", xactMsg.String()) 156 return 157 } 158 err := cmn.NewErrNoNodes(apc.Target, smap.CountTargets()) 159 nlog.Warningf("%s: %v, %s", p, err, smap) 160 } 161 162 // TODO: if voteInProgress snap and append xele, or else 163 164 p.writeJSON(w, r, resRaw, what) 165 } 166 167 // apc.WhatAllRunningXacts 168 func (p *proxy) xgetRunning(w http.ResponseWriter, r *http.Request, what string, query url.Values) { 169 var xactMsg xact.QueryMsg 170 if err := cmn.ReadJSON(w, r, &xactMsg); err != nil { 171 return 172 } 173 xactMsg.Kind, _ = xact.GetKindName(xactMsg.Kind) // convert display name => kind 174 body := cos.MustMarshal(xactMsg) 175 176 args := allocBcArgs() 177 args.req = cmn.HreqArgs{Method: http.MethodGet, Path: apc.URLPathXactions.S, Body: body, Query: query} 178 args.to = core.Targets 179 results := p.bcastGroup(args) 180 freeBcArgs(args) 181 182 uniqueKindIDs := cos.StrSet{} 183 for _, res := range results { 184 if res.err != nil { 185 p.writeErr(w, r, res.toErr()) 186 freeBcastRes(results) 187 return 188 } 189 if len(res.bytes) == 0 { 190 continue 191 } 192 var ( 193 kindIDs []string 194 err = jsoniter.Unmarshal(res.bytes, &kindIDs) 195 ) 196 debug.AssertNoErr(err) 197 for _, ki := range kindIDs { 198 uniqueKindIDs.Set(ki) 199 } 200 } 201 freeBcastRes(results) 202 p.writeJSON(w, r, uniqueKindIDs.ToSlice(), what) 203 } 204 205 func (p *proxy) qcluSysinfo(w http.ResponseWriter, r *http.Request, what string, query url.Values) { 206 var ( 207 config = cmn.GCO.Get() 208 timeout = config.Client.Timeout.D() 209 ) 210 proxyResults, err := p._sysinfo(r, timeout, core.Proxies, query) 211 if err != nil { 212 p.writeErr(w, r, err) 213 return 214 } 215 out := &apc.ClusterSysInfoRaw{} 216 out.Proxy = proxyResults 217 218 targetResults, err := p._sysinfo(r, timeout, core.Targets, query) 219 if err != nil { 220 p.writeErr(w, r, err) 221 return 222 } 223 out.Target = targetResults 224 p.writeJSON(w, r, out, what) 225 } 226 227 func (p *proxy) getRemAisVec(refresh bool) (*meta.RemAisVec, error) { 228 smap := p.owner.smap.get() 229 si, errT := smap.GetRandTarget() 230 if errT != nil { 231 return nil, errT 232 } 233 q := url.Values{apc.QparamWhat: []string{apc.WhatRemoteAIS}} 234 if refresh { 235 q[apc.QparamClusterInfo] = []string{"true"} // handshake to check connectivity and get remote Smap 236 } 237 cargs := allocCargs() 238 { 239 cargs.si = si 240 cargs.req = cmn.HreqArgs{ 241 Method: http.MethodGet, 242 Path: apc.URLPathDae.S, 243 Query: q, 244 } 245 cargs.timeout = cmn.Rom.MaxKeepalive() 246 cargs.cresv = cresBA{} // -> cmn.BackendInfoAIS 247 } 248 var ( 249 v *meta.RemAisVec 250 res = p.call(cargs, smap) 251 err = res.toErr() 252 ) 253 if err == nil { 254 v = res.v.(*meta.RemAisVec) 255 } 256 freeCargs(cargs) 257 freeCR(res) 258 return v, err 259 } 260 261 func (p *proxy) _sysinfo(r *http.Request, timeout time.Duration, to int, query url.Values) (cos.JSONRawMsgs, error) { 262 args := allocBcArgs() 263 args.req = cmn.HreqArgs{Method: r.Method, Path: apc.URLPathDae.S, Query: query} 264 args.timeout = timeout 265 args.to = to 266 results := p.bcastGroup(args) 267 freeBcArgs(args) 268 sysInfoMap := make(cos.JSONRawMsgs, len(results)) 269 for _, res := range results { 270 if res.err != nil { 271 err := res.toErr() 272 freeBcastRes(results) 273 return nil, err 274 } 275 sysInfoMap[res.si.ID()] = res.bytes 276 } 277 freeBcastRes(results) 278 return sysInfoMap, nil 279 } 280 281 func (p *proxy) qcluStats(w http.ResponseWriter, r *http.Request, what string, query url.Values) { 282 targetStats, erred := p._queryTs(w, r, query) 283 if targetStats == nil || erred { 284 return 285 } 286 out := &stats.ClusterRaw{} 287 out.Target = targetStats 288 out.Proxy = p.statsT.GetStats() 289 out.Proxy.Snode = p.si 290 p.writeJSON(w, r, out, what) 291 } 292 293 func (p *proxy) qcluMountpaths(w http.ResponseWriter, r *http.Request, what string, query url.Values) { 294 targetMountpaths, erred := p._queryTs(w, r, query) 295 if targetMountpaths == nil || erred { 296 return 297 } 298 out := &ClusterMountpathsRaw{} 299 out.Targets = targetMountpaths 300 p.writeJSON(w, r, out, what) 301 } 302 303 // helper methods for querying targets 304 305 func (p *proxy) _queryTs(w http.ResponseWriter, r *http.Request, query url.Values) (cos.JSONRawMsgs, bool) { 306 var ( 307 err error 308 body []byte 309 ) 310 if r.Body != nil { 311 body, err = cmn.ReadBytes(r) 312 if err != nil { 313 p.writeErr(w, r, err) 314 return nil, true 315 } 316 } 317 args := allocBcArgs() 318 args.req = cmn.HreqArgs{Method: r.Method, Path: apc.URLPathDae.S, Query: query, Body: body} 319 args.timeout = cmn.Rom.MaxKeepalive() 320 results := p.bcastGroup(args) 321 freeBcArgs(args) 322 return p._tresRaw(w, r, results) 323 } 324 325 func (p *proxy) _tresRaw(w http.ResponseWriter, r *http.Request, results sliceResults) (tres cos.JSONRawMsgs, erred bool) { 326 tres = make(cos.JSONRawMsgs, len(results)) 327 for _, res := range results { 328 if res.status == http.StatusNotFound { 329 continue 330 } 331 if res.err != nil { 332 p.writeErr(w, r, res.toErr()) 333 freeBcastRes(results) 334 tres, erred = nil, true 335 return 336 } 337 tres[res.si.ID()] = res.bytes 338 } 339 freeBcastRes(results) 340 return 341 } 342 343 // POST /v1/cluster - handles joins and keepalives 344 func (p *proxy) httpclupost(w http.ResponseWriter, r *http.Request) { 345 apiItems, err := p.parseURL(w, r, apc.URLPathClu.L, 1, true) 346 if err != nil { 347 return 348 } 349 if p.forwardCP(w, r, nil, "httpclupost") { 350 return 351 } 352 353 var ( 354 nsi *meta.Snode 355 action string 356 regReq cluMeta 357 smap = p.owner.smap.get() 358 config = cmn.GCO.Get() 359 apiOp = apiItems[0] 360 ) 361 if len(apiItems) > 1 && apiOp != apc.Keepalive { 362 p.writeErrURL(w, r) 363 return 364 } 365 if p.settingNewPrimary.Load() { 366 // ignore of fail 367 if apiOp != apc.Keepalive { 368 var s string 369 if apiOp == apc.AdminJoin { 370 s = " (retry in a few seconds)" 371 } 372 p.writeErr(w, r, errors.New("setting new primary - transitioning"+s), http.StatusServiceUnavailable) 373 } 374 return 375 } 376 377 switch apiOp { 378 case apc.Keepalive: 379 // fast path(?) 380 if len(apiItems) > 1 { 381 p.fastKalive(w, r, smap, config, apiItems[1]) 382 return 383 } 384 385 // slow path 386 if cmn.ReadJSON(w, r, ®Req) != nil { 387 return 388 } 389 nsi = regReq.SI 390 case apc.AdminJoin: // administrative join 391 if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil { 392 return 393 } 394 if cmn.ReadJSON(w, r, ®Req.SI) != nil { 395 return 396 } 397 nsi = regReq.SI 398 // must be reachable and must respond 399 si, err := p._getSI(nsi) 400 if err != nil { 401 p.writeErrf(w, r, "%s: failed to obtain node info from %s: %v", p.si, nsi.StringEx(), err) 402 return 403 } 404 // NOTE: node ID and 3-networks configuration is obtained from the node itself 405 *nsi = *si 406 case apc.SelfJoin: // auto-join at node startup 407 if cmn.ReadJSON(w, r, ®Req) != nil { 408 return 409 } 410 // NOTE: ditto 411 nsi = regReq.SI 412 if !p.ClusterStarted() { 413 p.reg.mu.Lock() 414 p.reg.pool = append(p.reg.pool, regReq) 415 p.reg.mu.Unlock() 416 } 417 default: 418 p.writeErrURL(w, r) 419 return 420 } 421 422 if err := nsi.Validate(); err != nil { 423 p.writeErr(w, r, err) 424 return 425 } 426 // given node and operation, set msg.Action 427 switch apiOp { 428 case apc.AdminJoin: 429 if nsi.IsProxy() { 430 action = apc.ActAdminJoinProxy 431 } else { 432 action = apc.ActAdminJoinTarget 433 } 434 case apc.SelfJoin: 435 if nsi.IsProxy() { 436 action = apc.ActSelfJoinProxy 437 } else { 438 action = apc.ActSelfJoinTarget 439 } 440 case apc.Keepalive: 441 action = apc.ActKeepaliveUpdate // (must be an extremely rare case) 442 } 443 444 // more validation && non-electability 445 if p.NodeStarted() { 446 bmd := p.owner.bmd.get() 447 if err := bmd.validateUUID(regReq.BMD, p.si, nsi, ""); err != nil { 448 p.writeErr(w, r, err) 449 return 450 } 451 } 452 var ( 453 nonElectable bool 454 ) 455 if nsi.IsProxy() { 456 s := r.URL.Query().Get(apc.QparamNonElectable) 457 if nonElectable, err = cos.ParseBool(s); err != nil { 458 nlog.Errorf("%s: failed to parse %s for non-electability: %v", p, s, err) 459 } 460 } 461 if _, err := cmn.ParseHost2IP(nsi.PubNet.Hostname); err != nil { 462 p.writeErrf(w, r, "%s: failed to %s %s: invalid hostname: %v", p.si, apiOp, nsi.StringEx(), err) 463 return 464 } 465 466 // node flags 467 if osi := smap.GetNode(nsi.ID()); osi != nil { 468 nsi.Flags = osi.Flags 469 } 470 if nonElectable { 471 nsi.Flags = nsi.Flags.Set(meta.SnodeNonElectable) 472 } 473 474 // handshake | check dup 475 if apiOp == apc.AdminJoin { 476 // call the node with cluster-metadata included 477 if ecode, err := p.adminJoinHandshake(smap, nsi, apiOp); err != nil { 478 p.writeErr(w, r, err, ecode) 479 return 480 } 481 } else if apiOp == apc.SelfJoin { 482 // 483 // check for: a) different node, duplicate node ID, or b) same node, net-info change 484 // 485 if osi := smap.GetNode(nsi.ID()); osi != nil && !osi.Eq(nsi) { 486 ok, err := p._confirmSnode(osi, nsi) // handshake (expecting nsi in response) 487 if err != nil { 488 if !cos.IsRetriableConnErr(err) { 489 p.writeErrf(w, r, "failed to obtain node info: %v", err) 490 return 491 } 492 // starting up, not listening yet 493 // NOTE [ref0417] 494 // TODO: try to confirm asynchronously 495 } else if !ok { 496 p.writeErrf(w, r, "duplicate node ID %q (%s, %s)", nsi.ID(), osi.StringEx(), nsi.StringEx()) 497 return 498 } 499 nlog.Warningf("%s: self-joining %s [err %v, confirmed %t]", p, nsi.StringEx(), err, ok) 500 } 501 } 502 503 if !config.Rebalance.Enabled { 504 regReq.Flags = regReq.Flags.Clear(cifl.RebalanceInterrupted) 505 regReq.Flags = regReq.Flags.Clear(cifl.Restarted) 506 } 507 interrupted, restarted := regReq.Flags.IsSet(cifl.RebalanceInterrupted), regReq.Flags.IsSet(cifl.Restarted) 508 if nsi.IsTarget() && (interrupted || restarted) { 509 if a, b := p.ClusterStarted(), p.owner.rmd.starting.Load(); !a || b { 510 // handle via rmd.starting + resumeReb 511 if p.owner.rmd.interrupted.CAS(false, true) { 512 nlog.Warningf("%s: will resume rebalance %s(%t, %t)", p, nsi.StringEx(), interrupted, restarted) 513 } 514 } 515 } 516 // when keepalive becomes a new join 517 if restarted && apiOp == apc.Keepalive { 518 apiOp = apc.SelfJoin 519 } 520 521 msg := &apc.ActMsg{Action: action, Name: nsi.ID()} 522 523 p.owner.smap.mu.Lock() 524 upd, err := p._joinKalive(nsi, regReq.Smap, apiOp, nsi.Flags, ®Req, msg) 525 p.owner.smap.mu.Unlock() 526 if err != nil { 527 p.writeErr(w, r, err) 528 return 529 } 530 if !upd { 531 if apiOp == apc.AdminJoin { 532 // TODO: respond !updated (NOP) 533 p.writeJSON(w, r, apc.JoinNodeResult{DaemonID: nsi.ID()}, "") 534 } 535 return 536 } 537 538 nlog.Infof("%s: %s(%q) %s (%s)", p, apiOp, action, nsi.StringEx(), regReq.Smap) 539 540 if apiOp == apc.AdminJoin { 541 rebID, err := p.mcastJoined(nsi, msg, nsi.Flags, ®Req) 542 if err != nil { 543 p.writeErr(w, r, err) 544 return 545 } 546 p.writeJSON(w, r, apc.JoinNodeResult{DaemonID: nsi.ID(), RebalanceID: rebID}, "") 547 return 548 } 549 550 if apiOp == apc.SelfJoin { 551 // respond to the self-joining node with cluster-meta that does not include Smap 552 meta, err := p.cluMeta(cmetaFillOpt{skipSmap: true}) 553 if err != nil { 554 p.writeErr(w, r, err) 555 return 556 } 557 p.writeJSON(w, r, meta, path.Join(msg.Action, nsi.ID())) 558 } 559 560 go p.mcastJoined(nsi, msg, nsi.Flags, ®Req) 561 } 562 563 func (p *proxy) fastKalive(w http.ResponseWriter, r *http.Request, smap *smapX, config *cmn.Config, sid string) { 564 fast := p.readyToFastKalive.Load() 565 if !fast { 566 var ( 567 now = mono.NanoTime() 568 cfg = config.Keepalive 569 minUptime = max(cfg.Target.Interval.D(), cfg.Proxy.Interval.D()) << 1 570 ) 571 if fast = p.keepalive.cluUptime(now) > minUptime; fast { 572 p.readyToFastKalive.Store(true) // not resetting upon a change of primary 573 } 574 } 575 if fast { 576 var ( 577 callerID = r.Header.Get(apc.HdrCallerID) 578 callerSver = r.Header.Get(apc.HdrCallerSmapVer) 579 ) 580 if callerID == sid && callerSver != "" && callerSver == smap.vstr { 581 if si := smap.GetNode(sid); si != nil { 582 p.keepalive.heardFrom(sid) 583 return 584 } 585 } 586 } 587 p.writeErr(w, r, errFastKalive, 0, Silent) 588 } 589 590 // when joining manually: update the node with cluster meta that does not include Smap 591 // (the later gets finalized and metasync-ed upon success) 592 func (p *proxy) adminJoinHandshake(smap *smapX, nsi *meta.Snode, apiOp string) (int, error) { 593 cm, err := p.cluMeta(cmetaFillOpt{skipSmap: true}) 594 if err != nil { 595 return http.StatusInternalServerError, err 596 } 597 nlog.Infof("%s: %s %s => (%s)", p, apiOp, nsi.StringEx(), p.owner.smap.get().StringEx()) 598 599 cargs := allocCargs() 600 { 601 cargs.si = nsi 602 cargs.req = cmn.HreqArgs{Method: http.MethodPost, Path: apc.URLPathDaeAdminJoin.S, Body: cos.MustMarshal(cm)} 603 cargs.timeout = cmn.Rom.CplaneOperation() 604 } 605 res := p.call(cargs, smap) 606 err = res.err 607 status := res.status 608 if err != nil { 609 if cos.IsRetriableConnErr(res.err) { 610 err = fmt.Errorf("%s: failed to reach %s at %s:%s: %w", 611 p.si, nsi.StringEx(), nsi.PubNet.Hostname, nsi.PubNet.Port, res.err) 612 } else { 613 err = res.errorf("%s: failed to %s %s: %v", p.si, apiOp, nsi.StringEx(), res.err) 614 } 615 } 616 freeCargs(cargs) 617 freeCR(res) 618 return status, err 619 } 620 621 // executes under lock 622 func (p *proxy) _joinKalive(nsi *meta.Snode, regSmap *smapX, apiOp string, flags cos.BitFlags, regReq *cluMeta, msg *apc.ActMsg) (upd bool, err error) { 623 smap := p.owner.smap.get() 624 if !smap.isPrimary(p.si) { 625 err = newErrNotPrimary(p.si, smap, "cannot "+apiOp+" "+nsi.StringEx()) 626 return 627 } 628 629 keepalive := apiOp == apc.Keepalive 630 osi := smap.GetNode(nsi.ID()) 631 if osi == nil { 632 if keepalive { 633 nlog.Warningln(p.String(), "keepalive", nsi.StringEx(), "- adding back to the", smap.StringEx()) 634 } 635 } else { 636 if osi.Type() != nsi.Type() { 637 err = fmt.Errorf("unexpected node type: osi=%s, nsi=%s, %s (%t)", osi.StringEx(), nsi.StringEx(), smap.StringEx(), keepalive) 638 return 639 } 640 if keepalive { 641 upd = p.kalive(nsi, osi) 642 } else if regReq.Flags.IsSet(cifl.Restarted) { 643 upd = true 644 } else { 645 upd = p.rereg(nsi, osi) 646 } 647 if !upd { 648 return 649 } 650 } 651 // check for cluster integrity errors (cie) 652 if err = smap.validateUUID(p.si, regSmap, nsi.StringEx(), 80 /* ciError */); err != nil { 653 return 654 } 655 if apiOp == apc.Keepalive { 656 // whether IP is in use by a different node 657 // (but only for keep-alive - the other two opcodes have been already checked via handshake) 658 if _, err = smap.IsDupNet(nsi); err != nil { 659 err = errors.New(p.String() + ": " + err.Error()) 660 } 661 } 662 663 // when cluster's starting up 664 if a, b := p.ClusterStarted(), p.owner.rmd.starting.Load(); err == nil && (!a || b) { 665 clone := smap.clone() 666 // TODO [feature]: updated *nsi contents (e.g., different network) may not "survive" earlystart merge 667 clone.putNode(nsi, flags, false /*silent*/) 668 p.owner.smap.put(clone) 669 upd = false 670 if a { 671 aisMsg := p.newAmsg(msg, nil) 672 _ = p.metasyncer.sync(revsPair{clone, aisMsg}) 673 } 674 return 675 } 676 677 upd = err == nil 678 return 679 } 680 681 func (p *proxy) _confirmSnode(osi, nsi *meta.Snode) (bool, error) { 682 si, err := p._getSI(osi) 683 if err != nil { 684 return false, err 685 } 686 return nsi.Eq(si), nil 687 } 688 689 func (p *proxy) kalive(nsi, osi *meta.Snode) bool { 690 if !osi.Eq(nsi) { 691 ok, err := p._confirmSnode(osi, nsi) 692 if err != nil { 693 nlog.Errorf("%s: %s(%s) failed to obtain node info: %v", p, nsi.StringEx(), nsi.PubNet.URL, err) 694 return false 695 } 696 if !ok { 697 nlog.Errorf("%s: %s(%s) is trying to keepalive with duplicate ID", p, nsi.StringEx(), nsi.PubNet.URL) 698 return false 699 } 700 nlog.Warningf("%s: renewing registration %s (info changed!)", p, nsi.StringEx()) 701 return true // NOTE: update cluster map 702 } 703 704 p.keepalive.heardFrom(nsi.ID()) 705 return false 706 } 707 708 func (p *proxy) rereg(nsi, osi *meta.Snode) bool { 709 if !p.NodeStarted() { 710 return true 711 } 712 if osi.Eq(nsi) { 713 nlog.Infoln(p.String()+":", nsi.StringEx(), "is already _in_") 714 return false 715 } 716 717 // NOTE: see also ref0417 (ais/earlystart) 718 nlog.Warningln(p.String()+":", "renewing", nsi.StringEx(), "=>", nsi.StrURLs()) 719 return true 720 } 721 722 func (p *proxy) mcastJoined(nsi *meta.Snode, msg *apc.ActMsg, flags cos.BitFlags, regReq *cluMeta) (xid string, err error) { 723 ctx := &smapModifier{ 724 pre: p._joinedPre, 725 post: p._joinedPost, 726 final: p._joinedFinal, 727 nsi: nsi, 728 msg: msg, 729 flags: flags, 730 interrupted: regReq.Flags.IsSet(cifl.RebalanceInterrupted), 731 restarted: regReq.Flags.IsSet(cifl.Restarted), 732 } 733 if err = p._earlyGFN(ctx, ctx.nsi); err != nil { 734 return 735 } 736 if err = p.owner.smap.modify(ctx); err != nil { 737 debug.AssertNoErr(err) 738 return 739 } 740 // with rebalance 741 if ctx.rmdCtx != nil && ctx.rmdCtx.cur != nil { 742 debug.Assert(ctx.rmdCtx.rebID != "") 743 xid = ctx.rmdCtx.rebID 744 return 745 } 746 // cleanup target state 747 if ctx.restarted || ctx.interrupted { 748 go p.cleanupMark(ctx) 749 } 750 if ctx.gfn { 751 aisMsg := p.newAmsgActVal(apc.ActStopGFN, nil) // "stop-gfn" timed 752 aisMsg.UUID = ctx.nsi.ID() 753 revs := revsPair{&smapX{Smap: meta.Smap{Version: ctx.nver}}, aisMsg} 754 _ = p.metasyncer.notify(false /*wait*/, revs) // async, failed-cnt always zero 755 } 756 return 757 } 758 759 func (p *proxy) _earlyGFN(ctx *smapModifier, si *meta.Snode /*being added or removed*/) error { 760 smap := p.owner.smap.get() 761 if !smap.isPrimary(p.si) { 762 return newErrNotPrimary(p.si, smap, fmt.Sprintf("cannot add %s", si)) 763 } 764 if si.IsProxy() { 765 return nil 766 } 767 if err := p.canRebalance(); err != nil { 768 if err == errRebalanceDisabled { 769 err = nil 770 } 771 return err 772 } 773 774 // early-GFN notification with an empty (version-only and not yet updated) Smap and 775 // message(new target's ID) 776 msg := p.newAmsgActVal(apc.ActStartGFN, nil) 777 msg.UUID = si.ID() 778 revs := revsPair{&smapX{Smap: meta.Smap{Version: smap.Version}}, msg} 779 if fcnt := p.metasyncer.notify(true /*wait*/, revs); fcnt > 0 { 780 return fmt.Errorf("failed to notify early-gfn (%d)", fcnt) 781 } 782 ctx.gfn = true // to undo if need be 783 return nil 784 } 785 786 // calls t.cleanupMark 787 func (p *proxy) cleanupMark(ctx *smapModifier) { 788 var ( 789 val = cleanmark{OldVer: ctx.smap.version(), NewVer: ctx.nver, 790 Interrupted: ctx.interrupted, Restarted: ctx.restarted, 791 } 792 msg = apc.ActMsg{Action: apc.ActCleanupMarkers, Value: &val} 793 cargs = allocCargs() 794 smap = p.owner.smap.get() 795 timeout = cmn.Rom.CplaneOperation() 796 sleep = timeout >> 1 797 ) 798 { 799 cargs.si = ctx.nsi 800 cargs.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: cos.MustMarshal(msg)} 801 cargs.timeout = timeout 802 } 803 time.Sleep(sleep) 804 for i := range 4 { // retry 805 res := p.call(cargs, smap) 806 err := res.err 807 freeCR(res) 808 if err == nil { 809 break 810 } 811 if cos.IsRetriableConnErr(err) { 812 time.Sleep(sleep) 813 smap = p.owner.smap.get() 814 nlog.Warningf("%s: %v (cleanmark #%d)", p, err, i+1) 815 continue 816 } 817 nlog.Errorln(err) 818 break 819 } 820 freeCargs(cargs) 821 } 822 823 func (p *proxy) _joinedPre(ctx *smapModifier, clone *smapX) error { 824 if !clone.isPrimary(p.si) { 825 return newErrNotPrimary(p.si, clone, fmt.Sprintf("cannot add %s", ctx.nsi)) 826 } 827 clone.putNode(ctx.nsi, ctx.flags, true /*silent*/) 828 if ctx.nsi.IsProxy() { 829 clone.staffIC() 830 } 831 return nil 832 } 833 834 // RMD is always transmitted to provide for its (RMD's) replication - 835 // done under Smap lock to serialize with respect to new joins. 836 func (p *proxy) _joinedPost(ctx *smapModifier, clone *smapX) { 837 if ctx.nsi.IsProxy() { 838 return 839 } 840 if err := p.canRebalance(); err != nil { 841 return 842 } 843 if !mustRebalance(ctx, clone) { 844 return 845 } 846 // new RMD 847 rmdCtx := &rmdModifier{ 848 pre: func(_ *rmdModifier, clone *rebMD) { 849 clone.TargetIDs = []string{ctx.nsi.ID()} 850 clone.inc() 851 }, 852 smapCtx: ctx, 853 p: p, 854 wait: true, 855 } 856 if _, err := p.owner.rmd.modify(rmdCtx); err != nil { 857 debug.AssertNoErr(err) 858 return 859 } 860 rmdCtx.listen(nil) 861 ctx.rmdCtx = rmdCtx // smap modifier to reference the rmd one directly 862 } 863 864 func (p *proxy) _joinedFinal(ctx *smapModifier, clone *smapX) { 865 var ( 866 tokens = p.authn.revokedTokenList() 867 bmd = p.owner.bmd.get() 868 etlMD = p.owner.etl.get() 869 aisMsg = p.newAmsg(ctx.msg, bmd) 870 pairs = make([]revsPair, 0, 5) 871 ) 872 // when targets join as well (redundant?, minor) 873 config, err := p.ensureConfigURLs() 874 if config == nil /*not updated*/ && err == nil { 875 config, err = p.owner.config.get() 876 } 877 if err != nil { 878 nlog.Errorln(err) 879 // proceed anyway 880 } else if config != nil { 881 pairs = append(pairs, revsPair{config, aisMsg}) 882 } 883 884 pairs = append(pairs, revsPair{clone, aisMsg}, revsPair{bmd, aisMsg}) 885 if etlMD != nil && etlMD.version() > 0 { 886 pairs = append(pairs, revsPair{etlMD, aisMsg}) 887 } 888 889 reb := ctx.rmdCtx != nil && ctx.rmdCtx.rebID != "" 890 if !reb { 891 // replicate RMD across (existing nodes will drop it upon version comparison) 892 rmd := p.owner.rmd.get() 893 pairs = append(pairs, revsPair{rmd, aisMsg}) 894 } else { 895 debug.Assert(ctx.rmdCtx.prev.version() < ctx.rmdCtx.cur.version()) 896 aisMsg.UUID = ctx.rmdCtx.rebID 897 pairs = append(pairs, revsPair{ctx.rmdCtx.cur, aisMsg}) 898 } 899 900 if tokens != nil { 901 pairs = append(pairs, revsPair{tokens, aisMsg}) 902 } 903 _ = p.metasyncer.sync(pairs...) 904 p.syncNewICOwners(ctx.smap, clone) 905 } 906 907 func (p *proxy) _syncFinal(ctx *smapModifier, clone *smapX) { 908 var ( 909 aisMsg = p.newAmsg(ctx.msg, nil) 910 pairs = make([]revsPair, 0, 2) 911 reb = ctx.rmdCtx != nil && ctx.rmdCtx.rebID != "" 912 ) 913 pairs = append(pairs, revsPair{clone, aisMsg}) 914 if reb { 915 debug.Assert(ctx.rmdCtx.prev.version() < ctx.rmdCtx.cur.version()) 916 aisMsg.UUID = ctx.rmdCtx.rebID 917 pairs = append(pairs, revsPair{ctx.rmdCtx.cur, aisMsg}) 918 } 919 debug.Assert(clone._sgl != nil) 920 921 config, err := p.ensureConfigURLs() 922 if config != nil /*updated*/ { 923 debug.AssertNoErr(err) 924 pairs = append(pairs, revsPair{config, aisMsg}) 925 } 926 927 wg := p.metasyncer.sync(pairs...) 928 if ctx.rmdCtx != nil && ctx.rmdCtx.wait { 929 wg.Wait() 930 } 931 } 932 933 ///////////////////// 934 // PUT /v1/cluster // 935 ///////////////////// 936 937 // - cluster membership, including maintenance and decommission 938 // - start/stop xactions 939 // - rebalance 940 // - cluster-wide configuration 941 // - cluster membership, xactions, rebalance, configuration 942 func (p *proxy) httpcluput(w http.ResponseWriter, r *http.Request) { 943 apiItems, err := p.parseURL(w, r, apc.URLPathClu.L, 0, true) 944 if err != nil { 945 return 946 } 947 if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil { 948 return 949 } 950 if nlog.Stopping() { 951 p.writeErr(w, r, fmt.Errorf("%s is stopping", p), http.StatusServiceUnavailable) 952 return 953 } 954 if !p.NodeStarted() { 955 p.writeErrStatusf(w, r, http.StatusServiceUnavailable, "%s is not ready yet (starting up)", p) 956 return 957 } 958 if len(apiItems) == 0 { 959 p.cluputJSON(w, r) 960 } else { 961 p.cluputQuery(w, r, apiItems[0]) 962 } 963 } 964 965 func (p *proxy) cluputJSON(w http.ResponseWriter, r *http.Request) { 966 msg, err := p.readActionMsg(w, r) 967 if err != nil { 968 return 969 } 970 if msg.Action != apc.ActSendOwnershipTbl { 971 // must be primary to execute all the rest actions 972 if p.forwardCP(w, r, msg, "") { 973 return 974 } 975 976 // not just 'cluster-started' - must be ready to rebalance as well 977 // with two distinct exceptions 978 withRR := (msg.Action != apc.ActShutdownCluster && msg.Action != apc.ActXactStop) 979 if err := p.pready(nil, withRR); err != nil { 980 p.writeErr(w, r, err, http.StatusServiceUnavailable) 981 return 982 } 983 } 984 985 switch msg.Action { 986 case apc.ActSetConfig: 987 toUpdate := &cmn.ConfigToSet{} 988 if err := cos.MorphMarshal(msg.Value, toUpdate); err != nil { 989 p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err) 990 return 991 } 992 query := r.URL.Query() 993 if transient := cos.IsParseBool(query.Get(apc.ActTransient)); transient { 994 p.setCluCfgTransient(w, r, toUpdate, msg) 995 } else { 996 p.setCluCfgPersistent(w, r, toUpdate, msg) 997 } 998 case apc.ActResetConfig: 999 p.resetCluCfgPersistent(w, r, msg) 1000 case apc.ActRotateLogs: 1001 p.rotateLogs(w, r, msg) 1002 1003 case apc.ActShutdownCluster: 1004 args := allocBcArgs() 1005 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: cos.MustMarshal(msg)} 1006 args.to = core.AllNodes 1007 _ = p.bcastGroup(args) 1008 freeBcArgs(args) 1009 // self 1010 p.termKalive(msg.Action) 1011 p.shutdown(msg.Action) 1012 case apc.ActDecommissionCluster: 1013 var ( 1014 opts apc.ActValRmNode 1015 args = allocBcArgs() 1016 ) 1017 if err := cos.MorphMarshal(msg.Value, &opts); err != nil { 1018 p.writeErr(w, r, err) 1019 return 1020 } 1021 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: cos.MustMarshal(msg)} 1022 args.to = core.AllNodes 1023 _ = p.bcastGroup(args) 1024 freeBcArgs(args) 1025 // self 1026 p.termKalive(msg.Action) 1027 p.decommission(msg.Action, &opts) 1028 case apc.ActStartMaintenance, apc.ActDecommissionNode, apc.ActShutdownNode, apc.ActRmNodeUnsafe: 1029 p.rmNode(w, r, msg) 1030 case apc.ActStopMaintenance: 1031 p.stopMaintenance(w, r, msg) 1032 1033 case apc.ActResetStats: 1034 errorsOnly := msg.Value.(bool) 1035 p.statsT.ResetStats(errorsOnly) 1036 args := allocBcArgs() 1037 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: cos.MustMarshal(msg)} 1038 p.bcastAllNodes(w, r, args) 1039 freeBcArgs(args) 1040 case apc.ActXactStart: 1041 p.xstart(w, r, msg) 1042 case apc.ActXactStop: 1043 p.xstop(w, r, msg) 1044 case apc.ActSendOwnershipTbl: 1045 p.sendOwnTbl(w, r, msg) 1046 default: 1047 p.writeErrAct(w, r, msg.Action) 1048 } 1049 } 1050 1051 func (p *proxy) setCluCfgPersistent(w http.ResponseWriter, r *http.Request, toUpdate *cmn.ConfigToSet, msg *apc.ActMsg) { 1052 ctx := &configModifier{ 1053 pre: _setConfPre, 1054 final: p._syncConfFinal, 1055 msg: msg, 1056 toUpdate: toUpdate, 1057 wait: true, 1058 } 1059 // NOTE: critical cluster-wide config updates requiring restart (of the cluster) 1060 if toUpdate.Net != nil && toUpdate.Net.HTTP != nil { 1061 config := cmn.GCO.Get() 1062 from, _ := jsoniter.Marshal(config.Net.HTTP) 1063 to, _ := jsoniter.Marshal(toUpdate.Net.HTTP) 1064 whingeToUpdate("net.http", string(from), string(to)) 1065 1066 // complementary 1067 if toUpdate.Net.HTTP.UseHTTPS != nil { 1068 use := *toUpdate.Net.HTTP.UseHTTPS 1069 if config.Net.HTTP.UseHTTPS != use { 1070 if toUpdate.Proxy == nil { 1071 toUpdate.Proxy = &cmn.ProxyConfToSet{} 1072 } 1073 switchHTTPS(toUpdate.Proxy, &config.Proxy, use) 1074 } 1075 } 1076 } 1077 if toUpdate.Auth != nil { 1078 from, _ := jsoniter.Marshal(cmn.GCO.Get().Auth) 1079 to, _ := jsoniter.Marshal(toUpdate.Auth) 1080 whingeToUpdate("config.auth", string(from), string(to)) 1081 } 1082 1083 // do 1084 if _, err := p.owner.config.modify(ctx); err != nil { 1085 p.writeErr(w, r, err) 1086 } 1087 } 1088 1089 // switch http => https, or vice versa 1090 func switchHTTPS(toCfg *cmn.ProxyConfToSet, fromCfg *cmn.ProxyConf, use bool) { 1091 toScheme, fromScheme := "http", "https" 1092 if use { 1093 toScheme, fromScheme = "https", "http" 1094 } 1095 f := func(to *string, from string) *string { 1096 if to == nil && strings.HasPrefix(from, fromScheme) { 1097 s := strings.Replace(from, fromScheme, toScheme, 1) 1098 to = apc.Ptr(s) 1099 } 1100 return to 1101 } 1102 toCfg.PrimaryURL = f(toCfg.PrimaryURL, fromCfg.PrimaryURL) 1103 toCfg.OriginalURL = f(toCfg.OriginalURL, fromCfg.OriginalURL) 1104 toCfg.DiscoveryURL = f(toCfg.DiscoveryURL, fromCfg.DiscoveryURL) 1105 1106 nlog.Errorln("Warning: _prior_ to restart make sure to remove all copies of cluster maps") 1107 } 1108 1109 func whingeToUpdate(what, from, to string) { 1110 nlog.Warningf("Updating cluster %s configuration: setting %s", what, to) 1111 nlog.Warningf("Prior-to-update %s values: %s", what, from) 1112 nlog.Errorln("Warning: this update MAY require cluster restart") 1113 } 1114 1115 func (p *proxy) resetCluCfgPersistent(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { 1116 if err := p.owner.config.resetDaemonConfig(); err != nil { 1117 p.writeErr(w, r, err) 1118 return 1119 } 1120 body := cos.MustMarshal(msg) 1121 1122 args := allocBcArgs() 1123 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: body} 1124 p.bcastAllNodes(w, r, args) 1125 freeBcArgs(args) 1126 } 1127 1128 func (p *proxy) rotateLogs(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { 1129 nlog.Flush(nlog.ActRotate) 1130 body := cos.MustMarshal(msg) 1131 args := allocBcArgs() 1132 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: body} 1133 p.bcastAllNodes(w, r, args) 1134 freeBcArgs(args) 1135 } 1136 1137 func (p *proxy) setCluCfgTransient(w http.ResponseWriter, r *http.Request, toUpdate *cmn.ConfigToSet, msg *apc.ActMsg) { 1138 co := p.owner.config 1139 co.Lock() 1140 err := setConfig(toUpdate, true /* transient */) 1141 co.Unlock() 1142 if err != nil { 1143 p.writeErr(w, r, err) 1144 return 1145 } 1146 1147 msg.Value = toUpdate 1148 args := allocBcArgs() 1149 args.req = cmn.HreqArgs{ 1150 Method: http.MethodPut, 1151 Path: apc.URLPathDae.S, 1152 Body: cos.MustMarshal(msg), 1153 Query: url.Values{apc.ActTransient: []string{"true"}}, 1154 } 1155 p.bcastAllNodes(w, r, args) 1156 freeBcArgs(args) 1157 } 1158 1159 func _setConfPre(ctx *configModifier, clone *globalConfig) (updated bool, err error) { 1160 if err = clone.Apply(ctx.toUpdate, apc.Cluster); err != nil { 1161 return 1162 } 1163 updated = true 1164 return 1165 } 1166 1167 func (p *proxy) _syncConfFinal(ctx *configModifier, clone *globalConfig) { 1168 wg := p.metasyncer.sync(revsPair{clone, p.newAmsg(ctx.msg, nil)}) 1169 if ctx.wait { 1170 wg.Wait() 1171 } 1172 } 1173 1174 // xstart: rebalance, resilver, other "startables" (see xaction/api.go) 1175 func (p *proxy) xstart(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { 1176 var xargs xact.ArgsMsg 1177 if err := cos.MorphMarshal(msg.Value, &xargs); err != nil { 1178 p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err) 1179 return 1180 } 1181 xargs.Kind, _ = xact.GetKindName(xargs.Kind) // display name => kind 1182 1183 // rebalance 1184 if xargs.Kind == apc.ActRebalance { 1185 p.rebalanceCluster(w, r, msg) 1186 return 1187 } 1188 1189 args := allocBcArgs() 1190 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathXactions.S} 1191 1192 switch { 1193 case xargs.Kind == apc.ActBlobDl: 1194 // validate; select one target 1195 args.smap = p.owner.smap.get() 1196 tsi, err := p.blobdl(args.smap, &xargs, msg) 1197 if err != nil { 1198 freeBcArgs(args) 1199 p.writeErr(w, r, err) 1200 return 1201 } 1202 args._selected(tsi) 1203 args.req.Body = cos.MustMarshal(apc.ActMsg{Action: msg.Action, Value: xargs, Name: msg.Name}) 1204 case xargs.Kind == apc.ActResilver && xargs.DaemonID != "": 1205 args.smap = p.owner.smap.get() 1206 tsi := args.smap.GetTarget(xargs.DaemonID) 1207 if tsi == nil { 1208 err := &errNodeNotFound{"cannot resilver", xargs.DaemonID, p.si, args.smap} 1209 p.writeErr(w, r, err) 1210 return 1211 } 1212 args._selected(tsi) 1213 args.req.Body = cos.MustMarshal(apc.ActMsg{Action: msg.Action, Value: xargs}) 1214 default: 1215 // all targets, one common UUID for all 1216 args.to = core.Targets 1217 xargs.ID = cos.GenUUID() 1218 args.req.Body = cos.MustMarshal(apc.ActMsg{Action: msg.Action, Value: xargs}) 1219 } 1220 1221 results := p.bcastGroup(args) 1222 freeBcArgs(args) 1223 1224 for _, res := range results { 1225 if res.err == nil { 1226 if xargs.Kind == apc.ActResilver && xargs.DaemonID != "" { 1227 // - UUID assigned by the selected target (see above) 1228 // - not running notif listener for blob downloads - may reconsider 1229 xargs.ID = string(res.bytes) 1230 } 1231 continue 1232 } 1233 p.writeErr(w, r, res.toErr()) 1234 freeBcastRes(results) 1235 return 1236 } 1237 freeBcastRes(results) 1238 1239 if xargs.ID != "" { 1240 smap := p.owner.smap.get() 1241 nl := xact.NewXactNL(xargs.ID, xargs.Kind, &smap.Smap, nil) 1242 p.ic.registerEqual(regIC{smap: smap, nl: nl}) 1243 1244 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(xargs.ID))) 1245 w.Write([]byte(xargs.ID)) 1246 } 1247 } 1248 1249 func (a *bcastArgs) _selected(tsi *meta.Snode) { 1250 nmap := make(meta.NodeMap, 1) 1251 nmap[tsi.ID()] = tsi 1252 a.nodes = []meta.NodeMap{nmap} 1253 a.to = core.SelectedNodes 1254 } 1255 1256 func (p *proxy) blobdl(smap *smapX, xargs *xact.ArgsMsg, msg *apc.ActMsg) (tsi *meta.Snode, err error) { 1257 bck := meta.CloneBck(&xargs.Bck) 1258 if err := bck.Init(p.owner.bmd); err != nil { 1259 return nil, err 1260 } 1261 if err := cmn.ValidateRemoteBck(apc.ActBlobDl, &xargs.Bck); err != nil { 1262 return nil, err 1263 } 1264 objName := msg.Name 1265 tsi, _, err = smap.HrwMultiHome(xargs.Bck.MakeUname(objName)) 1266 return tsi, err 1267 } 1268 1269 func (p *proxy) xstop(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { 1270 var ( 1271 xargs = xact.ArgsMsg{} 1272 ) 1273 if err := cos.MorphMarshal(msg.Value, &xargs); err != nil { 1274 p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err) 1275 return 1276 } 1277 xargs.Kind, _ = xact.GetKindName(xargs.Kind) // display name => kind 1278 1279 // (lso + tco) special 1280 p.lstca.abort(&xargs) 1281 1282 if xargs.Kind == apc.ActRebalance { 1283 // disallow aborting rebalance during 1284 // critical (meta.SnodeMaint => meta.SnodeMaintPostReb) and (meta.SnodeDecomm => removed) transitions 1285 smap := p.owner.smap.get() 1286 for _, tsi := range smap.Tmap { 1287 if tsi.Flags.IsAnySet(meta.SnodeMaint) && !tsi.Flags.IsAnySet(meta.SnodeMaintPostReb) { 1288 p.writeErrf(w, r, "cannot abort %s: putting %s in maintenance mode - rebalancing...", 1289 xargs.String(), tsi.StringEx()) 1290 return 1291 } 1292 if tsi.Flags.IsAnySet(meta.SnodeDecomm) { 1293 p.writeErrf(w, r, "cannot abort %s: decommissioning %s - rebalancing...", 1294 xargs.String(), tsi.StringEx()) 1295 return 1296 } 1297 } 1298 } 1299 1300 body := cos.MustMarshal(apc.ActMsg{Action: msg.Action, Value: xargs}) 1301 args := allocBcArgs() 1302 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathXactions.S, Body: body} 1303 args.to = core.Targets 1304 results := p.bcastGroup(args) 1305 freeBcArgs(args) 1306 1307 for _, res := range results { 1308 if res.err != nil { 1309 p.writeErr(w, r, res.toErr()) 1310 break 1311 } 1312 } 1313 freeBcastRes(results) 1314 } 1315 1316 func (p *proxy) rebalanceCluster(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { 1317 // note operational priority over config-disabled `errRebalanceDisabled` 1318 if err := p.canRebalance(); err != nil && err != errRebalanceDisabled { 1319 p.writeErr(w, r, err) 1320 return 1321 } 1322 smap := p.owner.smap.get() 1323 if smap.CountTargets() < 2 { 1324 p.writeErr(w, r, &errNotEnoughTargets{p.si, smap, 2}) 1325 return 1326 } 1327 if na := smap.CountActiveTs(); na < 2 { 1328 nlog.Warningf("%s: not enough active targets (%d) - proceeding to rebalance anyway", p, na) 1329 } 1330 rmdCtx := &rmdModifier{ 1331 pre: rmdInc, 1332 final: rmdSync, // metasync new rmd instance 1333 p: p, 1334 smapCtx: &smapModifier{smap: smap, msg: msg}, 1335 } 1336 _, err := p.owner.rmd.modify(rmdCtx) 1337 if err != nil { 1338 p.writeErr(w, r, err) 1339 return 1340 } 1341 debug.Assert(rmdCtx.rebID != "") 1342 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(rmdCtx.rebID))) 1343 w.Write([]byte(rmdCtx.rebID)) 1344 } 1345 1346 func (p *proxy) sendOwnTbl(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { 1347 var ( 1348 smap = p.owner.smap.get() 1349 dstID string 1350 ) 1351 if err := cos.MorphMarshal(msg.Value, &dstID); err != nil { 1352 p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err) 1353 return 1354 } 1355 dst := smap.GetProxy(dstID) 1356 if dst == nil { 1357 p.writeErrf(w, r, "%s: unknown proxy node p[%s]", p.si, dstID) 1358 return 1359 } 1360 if !smap.IsIC(dst) { 1361 p.writeErrf(w, r, "%s: not an IC member", dst) 1362 return 1363 } 1364 if smap.IsIC(p.si) && !p.si.Eq(dst) { 1365 // node has older version than dst node handle locally 1366 if err := p.ic.sendOwnershipTbl(dst, smap); err != nil { 1367 p.writeErr(w, r, err) 1368 } 1369 return 1370 } 1371 // forward 1372 var ( 1373 err error 1374 cargs = allocCargs() 1375 ) 1376 { 1377 cargs.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathClu.S, Body: cos.MustMarshal(msg)} 1378 cargs.timeout = apc.DefaultTimeout 1379 } 1380 for pid, psi := range smap.Pmap { 1381 if !smap.IsIC(psi) || pid == dstID { 1382 continue 1383 } 1384 cargs.si = psi 1385 res := p.call(cargs, smap) 1386 if res.err != nil { 1387 err = res.toErr() 1388 } 1389 freeCR(res) 1390 } 1391 if err != nil { 1392 p.writeErr(w, r, err) 1393 } 1394 freeCargs(cargs) 1395 } 1396 1397 // gracefully remove node via apc.ActStartMaintenance, apc.ActDecommission, apc.ActShutdownNode 1398 func (p *proxy) rmNode(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { 1399 var ( 1400 opts apc.ActValRmNode 1401 smap = p.owner.smap.get() 1402 ) 1403 if err := cos.MorphMarshal(msg.Value, &opts); err != nil { 1404 p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err) 1405 return 1406 } 1407 si := smap.GetNode(opts.DaemonID) 1408 if si == nil { 1409 err := cos.NewErrNotFound(p, "node "+opts.DaemonID) 1410 p.writeErr(w, r, err, http.StatusNotFound) 1411 return 1412 } 1413 var inMaint bool 1414 if smap.InMaintOrDecomm(si) { 1415 // only (maintenance => decommission|shutdown) permitted 1416 sname := si.StringEx() 1417 switch msg.Action { 1418 case apc.ActDecommissionNode, apc.ActDecommissionCluster, 1419 apc.ActShutdownNode, apc.ActShutdownCluster, apc.ActRmNodeUnsafe: 1420 onl := true 1421 flt := nlFilter{Kind: apc.ActRebalance, OnlyRunning: &onl} 1422 if nl := p.notifs.find(flt); nl != nil { 1423 p.writeErrf(w, r, "rebalance[%s] is currently running, please try (%s %s) later", 1424 nl.UUID(), msg.Action, si.StringEx()) 1425 return 1426 } 1427 if !smap.InMaint(si) { 1428 nlog.Errorln("Warning: " + sname + " is currently being decommissioned") 1429 } 1430 inMaint = true 1431 // proceeding anyway 1432 default: 1433 if smap.InMaint(si) { 1434 p.writeErrMsg(w, r, sname+" is already in maintenance mode") 1435 } else { 1436 p.writeErrMsg(w, r, sname+" is currently being decommissioned") 1437 } 1438 return 1439 } 1440 } 1441 if p.SID() == opts.DaemonID { 1442 p.writeErrf(w, r, "%s is the current primary, cannot perform action %q on itself", p, msg.Action) 1443 return 1444 } 1445 1446 nlog.Infof("%s: %s(%s) opts=%v", p, msg.Action, si.StringEx(), opts) 1447 1448 switch { 1449 case si.IsProxy(): 1450 if _, err := p.mcastMaint(msg, si, false /*reb*/, false /*maintPostReb*/); err != nil { 1451 p.writeErr(w, r, cmn.NewErrFailedTo(p, msg.Action, si, err)) 1452 return 1453 } 1454 ecode, err := p.rmNodeFinal(msg, si, nil) 1455 if err != nil { 1456 p.writeErr(w, r, cmn.NewErrFailedTo(p, msg.Action, si, err), ecode) 1457 } 1458 case msg.Action == apc.ActRmNodeUnsafe: // target unsafe 1459 if !opts.SkipRebalance { 1460 err := errors.New("unsafe must be unsafe") 1461 debug.AssertNoErr(err) 1462 p.writeErr(w, r, err) 1463 return 1464 } 1465 ecode, err := p.rmNodeFinal(msg, si, nil) 1466 if err != nil { 1467 p.writeErr(w, r, cmn.NewErrFailedTo(p, msg.Action, si, err), ecode) 1468 } 1469 default: // target 1470 reb := !opts.SkipRebalance && cmn.GCO.Get().Rebalance.Enabled && !inMaint 1471 nlog.Infof("%s: %s reb=%t", p, msg.Action, reb) 1472 if reb { 1473 if err := p.canRebalance(); err != nil { 1474 p.writeErr(w, r, err) 1475 return 1476 } 1477 if err := p.beginRmTarget(si, msg); err != nil { 1478 p.writeErr(w, r, err) 1479 return 1480 } 1481 } 1482 rebID, err := p.rmTarget(si, msg, reb) 1483 if err != nil { 1484 p.writeErr(w, r, cmn.NewErrFailedTo(p, msg.Action, si, err)) 1485 return 1486 } 1487 if rebID != "" { 1488 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(rebID))) 1489 w.Write(cos.UnsafeB(rebID)) 1490 } 1491 } 1492 } 1493 1494 func (p *proxy) rmTarget(si *meta.Snode, msg *apc.ActMsg, reb bool) (rebID string, err error) { 1495 var ctx *smapModifier 1496 if ctx, err = p.mcastMaint(msg, si, reb, false /*maintPostReb*/); err != nil { 1497 return 1498 } 1499 if !reb { 1500 _, err = p.rmNodeFinal(msg, si, ctx) 1501 } else if ctx.rmdCtx != nil { 1502 rebID = ctx.rmdCtx.rebID 1503 if rebID == "" && ctx.gfn { // stop early gfn 1504 aisMsg := p.newAmsgActVal(apc.ActStopGFN, nil) 1505 aisMsg.UUID = si.ID() 1506 revs := revsPair{&smapX{Smap: meta.Smap{Version: ctx.nver}}, aisMsg} 1507 _ = p.metasyncer.notify(false /*wait*/, revs) // async, failed-cnt always zero 1508 } 1509 } 1510 return 1511 } 1512 1513 func (p *proxy) mcastMaint(msg *apc.ActMsg, si *meta.Snode, reb, maintPostReb bool) (ctx *smapModifier, err error) { 1514 var flags cos.BitFlags 1515 switch msg.Action { 1516 case apc.ActDecommissionNode: 1517 flags = meta.SnodeDecomm 1518 case apc.ActShutdownNode, apc.ActStartMaintenance: 1519 flags = meta.SnodeMaint 1520 if maintPostReb { 1521 debug.Assert(si.IsTarget()) 1522 flags |= meta.SnodeMaintPostReb 1523 } 1524 default: 1525 err = fmt.Errorf(fmtErrInvaldAction, msg.Action, 1526 []string{apc.ActDecommissionNode, apc.ActStartMaintenance, apc.ActShutdownNode}) 1527 return 1528 } 1529 var dummy = meta.Snode{Flags: flags} 1530 nlog.Infof("%s mcast-maint: %s, %s reb=(%t, %t), nflags=%s", p, msg, si.StringEx(), reb, maintPostReb, dummy.Fl2S()) 1531 ctx = &smapModifier{ 1532 pre: p._markMaint, 1533 post: p._rebPostRm, // (rmdCtx.rmNode => p.rmNodeFinal when all done) 1534 final: p._syncFinal, 1535 sid: si.ID(), 1536 flags: flags, 1537 msg: msg, 1538 skipReb: !reb, 1539 } 1540 if err = p._earlyGFN(ctx, si); err != nil { 1541 return 1542 } 1543 if err = p.owner.smap.modify(ctx); err != nil { 1544 debug.AssertNoErr(err) 1545 return 1546 } 1547 return 1548 } 1549 1550 func (p *proxy) _markMaint(ctx *smapModifier, clone *smapX) error { 1551 if !clone.isPrimary(p.si) { 1552 return newErrNotPrimary(p.si, clone, fmt.Sprintf("cannot put %s in maintenance", ctx.sid)) 1553 } 1554 clone.setNodeFlags(ctx.sid, ctx.flags) 1555 clone.staffIC() 1556 return nil 1557 } 1558 1559 func (p *proxy) _rebPostRm(ctx *smapModifier, clone *smapX) { 1560 if ctx.skipReb { 1561 return 1562 } 1563 if !mustRebalance(ctx, clone) { 1564 return 1565 } 1566 rmdCtx := &rmdModifier{ 1567 pre: rmdInc, 1568 p: p, 1569 smapCtx: ctx, 1570 wait: true, 1571 } 1572 if _, err := p.owner.rmd.modify(rmdCtx); err != nil { 1573 debug.AssertNoErr(err) 1574 return 1575 } 1576 rmdCtx.listen(rmdCtx.postRm) 1577 ctx.rmdCtx = rmdCtx 1578 } 1579 1580 func (p *proxy) stopMaintenance(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { 1581 var ( 1582 opts apc.ActValRmNode 1583 smap = p.owner.smap.get() 1584 ) 1585 if err := cos.MorphMarshal(msg.Value, &opts); err != nil { 1586 p.writeErrf(w, r, cmn.FmtErrMorphUnmarshal, p.si, msg.Action, msg.Value, err) 1587 return 1588 } 1589 si := smap.GetNode(opts.DaemonID) 1590 if si == nil { 1591 err := cos.NewErrNotFound(p, "node "+opts.DaemonID) 1592 p.writeErr(w, r, err, http.StatusNotFound) 1593 return 1594 } 1595 1596 nlog.Infof("%s: %s(%s) opts=%v", p, msg.Action, si.StringEx(), opts) 1597 1598 if !smap.InMaint(si) { 1599 p.writeErrf(w, r, "node %s is not in maintenance mode - nothing to do", si.StringEx()) 1600 return 1601 } 1602 timeout := cmn.GCO.Get().Timeout.CplaneOperation.D() 1603 if _, status, err := p.reqHealth(si, timeout, nil, smap); err != nil { 1604 sleep, retries := timeout/2, 5 1605 time.Sleep(sleep) 1606 for range retries { // retry 1607 time.Sleep(sleep) 1608 _, status, err = p.reqHealth(si, timeout, nil, smap) 1609 if err == nil { 1610 break 1611 } 1612 if status != http.StatusServiceUnavailable { 1613 p.writeErrf(w, r, "%s is unreachable: %v(%d)", si, err, status) 1614 return 1615 } 1616 } 1617 if err != nil { 1618 debug.Assert(status == http.StatusServiceUnavailable) 1619 nlog.Errorf("%s: node %s takes unusually long time to start: %v(%d) - proceeding anyway", 1620 p.si, si, err, status) 1621 } 1622 } 1623 1624 rebID, err := p.mcastStopMaint(msg, &opts) 1625 if err != nil { 1626 p.writeErr(w, r, err) 1627 return 1628 } 1629 if rebID != "" { 1630 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(rebID))) 1631 w.Write(cos.UnsafeB(rebID)) 1632 } 1633 } 1634 1635 func (p *proxy) cluputQuery(w http.ResponseWriter, r *http.Request, action string) { 1636 if p.forwardCP(w, r, &apc.ActMsg{Action: action}, "") { 1637 return 1638 } 1639 switch action { 1640 case apc.Proxy: 1641 if err := p.pready(nil, true); err != nil { 1642 p.writeErr(w, r, err, http.StatusServiceUnavailable) 1643 return 1644 } 1645 // cluster-wide: designate a new primary proxy administratively 1646 p.cluSetPrimary(w, r) 1647 case apc.ActSetConfig: // set-config via query parameters and "?n1=v1&n2=v2..." 1648 if err := p.pready(nil, true); err != nil { 1649 p.writeErr(w, r, err, http.StatusServiceUnavailable) 1650 return 1651 } 1652 var ( 1653 query = r.URL.Query() 1654 toUpdate = &cmn.ConfigToSet{} 1655 msg = &apc.ActMsg{Action: action} 1656 ) 1657 if err := toUpdate.FillFromQuery(query); err != nil { 1658 p.writeErrf(w, r, err.Error()) 1659 return 1660 } 1661 if transient := cos.IsParseBool(query.Get(apc.ActTransient)); transient { 1662 p.setCluCfgTransient(w, r, toUpdate, msg) 1663 } else { 1664 p.setCluCfgPersistent(w, r, toUpdate, msg) 1665 } 1666 case apc.ActAttachRemAis, apc.ActDetachRemAis: 1667 p.attachDetachRemAis(w, r, action, r.URL.Query()) 1668 } 1669 } 1670 1671 func (p *proxy) attachDetachRemAis(w http.ResponseWriter, r *http.Request, action string, query url.Values) { 1672 what := query.Get(apc.QparamWhat) 1673 if what != apc.WhatRemoteAIS { 1674 p.writeErr(w, r, fmt.Errorf(fmtUnknownQue, what)) 1675 return 1676 } 1677 if !p.ClusterStarted() { 1678 const fmerr = "(config-backends modifying) remote cluster: (%t, %s)" 1679 var timeout time.Duration 1680 for { 1681 time.Sleep(cmn.Rom.MaxKeepalive()) 1682 timeout += cmn.Rom.MaxKeepalive() 1683 config := cmn.GCO.Get() 1684 if p.ClusterStarted() { 1685 break 1686 } 1687 if timeout > config.Timeout.Startup.D()/2 { 1688 p.writeErr(w, r, fmt.Errorf("%s: failed to attach "+fmerr, p, p.ClusterStarted(), config)) 1689 return 1690 } 1691 nlog.Errorf("%s: waiting to attach "+fmerr, p, p.ClusterStarted(), config) 1692 } 1693 } 1694 ctx := &configModifier{ 1695 pre: p._remaisConf, 1696 final: p._syncConfFinal, 1697 msg: &apc.ActMsg{Action: action}, 1698 query: query, 1699 hdr: r.Header, 1700 wait: true, 1701 } 1702 newConfig, err := p.owner.config.modify(ctx) 1703 if err != nil { 1704 p.writeErr(w, r, err) 1705 } else if newConfig != nil { 1706 go p._remais(&newConfig.ClusterConfig, false) 1707 } 1708 } 1709 1710 // the flow: attach/detach remais => modify cluster config => _remaisConf as the pre phase 1711 // of the transaction 1712 func (p *proxy) _remaisConf(ctx *configModifier, config *globalConfig) (bool, error) { 1713 var ( 1714 aisConf cmn.BackendConfAIS 1715 action = ctx.msg.Action 1716 v = config.Backend.Get(apc.AIS) 1717 ) 1718 if v == nil { 1719 if action == apc.ActDetachRemAis { 1720 return false, fmt.Errorf("%s: remote cluster config is empty", p.si) 1721 } 1722 aisConf = make(cmn.BackendConfAIS) 1723 } else { 1724 aisConf = cmn.BackendConfAIS{} 1725 cos.MustMorphMarshal(v, &aisConf) 1726 } 1727 1728 alias := ctx.hdr.Get(apc.HdrRemAisAlias) 1729 if action == apc.ActDetachRemAis { 1730 if _, ok := aisConf[alias]; !ok { 1731 return false, 1732 cmn.NewErrFailedTo(p, action, "remote cluster", errors.New("not found"), http.StatusNotFound) 1733 } 1734 delete(aisConf, alias) 1735 if len(aisConf) == 0 { 1736 aisConf = nil // unconfigure 1737 } 1738 } else { 1739 debug.Assert(action == apc.ActAttachRemAis) 1740 u := ctx.hdr.Get(apc.HdrRemAisURL) 1741 detail := fmt.Sprintf("remote cluster [alias %s => %v]", alias, u) 1742 1743 // validation rules: 1744 // rule #1: no two remote ais clusters can share the same alias (TODO: allow configuring multiple URLs per) 1745 for a, urls := range aisConf { 1746 if a != alias { 1747 continue 1748 } 1749 errmsg := fmt.Sprintf("%s: %s is already attached", p.si, detail) 1750 if !cos.StringInSlice(u, urls) { 1751 return false, errors.New(errmsg) 1752 } 1753 nlog.Warningln(errmsg + " - proceeding anyway") 1754 } 1755 // rule #2: aliases and UUIDs are two distinct non-overlapping sets 1756 p.remais.mu.RLock() 1757 for _, remais := range p.remais.A { 1758 debug.Assert(remais.Alias != alias) 1759 if alias == remais.UUID { 1760 p.remais.mu.RUnlock() 1761 return false, fmt.Errorf("%s: alias %q cannot be equal UUID of an already attached cluster [%s => %s]", 1762 p.si, alias, remais.Alias, remais.UUID) 1763 } 1764 } 1765 p.remais.mu.RUnlock() 1766 1767 parsed, err := url.ParseRequestURI(u) 1768 if err != nil { 1769 return false, cmn.NewErrFailedTo(p, action, detail, err) 1770 } 1771 if parsed.Scheme != "http" && parsed.Scheme != "https" { 1772 return false, cmn.NewErrFailedTo(p, action, detail, errors.New("invalid URL scheme")) 1773 } 1774 nlog.Infof("%s: %s %s", p, action, detail) 1775 aisConf[alias] = []string{u} 1776 } 1777 config.Backend.Set(apc.AIS, aisConf) 1778 1779 return true, nil 1780 } 1781 1782 func (p *proxy) mcastStopMaint(msg *apc.ActMsg, opts *apc.ActValRmNode) (rebID string, err error) { 1783 nlog.Infof("%s mcast-stopm: %s, %s, skip-reb=%t", p, msg, opts.DaemonID, opts.SkipRebalance) 1784 ctx := &smapModifier{ 1785 pre: p._stopMaintPre, 1786 post: p._stopMaintRMD, 1787 final: p._syncFinal, 1788 sid: opts.DaemonID, 1789 skipReb: opts.SkipRebalance, 1790 msg: msg, 1791 flags: meta.SnodeMaint | meta.SnodeMaintPostReb, // to clear node flags 1792 } 1793 err = p.owner.smap.modify(ctx) 1794 if ctx.rmdCtx != nil && ctx.rmdCtx.cur != nil { 1795 debug.Assert(ctx.rmdCtx.cur.version() > ctx.rmdCtx.prev.version() && ctx.rmdCtx.rebID != "") 1796 rebID = ctx.rmdCtx.rebID 1797 } 1798 return 1799 } 1800 1801 func (p *proxy) _stopMaintPre(ctx *smapModifier, clone *smapX) error { 1802 const efmt = "cannot take %s out of maintenance:" 1803 if !clone.isPrimary(p.si) { 1804 return newErrNotPrimary(p.si, clone, fmt.Sprintf(efmt, ctx.sid)) 1805 } 1806 node := clone.GetNode(ctx.sid) 1807 if node == nil { 1808 ctx.status = http.StatusNotFound 1809 return &errNodeNotFound{fmt.Sprintf(efmt, ctx.sid), ctx.sid, p.si, clone} 1810 } 1811 clone.clearNodeFlags(ctx.sid, ctx.flags) 1812 if node.IsProxy() { 1813 clone.staffIC() 1814 } 1815 return nil 1816 } 1817 1818 func (p *proxy) _stopMaintRMD(ctx *smapModifier, clone *smapX) { 1819 if ctx.skipReb { 1820 nlog.Infoln("ctx.skip-reb", ctx.skipReb) 1821 return 1822 } 1823 if !cmn.GCO.Get().Rebalance.Enabled { 1824 return 1825 } 1826 if nlog.Stopping() { 1827 return 1828 } 1829 if clone.CountActiveTs() < 2 { 1830 return 1831 } 1832 rmdCtx := &rmdModifier{ 1833 pre: rmdInc, 1834 smapCtx: ctx, 1835 p: p, 1836 wait: true, 1837 } 1838 if _, err := p.owner.rmd.modify(rmdCtx); err != nil { 1839 debug.AssertNoErr(err) 1840 return 1841 } 1842 rmdCtx.listen(nil) 1843 ctx.rmdCtx = rmdCtx 1844 } 1845 1846 func (p *proxy) cluSetPrimary(w http.ResponseWriter, r *http.Request) { 1847 apiItems, err := p.parseURL(w, r, apc.URLPathCluProxy.L, 1, false) 1848 if err != nil { 1849 return 1850 } 1851 npid := apiItems[0] 1852 if p.forwardCP(w, r, nil, "designate new primary proxy '"+npid+"'") { 1853 return 1854 } 1855 1856 // am current primary - validating 1857 smap := p.owner.smap.get() 1858 npsi := smap.GetProxy(npid) 1859 if npsi == nil { 1860 p.writeErrf(w, r, "new primary proxy %s is not present in the %s", npid, smap.StringEx()) 1861 return 1862 } 1863 if npid == p.SID() { 1864 debug.Assert(p.SID() == smap.Primary.ID()) // must be forwardCP-ed 1865 nlog.Warningf("Request to set primary to %s(self) - nothing to do", npid) 1866 return 1867 } 1868 if smap.InMaintOrDecomm(npsi) { 1869 var err error 1870 if smap.InMaint(npsi) { 1871 err = fmt.Errorf("%s cannot become the new primary as it's currently under maintenance", npsi) 1872 } else { 1873 err = fmt.Errorf("%s cannot become the new primary as it's currently being decommissioned", npsi) 1874 } 1875 debug.AssertNoErr(err) 1876 p.writeErr(w, r, err, http.StatusServiceUnavailable) 1877 return 1878 } 1879 1880 // executing 1881 if p.settingNewPrimary.CAS(false, true) { 1882 p._setPrimary(w, r, npsi) 1883 p.settingNewPrimary.Store(false) 1884 } 1885 } 1886 1887 func (p *proxy) _setPrimary(w http.ResponseWriter, r *http.Request, npsi *meta.Snode) { 1888 // 1889 // (I.1) Prepare phase - inform other nodes. 1890 // 1891 urlPath := apc.URLPathDaeProxy.Join(npsi.ID()) 1892 q := url.Values{} 1893 q.Set(apc.QparamPrepare, "true") 1894 args := allocBcArgs() 1895 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: urlPath, Query: q} 1896 1897 cluMeta, errM := p.cluMeta(cmetaFillOpt{skipSmap: true, skipPrimeTime: true}) 1898 if errM != nil { 1899 p.writeErr(w, r, errM) 1900 return 1901 } 1902 args.req.Body = cos.MustMarshal(cluMeta) 1903 1904 args.to = core.AllNodes 1905 results := p.bcastGroup(args) 1906 freeBcArgs(args) 1907 for _, res := range results { 1908 if res.err == nil { 1909 continue 1910 } 1911 err := res.errorf("node %s failed to set primary %s in the prepare phase", res.si, npsi.StringEx()) 1912 p.writeErr(w, r, err) 1913 freeBcastRes(results) 1914 return 1915 } 1916 freeBcastRes(results) 1917 1918 // 1919 // (I.2) Prepare phase - local changes. 1920 // 1921 err := p.owner.smap.modify(&smapModifier{pre: func(_ *smapModifier, clone *smapX) error { 1922 clone.Primary = npsi 1923 p.metasyncer.becomeNonPrimary() 1924 return nil 1925 }}) 1926 debug.AssertNoErr(err) 1927 1928 // 1929 // (II) Commit phase. 1930 // 1931 q.Set(apc.QparamPrepare, "false") 1932 args = allocBcArgs() 1933 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: urlPath, Query: q} 1934 args.to = core.AllNodes 1935 results = p.bcastGroup(args) 1936 freeBcArgs(args) 1937 for _, res := range results { 1938 if res.err == nil { 1939 continue 1940 } 1941 if res.si.ID() == npsi.ID() { 1942 cos.ExitLogf("commit phase failure: new primary %s returned %v", npsi.StringEx(), res.err) 1943 } else { 1944 nlog.Errorf("Commit phase failure: %s returned %v when setting primary = %s", 1945 res.si.ID(), res.err, npsi.StringEx()) 1946 } 1947 } 1948 freeBcastRes(results) 1949 } 1950 1951 ///////////////////////////////////////// 1952 // DELET /v1/cluster - self-unregister // 1953 ///////////////////////////////////////// 1954 1955 func (p *proxy) httpcludel(w http.ResponseWriter, r *http.Request) { 1956 apiItems, err := p.parseURL(w, r, apc.URLPathCluDaemon.L, 1, false) 1957 if err != nil { 1958 return 1959 } 1960 var ( 1961 sid = apiItems[0] 1962 smap = p.owner.smap.get() 1963 node = smap.GetNode(sid) 1964 ) 1965 if node == nil { 1966 err = &errNodeNotFound{"cannot remove", sid, p.si, smap} 1967 p.writeErr(w, r, err, http.StatusNotFound) 1968 return 1969 } 1970 if smap.IsPrimary(node) { 1971 p.writeErrMsg(w, r, "cannot remove primary proxy", http.StatusBadRequest) 1972 return 1973 } 1974 if p.forwardCP(w, r, nil, sid) { 1975 return 1976 } 1977 if !p.NodeStarted() { 1978 p.writeErrStatusf(w, r, http.StatusServiceUnavailable, "%s is not ready yet (starting up)", p) 1979 return 1980 } 1981 1982 // primary (and cluster) to start and finalize rebalancing status _prior_ to removing invidual nodes 1983 if err := p.pready(smap, true); err != nil { 1984 p.writeErr(w, r, err, http.StatusServiceUnavailable) 1985 return 1986 } 1987 1988 if err := p.checkAccess(w, r, nil, apc.AceAdmin); err != nil { 1989 return 1990 } 1991 if err := p.isIntraCall(r.Header, false /*from primary*/); err != nil { 1992 err = fmt.Errorf("expecting intra-cluster call for self-initiated removal, got %w", err) 1993 p.writeErr(w, r, err) 1994 return 1995 } 1996 cid := r.Header.Get(apc.HdrCallerID) 1997 if cid != sid { 1998 err = fmt.Errorf("expecting self-initiated removal (%s != %s)", cid, sid) 1999 p.writeErr(w, r, err) 2000 return 2001 } 2002 if ecode, err := p.mcastUnreg(&apc.ActMsg{Action: "self-initiated-removal"}, node); err != nil { 2003 p.writeErr(w, r, err, ecode) 2004 } 2005 } 2006 2007 // post-rebalance or post no-rebalance - last step removing a node 2008 // (with msg.Action defining semantics) 2009 func (p *proxy) rmNodeFinal(msg *apc.ActMsg, si *meta.Snode, ctx *smapModifier) (int, error) { 2010 var ( 2011 smap = p.owner.smap.get() 2012 node = smap.GetNode(si.ID()) 2013 timeout = cmn.Rom.CplaneOperation() 2014 ) 2015 if node == nil { 2016 txt := "cannot \"" + msg.Action + "\"" 2017 return http.StatusNotFound, &errNodeNotFound{txt, si.ID(), p.si, smap} 2018 } 2019 2020 var ( 2021 err error 2022 ecode int 2023 cargs = allocCargs() 2024 body = cos.MustMarshal(msg) 2025 sname = node.StringEx() 2026 ) 2027 cargs.si, cargs.timeout = node, timeout 2028 switch msg.Action { 2029 case apc.ActShutdownNode, apc.ActRmNodeUnsafe, apc.ActStartMaintenance, apc.ActDecommissionNode: 2030 cargs.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathDae.S, Body: body} 2031 default: 2032 return 0, fmt.Errorf(fmtErrInvaldAction, msg.Action, 2033 []string{apc.ActShutdownNode, apc.ActStartMaintenance, apc.ActDecommissionNode, apc.ActRmNodeUnsafe}) 2034 } 2035 2036 nlog.InfoDepth(1, p.String()+":", msg.Action, sname) 2037 res := p.call(cargs, smap) 2038 err = res.unwrap() 2039 freeCargs(cargs) 2040 freeCR(res) 2041 2042 if err != nil { 2043 emsg := fmt.Sprintf("%s: (%s %s) final: %v - proceeding anyway...", p, msg, sname, err) 2044 switch msg.Action { 2045 case apc.ActShutdownNode, apc.ActDecommissionNode: // expecting EOF 2046 if !cos.IsEOF(err) { 2047 nlog.Errorln(emsg) 2048 } 2049 case apc.ActRmNodeUnsafe: 2050 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 2051 nlog.Errorln(emsg) 2052 } 2053 default: 2054 nlog.Errorln(emsg) 2055 } 2056 err = nil // NOTE: proceeding anyway 2057 } 2058 2059 switch msg.Action { 2060 case apc.ActDecommissionNode, apc.ActRmNodeUnsafe: 2061 ecode, err = p.mcastUnreg(msg, node) 2062 case apc.ActStartMaintenance, apc.ActShutdownNode: 2063 if ctx != nil && ctx.rmdCtx != nil && ctx.rmdCtx.rebID != "" { 2064 // final step executing shutdown and start-maintenance transaction: 2065 // setting si.Flags |= cluster.SnodeMaintPostReb 2066 // (compare w/ rmTarget --> p.mcastMaint above) 2067 _, err = p.mcastMaint(msg, node, false /*reb*/, true /*maintPostReb*/) 2068 } 2069 } 2070 if err != nil { 2071 nlog.Errorf("%s: (%s %s) FATAL: failed to update %s: %v", p, msg, sname, p.owner.smap.get(), err) 2072 } 2073 return ecode, err 2074 } 2075 2076 func (p *proxy) mcastUnreg(msg *apc.ActMsg, si *meta.Snode) (ecode int, err error) { 2077 nlog.Infof("%s mcast-unreg: %s, %s", p, msg, si.StringEx()) 2078 ctx := &smapModifier{ 2079 pre: p._unregNodePre, 2080 final: p._syncFinal, 2081 msg: msg, 2082 sid: si.ID(), 2083 skipReb: true, 2084 } 2085 err = p.owner.smap.modify(ctx) 2086 return ctx.status, err 2087 } 2088 2089 func (p *proxy) _unregNodePre(ctx *smapModifier, clone *smapX) error { 2090 const verb = "remove" 2091 sid := ctx.sid 2092 if !clone.isPrimary(p.si) { 2093 return newErrNotPrimary(p.si, clone, fmt.Sprintf("cannot cancel %s %s", verb, sid)) 2094 } 2095 node := clone.GetNode(sid) 2096 if node == nil { 2097 ctx.status = http.StatusNotFound 2098 return &errNodeNotFound{"failed to " + verb, sid, p.si, clone} 2099 } 2100 if node.IsProxy() { 2101 clone.delProxy(sid) 2102 nlog.Infof("%s %s (num proxies %d)", verb, node.StringEx(), clone.CountProxies()) 2103 clone.staffIC() 2104 } else { 2105 clone.delTarget(sid) 2106 nlog.Infof("%s %s (num targets %d)", verb, node.StringEx(), clone.CountTargets()) 2107 } 2108 p.rproxy.nodes.Delete(ctx.sid) 2109 return nil 2110 } 2111 2112 // rebalance's `can`: factors not including cluster map 2113 func (p *proxy) canRebalance() (err error) { 2114 if nlog.Stopping() { 2115 return fmt.Errorf("%s is stopping", p) 2116 } 2117 smap := p.owner.smap.get() 2118 if err = smap.validate(); err != nil { 2119 return 2120 } 2121 if !smap.IsPrimary(p.si) { 2122 err = newErrNotPrimary(p.si, smap) 2123 debug.AssertNoErr(err) 2124 return 2125 } 2126 // NOTE: cluster startup handles rebalance elsewhere (see p.resumeReb), and so 2127 // all rebalance-triggering events (shutdown, decommission, maintenance, etc.) 2128 // are not permitted and will fail during startup. 2129 if err = p.pready(smap, true); err != nil { 2130 return 2131 } 2132 if !cmn.GCO.Get().Rebalance.Enabled { 2133 err = errRebalanceDisabled 2134 } 2135 return 2136 } 2137 2138 // rebalance's `must`: compares previous and current (cloned, updated) Smap 2139 // TODO: bmd.num-buckets == 0 would be an easy one to check 2140 func mustRebalance(ctx *smapModifier, cur *smapX) bool { 2141 if !cmn.GCO.Get().Rebalance.Enabled { 2142 return false 2143 } 2144 if nlog.Stopping() { 2145 return false 2146 } 2147 prev := ctx.smap 2148 if prev.CountActiveTs() == 0 { 2149 return false 2150 } 2151 if ctx.interrupted || ctx.restarted { 2152 return true 2153 } 2154 2155 // active <=> inactive transition 2156 debug.Assert(prev.version() < cur.version()) 2157 for _, tsi := range cur.Tmap { 2158 // added an active one or activated previously inactive 2159 if !tsi.InMaintOrDecomm() && prev.GetActiveNode(tsi.ID()) == nil { 2160 return true 2161 } 2162 } 2163 for _, tsi := range prev.Tmap { 2164 // removed an active one or deactivated previously active 2165 if !tsi.InMaintOrDecomm() && cur.GetActiveNode(tsi.ID()) == nil { 2166 return true 2167 } 2168 } 2169 return false 2170 }