github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/vote.go (about) 1 // Package ais provides core functionality for the AIStore object storage. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package ais 6 7 import ( 8 "fmt" 9 "net/http" 10 "net/url" 11 "runtime" 12 "strconv" 13 "time" 14 15 "github.com/NVIDIA/aistore/api/apc" 16 "github.com/NVIDIA/aistore/cmn" 17 "github.com/NVIDIA/aistore/cmn/cos" 18 "github.com/NVIDIA/aistore/cmn/debug" 19 "github.com/NVIDIA/aistore/cmn/nlog" 20 "github.com/NVIDIA/aistore/core" 21 "github.com/NVIDIA/aistore/core/meta" 22 "github.com/NVIDIA/aistore/xact/xreg" 23 "github.com/NVIDIA/aistore/xact/xs" 24 ) 25 26 const ( 27 VoteYes Vote = "YES" 28 VoteNo Vote = "NO" 29 ) 30 31 const maxRetryElectReq = 3 32 33 type ( 34 Vote string 35 36 VoteRecord struct { 37 Candidate string `json:"candidate"` 38 Primary string `json:"primary"` 39 Smap *smapX `json:"smap"` 40 StartTime time.Time `json:"start_time"` 41 Initiator string `json:"initiator"` 42 } 43 44 VoteInitiation VoteRecord 45 VoteResult VoteRecord 46 47 VoteMessage struct { 48 Record VoteRecord `json:"vote_record"` 49 } 50 51 VoteInitiationMessage struct { 52 Request VoteInitiation `json:"vote_initiation"` 53 } 54 55 VoteResultMessage struct { 56 Result VoteResult `json:"vote_result"` 57 } 58 59 voteResult struct { 60 yes bool 61 daemonID string 62 err error 63 } 64 ) 65 66 func voteInProgress() (xele core.Xact) { 67 if e := xreg.GetRunning(xreg.Flt{Kind: apc.ActElection}); e != nil { 68 xele = e.Get() 69 } 70 return 71 } 72 73 // 74 // voting: proxy 75 // 76 77 // [METHOD] /v1/vote 78 func (p *proxy) voteHandler(w http.ResponseWriter, r *http.Request) { 79 if r.Method != http.MethodGet && r.Method != http.MethodPut { 80 cmn.WriteErr405(w, r, http.MethodGet, http.MethodPut) 81 return 82 } 83 apiItems, err := p.parseURL(w, r, apc.URLPathVote.L, 1, false) 84 if err != nil { 85 return 86 } 87 item := apiItems[0] 88 if !p.NodeStarted() { 89 w.WriteHeader(http.StatusServiceUnavailable) 90 return 91 } 92 // MethodGet 93 if r.Method == http.MethodGet { 94 if item != apc.Proxy { 95 p.writeErrURL(w, r) 96 return 97 } 98 p.httpgetvote(w, r) 99 return 100 } 101 // MethodPut 102 switch item { 103 case apc.Voteres: 104 p.httpsetprimary(w, r) 105 case apc.VoteInit: 106 p.httpelect(w, r) 107 case apc.PriStop: 108 callerID := r.Header.Get(apc.HdrCallerID) 109 p.onPrimaryDown(p, callerID) 110 default: 111 p.writeErrURL(w, r) 112 } 113 } 114 115 // PUT /v1/vote/init (via sendElectionRequest) 116 func (p *proxy) httpelect(w http.ResponseWriter, r *http.Request) { 117 if _, err := p.parseURL(w, r, apc.URLPathVoteInit.L, 0, false); err != nil { 118 return 119 } 120 msg := VoteInitiationMessage{} 121 if err := cmn.ReadJSON(w, r, &msg); err != nil { 122 return 123 } 124 newSmap := msg.Request.Smap 125 if err := newSmap.validate(); err != nil { 126 p.writeErrf(w, r, "%s: invalid %s in the Vote Request, err: %v", p.si, newSmap, err) 127 return 128 } 129 smap := p.owner.smap.get() 130 caller := r.Header.Get(apc.HdrCallerName) 131 nlog.Infof("[vote] receive %s from %q (local: %s)", newSmap.StringEx(), caller, smap.StringEx()) 132 133 if !newSmap.isPresent(p.si) { 134 p.writeErrf(w, r, "%s: not present in the Vote Request, %s", p.si, newSmap) 135 return 136 } 137 debug.Assert(!newSmap.isPrimary(p.si)) 138 139 if err := p.owner.smap.synchronize(p.si, newSmap, nil /*ms payload*/, p.htrun.smapUpdatedCB); err != nil { 140 if isErrDowngrade(err) { 141 psi := newSmap.GetProxy(msg.Request.Candidate) 142 psi2 := p.owner.smap.get().GetProxy(msg.Request.Candidate) 143 if psi2.Eq(psi) { 144 err = nil 145 } 146 } 147 if err != nil { 148 p.writeErr(w, r, cmn.NewErrFailedTo(p, "synchronize", newSmap, err)) 149 return 150 } 151 } 152 153 smap = p.owner.smap.get() 154 psi, err := smap.HrwProxy(smap.Primary.ID()) 155 if err != nil { 156 p.writeErr(w, r, err) 157 return 158 } 159 160 // proceed with election iff: 161 if psi.ID() != p.SID() { 162 nlog.Warningf("%s: not next in line %s", p, psi) 163 return 164 } 165 if !p.ClusterStarted() { 166 nlog.Warningf("%s: not ready yet to be elected - starting up", p) 167 w.WriteHeader(http.StatusServiceUnavailable) 168 return 169 } 170 171 vr := &VoteRecord{ 172 Candidate: msg.Request.Candidate, 173 Primary: msg.Request.Primary, 174 StartTime: time.Now(), 175 Initiator: p.SID(), 176 } 177 // include resulting Smap in the response 178 vr.Smap = p.owner.smap.get() 179 180 // xaction (minimal and, unlike target xactions, not visible via API (TODO)) 181 go p.startElection(vr) 182 } 183 184 // Election Functions 185 186 func (p *proxy) startElection(vr *VoteRecord) { 187 if p.owner.smap.get().isPrimary(p.si) { 188 nlog.Infof("%s: already in primary state", p) 189 return 190 } 191 rns := xreg.RenewElection() 192 if rns.Err != nil { 193 nlog.Errorf("%s: %+v %v", p, vr, rns.Err) 194 debug.AssertNoErr(rns.Err) 195 return 196 } 197 if rns.IsRunning() { 198 return 199 } 200 xctn := rns.Entry.Get() 201 xele, ok := xctn.(*xs.Election) 202 debug.Assert(ok) 203 nlog.Infoln(xele.Name()) 204 p.elect(vr, xele) 205 xele.Finish() 206 } 207 208 func (p *proxy) elect(vr *VoteRecord, xele *xs.Election) { 209 var ( 210 smap *smapX 211 err error 212 curPrimary = vr.Smap.Primary 213 config = cmn.GCO.Get() 214 timeout = config.Timeout.CplaneOperation.D() / 2 215 ) 216 // 1. ping the current primary (not using apc.QparamAskPrimary as it might be transitioning) 217 for i := range 2 { 218 if i > 0 { 219 runtime.Gosched() 220 } 221 smap = p.owner.smap.get() 222 if smap.version() > vr.Smap.version() { 223 nlog.Warningf("%s: %s updated from %s, moving back to idle", p, smap, vr.Smap) 224 return 225 } 226 _, _, err = p.reqHealth(curPrimary, timeout, nil /*ask primary*/, smap) 227 if err == nil { 228 break 229 } 230 timeout = config.Timeout.CplaneOperation.D() 231 } 232 if err == nil { 233 // move back to idle 234 query := url.Values{apc.QparamAskPrimary: []string{"true"}} 235 _, _, err = p.reqHealth(curPrimary, timeout, query /*ask primary*/, smap) 236 if err == nil { 237 nlog.Infof("%s: current primary %s is up, moving back to idle", p, curPrimary) 238 } else { 239 errV := fmt.Errorf("%s: current primary(?) %s responds but does not consider itself primary", 240 p, curPrimary.StringEx()) 241 xele.AddErr(errV, 0) 242 } 243 return 244 } 245 nlog.Infof("%s: primary %s is confirmed down: [%v] - moving to election state phase 1 (prepare)", 246 p, curPrimary.StringEx(), err) 247 248 // 2. election phase 1 249 elected, votingErrors := p.electPhase1(vr) 250 if !elected { 251 errV := fmt.Errorf("%s: election phase 1 (prepare) failed: primary still %s w/ status unknown", 252 p, curPrimary.StringEx()) 253 xele.AddErr(errV, 0) 254 255 smap = p.owner.smap.get() 256 if smap.version() > vr.Smap.version() { 257 nlog.Warningf("%s: %s updated from %s, moving back to idle", p, smap, vr.Smap) 258 return 259 } 260 261 // best-effort 262 svm, _, slowp := p.bcastMaxVer(smap, nil, nil) 263 if svm.Smap != nil && !slowp { 264 if svm.Smap.UUID == smap.UUID && svm.Smap.version() > smap.version() && svm.Smap.validate() == nil { 265 nlog.Warningf("%s: upgrading local %s to cluster max-ver %s", 266 p, smap.StringEx(), svm.Smap.StringEx()) 267 if svm.Smap.Primary.ID() != smap.Primary.ID() { 268 nlog.Warningf("%s: new primary %s is already elected ...", 269 p, svm.Smap.Primary.StringEx()) 270 } 271 errV := p.owner.smap.synchronize(p.si, svm.Smap, nil /*ms payload*/, p.smapUpdatedCB) 272 if errV != nil { 273 cos.ExitLog(errV) 274 } 275 } 276 } 277 278 return 279 } 280 281 // 3. election phase 2 282 nlog.Infoln(p.String()+":", "moving to election state phase 2 (commit)") 283 confirmationErrors := p.electPhase2(vr) 284 for sid := range confirmationErrors { 285 if !votingErrors.Contains(sid) { 286 errV := fmt.Errorf("%s: error confirming the election: %s was healthy when voting", p, sid) 287 xele.AddErr(errV, 0) 288 } 289 } 290 291 // 4. become! 292 nlog.Infof("%s: becoming primary", p) 293 p.becomeNewPrimary(vr.Primary /*proxyIDToRemove*/) 294 } 295 296 // phase 1: prepare (via simple majority voting) 297 func (p *proxy) electPhase1(vr *VoteRecord) (winner bool, errors cos.StrSet) { 298 var ( 299 resCh = p.requestVotes(vr) 300 y, n int 301 ) 302 for res := range resCh { 303 if res.err != nil { 304 if errors == nil { 305 errors = cos.NewStrSet(res.daemonID) 306 } else { 307 errors.Set(res.daemonID) 308 } 309 n++ 310 } else { 311 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 312 nlog.Infof("Node %s responded with (winner: %t)", res.daemonID, res.yes) 313 } 314 if res.yes { 315 y++ 316 } else { 317 n++ 318 } 319 } 320 } 321 322 winner = y > n || (y+n == 0) // No Votes: Default Winner 323 nlog.Infof("Vote Results:\n Y: %d, N: %d\n Victory: %t\n", y, n, winner) 324 return 325 } 326 327 func (p *proxy) requestVotes(vr *VoteRecord) chan voteResult { 328 var ( 329 msg = VoteMessage{Record: *vr} 330 q = url.Values{} 331 ) 332 q.Set(apc.QparamPrimaryCandidate, p.SID()) 333 args := allocBcArgs() 334 args.req = cmn.HreqArgs{ 335 Method: http.MethodGet, 336 Path: apc.URLPathVoteProxy.S, 337 Body: cos.MustMarshal(&msg), 338 Query: q, 339 } 340 args.to = core.AllNodes 341 results := p.bcastGroup(args) 342 freeBcArgs(args) 343 resCh := make(chan voteResult, len(results)) 344 for _, res := range results { 345 if res.err != nil { 346 resCh <- voteResult{ 347 yes: false, 348 daemonID: res.si.ID(), 349 err: res.err, 350 } 351 } else { 352 resCh <- voteResult{ 353 yes: VoteYes == Vote(res.bytes), 354 daemonID: res.si.ID(), 355 err: nil, 356 } 357 } 358 } 359 freeBcastRes(results) 360 close(resCh) 361 return resCh 362 } 363 364 // phase 2: confirm and commit 365 func (p *proxy) electPhase2(vr *VoteRecord) cos.StrSet { 366 var ( 367 errors = cos.StrSet{} 368 msg = &VoteResultMessage{ 369 VoteResult{ 370 Candidate: vr.Candidate, 371 Primary: vr.Primary, 372 Smap: vr.Smap, 373 StartTime: time.Now(), 374 Initiator: p.SID(), 375 }, 376 } 377 ) 378 args := allocBcArgs() 379 args.req = cmn.HreqArgs{Method: http.MethodPut, Path: apc.URLPathVoteVoteres.S, Body: cos.MustMarshal(msg)} 380 args.to = core.AllNodes 381 results := p.bcastGroup(args) 382 freeBcArgs(args) 383 for _, res := range results { 384 if res.err == nil { 385 continue 386 } 387 nlog.Warningf("%s: failed to confirm election with %s: %v", p, res.si.StringEx(), res.err) 388 errors.Set(res.si.ID()) 389 } 390 freeBcastRes(results) 391 return errors 392 } 393 394 // 395 // voting: target 396 // 397 398 // [METHOD] /v1/vote 399 func (t *target) voteHandler(w http.ResponseWriter, r *http.Request) { 400 if r.Method != http.MethodGet && r.Method != http.MethodPut { 401 cmn.WriteErr405(w, r, http.MethodGet, http.MethodPut) 402 return 403 } 404 apiItems, err := t.parseURL(w, r, apc.URLPathVote.L, 1, false) 405 if err != nil { 406 return 407 } 408 switch { 409 case r.Method == http.MethodGet && apiItems[0] == apc.Proxy: 410 t.httpgetvote(w, r) 411 case r.Method == http.MethodPut && apiItems[0] == apc.Voteres: 412 t.httpsetprimary(w, r) 413 default: 414 t.writeErrURL(w, r) 415 } 416 } 417 418 // 419 // voting: common methods 420 // 421 422 func (h *htrun) onPrimaryDown(self *proxy, callerID string) { 423 smap := h.owner.smap.get() 424 if smap.validate() != nil { 425 return 426 } 427 clone := smap.clone() 428 s := "via keepalive" 429 if callerID != "" { 430 s = "via direct call" 431 if callerID != clone.Primary.ID() { 432 nlog.Errorf("%s (%s): non-primary caller reporting primary down (%s, %s, %s)", 433 h, s, callerID, clone.Primary.StringEx(), smap) 434 return 435 } 436 } 437 nlog.Infof("%s (%s): primary %s is no longer online and must be reelected", h, s, clone.Primary.StringEx()) 438 439 for { 440 if nlog.Stopping() { 441 return 442 } 443 // use HRW ordering 444 nextPrimaryProxy, err := clone.HrwProxy(clone.Primary.ID()) 445 if err != nil { 446 if !nlog.Stopping() { 447 nlog.Errorf("%s failed to execute HRW selection: %v", h, err) 448 } 449 return 450 } 451 452 // If this proxy is the next primary proxy candidate, it starts the election directly. 453 if nextPrimaryProxy.ID() == h.si.ID() { 454 debug.Assert(h.si.IsProxy()) 455 debug.Assert(h.SID() == self.SID()) 456 nlog.Infof("%s: starting election (candidate = self)", h) 457 vr := &VoteRecord{ 458 Candidate: nextPrimaryProxy.ID(), 459 Primary: clone.Primary.ID(), 460 StartTime: time.Now(), 461 Initiator: h.si.ID(), 462 } 463 vr.Smap = clone 464 self.startElection(vr) 465 return 466 } 467 468 nlog.Infof("%s: trying %s as the new primary candidate", h, meta.Pname(nextPrimaryProxy.ID())) 469 470 // ask the candidate to start election 471 vr := &VoteInitiation{ 472 Candidate: nextPrimaryProxy.ID(), 473 Primary: clone.Primary.ID(), 474 StartTime: time.Now(), 475 Initiator: h.si.ID(), 476 } 477 vr.Smap = clone 478 if h.sendElectionRequest(vr, nextPrimaryProxy) == nil { 479 return // the candidate has accepted the request and started election 480 } 481 482 // No response from the candidate (or it failed to start election) - remove 483 // it from the Smap and try the next candidate 484 // TODO: handle http.StatusServiceUnavailable from the candidate that is currently starting up 485 // (see httpelect) 486 if clone.GetProxy(nextPrimaryProxy.ID()) != nil { 487 clone.delProxy(nextPrimaryProxy.ID()) 488 } 489 } 490 } 491 492 // GET /v1/vote/proxy 493 func (h *htrun) httpgetvote(w http.ResponseWriter, r *http.Request) { 494 if _, err := h.parseURL(w, r, apc.URLPathVoteProxy.L, 0, false); err != nil { 495 return 496 } 497 msg := VoteMessage{} 498 if err := cmn.ReadJSON(w, r, &msg); err != nil { 499 return 500 } 501 candidate := msg.Record.Candidate 502 if candidate == "" { 503 h.writeErrf(w, r, "%s: unexpected: empty candidate field [%v]", h, msg.Record) 504 return 505 } 506 smap := h.owner.smap.get() 507 if smap.Primary == nil { 508 h.writeErrf(w, r, "%s: current primary undefined, %s", h, smap) 509 return 510 } 511 currPrimaryID := smap.Primary.ID() 512 if candidate == currPrimaryID { 513 h.writeErrf(w, r, "%s: candidate %q _is_ the current primary, %s", h, candidate, smap) 514 return 515 } 516 newSmap := msg.Record.Smap 517 psi := newSmap.GetProxy(candidate) 518 if psi == nil { 519 h.writeErrf(w, r, "%s: candidate %q not present in the VoteRecord %s", h, candidate, newSmap) 520 return 521 } 522 if !newSmap.isPresent(h.si) { 523 h.writeErrf(w, r, "%s: not present in the VoteRecord %s", h, newSmap) 524 return 525 } 526 527 if err := h.owner.smap.synchronize(h.si, newSmap, nil /*ms payload*/, h.smapUpdatedCB); err != nil { 528 // double-checking errDowngrade 529 if isErrDowngrade(err) { 530 newSmap2 := h.owner.smap.get() 531 psi2 := newSmap2.GetProxy(candidate) 532 if psi2.Eq(psi) { 533 err = nil // not an error - can vote Yes 534 } 535 } 536 if err != nil { 537 nlog.Errorf("%s: failed to synch %s, err %v - voting No", h, newSmap, err) 538 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(VoteNo))) 539 _, err := w.Write([]byte(VoteNo)) 540 debug.AssertNoErr(err) 541 return 542 } 543 } 544 545 vote, err := h.voteOnProxy(psi.ID(), currPrimaryID) 546 if err != nil { 547 h.writeErr(w, r, err) 548 return 549 } 550 if vote { 551 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(VoteYes))) 552 _, err = w.Write([]byte(VoteYes)) 553 } else { 554 w.Header().Set(cos.HdrContentLength, strconv.Itoa(len(VoteNo))) 555 _, err = w.Write([]byte(VoteNo)) 556 } 557 debug.AssertNoErr(err) 558 } 559 560 // PUT /v1/vote/result 561 func (h *htrun) httpsetprimary(w http.ResponseWriter, r *http.Request) { 562 if _, err := h.parseURL(w, r, apc.URLPathVoteVoteres.L, 0, false); err != nil { 563 return 564 } 565 msg := VoteResultMessage{} 566 if err := cmn.ReadJSON(w, r, &msg); err != nil { 567 return 568 } 569 vr := msg.Result 570 nlog.Infof("%s: received vote result: new primary %s (old %s)", h.si, vr.Candidate, vr.Primary) 571 572 ctx := &smapModifier{ 573 pre: h._votedPrimary, 574 nid: vr.Candidate, 575 sid: vr.Primary, 576 } 577 err := h.owner.smap.modify(ctx) 578 if err != nil { 579 h.writeErr(w, r, err) 580 } 581 } 582 583 func (h *htrun) _votedPrimary(ctx *smapModifier, clone *smapX) error { 584 newPrimary, oldPrimary := ctx.nid, ctx.sid 585 psi := clone.GetProxy(newPrimary) 586 if psi == nil { 587 return &errNodeNotFound{"cannot accept new primary election:", newPrimary, h.si, clone} 588 } 589 clone.Primary = psi 590 if oldPrimary != "" && clone.GetProxy(oldPrimary) != nil { 591 clone.delProxy(oldPrimary) 592 } 593 nlog.Infof("%s: voted-primary result: %s", h.si, clone) 594 return nil 595 } 596 597 func (h *htrun) sendElectionRequest(vr *VoteInitiation, nextPrimaryProxy *meta.Snode) (err error) { 598 var ( 599 msg = VoteInitiationMessage{Request: *vr} 600 body = cos.MustMarshal(&msg) 601 cargs = allocCargs() 602 ) 603 { 604 cargs.si = nextPrimaryProxy 605 cargs.req = cmn.HreqArgs{ 606 Method: http.MethodPut, 607 Base: nextPrimaryProxy.ControlNet.URL, 608 Path: apc.URLPathVoteInit.S, 609 Body: body, 610 } 611 cargs.timeout = apc.DefaultTimeout 612 } 613 res := h.call(cargs, vr.Smap) 614 err = res.err 615 freeCR(res) 616 defer freeCargs(cargs) 617 if err == nil || !cos.IsRetriableConnErr(err) { 618 return 619 } 620 // retry 621 sleep := cmn.Rom.CplaneOperation() / 2 622 for range maxRetryElectReq { 623 time.Sleep(sleep) 624 res = h.call(cargs, vr.Smap) 625 err = res.err 626 freeCR(res) 627 if err == nil { 628 return 629 } 630 if !cos.IsRetriableConnErr(err) { 631 break 632 } 633 sleep += sleep / 2 634 } 635 if !nlog.Stopping() { 636 nlog.Errorf("%s: failed to request election from the _next_ primary %s: %v", 637 h.si, nextPrimaryProxy.StringEx(), err) 638 } 639 return 640 } 641 642 func (h *htrun) voteOnProxy(daemonID, currPrimaryID string) (bool, error) { 643 // First: Check last keepalive timestamp. If the proxy was recently successfully reached, 644 // this will always vote no, as we believe the original proxy is still alive. 645 if !h.keepalive.timeToPing(currPrimaryID) { 646 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 647 nlog.Warningf("Primary %s is still alive", currPrimaryID) 648 } 649 return false, nil 650 } 651 652 // Second: Vote according to whether or not the candidate is the Highest Random Weight remaining 653 // in the Smap 654 smap := h.owner.smap.get() 655 nextPrimaryProxy, err := smap.HrwProxy(currPrimaryID) 656 if err != nil { 657 return false, fmt.Errorf("error executing HRW: %v", err) 658 } 659 660 vote := nextPrimaryProxy.ID() == daemonID 661 if cmn.Rom.FastV(4, cos.SmoduleAIS) { 662 nlog.Infof("%s: voting '%t' for %s", h, vote, daemonID) 663 } 664 return vote, nil 665 }