github.com/ergo-services/ergo@v1.999.224/gen/raft.go (about) 1 package gen 2 3 import ( 4 "fmt" 5 "math/rand" 6 "sort" 7 "time" 8 9 "github.com/ergo-services/ergo/etf" 10 "github.com/ergo-services/ergo/lib" 11 ) 12 13 const ( 14 DefaultRaftGetTimeout = 5 // in seconds 15 DefaultRaftAppendTimeout = 5 // in seconds 16 DefaultRaftHeartbeat = 3 // in seconds 17 ) 18 19 var ( 20 ErrRaftState = fmt.Errorf("incorrect raft state") 21 ErrRaftNoQuorum = fmt.Errorf("no quorum") 22 ErrRaftNoLeader = fmt.Errorf("no leader") 23 ErrRaftNoSerial = fmt.Errorf("no peers with requested serial") 24 ErrRaftBusy = fmt.Errorf("another append request is in progress") 25 ErrRaftWrongTimeout = fmt.Errorf("wrong timeout value") 26 ) 27 28 type RaftBehavior interface { 29 ServerBehavior 30 // 31 // Mandatory callbacks 32 // 33 34 InitRaft(process *RaftProcess, arr ...etf.Term) (RaftOptions, error) 35 36 // HandleAppend. Invokes on append request. To cancel this request by a leader, it must return RaftStatusDiscard. 37 HandleAppend(process *RaftProcess, ref etf.Ref, serial uint64, key string, value etf.Term) RaftStatus 38 39 // HandleGet 40 HandleGet(process *RaftProcess, serial uint64) (string, etf.Term, RaftStatus) 41 42 // 43 // Optional callbacks 44 // 45 46 // HandlePeer 47 HandlePeer(process *RaftProcess, peer etf.Pid, serial uint64) RaftStatus 48 49 // HandleQuorum 50 HandleQuorum(process *RaftProcess, quorum *RaftQuorum) RaftStatus 51 52 // HandleLeader 53 HandleLeader(process *RaftProcess, leader *RaftLeader) RaftStatus 54 55 // HandleCancel 56 HandleCancel(process *RaftProcess, ref etf.Ref, reason string) RaftStatus 57 58 // HandleSerial 59 HandleSerial(process *RaftProcess, ref etf.Ref, serial uint64, key string, value etf.Term) RaftStatus 60 61 // 62 // Server's callbacks 63 // 64 65 // HandleRaftCall this callback is invoked on ServerProcess.Call. This method is optional 66 // for the implementation 67 HandleRaftCall(process *RaftProcess, from ServerFrom, message etf.Term) (etf.Term, ServerStatus) 68 // HandleStageCast this callback is invoked on ServerProcess.Cast. This method is optional 69 // for the implementation 70 HandleRaftCast(process *RaftProcess, message etf.Term) ServerStatus 71 // HandleStageInfo this callback is invoked on Process.Send. This method is optional 72 // for the implementation 73 HandleRaftInfo(process *RaftProcess, message etf.Term) ServerStatus 74 // HandleRaftDirect this callback is invoked on Process.Direct. This method is optional 75 // for the implementation 76 HandleRaftDirect(process *RaftProcess, message interface{}) (interface{}, error) 77 } 78 79 type RaftStatus error 80 type RaftQuorumState int 81 82 var ( 83 RaftStatusOK RaftStatus // nil 84 RaftStatusStop RaftStatus = fmt.Errorf("stop") 85 RaftStatusDiscard RaftStatus = fmt.Errorf("discard") 86 87 RaftQuorumState3 RaftQuorumState = 3 // minimum quorum that could make leader election 88 RaftQuorumState5 RaftQuorumState = 5 89 RaftQuorumState7 RaftQuorumState = 7 90 RaftQuorumState9 RaftQuorumState = 9 91 RaftQuorumState11 RaftQuorumState = 11 // maximal quorum 92 93 cleanVoteTimeout = 1 * time.Second 94 cleanLeaderVoteTimeout = 1 * time.Second 95 quorumChangeDeferMaxTime = 450 // in millisecond. uses as max value in range of 50.. 96 ) 97 98 type Raft struct { 99 Server 100 } 101 102 type RaftProcess struct { 103 ServerProcess 104 options RaftOptions 105 behavior RaftBehavior 106 107 quorum *RaftQuorum 108 quorumCandidates *quorumCandidates 109 quorumVotes map[RaftQuorumState]*quorum 110 quorumChangeDefer bool 111 quorumChangeAttempt int 112 113 leader etf.Pid 114 election *leaderElection 115 round int // "log term" in terms of Raft spec 116 117 // get requests 118 requests map[etf.Ref]CancelFunc 119 120 // append requests 121 requestsAppend map[string]*requestAppend 122 requestsAppendQueue []requestAppendQueued 123 124 // leader sends heartbeat messages and keep the last sending timestamp 125 heartbeatLeader int64 126 heartbeatCancel CancelFunc 127 } 128 129 type leaderElection struct { 130 votes map[etf.Pid]etf.Pid 131 results map[etf.Pid]bool 132 round int 133 leader etf.Pid // leader elected 134 voted int // number of peers voted for the leader 135 cancel CancelFunc 136 } 137 138 type requestAppend struct { 139 ref etf.Ref 140 from etf.Pid 141 origin etf.Pid 142 value etf.Term 143 peers map[etf.Pid]bool 144 cancel CancelFunc 145 } 146 147 type requestAppendQueued struct { 148 from etf.Pid 149 request *messageRaftRequestAppend 150 } 151 152 type quorumCandidates struct { 153 candidates map[etf.Pid]*candidate 154 } 155 156 type candidate struct { 157 monitor etf.Ref 158 serial uint64 159 joined bool 160 heartbeat int64 161 failures int 162 } 163 164 type RaftLeader struct { 165 Leader etf.Pid 166 Serial uint64 167 State RaftQuorumState 168 } 169 170 type RaftQuorum struct { 171 Member bool 172 State RaftQuorumState 173 Peers []etf.Pid // the number of participants in quorum could be 3,5,7,9,11 174 } 175 type quorum struct { 176 RaftQuorum 177 votes map[etf.Pid]int // 1 - sent, 2 - recv, 3 - sent and recv 178 origin etf.Pid // where the voting has come from. it must receive our voice in the last order 179 lastVote int64 // time.Now().UnixMilli() 180 } 181 182 type RaftOptions struct { 183 ID string // raft cluster id 184 Peers []ProcessID 185 Serial uint64 // serial number ("log id" in terms of Raft spec) 186 } 187 188 type messageRaft struct { 189 Request etf.Atom 190 Pid etf.Pid 191 Command interface{} 192 } 193 194 type messageRaftClusterInit struct{} 195 type messageRaftClusterJoin struct { 196 ID string // cluster id 197 Serial uint64 198 } 199 type messageRaftClusterJoinReply struct { 200 ID string // cluster id 201 Serial uint64 202 Peers []etf.Pid 203 QuorumState int 204 QuorumPeers []etf.Pid 205 } 206 type messageRaftQuorumVote struct { 207 ID string // cluster id 208 Serial uint64 209 State int 210 Candidates []etf.Pid 211 } 212 type messageRaftQuorumChange struct{} 213 type messageRaftQuorumBuilt struct { 214 ID string // cluster id 215 State int 216 Round int // last round 217 Peers []etf.Pid 218 } 219 type messageRaftQuorumLeave struct { 220 ID string 221 DueToPid etf.Pid 222 } 223 224 type messageRaftQuorumCleanVote struct { 225 state RaftQuorumState 226 } 227 228 type messageRaftLeaderHeartbeat struct { 229 ID string 230 Serial uint64 231 } 232 233 type messageRaftLeaderVote struct { 234 ID string // cluster id 235 State int //quorum state 236 Leader etf.Pid // offered leader 237 Round int 238 } 239 type messageRaftLeaderElected struct { 240 ID string // cluster id 241 Leader etf.Pid // elected leader 242 Voted int // number of votes for this leader 243 Round int 244 } 245 246 type messageRaftRequestGet struct { 247 ID string // cluster id 248 Ref etf.Ref 249 Origin etf.Pid 250 Serial uint64 251 } 252 type messageRaftRequestReply struct { 253 ID string // cluster id 254 Ref etf.Ref 255 Serial uint64 256 Key string 257 Value etf.Term 258 } 259 type messageRaftRequestAppend struct { 260 ID string // cluster id 261 Ref etf.Ref 262 Origin etf.Pid 263 Key string 264 Value etf.Term 265 Deadline int64 // timestamp in milliseconds 266 } 267 268 type messageRaftAppendReady struct { 269 ID string // cluster id 270 Ref etf.Ref 271 Key string 272 } 273 274 type messageRaftAppendCommit struct { 275 ID string // cluster id 276 Ref etf.Ref 277 Key string 278 Serial uint64 279 Broadcast etf.Pid // quorum member who is in charge of broadcasting 280 } 281 282 type messageRaftAppendBroadcast struct { 283 ID string 284 Ref etf.Ref 285 Serial uint64 286 Key string 287 Value etf.Term 288 } 289 290 type messageRaftRequestClean struct { 291 ref etf.Ref 292 } 293 type messageRaftAppendClean struct { 294 key string 295 ref etf.Ref 296 } 297 type messageRaftElectionClean struct { 298 round int 299 } 300 type messageRaftHeartbeat struct{} 301 302 // 303 // RaftProcess quorum routines and APIs 304 // 305 306 // Join makes a join requst to the given peer, which is supposed to be in a raft cluster 307 func (rp *RaftProcess) Join(peer interface{}) error { 308 // QUODBG fmt.Println(rp.Name(), "CLU send join to", peer) 309 join := etf.Tuple{ 310 etf.Atom("$cluster_join"), 311 rp.Self(), 312 etf.Tuple{ 313 rp.options.ID, 314 }, 315 } 316 return rp.Cast(peer, join) 317 } 318 319 // Peers returns list of the processes in the raft cluster. Note, this list is sorted by the Serial value on them in the descending order 320 func (rp *RaftProcess) Peers() []etf.Pid { 321 return rp.quorumCandidates.List() 322 } 323 324 // Quorum returns current quorum. It returns nil if quorum hasn't built yet. 325 func (rp *RaftProcess) Quorum() *RaftQuorum { 326 var q RaftQuorum 327 if rp.quorum == nil { 328 return nil 329 } 330 q.Member = rp.quorum.Member 331 q.State = rp.quorum.State 332 q.Peers = make([]etf.Pid, len(rp.quorum.Peers)) 333 for i := range rp.quorum.Peers { 334 q.Peers[i] = rp.quorum.Peers[i] 335 } 336 return &q 337 } 338 339 // Leader returns current leader in the quorum. It returns nil If this process is not a quorum or if leader election is still in progress 340 func (rp *RaftProcess) Leader() *RaftLeader { 341 var leader RaftLeader 342 343 if rp.quorum == nil || rp.quorum.Member == false { 344 return nil 345 } 346 347 noLeader := etf.Pid{} 348 if rp.leader == noLeader { 349 return nil 350 } 351 leader.Leader = rp.leader 352 leader.State = rp.quorum.State 353 leader.Serial = rp.options.Serial 354 if rp.leader != rp.Self() { 355 // must be present among the peers 356 c := rp.quorumCandidates.GetOnline(rp.leader) 357 if c == nil { 358 panic("internal error. elected leader has been lost") 359 } 360 leader.Serial = c.serial 361 } 362 363 return &leader 364 } 365 366 // Get makes a request to the quorum member to get the data with the given serial number and 367 // sets the timeout to the DefaultRaftGetTimeout = 5 sec. It returns ErrRaftNoQuorum if quorum 368 // forming is still in progress. 369 func (rp *RaftProcess) Get(serial uint64) (etf.Ref, error) { 370 return rp.GetWithTimeout(serial, DefaultRaftGetTimeout) 371 } 372 373 // Get makes a request to the quorum member to get the data with the given serial number and 374 // timeout in seconds. Returns a reference of this request. Once requested data has arrived 375 // the callback HandleSerial will be invoked. 376 // If a timeout occurred the callback HandleCancel will be invoked with reason "timeout" 377 func (rp *RaftProcess) GetWithTimeout(serial uint64, timeout int) (etf.Ref, error) { 378 var ref etf.Ref 379 if rp.quorum == nil { 380 return ref, ErrRaftNoQuorum 381 } 382 383 peers := []etf.Pid{} 384 for _, pid := range rp.quorum.Peers { 385 if pid == rp.Self() { 386 continue 387 } 388 if c := rp.quorumCandidates.GetOnline(pid); c != nil { 389 if serial > c.serial { 390 continue 391 } 392 peers = append(peers, pid) 393 } 394 } 395 if len(peers) == 0 { 396 return ref, ErrRaftNoSerial 397 } 398 399 // get random member of quorum and send the request 400 n := 0 401 if len(peers) > 1 { 402 rand.Intn(len(peers) - 1) 403 } 404 peer := peers[n] 405 ref = rp.MakeRef() 406 requestGet := etf.Tuple{ 407 etf.Atom("$request_get"), 408 rp.Self(), 409 etf.Tuple{ 410 rp.options.ID, 411 ref, 412 rp.Self(), // origin 413 serial, 414 }, 415 } 416 417 if err := rp.Cast(peer, requestGet); err != nil { 418 return ref, err 419 } 420 cancel := rp.CastAfter(rp.Self, messageRaftRequestClean{ref: ref}, time.Duration(timeout)*time.Second) 421 rp.requests[ref] = cancel 422 return ref, nil 423 } 424 425 // Append 426 func (rp *RaftProcess) Append(key string, value etf.Term) (etf.Ref, error) { 427 return rp.AppendWithTimeout(key, value, DefaultRaftAppendTimeout) 428 } 429 430 // AppendWithTimeout 431 func (rp *RaftProcess) AppendWithTimeout(key string, value etf.Term, timeout int) (etf.Ref, error) { 432 var ref etf.Ref 433 if timeout < 1 { 434 return ref, ErrRaftWrongTimeout 435 } 436 437 if _, exist := rp.requestsAppend[key]; exist { 438 return ref, ErrRaftBusy 439 } 440 if rp.quorum == nil { 441 return ref, ErrRaftNoQuorum 442 } 443 noLeader := etf.Pid{} 444 if rp.quorum.Member == true && rp.leader == noLeader { 445 return ref, ErrRaftNoLeader 446 } 447 t := int(time.Duration(timeout) * time.Second) 448 deadline := time.Now().Add(time.Duration(t - t/int(rp.quorum.State))).UnixMilli() 449 ref = rp.MakeRef() 450 451 // if Append request has made on a leader 452 if rp.leader == rp.Self() { 453 // DBGAPN fmt.Println(rp.Self(), "DBGAPN append request", ref, "made on a leader") 454 dataAppend := &messageRaftRequestAppend{ 455 Ref: ref, 456 Origin: rp.Self(), 457 Key: key, 458 Value: value, 459 Deadline: deadline, 460 } 461 rp.handleAppendLeader(rp.Self(), dataAppend) 462 return ref, nil 463 } 464 465 peer := rp.leader 466 // if Member == false => rp.leader == noLeader 467 if rp.quorum.Member == false { 468 // this raft process runs as a Client. send this request to the quorum member 469 n := rand.Intn(len(rp.quorum.Peers) - 1) 470 peer = rp.quorum.Peers[n] 471 deadline = time.Now().Add(time.Duration(t - t/(int(rp.quorum.State)+1))).UnixMilli() 472 } 473 dataAppend := etf.Tuple{ 474 etf.Atom("$request_append"), 475 rp.Self(), 476 etf.Tuple{ 477 rp.options.ID, 478 ref, 479 rp.Self(), 480 key, 481 value, 482 deadline, 483 }, 484 } 485 // DBGAPN fmt.Println(rp.Self(), "DPGAPN sent $request_append", ref, "to the peer", peer) 486 if err := rp.Cast(peer, dataAppend); err != nil { 487 return ref, err 488 } 489 490 peers := make(map[etf.Pid]bool) 491 if rp.quorum.Member == true { 492 // this process will be in charge of broadcasting 493 // so we should keep the set of peers in this quorum in order 494 // to exlude them on the broadcasting 495 for _, pid := range rp.quorum.Peers { 496 if pid == rp.Self() { 497 continue 498 } 499 peers[pid] = true 500 } 501 } 502 503 clean := messageRaftAppendClean{key: key, ref: ref} 504 after := time.Duration(timeout) * time.Second 505 cancel := rp.CastAfter(rp.Self, clean, after) 506 requestAppend := &requestAppend{ 507 ref: ref, 508 origin: rp.Self(), 509 value: value, 510 peers: peers, 511 cancel: cancel, 512 } 513 rp.requestsAppend[key] = requestAppend 514 return ref, nil 515 } 516 517 // Serial returns current value of serial for this raft process 518 func (rp *RaftProcess) Serial() uint64 { 519 return rp.options.Serial 520 } 521 522 // private routines 523 524 func (rp *RaftProcess) handleRaftRequest(m messageRaft) error { 525 switch m.Request { 526 case etf.Atom("$cluster_join"): 527 join := &messageRaftClusterJoin{} 528 if err := etf.TermIntoStruct(m.Command, &join); err != nil { 529 return lib.ErrUnsupportedRequest 530 } 531 532 if join.ID != rp.options.ID { 533 // this peer belongs to another quorum id 534 return RaftStatusOK 535 } 536 537 if rp.quorum != nil && rp.quorum.Member { 538 // if we got $cluster_join from a quorum member, it means 539 // the quorum we had belonging is not existed anymore 540 if rp.isQuorumMember(m.Pid) == true { 541 rp.quorum = nil 542 rp.handleQuorum() 543 rp.quorumChangeStart(false) 544 } 545 } 546 547 rp.quorumCandidates.Set(rp, m.Pid) 548 rp.quorumCandidates.SetOnline(rp, m.Pid, join.Serial) 549 550 if status := rp.behavior.HandlePeer(rp, m.Pid, join.Serial); status != RaftStatusOK { 551 return status 552 } 553 554 // send peer list even if this peer is already present in our candidates list 555 // just to exchange updated data 556 peers := rp.quorumCandidates.List() 557 quorumState := 0 558 quorumPeers := []etf.Pid{} 559 if rp.quorum != nil { 560 quorumState = int(rp.quorum.State) 561 quorumPeers = rp.quorum.Peers 562 } 563 reply := etf.Tuple{ 564 etf.Atom("$cluster_join_reply"), 565 rp.Self(), 566 etf.Tuple{ 567 rp.options.ID, 568 rp.options.Serial, 569 peers, 570 quorumState, 571 quorumPeers, 572 }, 573 } 574 // QUODBG fmt.Println(rp.Name(), "GOT CLU JOIN from", m.Pid, "send peers", peers) 575 rp.Cast(m.Pid, reply) 576 return RaftStatusOK 577 578 case etf.Atom("$cluster_join_reply"): 579 580 reply := &messageRaftClusterJoinReply{} 581 if err := etf.TermIntoStruct(m.Command, &reply); err != nil { 582 return lib.ErrUnsupportedRequest 583 } 584 585 if reply.ID != rp.options.ID { 586 // this peer belongs to another quorum id. ignore it. 587 return RaftStatusOK 588 } 589 590 // QUODBG fmt.Println(rp.Name(), "GOT CLU JOIN REPL from", m.Pid, "got peers", reply.Peers) 591 canAcceptQuorum := true 592 593 // check if there is another quorum in this cluster 594 if rp.quorum != nil { 595 // doesnt matter we compare the number of peers or quorum state 596 // reply.QuorumState <= rp.quorum.State 597 if len(reply.QuorumPeers) <= len(rp.quorum.Peers) { 598 canAcceptQuorum = false 599 } 600 } 601 602 // check peers 603 for _, peer := range reply.Peers { 604 if peer == rp.Self() { 605 continue 606 } 607 // check if we dont have some of them among the online peers 608 if c := rp.quorumCandidates.GetOnline(peer); c != nil { 609 continue 610 } 611 rp.quorumCandidates.Set(rp, peer) 612 canAcceptQuorum = false 613 } 614 615 rp.quorumCandidates.Set(rp, m.Pid) 616 rp.quorumCandidates.SetOnline(rp, m.Pid, reply.Serial) 617 618 if status := rp.behavior.HandlePeer(rp, m.Pid, reply.Serial); status != RaftStatusOK { 619 return status 620 } 621 622 // try to rebuild quorum since the number of peers has changed 623 rp.quorumChangeStart(false) 624 625 // accept quorum if this peer is belongs to the existing quorum 626 // and set membership to false 627 switch RaftQuorumState(reply.QuorumState) { 628 case RaftQuorumState3, RaftQuorumState5: 629 break 630 case RaftQuorumState7, RaftQuorumState9, RaftQuorumState11: 631 break 632 default: 633 canAcceptQuorum = false 634 } 635 if canAcceptQuorum == true { 636 rp.election = nil 637 rp.quorum = &RaftQuorum{ 638 State: RaftQuorumState(reply.QuorumState), 639 Peers: reply.QuorumPeers, 640 Member: false, 641 } 642 return rp.handleQuorum() 643 } 644 return RaftStatusOK 645 646 case etf.Atom("$quorum_vote"): 647 vote := &messageRaftQuorumVote{} 648 if err := etf.TermIntoStruct(m.Command, &vote); err != nil { 649 return lib.ErrUnsupportedRequest 650 } 651 if vote.ID != rp.options.ID { 652 // ignore this request 653 return RaftStatusOK 654 } 655 return rp.quorumVote(m.Pid, vote) 656 657 case etf.Atom("$quorum_built"): 658 built := &messageRaftQuorumBuilt{} 659 if err := etf.TermIntoStruct(m.Command, &built); err != nil { 660 return lib.ErrUnsupportedRequest 661 } 662 // QUODBG fmt.Println(rp.Name(), "GOT QUO BUILT from", m.Pid) 663 if built.ID != rp.options.ID { 664 // this process is not belong this quorum 665 return RaftStatusOK 666 } 667 duplicates := make(map[etf.Pid]bool) 668 matchCandidates := true 669 for _, pid := range built.Peers { 670 if _, exist := duplicates[pid]; exist { 671 // duplicate found 672 return RaftStatusOK 673 } 674 if pid == rp.Self() { 675 panic("raft internal error. got quorum built message") 676 } 677 if c := rp.quorumCandidates.GetOnline(pid); c != nil { 678 c.failures = 0 679 c.heartbeat = time.Now().Unix() 680 continue 681 } 682 rp.quorumCandidates.Set(rp, pid) 683 matchCandidates = false 684 } 685 if len(built.Peers) != built.State { 686 // ignore wrong peer list 687 lib.Warning("[%s] got quorum state doesn't match with the peer list", rp.Self()) 688 return RaftStatusOK 689 } 690 candidateQuorumState := RaftQuorumState3 691 switch built.State { 692 case 11: 693 candidateQuorumState = RaftQuorumState11 694 case 9: 695 candidateQuorumState = RaftQuorumState9 696 case 7: 697 candidateQuorumState = RaftQuorumState7 698 case 5: 699 candidateQuorumState = RaftQuorumState5 700 case 3: 701 candidateQuorumState = RaftQuorumState3 702 default: 703 // ignore wrong state 704 return RaftStatusOK 705 } 706 707 rp.quorumChangeStart(false) 708 709 if built.Round > rp.round { 710 // update rp.round 711 rp.round = built.Round 712 } 713 714 // we do accept quorum if it was built using 715 // the peers we got registered as candidates 716 if matchCandidates == true { 717 rp.election = nil 718 if rp.quorum == nil { 719 rp.quorum = &RaftQuorum{} 720 rp.quorum.State = candidateQuorumState 721 rp.quorum.Member = false 722 rp.quorum.Peers = built.Peers 723 // QUODBG fmt.Println(rp.Name(), "QUO BUILT. NOT A MEMBER", rp.quorum.State, rp.quorum.Peers) 724 return rp.handleQuorum() 725 } 726 // QUODBG fmt.Println(rp.Name(), "QUO BUILT. NOT A MEMBER", rp.quorum.State, rp.quorum.Peers) 727 728 changed := false 729 if rp.quorum.State != candidateQuorumState { 730 changed = true 731 } 732 rp.quorum.State = candidateQuorumState 733 734 if rp.quorum.Member != false { 735 changed = true 736 } 737 rp.quorum.Member = false 738 739 rp.quorum.Peers = built.Peers 740 if changed == true { 741 return rp.handleQuorum() 742 } 743 return RaftStatusOK 744 } 745 746 if rp.quorum != nil { 747 rp.quorum = nil 748 rp.election = nil 749 return rp.handleQuorum() 750 } 751 return RaftStatusOK 752 753 case etf.Atom("$leader_heartbeat"): 754 heartbeat := &messageRaftLeaderHeartbeat{} 755 if err := etf.TermIntoStruct(m.Command, &heartbeat); err != nil { 756 return lib.ErrUnsupportedRequest 757 } 758 759 if rp.options.ID != heartbeat.ID { 760 return RaftStatusOK 761 } 762 763 c := rp.quorumCandidates.GetOnline(m.Pid) 764 if c == nil { 765 // HRTDBG fmt.Println(rp.Self(), "HRT from unknown/offline peer", m.Pid) 766 rp.quorumCandidates.Set(rp, m.Pid) 767 return RaftStatusOK 768 } 769 // HRTDBG fmt.Println(rp.Self(), "HRT from", m.Pid, "serial", c.serial) 770 c.heartbeat = time.Now().Unix() 771 c.serial = heartbeat.Serial 772 c.failures = 0 773 return RaftStatusOK 774 775 case etf.Atom("$quorum_leave"): 776 leave := &messageRaftQuorumLeave{} 777 if err := etf.TermIntoStruct(m.Command, &leave); err != nil { 778 return lib.ErrUnsupportedRequest 779 } 780 if rp.quorum == nil { 781 return RaftStatusOK 782 } 783 784 if rp.options.ID != leave.ID { 785 return RaftStatusOK 786 } 787 788 // check if it came from the quorum member 789 if rp.isQuorumMember(m.Pid) == false { 790 return RaftStatusOK 791 } 792 793 // QUODBG fmt.Println(rp.Self(), "QUO got leave from", m.Pid, "due to", leave.DueToPid) 794 rp.quorumCandidates.SetOffline(rp, leave.DueToPid) 795 796 member := rp.quorum.Member 797 rp.quorum = nil 798 rp.handleQuorum() 799 // only quorum member can restart quorum building if some of the member has left 800 if member == true { 801 rp.quorumChangeStart(false) 802 } 803 return RaftStatusOK 804 805 case etf.Atom("$leader_vote"): 806 vote := &messageRaftLeaderVote{} 807 if err := etf.TermIntoStruct(m.Command, &vote); err != nil { 808 return lib.ErrUnsupportedRequest 809 } 810 811 if rp.options.ID != vote.ID { 812 lib.Warning("[%s] ignore 'leader vote' message being not a member of the given raft cluster (from %s)", rp.Self(), m.Pid) 813 return RaftStatusOK 814 } 815 816 if rp.quorum == nil { 817 rp.election = nil 818 // no quorum 819 // LDRDBG fmt.Println(rp.Self(), "LDR NO QUO ignore vote from", m.Pid, "round", vote.Round, "for", vote.Leader) 820 // Seems we have received leader_vote before the quorum_built message. 821 // Ignore this vote but update its round value to start a new leader election. 822 // Otherwise, the new election will be started with the same round value but without 823 // votes, which have been ignored before the quorum was built. 824 if vote.Round > rp.round { 825 rp.round = vote.Round 826 } 827 return RaftStatusOK 828 } 829 830 if rp.quorum.State != RaftQuorumState(vote.State) { 831 // vote within another quorum. seems the quorum has been changed during this election. 832 // ignore it 833 // LDRDBG fmt.Println(rp.Self(), "LDR ignore vote from", m.Pid, "with another quorum", vote.State, "current quorum", rp.quorum.State) 834 if vote.Round > rp.round { 835 rp.round = vote.Round 836 } 837 return RaftStatusOK 838 } 839 if rp.election != nil && rp.election.round > vote.Round { 840 // ignore it. current election round is greater 841 // LDRDBG fmt.Println(rp.Self(), "LDR ignore vote from", m.Pid, "with round", vote.Round, "current election round", rp.election.round) 842 return RaftStatusOK 843 } 844 if rp.round > vote.Round { 845 // newbie is trying to start a new election :) 846 // LDRDBG fmt.Println(rp.Self(), "LDR ignore vote from newbie", m.Pid, "with round", vote.Round, "current round", rp.round) 847 return RaftStatusOK 848 } 849 850 // check if m.Pid is belongs to the quorum 851 belongs := false 852 for _, pid := range rp.quorum.Peers { 853 if pid == m.Pid { 854 belongs = true 855 break 856 } 857 } 858 859 if belongs == false { 860 // there might be a case if we got vote message before the quorum_built 861 lib.Warning("[%s] got ignore from the peer, which doesn't belong to the quorum %s", rp.Self(), m.Pid) 862 if vote.Round > rp.round { 863 rp.round = vote.Round 864 } 865 return RaftStatusOK 866 } 867 868 // start new election 869 new_election := false 870 switch { 871 case rp.election == nil: 872 new_election = true 873 case rp.election != nil: 874 // TODO case with existing leader whithin this quorum. if some of the quorum member 875 // got leader heartbeat timeout it starts new election but this process has no problem 876 // with the leader. 877 if vote.Round > rp.election.round { 878 // overwrite election if it has greater round number 879 rp.election.cancel() 880 new_election = true 881 } 882 } 883 if new_election { 884 // LDRDBG fmt.Println(rp.Self(), "LDR accept election from", m.Pid, "round", vote.Round, " with vote for:", vote.Leader) 885 rp.election = &leaderElection{ 886 votes: make(map[etf.Pid]etf.Pid), 887 results: make(map[etf.Pid]bool), 888 round: vote.Round, 889 } 890 rp.election.cancel = rp.CastAfter(rp.Self, messageRaftElectionClean{round: vote.Round}, cleanLeaderVoteTimeout) 891 rp.handleElectionVote() 892 } 893 894 if _, exist := rp.election.votes[m.Pid]; exist { 895 lib.Warning("[%s] ignore duplicate vote for %s from %s during %d round", rp.Self(), 896 vote.Leader, m.Pid, vote.Round) 897 return RaftStatusOK 898 } 899 900 rp.election.votes[m.Pid] = vote.Leader 901 // LDRDBG fmt.Println(rp.Self(), "LDR got vote from", m.Pid, "for", vote.Leader, "round", vote.Round, "quorum", vote.State) 902 if len(rp.quorum.Peers) != len(rp.election.votes) { 903 // make sure if we got all votes 904 return RaftStatusOK 905 } 906 if len(rp.election.votes) != len(rp.quorum.Peers) { 907 // waiting for all votes from the quorum members) 908 return RaftStatusOK 909 } 910 911 // got all votes. count them to get the quorum leader 912 countVotes := make(map[etf.Pid]int) 913 for _, vote_for := range rp.election.votes { 914 c, _ := countVotes[vote_for] 915 countVotes[vote_for] = c + 1 916 } 917 leaderPid := etf.Pid{} 918 leaderVoted := 0 919 leaderSplit := false 920 for leader, voted := range countVotes { 921 if leaderVoted == voted { 922 leaderSplit = true 923 continue 924 } 925 if leaderVoted < voted { 926 leaderVoted = voted 927 leaderPid = leader 928 leaderSplit = false 929 } 930 } 931 // LDRDBG fmt.Println(rp.Self(), "LDR got all votes. round", vote.Round, "quorum", vote.State) 932 if leaderSplit { 933 // LDRDBG fmt.Println(rp.Self(), "LDR got split voices. round", vote.Round, "quorum", vote.State) 934 // got more than one leader 935 // start new leader election with round++ 936 rp.handleElectionStart(vote.Round + 1) 937 return RaftStatusOK 938 } 939 940 noLeader := etf.Pid{} 941 if rp.election.leader == noLeader { 942 rp.election.leader = leaderPid 943 rp.election.voted = leaderVoted 944 } else { 945 if rp.election.leader != leaderPid || rp.election.voted != leaderVoted { 946 // our result defers from the others which we already received 947 // start new leader election with round++ 948 lib.Warning("[%s] got different result from %s. cheating detected", rp.Self(), m.Pid) 949 rp.handleElectionStart(vote.Round + 1) 950 return RaftStatusOK 951 } 952 } 953 954 // LDRDBG fmt.Println(rp.Self(), "LDR election done. round", rp.election.round, "Leader", leaderPid, "with", leaderVoted, "voices", "quorum", vote.State) 955 rp.election.results[rp.Self()] = true 956 957 // send to all quorum members our choice 958 elected := etf.Tuple{ 959 etf.Atom("$leader_elected"), 960 rp.Self(), 961 etf.Tuple{ 962 rp.options.ID, 963 leaderPid, 964 leaderVoted, 965 rp.election.round, 966 }, 967 } 968 for _, pid := range rp.quorum.Peers { 969 if pid == rp.Self() { 970 continue 971 } 972 rp.Cast(pid, elected) 973 // LDRDBG fmt.Println(rp.Self(), "LDR elected", leaderPid, "sent result to", pid, "wait the others") 974 } 975 976 if len(rp.election.votes) != len(rp.election.results) { 977 // we should wait for result from all the election members 978 return RaftStatusOK 979 } 980 981 // leader has been elected 982 // LDRDBG fmt.Println(rp.Self(), "LDR finished. leader", rp.election.leader, "round", rp.election.round, "quorum", rp.quorum.State) 983 rp.round = rp.election.round 984 rp.election.cancel() 985 if rp.leader != rp.election.leader { 986 rp.leader = rp.election.leader 987 l := rp.Leader() 988 rp.election = nil 989 return rp.behavior.HandleLeader(rp, l) 990 } 991 rp.election = nil 992 return RaftStatusOK 993 994 case etf.Atom("$leader_elected"): 995 elected := &messageRaftLeaderElected{} 996 if err := etf.TermIntoStruct(m.Command, &elected); err != nil { 997 return lib.ErrUnsupportedRequest 998 } 999 1000 if rp.options.ID != elected.ID { 1001 lib.Warning("[%s] ignore 'leader elected' message being not a member of the given raft cluster (from %s)", rp.Self(), m.Pid) 1002 return RaftStatusOK 1003 } 1004 1005 if rp.quorum == nil { 1006 rp.election = nil 1007 // no quorum 1008 // LDRDBG fmt.Println(rp.Self, "LDR NO QUO ignore election result", elected, "from", m.Pid) 1009 return RaftStatusOK 1010 } 1011 1012 if rp.election == nil { 1013 lib.Warning("[%s] ignore election result from %s. no election on this peer", rp.Self(), m.Pid) 1014 return RaftStatusOK 1015 } 1016 1017 if elected.Round != rp.election.round { 1018 // round value must be the same. seemd another election is started 1019 lib.Warning("[%s] ignore election result from %s with another round value %d (current election round %d)", rp.Self(), m.Pid, elected.Round, rp.election.round) 1020 if elected.Round > rp.round { 1021 // update round value to the greatest one 1022 rp.round = elected.Round 1023 } 1024 return RaftStatusOK 1025 } 1026 1027 noLeader := etf.Pid{} 1028 if rp.election.leader == noLeader { 1029 rp.election.leader = elected.Leader 1030 rp.election.voted = elected.Voted 1031 } else { 1032 if rp.election.leader != elected.Leader || rp.election.voted != elected.Voted { 1033 // elected leader must be the same in all election results 1034 lib.Warning("[%s] ignore election result from %s with different leader which must be the same", rp.Self(), m.Pid) 1035 return RaftStatusOK 1036 } 1037 } 1038 1039 if _, exist := rp.election.results[m.Pid]; exist { 1040 // duplicate 1041 lib.Warning("[%s] ignore duplicate election result from %s during %d round", rp.Self(), 1042 m.Pid, elected.Round) 1043 return RaftStatusOK 1044 } 1045 1046 if _, exist := rp.election.votes[m.Pid]; exist == false { 1047 // Got election result before the vote from m.Pid 1048 // Check if m.Pid belongs to the quorum 1049 if rp.election.round > rp.round { 1050 rp.round = rp.election.round 1051 } 1052 belongs := false 1053 for _, pid := range rp.quorum.Peers { 1054 if pid == m.Pid { 1055 belongs = true 1056 break 1057 } 1058 } 1059 if belongs == false { 1060 // got from unknown peer 1061 lib.Warning("[%s] ignore election result from %s which doesn't belong this quorum", rp.Self(), m.Pid) 1062 return RaftStatusOK 1063 } 1064 1065 // keep it and wait for the vote from this peer 1066 rp.election.results[m.Pid] = true 1067 return RaftStatusOK 1068 } 1069 rp.election.results[m.Pid] = true 1070 1071 if len(rp.quorum.Peers) != len(rp.election.votes) { 1072 // make sure if we got all votes 1073 return RaftStatusOK 1074 } 1075 1076 if len(rp.election.votes) != len(rp.election.results) { 1077 // we should wait for result from all the election members 1078 return RaftStatusOK 1079 } 1080 1081 // leader has been elected 1082 // LDRDBG fmt.Println(rp.Self(), "LDR finished. leader", rp.election.leader, "round", rp.election.round, "quorum", rp.quorum.State) 1083 rp.election.cancel() // cancel timer 1084 rp.round = rp.election.round 1085 if rp.leader != rp.election.leader { 1086 rp.leader = rp.election.leader 1087 rp.election = nil 1088 l := rp.Leader() 1089 return rp.behavior.HandleLeader(rp, l) 1090 } 1091 rp.election = nil 1092 return RaftStatusOK 1093 1094 case etf.Atom("$request_get"): 1095 requestGet := &messageRaftRequestGet{} 1096 if err := etf.TermIntoStruct(m.Command, &requestGet); err != nil { 1097 return lib.ErrUnsupportedRequest 1098 } 1099 1100 if rp.options.ID != requestGet.ID { 1101 lib.Warning("[%s] got 'get' request being not a member of the given raft cluster (from %s)", rp.Self(), m.Pid) 1102 return RaftStatusOK 1103 } 1104 1105 if rp.quorum == nil { 1106 // no quorum 1107 return RaftStatusOK 1108 } 1109 1110 if rp.quorum.Member == false { 1111 // not a quorum member. couldn't handle this request 1112 lib.Warning("[%s] got 'get' request being not a member of the quorum (from %s)", rp.Self(), m.Pid) 1113 return RaftStatusOK 1114 } 1115 //fmt.Println(rp.Self(), "GET request", requestGet.Ref, "from", m.Pid, "serial", requestGet.Serial) 1116 1117 key, value, status := rp.behavior.HandleGet(rp, requestGet.Serial) 1118 if status != RaftStatusOK { 1119 // do nothing 1120 return status 1121 } 1122 if value == nil { 1123 // not found. 1124 if m.Pid != requestGet.Origin { 1125 // its already forwarded request. just ignore it 1126 return RaftStatusOK 1127 } 1128 1129 // forward this request to another qourum member 1130 forwardGet := etf.Tuple{ 1131 etf.Atom("$request_get"), 1132 rp.Self(), 1133 etf.Tuple{ 1134 requestGet.ID, 1135 requestGet.Ref, 1136 requestGet.Origin, 1137 requestGet.Serial, 1138 }, 1139 } 1140 1141 // get random quorum member excluding m.Pid and requestGet.Origin 1142 peers := []etf.Pid{} 1143 for _, pid := range rp.quorum.Peers { 1144 if pid == m.Pid { 1145 continue 1146 } 1147 if pid == requestGet.Origin { 1148 continue 1149 } 1150 if pid == rp.Self() { 1151 continue 1152 } 1153 peers = append(peers, pid) 1154 } 1155 1156 if len(peers) == 0 { 1157 return RaftStatusOK 1158 } 1159 1160 n := 0 1161 if len(peers) > 1 { 1162 n = rand.Intn(len(peers) - 1) 1163 } 1164 peer := peers[n] 1165 //fmt.Println(rp.Self(), "GET forward", requestGet.Ref, "to", peer, "serial", requestGet.Serial) 1166 rp.Cast(peer, forwardGet) 1167 return RaftStatusOK 1168 } 1169 1170 requestReply := etf.Tuple{ 1171 etf.Atom("$request_reply"), 1172 rp.Self(), 1173 etf.Tuple{ 1174 requestGet.ID, 1175 requestGet.Ref, 1176 requestGet.Serial, 1177 key, 1178 value, 1179 }, 1180 } 1181 rp.Cast(requestGet.Origin, requestReply) 1182 1183 // update serial of this peer 1184 if c := rp.quorumCandidates.GetOnline(requestGet.Origin); c != nil { 1185 if c.serial < requestGet.Serial { 1186 c.serial = requestGet.Serial 1187 } 1188 } else { 1189 rp.quorumCandidates.Set(rp, requestGet.Origin) 1190 } 1191 return RaftStatusOK 1192 1193 case etf.Atom("$request_reply"): 1194 requestReply := &messageRaftRequestReply{} 1195 if err := etf.TermIntoStruct(m.Command, &requestReply); err != nil { 1196 return lib.ErrUnsupportedRequest 1197 } 1198 1199 if rp.options.ID != requestReply.ID { 1200 lib.Warning("[%s] got 'reply' being not a member of the given raft cluster (from %s)", rp.Self(), m.Pid) 1201 return RaftStatusOK 1202 } 1203 cancel, exist := rp.requests[requestReply.Ref] 1204 if exist == false { 1205 // might be timed out already. do nothing 1206 return RaftStatusOK 1207 } 1208 // cancel timer 1209 cancel() 1210 if rp.options.Serial < requestReply.Serial { 1211 rp.options.Serial = requestReply.Serial 1212 } 1213 // call HandleSerial 1214 return rp.behavior.HandleSerial(rp, requestReply.Ref, requestReply.Serial, 1215 requestReply.Key, requestReply.Value) 1216 1217 case etf.Atom("$request_append"): 1218 requestAppend := &messageRaftRequestAppend{} 1219 if err := etf.TermIntoStruct(m.Command, &requestAppend); err != nil { 1220 return lib.ErrUnsupportedRequest 1221 } 1222 1223 if rp.options.ID != requestAppend.ID { 1224 lib.Warning("[%s] got 'append' request being not a member of the given raft cluster (from %s)", rp.Self(), m.Pid) 1225 return RaftStatusOK 1226 } 1227 1228 if rp.quorum == nil { 1229 // no quorum. ignore it 1230 return RaftStatusOK 1231 } 1232 1233 // 1234 // There are 3 options: 1235 // 1236 1237 // 1) This process is a leader -> handleAppendLeader() 1238 // a) increment serial. send this request to all quorum members (except the origin peer) 1239 // b) wait for the request_append_ready from the quorum peers 1240 // c) call the callback HandleAppend 1241 // d) send request_append_commit(serial) to all quorum members (including the origin peer) 1242 if rp.leader == rp.Self() { 1243 return rp.handleAppendLeader(m.Pid, requestAppend) 1244 } 1245 1246 // 2) This process is not a leader, is a quorum member, and request has 1247 // received from the leader -> handleAppendQuorum() 1248 // a) accept this request and reply with request_append_ready 1249 // b) wait for the request_append_commit 1250 // c) call the callback HandleAppend 1251 // d) send request_append to the peers that are not in the quorum 1252 if rp.quorum.Member == true && m.Pid == rp.leader { 1253 return rp.handleAppendQuorum(requestAppend) 1254 } 1255 1256 // 3) This process neither a leader or a quorum member. 1257 // Or this process is a quorum member but request has received not from 1258 // the leader of this quorum. 1259 // It also could happened if quorum has changed during the delivering this request. 1260 1261 // Forward this request to the quorum member (if this process not a quorum member) 1262 // or to the leader (if this process is a quorum member) 1263 1264 forwardAppend := etf.Tuple{ 1265 etf.Atom("$request_append"), 1266 rp.Self(), 1267 etf.Tuple{ 1268 requestAppend.ID, 1269 requestAppend.Ref, 1270 requestAppend.Origin, 1271 requestAppend.Key, 1272 requestAppend.Value, 1273 requestAppend.Deadline, 1274 }, 1275 } 1276 1277 if rp.quorum.Member == true { 1278 // DBGAPN fmt.Println(rp.Self(), "DPGAPN forward $request_append", requestAppend.Ref, "to the leader", rp.leader) 1279 noLeader := etf.Pid{} 1280 if rp.leader == noLeader { 1281 // no leader in this quorum yet. ignore this request 1282 return RaftStatusOK 1283 } 1284 // This request has received not from the quorum leader. 1285 // Forward this request to the leader 1286 rp.Cast(rp.leader, forwardAppend) 1287 return RaftStatusOK 1288 } 1289 1290 // exclude requestAppend.Origin and m.Pid 1291 peers := []etf.Pid{} 1292 for _, pid := range rp.quorum.Peers { 1293 if pid == m.Pid { 1294 continue 1295 } 1296 if pid == requestAppend.Origin { 1297 continue 1298 } 1299 peers = append(peers, pid) 1300 } 1301 n := rand.Intn(len(peers) - 1) 1302 peer := peers[n] 1303 // DBGAPN fmt.Println(rp.Self(), "DPGAPN forward $request_append", requestAppend.Ref, "to the quorum member", peer) 1304 rp.Cast(peer, forwardAppend) 1305 return RaftStatusOK 1306 1307 case etf.Atom("$request_append_ready"): 1308 appendReady := &messageRaftAppendReady{} 1309 if err := etf.TermIntoStruct(m.Command, &appendReady); err != nil { 1310 return lib.ErrUnsupportedRequest 1311 } 1312 1313 if rp.options.ID != appendReady.ID { 1314 lib.Warning("[%s] got 'append_ready' message being not a member of the given raft cluster (from %s)", rp.Self(), m.Pid) 1315 return RaftStatusOK 1316 } 1317 1318 if rp.quorum == nil { 1319 // no quorum. ignore it 1320 return RaftStatusOK 1321 } 1322 1323 requestAppend, exist := rp.requestsAppend[appendReady.Key] 1324 if exist == false { 1325 // there might be timeout happened. ignore this message 1326 return RaftStatusOK 1327 } 1328 1329 if requestAppend.ref != appendReady.Ref { 1330 // there might be timeout happened for the previous append request for this key 1331 // and another append request arrived during previous append request handling 1332 return RaftStatusOK 1333 } 1334 1335 if rp.leader != rp.Self() { 1336 // i'm not a leader. seems leader election happened during this request handling 1337 requestAppend.cancel() 1338 delete(rp.requestsAppend, appendReady.Key) 1339 return RaftStatusOK 1340 } 1341 requestAppend.peers[m.Pid] = true 1342 commit := true 1343 for _, confirmed := range requestAppend.peers { 1344 if confirmed { 1345 continue 1346 } 1347 commit = false 1348 break 1349 } 1350 1351 if commit == false { 1352 return RaftStatusOK 1353 } 1354 1355 // received confirmations from all the peers are involved to this append handling. 1356 // call HandleAppend 1357 status := rp.behavior.HandleAppend(rp, requestAppend.ref, rp.options.Serial+1, 1358 appendReady.Key, requestAppend.value) 1359 switch status { 1360 case RaftStatusOK: 1361 rp.options.Serial++ 1362 // sent them $request_append_commit including the origin 1363 request := etf.Tuple{ 1364 etf.Atom("$request_append_commit"), 1365 rp.Self(), 1366 etf.Tuple{ 1367 rp.options.ID, 1368 requestAppend.ref, 1369 appendReady.Key, 1370 rp.options.Serial, 1371 requestAppend.from, 1372 }, 1373 } 1374 for pid, _ := range requestAppend.peers { 1375 if pid == rp.Self() { 1376 continue 1377 } 1378 rp.Cast(pid, request) 1379 // DBGAPN fmt.Println(rp.Self(), "DBGAPN sent append_commit to", pid, "with serial", rp.options.Serial) 1380 if c := rp.quorumCandidates.GetOnline(pid); c != nil { 1381 if c.serial < rp.options.Serial { 1382 c.serial = rp.options.Serial 1383 } 1384 } 1385 } 1386 requestAppend.cancel() 1387 delete(rp.requestsAppend, appendReady.Key) 1388 if requestAppend.from == rp.Self() { 1389 rp.handleBroadcastCommit(appendReady.Key, requestAppend, rp.options.Serial) 1390 } 1391 if len(rp.requestsAppendQueue) == 0 { 1392 return RaftStatusOK 1393 } 1394 1395 // handle queued append request 1396 handled := 0 1397 for i := range rp.requestsAppendQueue { 1398 handled = i 1399 queued := rp.requestsAppendQueue[i] 1400 if queued.request.Deadline < time.Now().UnixMilli() { 1401 // expired request 1402 lib.Warning("[%s] append request %s is expired", rp.Self(), queued.request.Ref) 1403 continue 1404 } 1405 rp.handleAppendLeader(queued.from, queued.request) 1406 break 1407 } 1408 rp.requestsAppendQueue = rp.requestsAppendQueue[handled+1:] 1409 if len(rp.requestsAppendQueue) == 0 { 1410 rp.requestsAppendQueue = nil 1411 } 1412 return RaftStatusOK 1413 1414 case RaftStatusDiscard: 1415 requestAppend.cancel() 1416 delete(rp.requestsAppend, appendReady.Key) 1417 return RaftStatusOK 1418 } 1419 1420 return status 1421 1422 case etf.Atom("$request_append_commit"): 1423 appendCommit := &messageRaftAppendCommit{} 1424 if err := etf.TermIntoStruct(m.Command, &appendCommit); err != nil { 1425 return lib.ErrUnsupportedRequest 1426 } 1427 1428 if rp.options.ID != appendCommit.ID { 1429 lib.Warning("[%s] got 'append_commit' message being not a member of the given raft cluster (from %s)", rp.Self(), m.Pid) 1430 return RaftStatusOK 1431 } 1432 1433 requestAppend, exist := rp.requestsAppend[appendCommit.Key] 1434 if exist == false { 1435 // seems timeout happened and this request was cleaned up 1436 return RaftStatusOK 1437 } 1438 requestAppend.cancel() 1439 delete(rp.requestsAppend, appendCommit.Key) 1440 1441 if rp.options.Serial >= appendCommit.Serial { 1442 lib.Warning("[%s] got append commit with serial (%d) greater or equal we have (%d). fork happened. stopping this process", rp.Self(), appendCommit.Serial, rp.options.Serial) 1443 return fmt.Errorf("raft fork happened") 1444 } 1445 1446 rp.options.Serial = appendCommit.Serial 1447 status := rp.behavior.HandleAppend(rp, requestAppend.ref, appendCommit.Serial, 1448 appendCommit.Key, requestAppend.value) 1449 if status == RaftStatusDiscard { 1450 lib.Warning("[%s] RaftStatusDiscard can be used by a leader only", rp.Self()) 1451 status = RaftStatusOK 1452 } 1453 if appendCommit.Broadcast != rp.Self() { 1454 return status 1455 } 1456 1457 rp.handleBroadcastCommit(appendCommit.Key, requestAppend, appendCommit.Serial) 1458 return status 1459 1460 case etf.Atom("$request_append_broadcast"): 1461 broadcast := &messageRaftAppendBroadcast{} 1462 if err := etf.TermIntoStruct(m.Command, &broadcast); err != nil { 1463 return lib.ErrUnsupportedRequest 1464 } 1465 1466 if rp.options.ID != broadcast.ID { 1467 lib.Warning("[%s] got 'append_broadcast' message being not a member of the given raft cluster (from %s)", rp.Self(), m.Pid) 1468 return RaftStatusOK 1469 } 1470 1471 rp.options.Serial = broadcast.Serial 1472 return rp.behavior.HandleAppend(rp, broadcast.Ref, broadcast.Serial, 1473 broadcast.Key, broadcast.Value) 1474 1475 } 1476 1477 return lib.ErrUnsupportedRequest 1478 } 1479 1480 func (rp *RaftProcess) handleElectionStart(round int) { 1481 if rp.quorum == nil { 1482 // no quorum. can't start election 1483 return 1484 } 1485 if rp.quorum.Member == false { 1486 // not a quorum member 1487 return 1488 } 1489 if rp.election != nil { 1490 if rp.election.round >= round { 1491 // already in progress 1492 return 1493 } 1494 rp.election.cancel() 1495 } 1496 if rp.round > round { 1497 round = rp.round 1498 } 1499 // LDRDBG fmt.Println(rp.Self(), "LDR start. round", round, "Q", rp.quorum.State) 1500 rp.election = &leaderElection{ 1501 votes: make(map[etf.Pid]etf.Pid), 1502 results: make(map[etf.Pid]bool), 1503 round: round, 1504 } 1505 rp.handleElectionVote() 1506 cancel := rp.CastAfter(rp.Self, messageRaftElectionClean{round: round}, cleanLeaderVoteTimeout) 1507 rp.election.cancel = cancel 1508 } 1509 1510 func (rp *RaftProcess) handleElectionVote() { 1511 if rp.quorum == nil || rp.election == nil { 1512 return 1513 } 1514 1515 mapPeers := make(map[etf.Pid]bool) 1516 for _, p := range rp.quorum.Peers { 1517 mapPeers[p] = true 1518 } 1519 1520 voted_for := etf.Pid{} 1521 c := rp.quorumCandidates.List() // ordered by serial in desk order 1522 for _, pid := range c { 1523 // check if this candidate is a member of quorum 1524 if _, exist := mapPeers[pid]; exist == false { 1525 continue 1526 } 1527 // get the first member since it has biggest serial 1528 voted_for = pid 1529 break 1530 } 1531 1532 // LDRDBG fmt.Println(rp.Self(), "LDR voted for:", voted_for, "quorum", rp.quorum.State) 1533 leaderVote := etf.Tuple{ 1534 etf.Atom("$leader_vote"), 1535 rp.Self(), 1536 etf.Tuple{ 1537 rp.options.ID, 1538 int(rp.quorum.State), 1539 voted_for, 1540 rp.election.round, 1541 }, 1542 } 1543 for _, pid := range rp.quorum.Peers { 1544 if pid == rp.Self() { 1545 continue 1546 } 1547 // LDRDBG fmt.Println(rp.Self(), "LDR sent vote for", voted_for, "to", pid, "round", rp.election.round, "quorum", rp.quorum.State) 1548 rp.Cast(pid, leaderVote) 1549 } 1550 rp.election.votes[rp.Self()] = voted_for 1551 } 1552 1553 func (rp *RaftProcess) handleBroadcastCommit(key string, request *requestAppend, serial uint64) { 1554 // DBGAPN fmt.Println(rp.Self(), "broadcasting", request.ref) 1555 // the origin process is in charge of broadcasting this result among 1556 // the peers who aren't quorum members. 1557 commit := etf.Tuple{ 1558 etf.Atom("$request_append_broadcast"), 1559 rp.Self(), 1560 etf.Tuple{ 1561 rp.options.ID, 1562 request.ref, 1563 serial, 1564 key, 1565 request.value, 1566 }, 1567 } 1568 allPeers := rp.quorumCandidates.List() 1569 for _, pid := range allPeers { 1570 if _, exist := request.peers[pid]; exist { 1571 continue 1572 } 1573 if pid == rp.Self() { 1574 continue 1575 } 1576 rp.Cast(pid, commit) 1577 // DBGAPN fmt.Println(rp.Self(), "DBGAPN sent $request_append_broadcast", request.ref, "to", pid) 1578 c := rp.quorumCandidates.GetOnline(pid) 1579 if c != nil && c.serial < serial { 1580 c.serial = serial 1581 } 1582 } 1583 } 1584 1585 func (rp *RaftProcess) handleAppendLeader(from etf.Pid, request *messageRaftRequestAppend) RaftStatus { 1586 // DBGAPN fmt.Println(rp.Self(), "DBGAPN handle append", request.Ref, "on leader.", request.Key, request.Value) 1587 if _, exist := rp.requestsAppend[request.Key]; exist { 1588 // another append request with this key is still in progress. append to the queue 1589 queued := requestAppendQueued{ 1590 from: from, 1591 request: request, 1592 } 1593 rp.requestsAppendQueue = append(rp.requestsAppendQueue, queued) 1594 lq := len(rp.requestsAppendQueue) 1595 if lq > 10 { 1596 lib.Warning("[%s] append request queue is getting long. queued request %d", rp.Self(), lq) 1597 } 1598 return RaftStatusOK 1599 } 1600 now := time.Now().UnixMilli() 1601 if now >= request.Deadline { 1602 // deadline has been passed. ignore this request 1603 return RaftStatusOK 1604 } 1605 1606 sendRequestAppend := etf.Tuple{ 1607 etf.Atom("$request_append"), 1608 rp.Self(), 1609 etf.Tuple{ 1610 rp.options.ID, 1611 request.Ref, 1612 request.Origin, 1613 request.Key, 1614 request.Value, 1615 request.Deadline, 1616 }, 1617 } 1618 1619 peers := make(map[etf.Pid]bool) 1620 for _, pid := range rp.quorum.Peers { 1621 if pid == rp.Self() { 1622 continue 1623 } 1624 peers[pid] = false 1625 if pid == request.Origin { 1626 peers[pid] = true // do not wait append_ready for the Origin 1627 continue 1628 } 1629 rp.Cast(pid, sendRequestAppend) 1630 } 1631 1632 // if 'from' is not a quorum member the leader is in charge of broadcasting 1633 if _, exist := peers[from]; exist == false { 1634 from = rp.Self() 1635 } 1636 1637 after := time.Duration(request.Deadline-now) * time.Millisecond 1638 clean := messageRaftAppendClean{key: request.Key, ref: request.Ref} 1639 cancel := rp.CastAfter(rp.Self(), clean, after) 1640 requestAppend := &requestAppend{ 1641 ref: request.Ref, 1642 from: from, 1643 origin: request.Origin, 1644 value: request.Value, 1645 peers: peers, 1646 cancel: cancel, 1647 } 1648 rp.requestsAppend[request.Key] = requestAppend 1649 1650 return RaftStatusOK 1651 } 1652 1653 func (rp *RaftProcess) handleAppendQuorum(request *messageRaftRequestAppend) RaftStatus { 1654 // DBGAPN fmt.Println(rp.Self(), "DBGAPN handle append", request.Ref, "on a quorum member.", request.Key, request.Value) 1655 if r, exist := rp.requestsAppend[request.Key]; exist { 1656 r.cancel() 1657 delete(rp.requestsAppend, request.Key) 1658 } 1659 1660 ready := etf.Tuple{ 1661 etf.Atom("$request_append_ready"), 1662 rp.Self(), 1663 etf.Tuple{ 1664 rp.options.ID, 1665 request.Ref, 1666 request.Key, 1667 }, 1668 } 1669 rp.Cast(rp.leader, ready) 1670 clean := messageRaftAppendClean{key: request.Key, ref: request.Ref} 1671 after := time.Duration(DefaultRaftAppendTimeout) * time.Second 1672 if d := time.Duration(request.Deadline-time.Now().UnixMilli()) * time.Millisecond; d > after { 1673 after = d 1674 } 1675 cancel := rp.CastAfter(rp.Self, clean, after) 1676 1677 peers := make(map[etf.Pid]bool) 1678 for _, pid := range rp.quorum.Peers { 1679 peers[pid] = true 1680 } 1681 1682 requestAppend := &requestAppend{ 1683 ref: request.Ref, 1684 origin: request.Origin, 1685 value: request.Value, 1686 peers: peers, 1687 cancel: cancel, 1688 } 1689 rp.requestsAppend[request.Key] = requestAppend 1690 return RaftStatusOK 1691 } 1692 1693 func (rp *RaftProcess) quorumChangeStart(nextAttempt bool) { 1694 if rp.quorumChangeDefer == false { 1695 if nextAttempt { 1696 // increase timeout for the next attempt to build a new quorum 1697 rp.quorumChangeAttempt++ 1698 } else { 1699 rp.quorumChangeAttempt = 1 1700 } 1701 maxTime := rp.quorumChangeAttempt * quorumChangeDeferMaxTime 1702 after := time.Duration(50+rand.Intn(maxTime)) * time.Millisecond 1703 rp.CastAfter(rp.Self(), messageRaftQuorumChange{}, after) 1704 rp.quorumChangeDefer = true 1705 } 1706 } 1707 1708 func (rp *RaftProcess) quorumChange() RaftStatus { 1709 l := len(rp.quorumCandidates.List()) 1710 1711 candidateRaftQuorumState := RaftQuorumState3 1712 switch { 1713 case l > 9: 1714 if rp.quorum != nil && rp.quorum.State == RaftQuorumState11 { 1715 // do nothing 1716 return RaftStatusOK 1717 } 1718 candidateRaftQuorumState = RaftQuorumState11 1719 l = 10 // to create quorum of 11 we need 10 candidates + itself. 1720 1721 case l > 7: 1722 if rp.quorum != nil && rp.quorum.State == RaftQuorumState9 { 1723 // do nothing 1724 return RaftStatusOK 1725 } 1726 candidateRaftQuorumState = RaftQuorumState9 1727 l = 8 // quorum of 9 => 8 candidates + itself 1728 case l > 5: 1729 if rp.quorum != nil && rp.quorum.State == RaftQuorumState7 { 1730 // do nothing 1731 return RaftStatusOK 1732 } 1733 candidateRaftQuorumState = RaftQuorumState7 1734 l = 6 // quorum of 7 => 6 candidates + itself 1735 case l > 3: 1736 if rp.quorum != nil && rp.quorum.State == RaftQuorumState5 { 1737 // do nothing 1738 return RaftStatusOK 1739 } 1740 candidateRaftQuorumState = RaftQuorumState5 1741 l = 4 // quorum of 5 => 4 candidates + itself 1742 case l > 1: 1743 if rp.quorum != nil && rp.quorum.State == RaftQuorumState3 { 1744 // do nothing 1745 return RaftStatusOK 1746 } 1747 candidateRaftQuorumState = RaftQuorumState3 1748 l = 2 // quorum of 3 => 2 candidates + itself 1749 default: 1750 // not enougth candidates to create a quorum 1751 if rp.quorum != nil { 1752 rp.quorum = nil 1753 return rp.handleQuorum() 1754 } 1755 // QUODBG fmt.Println(rp.Name(), "QUO VOTE. NOT ENO CAND", rp.quorumCandidates.List()) 1756 1757 // try send cluster_join again to receive an updated peer list 1758 rp.CastAfter(rp.Self(), messageRaftClusterInit{}, 5*time.Second) 1759 return RaftStatusOK 1760 } 1761 1762 if _, exist := rp.quorumVotes[candidateRaftQuorumState]; exist { 1763 // voting for this state is already in progress 1764 return RaftStatusOK 1765 } 1766 1767 quorumCandidates := make([]etf.Pid, 0, l+1) 1768 quorumCandidates = append(quorumCandidates, rp.Self()) 1769 candidates := rp.quorumCandidates.List() 1770 quorumCandidates = append(quorumCandidates, candidates[:l]...) 1771 // QUODBG fmt.Println(rp.Name(), "QUO VOTE INIT", candidateRaftQuorumState, quorumCandidates) 1772 1773 // send quorumVote to all candidates (except itself) 1774 quorum := &quorum{ 1775 votes: make(map[etf.Pid]int), 1776 origin: rp.Self(), 1777 } 1778 quorum.State = candidateRaftQuorumState 1779 quorum.Peers = quorumCandidates 1780 rp.quorumVotes[candidateRaftQuorumState] = quorum 1781 rp.quorumSendVote(quorum) 1782 rp.CastAfter(rp.Self(), messageRaftQuorumCleanVote{state: quorum.State}, cleanVoteTimeout) 1783 return RaftStatusOK 1784 } 1785 1786 func (rp *RaftProcess) quorumSendVote(q *quorum) bool { 1787 empty := etf.Pid{} 1788 if q.origin == empty { 1789 // do not send its vote until the origin vote will be received 1790 return false 1791 } 1792 1793 allVoted := true 1794 quorumVote := etf.Tuple{ 1795 etf.Atom("$quorum_vote"), 1796 rp.Self(), 1797 etf.Tuple{ 1798 rp.options.ID, 1799 rp.options.Serial, 1800 int(q.State), 1801 q.Peers, 1802 }, 1803 } 1804 1805 for _, pid := range q.Peers { 1806 if pid == rp.Self() { 1807 continue // do not send to itself 1808 } 1809 1810 if pid == q.origin { 1811 continue 1812 } 1813 v, _ := q.votes[pid] 1814 1815 // check if already sent vote to this peer 1816 if v&1 == 0 { 1817 // QUODBG fmt.Println(rp.Name(), "SEND VOTE to", pid, q.Peers) 1818 rp.Cast(pid, quorumVote) 1819 // mark as sent 1820 v |= 1 1821 q.votes[pid] = v 1822 } 1823 1824 if v != 3 { // 2(010) - recv, 1(001) - sent, 3(011) - recv & sent 1825 allVoted = false 1826 } 1827 } 1828 1829 if allVoted == true && q.origin != rp.Self() { 1830 // send vote to origin 1831 // QUODBG fmt.Println(rp.Name(), "SEND VOTE to origin", q.origin, q.Peers) 1832 rp.Cast(q.origin, quorumVote) 1833 } 1834 1835 return allVoted 1836 } 1837 1838 func (rp *RaftProcess) quorumVote(from etf.Pid, vote *messageRaftQuorumVote) RaftStatus { 1839 if vote.State != len(vote.Candidates) { 1840 lib.Warning("[%s] quorum state and number of candidates are mismatch", rp.Self()) 1841 rp.quorumCandidates.SetOffline(rp, from) 1842 return RaftStatusOK 1843 } 1844 1845 if c := rp.quorumCandidates.GetOnline(from); c == nil { 1846 // there is a race conditioned case when we received a vote before 1847 // the quorum_join_reply message. just ignore it. they will start 1848 // another round of quorum forming 1849 return RaftStatusOK 1850 } else { 1851 c.heartbeat = time.Now().Unix() 1852 c.failures = 0 1853 } 1854 candidatesRaftQuorumState := RaftQuorumState3 1855 switch vote.State { 1856 case 3: 1857 candidatesRaftQuorumState = RaftQuorumState3 1858 case 5: 1859 candidatesRaftQuorumState = RaftQuorumState5 1860 case 7: 1861 candidatesRaftQuorumState = RaftQuorumState7 1862 case 9: 1863 candidatesRaftQuorumState = RaftQuorumState9 1864 case 11: 1865 candidatesRaftQuorumState = RaftQuorumState11 1866 default: 1867 lib.Warning("[%s] wrong number of candidates in the request. removing %s from quorum candidates list", rp.Self(), from) 1868 rp.quorumCandidates.SetOffline(rp, from) 1869 return RaftStatusOK 1870 } 1871 1872 // do not vote if requested quorum is less than existing one 1873 if rp.quorum != nil && candidatesRaftQuorumState <= rp.quorum.State { 1874 // There is a case when a peer is involved in more than one voting, 1875 // and this peer just sent a vote for another voting process which 1876 // is still in progress. 1877 // Do not send $quorum_voted message if this peer is already a member 1878 // of accepted quorum 1879 member := false 1880 for _, pid := range rp.quorum.Peers { 1881 if pid == from { 1882 member = true 1883 break 1884 } 1885 } 1886 if member == true { 1887 return RaftStatusOK 1888 } 1889 1890 // QUODBG fmt.Println(rp.Name(), "SKIP VOTE from", from, candidatesRaftQuorumState, rp.quorum.State) 1891 built := etf.Tuple{ 1892 etf.Atom("$quorum_built"), 1893 rp.Self(), 1894 etf.Tuple{ 1895 rp.options.ID, 1896 int(rp.quorum.State), 1897 rp.round, 1898 rp.quorum.Peers, 1899 }, 1900 } 1901 rp.Cast(from, built) 1902 return RaftStatusOK 1903 } 1904 1905 q, exist := rp.quorumVotes[candidatesRaftQuorumState] 1906 if exist == false { 1907 // 1908 // Received the first vote 1909 // 1910 if len(rp.quorumVotes) > 5 { 1911 // can't be more than 5 (there could be only votes for 3,5,7,9,11) 1912 lib.Warning("[%s] too many votes %#v", rp.quorumVotes) 1913 return RaftStatusOK 1914 } 1915 1916 q = &quorum{} 1917 q.State = candidatesRaftQuorumState 1918 q.Peers = vote.Candidates 1919 1920 if from == vote.Candidates[0] { 1921 // Origin vote (received from the peer initiated this voting process). 1922 // Otherwise keep this field empty, which means this quorum 1923 // will be overwritten if we get another voting from the peer 1924 // initiated that voting (with a different set/order of peers) 1925 q.origin = from 1926 } 1927 1928 if rp.quorumValidateVote(from, q, vote) == false { 1929 // do not create this voting if those peers aren't valid (haven't registered yet) 1930 return RaftStatusOK 1931 } 1932 q.lastVote = time.Now().UnixMilli() 1933 // QUODBG fmt.Println(rp.Name(), "QUO VOTE (NEW)", from, vote) 1934 rp.quorumVotes[candidatesRaftQuorumState] = q 1935 rp.CastAfter(rp.Self(), messageRaftQuorumCleanVote{state: q.State}, cleanVoteTimeout) 1936 1937 } else { 1938 empty := etf.Pid{} 1939 if q.origin == empty && from == vote.Candidates[0] { 1940 // got origin vote. 1941 q.origin = from 1942 1943 // check if this vote has the same set of peers 1944 same := true 1945 for i := range q.Peers { 1946 if vote.Candidates[i] != q.Peers[i] { 1947 same = false 1948 break 1949 } 1950 } 1951 // if it differs overwrite quorum by the new voting 1952 if same == false { 1953 q.Peers = vote.Candidates 1954 q.votes = nil 1955 } 1956 } 1957 1958 if rp.quorumValidateVote(from, q, vote) == false { 1959 return RaftStatusOK 1960 } 1961 q.lastVote = time.Now().UnixMilli() 1962 // QUODBG fmt.Println(rp.Name(), "QUO VOTE", from, vote) 1963 } 1964 1965 // returns true if we got votes from all the peers whithin this quorum 1966 if rp.quorumSendVote(q) == true { 1967 // 1968 // Quorum built 1969 // 1970 // QUODBG fmt.Println(rp.Name(), "QUO BUILT", q.State, q.Peers) 1971 if rp.quorum == nil { 1972 rp.quorum = &RaftQuorum{} 1973 } 1974 rp.quorum.Member = true 1975 rp.quorum.State = q.State 1976 rp.quorum.Peers = q.Peers 1977 delete(rp.quorumVotes, q.State) 1978 1979 // all candidates who don't belong to this quorum should be known that quorum is built. 1980 mapPeers := make(map[etf.Pid]bool) 1981 for _, peer := range rp.quorum.Peers { 1982 mapPeers[peer] = true 1983 } 1984 allCandidates := rp.quorumCandidates.List() 1985 for _, peer := range allCandidates { 1986 if _, exist := mapPeers[peer]; exist { 1987 // this peer belongs to the quorum. skip it 1988 continue 1989 } 1990 built := etf.Tuple{ 1991 etf.Atom("$quorum_built"), 1992 rp.Self(), 1993 etf.Tuple{ 1994 rp.options.ID, 1995 int(rp.quorum.State), 1996 rp.round, 1997 rp.quorum.Peers, 1998 }, 1999 } 2000 rp.Cast(peer, built) 2001 2002 } 2003 2004 rp.handleElectionStart(rp.round + 1) 2005 return rp.handleQuorum() 2006 } 2007 2008 return RaftStatusOK 2009 } 2010 2011 func (rp *RaftProcess) clusterHeal() { 2012 for _, pid := range rp.quorumCandidates.ListOffline() { 2013 // c can't be nil here 2014 c := rp.quorumCandidates.Get(pid) 2015 if c.heartbeat == 0 { 2016 continue 2017 } 2018 diff := time.Now().Unix() - c.heartbeat 2019 switch { 2020 case diff < 0: 2021 // heartbeat was set in the future 2022 continue 2023 case diff > 300: // > 5 min 2024 rp.Join(pid) 2025 // the next attempt will be in an hour 2026 c.heartbeat = time.Now().Unix() + 3600 2027 } 2028 } 2029 } 2030 2031 func (rp *RaftProcess) handleQuorum() RaftStatus { 2032 q := rp.Quorum() 2033 if status := rp.behavior.HandleQuorum(rp, q); status != RaftStatusOK { 2034 return status 2035 } 2036 2037 noLeader := etf.Pid{} 2038 if rp.leader != noLeader { 2039 rp.leader = noLeader 2040 if status := rp.behavior.HandleLeader(rp, nil); status != RaftStatusOK { 2041 return status 2042 } 2043 } 2044 2045 if q == nil || q.Member == false { 2046 return RaftStatusOK 2047 } 2048 2049 if rp.election == nil { 2050 rp.handleElectionStart(rp.round + 1) 2051 } 2052 2053 return RaftStatusOK 2054 } 2055 2056 func (rp *RaftProcess) handleHeartbeat() { 2057 if rp.heartbeatCancel != nil { 2058 rp.heartbeatCancel() 2059 rp.heartbeatCancel = nil 2060 } 2061 2062 defer func() { 2063 after := DefaultRaftHeartbeat * time.Second 2064 cancel := rp.CastAfter(rp.Self(), messageRaftHeartbeat{}, after) 2065 rp.heartbeatCancel = cancel 2066 rp.clusterHeal() 2067 }() 2068 2069 if rp.quorum == nil || rp.quorum.Member == false { 2070 return 2071 } 2072 2073 noLeader := etf.Pid{} 2074 if rp.leader == noLeader { 2075 // leader election is still in progress. do nothing atm. 2076 return 2077 } 2078 2079 if rp.leader == rp.Self() { 2080 // send a heartbeat to all quorum members if this process is a leader of this quorum 2081 heartbeat := etf.Tuple{ 2082 etf.Atom("$leader_heartbeat"), 2083 rp.Self(), 2084 etf.Tuple{ 2085 rp.options.ID, 2086 rp.options.Serial, 2087 }, 2088 } 2089 for _, pid := range rp.quorum.Peers { 2090 if pid == rp.Self() { 2091 continue 2092 } 2093 rp.Cast(pid, heartbeat) 2094 } 2095 return 2096 } 2097 2098 // check leader's heartbeat 2099 c := rp.quorumCandidates.GetOnline(rp.leader) 2100 if c != nil { 2101 diff := time.Now().Unix() - c.heartbeat 2102 if c.heartbeat == 0 { 2103 diff = 0 2104 } 2105 2106 if diff < DefaultRaftHeartbeat*3 { 2107 return 2108 } 2109 2110 // long time no see heartbeats from the leader 2111 c.joined = false 2112 rp.quorumCandidates.SetOffline(rp, rp.leader) 2113 } 2114 2115 // HRTDBG fmt.Println(rp.Self(), "HRT lost leader", rp.leader) 2116 leave := etf.Tuple{ 2117 etf.Atom("$quorum_leave"), 2118 rp.Self(), 2119 etf.Tuple{ 2120 rp.options.ID, 2121 rp.leader, 2122 }, 2123 } 2124 2125 // tell everyone in the raft cluster 2126 for _, peer := range rp.quorumCandidates.List() { 2127 rp.Cast(peer, leave) 2128 } 2129 rp.quorum = nil 2130 rp.handleQuorum() 2131 rp.quorumChangeStart(false) 2132 } 2133 2134 func (rp *RaftProcess) isQuorumMember(pid etf.Pid) bool { 2135 if rp.quorum == nil { 2136 return false 2137 } 2138 for _, peer := range rp.quorum.Peers { 2139 if pid == peer { 2140 return true 2141 } 2142 } 2143 return false 2144 } 2145 2146 func (rp *RaftProcess) quorumValidateVote(from etf.Pid, q *quorum, vote *messageRaftQuorumVote) bool { 2147 duplicates := make(map[etf.Pid]bool) 2148 validFrom := false 2149 validTo := false 2150 validSerial := false 2151 candidatesMatch := true 2152 newVote := false 2153 if q.votes == nil { 2154 q.votes = make(map[etf.Pid]int) 2155 newVote = true 2156 } 2157 2158 empty := etf.Pid{} 2159 if q.origin != empty && newVote == true && vote.Candidates[0] != from { 2160 return false 2161 } 2162 2163 for i, pid := range vote.Candidates { 2164 if pid == rp.Self() { 2165 validTo = true 2166 continue 2167 } 2168 2169 // quorum peers must be matched with the vote's cadidates 2170 if q.Peers[i] != vote.Candidates[i] { 2171 candidatesMatch = false 2172 } 2173 2174 // check if received vote has the same set of peers. 2175 // if this is the first vote for the given q.State the pid 2176 // will be added to the vote map 2177 _, exist := q.votes[pid] 2178 if exist == false { 2179 if newVote { 2180 q.votes[pid] = 0 2181 } else { 2182 candidatesMatch = false 2183 } 2184 } 2185 2186 if _, exist := duplicates[pid]; exist { 2187 lib.Warning("[%s] got vote with duplicates from %s", rp.Name(), from) 2188 rp.quorumCandidates.SetOffline(rp, from) 2189 return false 2190 } 2191 duplicates[pid] = false 2192 2193 c := rp.quorumCandidates.GetOnline(pid) 2194 if c == nil { 2195 candidatesMatch = false 2196 rp.quorumCandidates.Set(rp, pid) 2197 continue 2198 } 2199 if pid == from { 2200 if c.serial > vote.Serial { 2201 // invalid serial 2202 continue 2203 } 2204 c.serial = vote.Serial 2205 validFrom = true 2206 validSerial = true 2207 } 2208 } 2209 2210 if candidatesMatch == false { 2211 // can't accept this vote 2212 // QUODBG fmt.Println(rp.Name(), "QUO CAND MISMATCH", from, vote.Candidates) 2213 return false 2214 } 2215 2216 if validSerial == false { 2217 lib.Warning("[%s] got vote from %s with invalid serial", rp.Name(), from) 2218 rp.quorumCandidates.SetOffline(rp, from) 2219 return false 2220 } 2221 2222 if validFrom == false || validTo == false { 2223 lib.Warning("[%s] got vote from %s with invalid data", rp.Name(), from) 2224 rp.quorumCandidates.SetOffline(rp, from) 2225 return false 2226 } 2227 2228 // mark as recv 2229 v, _ := q.votes[from] 2230 q.votes[from] = v | 2 2231 2232 return true 2233 } 2234 2235 // 2236 // Server callbacks 2237 // 2238 2239 func (r *Raft) Init(process *ServerProcess, args ...etf.Term) error { 2240 var options RaftOptions 2241 2242 behavior, ok := process.Behavior().(RaftBehavior) 2243 if !ok { 2244 return fmt.Errorf("Raft: not a RaftBehavior") 2245 } 2246 2247 raftProcess := &RaftProcess{ 2248 ServerProcess: *process, 2249 behavior: behavior, 2250 quorumCandidates: createQuorumCandidates(), 2251 quorumVotes: make(map[RaftQuorumState]*quorum), 2252 requests: make(map[etf.Ref]CancelFunc), 2253 requestsAppend: make(map[string]*requestAppend), 2254 } 2255 2256 // do not inherit parent State 2257 raftProcess.State = nil 2258 options, err := behavior.InitRaft(raftProcess, args...) 2259 if err != nil { 2260 return err 2261 } 2262 2263 raftProcess.options = options 2264 process.State = raftProcess 2265 2266 process.Cast(process.Self(), messageRaftClusterInit{}) 2267 //process.SetTrapExit(true) 2268 raftProcess.handleHeartbeat() 2269 return nil 2270 } 2271 2272 // HandleCall 2273 func (r *Raft) HandleCall(process *ServerProcess, from ServerFrom, message etf.Term) (etf.Term, ServerStatus) { 2274 rp := process.State.(*RaftProcess) 2275 return rp.behavior.HandleRaftCall(rp, from, message) 2276 } 2277 2278 // HandleCast 2279 func (r *Raft) HandleCast(process *ServerProcess, message etf.Term) ServerStatus { 2280 var mRaft messageRaft 2281 var status RaftStatus 2282 2283 rp := process.State.(*RaftProcess) 2284 switch m := message.(type) { 2285 case messageRaftClusterInit: 2286 if rp.quorum != nil { 2287 return ServerStatusOK 2288 } 2289 if len(rp.quorumVotes) > 0 { 2290 return ServerStatusOK 2291 } 2292 for _, peer := range rp.options.Peers { 2293 rp.Join(peer) 2294 } 2295 return ServerStatusOK 2296 2297 case messageRaftQuorumCleanVote: 2298 q, exist := rp.quorumVotes[m.state] 2299 if exist == true && q.lastVote > 0 { 2300 diff := time.Duration(time.Now().UnixMilli()-q.lastVote) * time.Millisecond 2301 // if voting is still in progress cast itself again with shifted timeout 2302 // according to cleanVoteTimeout 2303 if cleanVoteTimeout > diff { 2304 nextCleanVoteTimeout := cleanVoteTimeout - diff 2305 rp.CastAfter(rp.Self(), messageRaftQuorumCleanVote{state: q.State}, nextCleanVoteTimeout) 2306 return ServerStatusOK 2307 } 2308 } 2309 2310 if q != nil { 2311 // QUODBG fmt.Println(rp.Name(), "CLN VOTE", m.state, q.Peers) 2312 delete(rp.quorumVotes, m.state) 2313 for _, peer := range q.Peers { 2314 v, _ := q.votes[peer] 2315 if v&2 > 0 { // vote received 2316 continue 2317 } 2318 // no vote from this peer. there are two options 2319 // 1. this peer has switched to the other quorum building 2320 // 2. something wrong with this peer (raft process could be stuck). 2321 c := rp.quorumCandidates.GetOnline(peer) 2322 if c == nil { 2323 // already offline 2324 continue 2325 } 2326 c.failures++ 2327 if c.failures > 10 { 2328 // QUODBG fmt.Println(rp.Self(), "too many failures with", peer) 2329 rp.quorumCandidates.SetOffline(rp, peer) 2330 } 2331 } 2332 } 2333 if len(rp.quorumVotes) == 0 { 2334 // make another attempt to build new quorum 2335 rp.quorumChangeStart(true) 2336 } 2337 case messageRaftQuorumChange: 2338 rp.quorumChangeDefer = false 2339 status = rp.quorumChange() 2340 2341 case messageRaftRequestClean: 2342 delete(rp.requests, m.ref) 2343 status = rp.behavior.HandleCancel(rp, m.ref, "timeout") 2344 2345 case messageRaftAppendClean: 2346 request, exist := rp.requestsAppend[m.key] 2347 if exist == false { 2348 // do nothing 2349 return ServerStatusOK 2350 } 2351 if request.ref != m.ref { 2352 return ServerStatusOK 2353 } 2354 if request.origin == rp.Self() { 2355 status = rp.behavior.HandleCancel(rp, request.ref, "timeout") 2356 break 2357 } 2358 delete(rp.requestsAppend, m.key) 2359 return ServerStatusOK 2360 case messageRaftElectionClean: 2361 if rp.quorum == nil { 2362 return ServerStatusOK 2363 } 2364 if rp.election == nil && rp.quorum.Member { 2365 // restart election 2366 rp.handleElectionStart(rp.round + 1) 2367 return ServerStatusOK 2368 } 2369 if m.round != rp.election.round { 2370 // new election round happened 2371 // LDRDBG fmt.Println(rp.Self(), "LDR clean election. skip. new election round", rp.election.round) 2372 return ServerStatusOK 2373 } 2374 // LDRDBG fmt.Println(rp.Self(), "LDR clean election. round", rp.election.round) 2375 rp.election = nil 2376 return ServerStatusOK 2377 2378 case messageRaftHeartbeat: 2379 rp.handleHeartbeat() 2380 return ServerStatusOK 2381 2382 default: 2383 if err := etf.TermIntoStruct(message, &mRaft); err != nil { 2384 status = rp.behavior.HandleRaftInfo(rp, message) 2385 break 2386 } 2387 if mRaft.Pid == process.Self() { 2388 lib.Warning("[%s] got raft command from itself %#v", process.Self(), mRaft) 2389 return ServerStatusOK 2390 } 2391 status = rp.handleRaftRequest(mRaft) 2392 if status == lib.ErrUnsupportedRequest { 2393 status = rp.behavior.HandleRaftCast(rp, message) 2394 } 2395 } 2396 2397 switch status { 2398 case nil, RaftStatusOK: 2399 return ServerStatusOK 2400 case RaftStatusStop: 2401 return ServerStatusStop 2402 case lib.ErrUnsupportedRequest: 2403 return rp.behavior.HandleRaftInfo(rp, message) 2404 default: 2405 return ServerStatus(status) 2406 } 2407 2408 } 2409 2410 // HandleInfo 2411 func (r *Raft) HandleInfo(process *ServerProcess, message etf.Term) ServerStatus { 2412 var status RaftStatus 2413 2414 rp := process.State.(*RaftProcess) 2415 switch m := message.(type) { 2416 case MessageDown: 2417 can := rp.quorumCandidates.GetOnline(m.Pid) 2418 if can == nil { 2419 break 2420 } 2421 if can.monitor != m.Ref { 2422 status = rp.behavior.HandleRaftInfo(rp, message) 2423 break 2424 } 2425 rp.quorumCandidates.SetOffline(rp, m.Pid) 2426 if rp.quorum == nil { 2427 return ServerStatusOK 2428 } 2429 for _, peer := range rp.quorum.Peers { 2430 // check if this pid belongs to the quorum 2431 if peer != m.Pid { 2432 continue 2433 } 2434 2435 // start to build new quorum 2436 // QUODBG fmt.Println(rp.Name(), "QUO PEER DOWN", m.Pid) 2437 rp.handleQuorum() 2438 rp.quorumChangeStart(false) 2439 break 2440 } 2441 return ServerStatusOK 2442 2443 default: 2444 status = rp.behavior.HandleRaftInfo(rp, message) 2445 } 2446 2447 switch status { 2448 case nil, RaftStatusOK: 2449 return ServerStatusOK 2450 case RaftStatusStop: 2451 return ServerStatusStop 2452 default: 2453 return ServerStatus(status) 2454 } 2455 } 2456 2457 // 2458 // default Raft callbacks 2459 // 2460 2461 // HandleQuorum 2462 func (r *Raft) HandleQuorum(process *RaftProcess, quorum *RaftQuorum) RaftStatus { 2463 return RaftStatusOK 2464 } 2465 2466 // HandleLeader 2467 func (r *Raft) HandleLeader(process *RaftProcess, leader *RaftLeader) RaftStatus { 2468 return RaftStatusOK 2469 } 2470 2471 // HandlePeer 2472 func (r *Raft) HandlePeer(process *RaftProcess, peer etf.Pid, serial uint64) RaftStatus { 2473 return RaftStatusOK 2474 } 2475 2476 // HandleSerial 2477 func (r *Raft) HandleSerial(process *RaftProcess, ref etf.Ref, serial uint64, key string, value etf.Term) RaftStatus { 2478 lib.Warning("HandleSerial: unhandled key-value message with ref %s and serial %d", ref, serial) 2479 return RaftStatusOK 2480 } 2481 2482 // HandleCancel 2483 func (r *Raft) HandleCancel(process *RaftProcess, ref etf.Ref, reason string) RaftStatus { 2484 lib.Warning("HandleCancel: unhandled cancel with ref %s and reason %q", ref, reason) 2485 return RaftStatusOK 2486 } 2487 2488 // HandleRaftCall 2489 func (r *Raft) HandleRaftCall(process *RaftProcess, from ServerFrom, message etf.Term) (etf.Term, ServerStatus) { 2490 lib.Warning("HandleRaftCall: unhandled message (from %#v) %#v", from, message) 2491 return etf.Atom("ok"), ServerStatusOK 2492 } 2493 2494 // HandleRaftCast 2495 func (r *Raft) HandleRaftCast(process *RaftProcess, message etf.Term) ServerStatus { 2496 lib.Warning("HandleRaftCast: unhandled message %#v", message) 2497 return ServerStatusOK 2498 } 2499 2500 // HandleRaftInfo 2501 func (r *Raft) HandleRaftInfo(process *RaftProcess, message etf.Term) ServerStatus { 2502 lib.Warning("HandleRaftInfo: unhandled message %#v", message) 2503 return ServerStatusOK 2504 } 2505 2506 // HandleRaftDirect 2507 func (r *Raft) HandleRaftDirect(process *RaftProcess, message interface{}) (interface{}, error) { 2508 return nil, lib.ErrUnsupportedRequest 2509 } 2510 2511 // 2512 // internals 2513 // 2514 2515 func createQuorumCandidates() *quorumCandidates { 2516 qc := &quorumCandidates{ 2517 candidates: make(map[etf.Pid]*candidate), 2518 } 2519 return qc 2520 } 2521 2522 func (qc *quorumCandidates) Set(rp *RaftProcess, peer etf.Pid) { 2523 c, exist := qc.candidates[peer] 2524 if exist == true { 2525 diff := time.Now().Unix() - c.heartbeat 2526 if diff > DefaultRaftHeartbeat { 2527 rp.Join(peer) 2528 } 2529 return 2530 } 2531 c = &candidate{ 2532 heartbeat: time.Now().Unix(), 2533 } 2534 qc.candidates[peer] = c 2535 rp.Join(peer) 2536 } 2537 2538 func (qc *quorumCandidates) SetOnline(rp *RaftProcess, peer etf.Pid, serial uint64) bool { 2539 c, exist := qc.candidates[peer] 2540 if exist == false { 2541 return false 2542 } 2543 mon := rp.MonitorProcess(peer) 2544 c.serial = serial 2545 c.monitor = mon 2546 c.joined = true 2547 c.heartbeat = time.Now().Unix() 2548 c.failures = 0 2549 return true 2550 } 2551 2552 func (qc *quorumCandidates) SetOffline(rp *RaftProcess, peer etf.Pid) { 2553 c, exist := qc.candidates[peer] 2554 if exist == false { 2555 return 2556 } 2557 // QUODBG fmt.Println(rp.Self(), "peer", peer, "has left") 2558 emptyRef := etf.Ref{} 2559 if c.monitor != emptyRef { 2560 rp.DemonitorProcess(c.monitor) 2561 c.monitor = emptyRef 2562 } 2563 c.joined = false 2564 c.failures = 0 2565 c.heartbeat = time.Now().Unix() 2566 return 2567 } 2568 2569 func (qc *quorumCandidates) GetOnline(peer etf.Pid) *candidate { 2570 c, exist := qc.candidates[peer] 2571 if exist && c.joined == false { 2572 return nil 2573 } 2574 return c 2575 } 2576 func (qc *quorumCandidates) Get(peer etf.Pid) *candidate { 2577 c, exist := qc.candidates[peer] 2578 if exist == false { 2579 return nil 2580 } 2581 return c 2582 } 2583 2584 // List returns list of online peers 2585 func (qc *quorumCandidates) List() []etf.Pid { 2586 type c struct { 2587 pid etf.Pid 2588 serial uint64 2589 } 2590 list := []c{} 2591 for k, v := range qc.candidates { 2592 if v.joined == false { 2593 continue 2594 } 2595 list = append(list, c{pid: k, serial: v.serial}) 2596 } 2597 2598 // sort candidates by serial number in desc order 2599 sort.Slice(list, func(a, b int) bool { return list[a].serial > list[b].serial }) 2600 pids := []etf.Pid{} 2601 for i := range list { 2602 pids = append(pids, list[i].pid) 2603 } 2604 return pids 2605 } 2606 2607 func (qc *quorumCandidates) ListOffline() []etf.Pid { 2608 list := []etf.Pid{} 2609 for pid, c := range qc.candidates { 2610 if c.joined == true { 2611 continue 2612 } 2613 list = append(list, pid) 2614 } 2615 return list 2616 }