github.com/kisexp/xdchain@v0.0.0-20211206025815-490d6b732aa7/raft/handler.go (about) 1 package raft 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net" 8 "net/http" 9 "net/url" 10 "os" 11 "strconv" 12 "sync" 13 "time" 14 15 "github.com/coreos/etcd/etcdserver/stats" 16 "github.com/coreos/etcd/pkg/fileutil" 17 raftTypes "github.com/coreos/etcd/pkg/types" 18 etcdRaft "github.com/coreos/etcd/raft" 19 "github.com/coreos/etcd/raft/raftpb" 20 "github.com/coreos/etcd/rafthttp" 21 "github.com/coreos/etcd/snap" 22 "github.com/coreos/etcd/wal" 23 mapset "github.com/deckarep/golang-set" 24 "github.com/syndtr/goleveldb/leveldb" 25 26 "github.com/kisexp/xdchain/core" 27 "github.com/kisexp/xdchain/core/types" 28 "github.com/kisexp/xdchain/eth/downloader" 29 "github.com/kisexp/xdchain/event" 30 "github.com/kisexp/xdchain/log" 31 "github.com/kisexp/xdchain/p2p" 32 "github.com/kisexp/xdchain/p2p/enode" 33 "github.com/kisexp/xdchain/p2p/enr" 34 "github.com/kisexp/xdchain/rlp" 35 ) 36 37 type ProtocolManager struct { 38 mu sync.RWMutex // For protecting concurrent JS access to "local peer" and "remote peer" state 39 quitSync chan struct{} 40 stopped bool 41 42 // Static configuration 43 joinExisting bool // Whether to join an existing cluster when a WAL doesn't already exist 44 bootstrapNodes []*enode.Node 45 raftId uint16 46 raftPort uint16 47 48 // Local peer state (protected by mu vs concurrent access via JS) 49 address *Address 50 role int // Role: minter or verifier 51 appliedIndex uint64 // The index of the last-applied raft entry 52 snapshotIndex uint64 // The index of the latest snapshot. 53 54 // Remote peer state (protected by mu vs concurrent access via JS) 55 leader uint16 56 peers map[uint16]*Peer 57 removedPeers mapset.Set // *Permanently removed* peers 58 59 // P2P transport 60 p2pServer *p2p.Server 61 useDns bool 62 63 // Blockchain services 64 blockchain *core.BlockChain 65 downloader *downloader.Downloader 66 minter *minter 67 68 // Blockchain events 69 eventMux *event.TypeMux 70 minedBlockSub *event.TypeMuxSubscription 71 72 // Raft proposal events 73 blockProposalC chan *types.Block // for mined blocks to raft 74 confChangeProposalC chan raftpb.ConfChange // for config changes from js console to raft 75 76 // Raft transport 77 unsafeRawNode etcdRaft.Node 78 transport *rafthttp.Transport 79 httpstopc chan struct{} 80 httpdonec chan struct{} 81 82 // Raft snapshotting 83 snapshotter *snap.Snapshotter 84 snapdir string 85 confState raftpb.ConfState 86 87 // Raft write-ahead log 88 waldir string 89 wal *wal.WAL 90 91 // Storage 92 quorumRaftDb *leveldb.DB // Persistent storage for last-applied raft index 93 raftStorage *etcdRaft.MemoryStorage // Volatile raft storage 94 } 95 96 var errNoLeaderElected = errors.New("no leader is currently elected") 97 98 // 99 // Public interface 100 // 101 102 func NewProtocolManager(raftId uint16, raftPort uint16, blockchain *core.BlockChain, mux *event.TypeMux, bootstrapNodes []*enode.Node, joinExisting bool, raftLogDir string, minter *minter, downloader *downloader.Downloader, useDns bool, p2pServer *p2p.Server) (*ProtocolManager, error) { 103 waldir := fmt.Sprintf("%s/raft-wal", raftLogDir) 104 snapdir := fmt.Sprintf("%s/raft-snap", raftLogDir) 105 quorumRaftDbLoc := fmt.Sprintf("%s/quorum-raft-state", raftLogDir) 106 107 manager := &ProtocolManager{ 108 bootstrapNodes: bootstrapNodes, 109 peers: make(map[uint16]*Peer), 110 leader: uint16(etcdRaft.None), 111 removedPeers: mapset.NewSet(), 112 joinExisting: joinExisting, 113 blockchain: blockchain, 114 eventMux: mux, 115 blockProposalC: make(chan *types.Block, 10), 116 confChangeProposalC: make(chan raftpb.ConfChange), 117 httpstopc: make(chan struct{}), 118 httpdonec: make(chan struct{}), 119 waldir: waldir, 120 snapdir: snapdir, 121 snapshotter: snap.New(snapdir), 122 raftId: raftId, 123 raftPort: raftPort, 124 quitSync: make(chan struct{}), 125 raftStorage: etcdRaft.NewMemoryStorage(), 126 minter: minter, 127 downloader: downloader, 128 useDns: useDns, 129 p2pServer: p2pServer, 130 } 131 132 if db, err := openQuorumRaftDb(quorumRaftDbLoc); err != nil { 133 return nil, err 134 } else { 135 manager.quorumRaftDb = db 136 } 137 138 return manager, nil 139 } 140 141 func (pm *ProtocolManager) Start() { 142 log.Info("starting raft protocol handler") 143 144 pm.minedBlockSub = pm.eventMux.Subscribe(core.NewMinedBlockEvent{}) 145 pm.startRaft() 146 // update raft peers info to p2p server 147 pm.p2pServer.SetCheckPeerInRaft(pm.peerExist) 148 go pm.minedBroadcastLoop() 149 } 150 151 func (pm *ProtocolManager) Stop() { 152 pm.mu.Lock() 153 defer pm.mu.Unlock() 154 155 defer log.Info("raft protocol handler stopped") 156 157 if pm.stopped { 158 return 159 } 160 161 log.Info("stopping raft protocol handler...") 162 163 for raftId, peer := range pm.peers { 164 pm.disconnectFromPeer(raftId, peer) 165 } 166 167 pm.minedBlockSub.Unsubscribe() 168 169 if pm.transport != nil { 170 pm.transport.Stop() 171 } 172 173 close(pm.httpstopc) 174 <-pm.httpdonec 175 close(pm.quitSync) 176 177 if pm.unsafeRawNode != nil { 178 pm.unsafeRawNode.Stop() 179 } 180 181 pm.quorumRaftDb.Close() 182 183 pm.p2pServer = nil 184 185 pm.minter.stop() 186 187 pm.stopped = true 188 } 189 190 func (pm *ProtocolManager) NodeInfo() *RaftNodeInfo { 191 pm.mu.RLock() // as we read role and peers 192 defer pm.mu.RUnlock() 193 194 roleDescription := "" 195 if pm.role == minterRole { 196 roleDescription = "minter" 197 } else if pm.isVerifierNode() { 198 roleDescription = "verifier" 199 } else if pm.isLearnerNode() { 200 roleDescription = "learner" 201 } 202 203 peerAddresses := make([]*Address, len(pm.peers)) 204 peerIdx := 0 205 for _, peer := range pm.peers { 206 peerAddresses[peerIdx] = peer.address 207 peerIdx += 1 208 } 209 210 removedPeerIfaces := pm.removedPeers 211 removedPeerIds := make([]uint16, removedPeerIfaces.Cardinality()) 212 i := 0 213 for removedIface := range removedPeerIfaces.Iterator().C { 214 removedPeerIds[i] = removedIface.(uint16) 215 i++ 216 } 217 218 // 219 // NOTE: before exposing any new fields here, make sure that the underlying 220 // ProtocolManager members are protected from concurrent access by pm.mu! 221 // 222 return &RaftNodeInfo{ 223 ClusterSize: len(pm.peers) + 1, 224 Role: roleDescription, 225 Address: pm.address, 226 PeerAddresses: peerAddresses, 227 RemovedPeerIds: removedPeerIds, 228 AppliedIndex: pm.appliedIndex, 229 SnapshotIndex: pm.snapshotIndex, 230 } 231 } 232 233 // There seems to be a very rare race in raft where during `etcdRaft.StartNode` 234 // it will call back our `Process` method before it's finished returning the 235 // `raft.Node`, `pm.unsafeRawNode`, to us. This re-entrance through a separate 236 // thread will cause a nil pointer dereference. To work around this, this 237 // getter method should be used instead of reading `pm.unsafeRawNode` directly. 238 func (pm *ProtocolManager) rawNode() etcdRaft.Node { 239 for pm.unsafeRawNode == nil { 240 time.Sleep(100 * time.Millisecond) 241 } 242 243 return pm.unsafeRawNode 244 } 245 246 func (pm *ProtocolManager) nextRaftId() uint16 { 247 pm.mu.RLock() 248 defer pm.mu.RUnlock() 249 250 maxId := pm.raftId 251 252 for peerId := range pm.peers { 253 if maxId < peerId { 254 maxId = peerId 255 } 256 } 257 258 removedPeerIfaces := pm.removedPeers 259 for removedIface := range removedPeerIfaces.Iterator().C { 260 removedId := removedIface.(uint16) 261 262 if maxId < removedId { 263 maxId = removedId 264 } 265 } 266 267 return maxId + 1 268 } 269 270 func (pm *ProtocolManager) isRaftIdRemoved(id uint16) bool { 271 pm.mu.RLock() 272 defer pm.mu.RUnlock() 273 274 return pm.removedPeers.Contains(id) 275 } 276 277 func (pm *ProtocolManager) isRaftIdUsed(raftId uint16) bool { 278 if pm.raftId == raftId || pm.isRaftIdRemoved(raftId) { 279 return true 280 } 281 282 pm.mu.RLock() 283 defer pm.mu.RUnlock() 284 285 return pm.peers[raftId] != nil 286 } 287 288 func (pm *ProtocolManager) isNodeAlreadyInCluster(node *enode.Node) error { 289 pm.mu.RLock() 290 defer pm.mu.RUnlock() 291 292 thisEnode := enode.MustParse(pm.p2pServer.NodeInfo().Enode) 293 if thisEnode.EnodeID() == node.EnodeID() { 294 return fmt.Errorf("enode is this enode (self): node with this enode has already been added to the cluster: %s", node.ID()) 295 } 296 297 for _, peer := range pm.peers { 298 peerRaftId := peer.address.RaftId 299 peerNode := peer.p2pNode 300 301 if peerNode.ID() == node.ID() { 302 return fmt.Errorf("node with this enode has already been added to the cluster: %s", node.ID()) 303 } 304 305 if peerNode.IP().Equal(node.IP()) { 306 if peerNode.TCP() == node.TCP() { 307 return fmt.Errorf("existing node %v with raft ID %v is already using eth p2p at %v:%v", peerNode.ID(), peerRaftId, node.IP(), node.TCP()) 308 } else if peer.address.RaftPort == enr.RaftPort(node.RaftPort()) { 309 return fmt.Errorf("existing node %v with raft ID %v is already using raft at %v:%v", peerNode.ID(), peerRaftId, node.IP(), node.RaftPort()) 310 } 311 } 312 } 313 314 return nil 315 } 316 317 func (pm *ProtocolManager) peerExist(node *enode.Node) bool { 318 pm.mu.RLock() 319 defer pm.mu.RUnlock() 320 321 for _, p := range pm.peers { 322 if node.ID() == p.p2pNode.ID() { 323 return true 324 } 325 } 326 return false 327 } 328 329 func (pm *ProtocolManager) ProposeNewPeer(enodeURL string, isLearner bool) (uint16, error) { 330 if pm.isLearnerNode() { 331 return 0, errors.New("learner node can't add peer or learner") 332 } 333 node, err := enode.ParseV4(enodeURL) 334 if err != nil { 335 return 0, err 336 } 337 338 if !pm.useDns { 339 // hostname is not allowed if DNS is not enabled 340 if node.Host() != "" { 341 return 0, fmt.Errorf("raft must enable dns to use hostname") 342 } 343 if len(node.IP()) != 4 { 344 return 0, fmt.Errorf("expected IPv4 address (with length 4), but got IP of length %v", len(node.IP())) 345 } 346 } 347 348 if !node.HasRaftPort() { 349 return 0, fmt.Errorf("enodeId is missing raftport querystring parameter: %v", enodeURL) 350 } 351 352 if err := pm.isNodeAlreadyInCluster(node); err != nil { 353 return 0, err 354 } 355 356 raftId := pm.nextRaftId() 357 address := newAddress(raftId, node.RaftPort(), node, pm.useDns) 358 359 confChangeType := raftpb.ConfChangeAddNode 360 361 if isLearner { 362 confChangeType = raftpb.ConfChangeAddLearnerNode 363 } 364 365 pm.confChangeProposalC <- raftpb.ConfChange{ 366 Type: confChangeType, 367 NodeID: uint64(raftId), 368 Context: address.toBytes(), 369 } 370 371 return raftId, nil 372 } 373 374 func (pm *ProtocolManager) ProposePeerRemoval(raftId uint16) error { 375 if pm.isLearnerNode() && raftId != pm.raftId { 376 return errors.New("learner node can't remove other peer") 377 } 378 pm.confChangeProposalC <- raftpb.ConfChange{ 379 Type: raftpb.ConfChangeRemoveNode, 380 NodeID: uint64(raftId), 381 } 382 return nil 383 } 384 385 func (pm *ProtocolManager) PromoteToPeer(raftId uint16) (bool, error) { 386 if pm.isLearnerNode() { 387 return false, errors.New("learner node can't promote to peer") 388 } 389 390 if !pm.isLearner(raftId) { 391 return false, fmt.Errorf("%d is not a learner. only learner can be promoted to peer", raftId) 392 } 393 394 pm.confChangeProposalC <- raftpb.ConfChange{ 395 Type: raftpb.ConfChangeAddNode, 396 NodeID: uint64(raftId), 397 } 398 return true, nil 399 } 400 401 // 402 // MsgWriter interface (necessary for p2p.Send) 403 // 404 405 func (pm *ProtocolManager) WriteMsg(msg p2p.Msg) error { 406 // read *into* buffer 407 var buffer = make([]byte, msg.Size) 408 msg.Payload.Read(buffer) 409 410 return pm.rawNode().Propose(context.TODO(), buffer) 411 } 412 413 // 414 // Raft interface 415 // 416 417 func (pm *ProtocolManager) Process(ctx context.Context, m raftpb.Message) error { 418 return pm.rawNode().Step(ctx, m) 419 } 420 421 func (pm *ProtocolManager) IsIDRemoved(id uint64) bool { 422 return pm.isRaftIdRemoved(uint16(id)) 423 } 424 425 func (pm *ProtocolManager) ReportUnreachable(id uint64) { 426 log.Info("peer is currently unreachable", "peer id", id) 427 428 pm.rawNode().ReportUnreachable(id) 429 } 430 431 func (pm *ProtocolManager) ReportSnapshot(id uint64, status etcdRaft.SnapshotStatus) { 432 if status == etcdRaft.SnapshotFailure { 433 log.Info("failed to send snapshot", "raft peer", id) 434 } else if status == etcdRaft.SnapshotFinish { 435 log.Info("finished sending snapshot", "raft peer", id) 436 } 437 438 pm.rawNode().ReportSnapshot(id, status) 439 } 440 441 // 442 // Private methods 443 // 444 445 func (pm *ProtocolManager) startRaft() { 446 if !fileutil.Exist(pm.snapdir) { 447 if err := os.Mkdir(pm.snapdir, 0750); err != nil { 448 fatalf("cannot create dir for snapshot (%v)", err) 449 } 450 } 451 walExisted := wal.Exist(pm.waldir) 452 lastAppliedIndex := pm.loadAppliedIndex() 453 454 id := raftTypes.ID(pm.raftId).String() 455 ss := stats.NewServerStats(id, id) 456 457 pm.transport = &rafthttp.Transport{ 458 ID: raftTypes.ID(pm.raftId), 459 ClusterID: 0x1000, 460 Raft: pm, 461 ServerStats: ss, 462 LeaderStats: stats.NewLeaderStats(strconv.Itoa(int(pm.raftId))), 463 ErrorC: make(chan error), 464 } 465 pm.transport.Start() 466 467 // We load the snapshot to connect to prev peers before replaying the WAL, 468 // which typically goes further into the future than the snapshot. 469 470 var maybeRaftSnapshot *raftpb.Snapshot 471 472 if walExisted { 473 maybeRaftSnapshot = pm.loadSnapshot() // re-establishes peer connections 474 } 475 476 loadedWal, entries := pm.replayWAL(maybeRaftSnapshot) 477 pm.wal = loadedWal 478 479 if walExisted { 480 481 // If we shutdown but didn't manage to flush the state to disk, then it will be the case that we will only sync 482 // up to the snapshot. In this case, we can replay the raft entries that we have in saved to replay the blocks 483 // back into our chain. We output errors but cannot do much if one occurs, since we can't fork to a different 484 // chain and all other nodes in the network have confirmed these blocks 485 if maybeRaftSnapshot != nil { 486 currentChainHead := pm.blockchain.CurrentBlock().Number() 487 for _, entry := range entries { 488 if entry.Type == raftpb.EntryNormal { 489 var block types.Block 490 if err := rlp.DecodeBytes(entry.Data, &block); err != nil { 491 log.Error("error decoding block: ", "err", err) 492 continue 493 } 494 495 if thisBlockHead := pm.blockchain.GetBlockByHash(block.Hash()); thisBlockHead != nil { 496 // check if the block is already existing in the local chain 497 // and the block number is greater than current chain head 498 if thisBlockHeadNum := thisBlockHead.Number(); thisBlockHeadNum.Cmp(currentChainHead) > 0 { 499 // insert the block only if its already seen 500 blocks := []*types.Block{&block} 501 if _, err := pm.blockchain.InsertChain(blocks); err != nil { 502 log.Error("error inserting the block into the chain", "number", block.NumberU64(), "hash", block.Hash(), "err", err) 503 } 504 } 505 } 506 } 507 } 508 } 509 510 if hardState, _, err := pm.raftStorage.InitialState(); err != nil { 511 panic(fmt.Sprintf("failed to read initial state from raft while restarting: %v", err)) 512 } else { 513 if lastPersistedCommittedIndex := hardState.Commit; lastPersistedCommittedIndex < lastAppliedIndex { 514 log.Info("rolling back applied index to last-durably-committed", "last applied index", lastAppliedIndex, "last persisted index", lastPersistedCommittedIndex) 515 516 // Roll back our applied index. See the logic and explanation around 517 // the single call to `pm.applyNewChainHead` for more context. 518 lastAppliedIndex = lastPersistedCommittedIndex 519 } 520 521 // fix raft applied index out of range 522 firstIndex, err := pm.raftStorage.FirstIndex() 523 if err != nil { 524 panic(fmt.Sprintf("failed to read last persisted applied index from raft while restarting: %v", err)) 525 } 526 lastPersistedAppliedIndex := firstIndex - 1 527 if lastPersistedAppliedIndex > lastAppliedIndex { 528 log.Debug("set lastAppliedIndex to lastPersistedAppliedIndex", "last applied index", lastAppliedIndex, "last persisted applied index", lastPersistedAppliedIndex) 529 530 lastAppliedIndex = lastPersistedAppliedIndex 531 pm.advanceAppliedIndex(lastAppliedIndex) 532 } 533 } 534 } 535 536 // NOTE: cockroach sets this to false for now until they've "worked out the 537 // bugs" 538 enablePreVote := true 539 540 raftConfig := &etcdRaft.Config{ 541 Applied: lastAppliedIndex, 542 ID: uint64(pm.raftId), 543 ElectionTick: 10, // NOTE: cockroach sets this to 15 544 HeartbeatTick: 1, // NOTE: cockroach sets this to 5 545 Storage: pm.raftStorage, 546 547 // NOTE, from cockroach: 548 // "PreVote and CheckQuorum are two ways of achieving the same thing. 549 // PreVote is more compatible with quiesced ranges, so we want to switch 550 // to it once we've worked out the bugs." 551 // 552 // TODO: vendor again? 553 // PreVote: enablePreVote, 554 CheckQuorum: !enablePreVote, 555 556 // MaxSizePerMsg controls how many Raft log entries the leader will send to 557 // followers in a single MsgApp. 558 MaxSizePerMsg: 4096, // NOTE: in cockroachdb this is 16*1024 559 560 // MaxInflightMsgs controls how many in-flight messages Raft will send to 561 // a follower without hearing a response. The total number of Raft log 562 // entries is a combination of this setting and MaxSizePerMsg. 563 // 564 // NOTE: Cockroach's settings (MaxSizePerMsg of 4k and MaxInflightMsgs 565 // of 4) provide for up to 64 KB of raft log to be sent without 566 // acknowledgement. With an average entry size of 1 KB that translates 567 // to ~64 commands that might be executed in the handling of a single 568 // etcdraft.Ready operation. 569 MaxInflightMsgs: 256, // NOTE: in cockroachdb this is 4 570 } 571 572 log.Info("startRaft", "raft ID", raftConfig.ID) 573 574 if walExisted { 575 log.Info("remounting an existing raft log; connecting to peers.") 576 577 pm.unsafeRawNode = etcdRaft.RestartNode(raftConfig) 578 } else if pm.joinExisting { 579 log.Info("newly joining an existing cluster; waiting for connections.") 580 pm.unsafeRawNode = etcdRaft.StartNode(raftConfig, nil) 581 } else { 582 if numPeers := len(pm.bootstrapNodes); numPeers == 0 { 583 panic("exiting due to empty raft peers list") 584 } else { 585 log.Info("starting a new raft log", "initial cluster size of", numPeers) 586 } 587 588 raftPeers, peerAddresses, localAddress := pm.makeInitialRaftPeers() 589 590 pm.setLocalAddress(localAddress) 591 592 // We add all peers up-front even though we will see a ConfChangeAddNode 593 // for each shortly. This is because raft's ConfState will contain all of 594 // these nodes before we see these log entries, and we always want our 595 // snapshots to have all addresses for each of the nodes in the ConfState. 596 for _, peerAddress := range peerAddresses { 597 pm.addPeer(peerAddress) 598 } 599 pm.unsafeRawNode = etcdRaft.StartNode(raftConfig, raftPeers) 600 } 601 log.Info("raft node started") 602 go pm.serveRaft() 603 go pm.serveLocalProposals() 604 go pm.eventLoop() 605 go pm.handleRoleChange(pm.rawNode().RoleChan().Out()) 606 } 607 608 func (pm *ProtocolManager) setLocalAddress(addr *Address) { 609 pm.mu.Lock() 610 pm.address = addr 611 pm.mu.Unlock() 612 // By setting `URLs` on the raft transport, we advertise our URL (in an HTTP 613 // header) to any recipient. This is necessary for a newcomer to the cluster 614 // to be able to accept a snapshot from us to bootstrap them. 615 if urls, err := raftTypes.NewURLs([]string{pm.raftUrl(addr)}); err == nil { 616 pm.transport.URLs = urls 617 } else { 618 panic(fmt.Sprintf("error: could not create URL from local address: %v", addr)) 619 } 620 } 621 622 func (pm *ProtocolManager) serveRaft() { 623 urlString := fmt.Sprintf("http://0.0.0.0:%d", pm.raftPort) 624 url, err := url.Parse(urlString) 625 if err != nil { 626 fatalf("Failed parsing URL (%v)", err) 627 } 628 629 listener, err := newStoppableListener(url.Host, pm.httpstopc) 630 if err != nil { 631 fatalf("Failed to listen rafthttp (%v)", err) 632 } 633 err = (&http.Server{Handler: pm.transport.Handler()}).Serve(listener) 634 select { 635 case <-pm.httpstopc: 636 default: 637 fatalf("Failed to serve rafthttp (%v)", err) 638 } 639 close(pm.httpdonec) 640 } 641 642 func (pm *ProtocolManager) isLearner(rid uint16) bool { 643 pm.mu.RLock() 644 defer pm.mu.RUnlock() 645 for _, n := range pm.confState.Learners { 646 if uint16(n) == rid { 647 return true 648 } 649 } 650 return false 651 } 652 653 func (pm *ProtocolManager) isLearnerNode() bool { 654 return pm.isLearner(pm.raftId) 655 } 656 657 func (pm *ProtocolManager) isVerifierNode() bool { 658 return pm.isVerifier(pm.raftId) 659 } 660 661 func (pm *ProtocolManager) isVerifier(rid uint16) bool { 662 pm.mu.RLock() 663 defer pm.mu.RUnlock() 664 for _, n := range pm.confState.Nodes { 665 if uint16(n) == rid { 666 return true 667 } 668 } 669 return false 670 } 671 672 func (pm *ProtocolManager) handleRoleChange(roleC <-chan interface{}) { 673 for { 674 select { 675 case role := <-roleC: 676 intRole, ok := role.(int) 677 678 if !ok { 679 panic("Couldn't cast role to int") 680 } 681 if intRole == minterRole { 682 log.EmitCheckpoint(log.BecameMinter) 683 pm.minter.start() 684 } else { // verifier 685 if pm.isVerifierNode() { 686 log.EmitCheckpoint(log.BecameVerifier) 687 } else { 688 log.EmitCheckpoint(log.BecameLearner) 689 } 690 pm.minter.stop() 691 } 692 693 pm.mu.Lock() 694 pm.role = intRole 695 pm.mu.Unlock() 696 case <-pm.quitSync: 697 return 698 } 699 } 700 } 701 702 func (pm *ProtocolManager) minedBroadcastLoop() { 703 for obj := range pm.minedBlockSub.Chan() { 704 switch ev := obj.Data.(type) { 705 case core.NewMinedBlockEvent: 706 select { 707 case pm.blockProposalC <- ev.Block: 708 case <-pm.quitSync: 709 return 710 } 711 } 712 } 713 } 714 715 // Serve two channels to handle new blocks and raft configuration changes originating locally. 716 func (pm *ProtocolManager) serveLocalProposals() { 717 // 718 // TODO: does it matter that this will restart from 0 whenever we restart a cluster? 719 // 720 var confChangeCount uint64 721 722 for { 723 select { 724 case block, ok := <-pm.blockProposalC: 725 if !ok { 726 log.Info("error: read from blockProposalC failed") 727 return 728 } 729 730 size, r, err := rlp.EncodeToReader(block) 731 if err != nil { 732 panic(fmt.Sprintf("error: failed to send RLP-encoded block: %s", err.Error())) 733 } 734 var buffer = make([]byte, uint32(size)) 735 r.Read(buffer) 736 737 // blocks until accepted by the raft state machine 738 pm.rawNode().Propose(context.TODO(), buffer) 739 case cc, ok := <-pm.confChangeProposalC: 740 if !ok { 741 log.Info("error: read from confChangeProposalC failed") 742 return 743 } 744 745 confChangeCount++ 746 cc.ID = confChangeCount 747 pm.rawNode().ProposeConfChange(context.TODO(), cc) 748 case <-pm.quitSync: 749 return 750 } 751 } 752 } 753 754 func (pm *ProtocolManager) entriesToApply(allEntries []raftpb.Entry) (entriesToApply []raftpb.Entry) { 755 if len(allEntries) == 0 { 756 return 757 } 758 759 first := allEntries[0].Index 760 pm.mu.RLock() 761 lastApplied := pm.appliedIndex 762 pm.mu.RUnlock() 763 764 if first > lastApplied+1 { 765 fatalf("first index of committed entry[%d] should <= appliedIndex[%d] + 1", first, lastApplied) 766 } 767 768 firstToApply := lastApplied - first + 1 769 770 if firstToApply < uint64(len(allEntries)) { 771 entriesToApply = allEntries[firstToApply:] 772 } 773 return 774 } 775 776 func (pm *ProtocolManager) raftUrl(address *Address) string { 777 if parsedIp := net.ParseIP(address.Hostname); parsedIp != nil { 778 if ipv4 := parsedIp.To4(); ipv4 != nil { 779 //this is an IPv4 address 780 return fmt.Sprintf("http://%s:%d", ipv4, address.RaftPort) 781 } 782 //this is an IPv6 address 783 return fmt.Sprintf("http://[%s]:%d", parsedIp, address.RaftPort) 784 } 785 return fmt.Sprintf("http://%s:%d", address.Hostname, address.RaftPort) 786 } 787 788 func (pm *ProtocolManager) addPeer(address *Address) { 789 pm.mu.Lock() 790 defer pm.mu.Unlock() 791 792 raftId := address.RaftId 793 794 //Quorum - RAFT - derive pubkey from nodeId 795 pubKey, err := enode.HexPubkey(address.NodeId.String()) 796 if err != nil { 797 log.Error("error decoding pub key from enodeId", "enodeId", address.NodeId.String(), "err", err) 798 panic(err) 799 } 800 801 // Add P2P connection: 802 p2pNode := enode.NewV4Hostname(pubKey, address.Hostname, int(address.P2pPort), 0, int(address.RaftPort)) 803 pm.p2pServer.AddPeer(p2pNode) 804 805 // Add raft transport connection: 806 pm.transport.AddPeer(raftTypes.ID(raftId), []string{pm.raftUrl(address)}) 807 pm.peers[raftId] = &Peer{address, p2pNode} 808 } 809 810 func (pm *ProtocolManager) disconnectFromPeer(raftId uint16, peer *Peer) { 811 pm.p2pServer.RemovePeer(peer.p2pNode) 812 pm.transport.RemovePeer(raftTypes.ID(raftId)) 813 } 814 815 func (pm *ProtocolManager) removePeer(raftId uint16) { 816 pm.mu.Lock() 817 defer pm.mu.Unlock() 818 819 if peer := pm.peers[raftId]; peer != nil { 820 pm.disconnectFromPeer(raftId, peer) 821 822 delete(pm.peers, raftId) 823 } 824 825 // This is only necessary sometimes, but it's idempotent. Also, we *always* 826 // do this, and not just when there's still a peer in the map, because we 827 // need to do it for our *own* raft ID before we get booted from the cluster 828 // so that snapshots are identical on all nodes. It's important for a booted 829 // node to have a snapshot identical to every other node because that node 830 // can potentially re-enter the cluster with a new raft ID. 831 pm.removedPeers.Add(raftId) 832 } 833 834 func (pm *ProtocolManager) eventLoop() { 835 ticker := time.NewTicker(tickerMS * time.Millisecond) 836 defer ticker.Stop() 837 defer pm.wal.Close() 838 839 exitAfterApplying := false 840 841 for { 842 select { 843 case <-ticker.C: 844 pm.rawNode().Tick() 845 846 // when the node is first ready it gives us entries to commit and messages 847 // to immediately publish 848 case rd := <-pm.rawNode().Ready(): 849 pm.wal.Save(rd.HardState, rd.Entries) 850 851 if rd.SoftState != nil { 852 pm.updateLeader(rd.SoftState.Lead) 853 } 854 855 if snap := rd.Snapshot; !etcdRaft.IsEmptySnap(snap) { 856 pm.saveRaftSnapshot(snap) 857 pm.applyRaftSnapshot(snap) 858 pm.advanceAppliedIndex(snap.Metadata.Index) 859 } 860 861 // 1: Write HardState, Entries, and Snapshot to persistent storage if they 862 // are not empty. 863 pm.raftStorage.Append(rd.Entries) 864 865 // 2: Send all Messages to the nodes named in the To field. 866 pm.transport.Send(rd.Messages) 867 868 // 3: Apply Snapshot (if any) and CommittedEntries to the state machine. 869 for _, entry := range pm.entriesToApply(rd.CommittedEntries) { 870 switch entry.Type { 871 case raftpb.EntryNormal: 872 if len(entry.Data) == 0 { 873 break 874 } 875 var block types.Block 876 err := rlp.DecodeBytes(entry.Data, &block) 877 if err != nil { 878 log.Error("error decoding block", "err", err) 879 } 880 881 if pm.blockchain.HasBlock(block.Hash(), block.NumberU64()) { 882 // This can happen: 883 // 884 // if (1) we crashed after applying this block to the chain, but 885 // before writing appliedIndex to LDB. 886 // or (2) we crashed in a scenario where we applied further than 887 // raft *durably persisted* its committed index (see 888 // https://github.com/coreos/etcd/pull/7899). In this 889 // scenario, when the node comes back up, we will re-apply 890 // a few entries. 891 892 headBlockHash := pm.blockchain.CurrentBlock().Hash() 893 log.Warn("not applying already-applied block", "block hash", block.Hash(), "parent", block.ParentHash(), "head", headBlockHash) 894 } else { 895 if !pm.applyNewChainHead(&block) { 896 // return false only if insert chain is interrupted 897 // stop eventloop 898 return 899 } 900 } 901 902 case raftpb.EntryConfChange: 903 var cc raftpb.ConfChange 904 cc.Unmarshal(entry.Data) 905 raftId := uint16(cc.NodeID) 906 907 pm.confState = *pm.rawNode().ApplyConfChange(cc) 908 log.Info("confChange", "confState", pm.confState) 909 forceSnapshot := false 910 911 switch cc.Type { 912 case raftpb.ConfChangeAddNode, raftpb.ConfChangeAddLearnerNode: 913 confChangeTypeName := raftpb.ConfChangeType_name[int32(cc.Type)] 914 log.Info(confChangeTypeName, "raft id", raftId) 915 if pm.isRaftIdRemoved(raftId) { 916 log.Info("ignoring "+confChangeTypeName+" for permanently-removed peer", "raft id", raftId) 917 } else if pm.isRaftIdUsed(raftId) && raftId <= uint16(len(pm.bootstrapNodes)) { 918 // See initial cluster logic in startRaft() for more information. 919 log.Info("ignoring expected "+confChangeTypeName+" for initial peer", "raft id", raftId) 920 // We need a snapshot to exist to reconnect to peers on start-up after a crash. 921 forceSnapshot = true 922 } else { // add peer or add learner or promote learner to voter 923 forceSnapshot = true 924 //if raft id exists as peer, you are promoting learner to peer 925 if pm.isRaftIdUsed(raftId) { 926 log.Info("promote learner node to voter node", "raft id", raftId) 927 } else { 928 //if raft id does not exist, you are adding peer/learner 929 log.Info("add peer/learner -> "+confChangeTypeName, "raft id", raftId) 930 pm.addPeer(bytesToAddress(cc.Context)) 931 } 932 } 933 934 case raftpb.ConfChangeRemoveNode: 935 if pm.isRaftIdRemoved(raftId) { 936 log.Info("ignoring ConfChangeRemoveNode for already-removed peer", "raft id", raftId) 937 } else { 938 log.Info("removing peer due to ConfChangeRemoveNode", "raft id", raftId) 939 940 forceSnapshot = true 941 942 if raftId == pm.raftId { 943 exitAfterApplying = true 944 } 945 946 pm.removePeer(raftId) 947 } 948 949 case raftpb.ConfChangeUpdateNode: 950 // NOTE: remember to forceSnapshot in this case, if we add support 951 // for this. 952 fatalf("not yet handled: ConfChangeUpdateNode") 953 } 954 955 if forceSnapshot { 956 // We force a snapshot here to persist our updated confState, so we 957 // know our fellow cluster members when we come back online. 958 // 959 // It is critical here to snapshot *before* writing our applied 960 // index in LevelDB, otherwise a crash while/before snapshotting 961 // (after advancing our applied index) would result in the loss of a 962 // cluster member upon restart: we would re-mount with an old 963 // ConfState. 964 pm.triggerSnapshot(entry.Index) 965 } 966 } 967 968 pm.advanceAppliedIndex(entry.Index) 969 } 970 971 pm.maybeTriggerSnapshot() 972 973 if exitAfterApplying { 974 log.Warn("permanently removing self from the cluster") 975 pm.Stop() 976 log.Warn("permanently exited the cluster") 977 978 return 979 } 980 981 // 4: Call Node.Advance() to signal readiness for the next batch of 982 // updates. 983 pm.rawNode().Advance() 984 985 case <-pm.quitSync: 986 return 987 } 988 } 989 } 990 991 func (pm *ProtocolManager) makeInitialRaftPeers() (raftPeers []etcdRaft.Peer, peerAddresses []*Address, localAddress *Address) { 992 initialNodes := pm.bootstrapNodes 993 raftPeers = make([]etcdRaft.Peer, len(initialNodes)) // Entire cluster 994 peerAddresses = make([]*Address, len(initialNodes)-1) // Cluster without *this* node 995 996 peersSeen := 0 997 for i, node := range initialNodes { 998 raftId := uint16(i + 1) 999 // We initially get the raftPort from the enode ID's query string. As an alternative, we can move away from 1000 // requiring the use of static peers for the initial set, and load them from e.g. another JSON file which 1001 // contains pairs of enodes and raft ports, or we can get this initial peer list from commandline flags. 1002 address := newAddress(raftId, node.RaftPort(), node, pm.useDns) 1003 raftPeers[i] = etcdRaft.Peer{ 1004 ID: uint64(raftId), 1005 Context: address.toBytes(), 1006 } 1007 1008 if raftId == pm.raftId { 1009 localAddress = address 1010 } else { 1011 peerAddresses[peersSeen] = address 1012 peersSeen += 1 1013 } 1014 } 1015 1016 return 1017 } 1018 1019 func blockExtendsChain(block *types.Block, chain *core.BlockChain) bool { 1020 return block.ParentHash() == chain.CurrentBlock().Hash() 1021 } 1022 1023 func (pm *ProtocolManager) applyNewChainHead(block *types.Block) bool { 1024 if !blockExtendsChain(block, pm.blockchain) { 1025 headBlock := pm.blockchain.CurrentBlock() 1026 1027 log.Info("Non-extending block", "block", block.Hash(), "parent", block.ParentHash(), "head", headBlock.Hash()) 1028 1029 pm.minter.invalidRaftOrderingChan <- InvalidRaftOrdering{headBlock: headBlock, invalidBlock: block} 1030 } else { 1031 if existingBlock := pm.blockchain.GetBlockByHash(block.Hash()); nil == existingBlock { 1032 if err := pm.blockchain.Validator().ValidateBody(block); err != nil { 1033 panic(fmt.Sprintf("failed to validate block %x (%v)", block.Hash(), err)) 1034 } 1035 } 1036 1037 for _, tx := range block.Transactions() { 1038 log.EmitCheckpoint(log.TxAccepted, "tx", tx.Hash().Hex()) 1039 } 1040 1041 _, err := pm.blockchain.InsertChain([]*types.Block{block}) 1042 1043 if err != nil { 1044 if err == core.ErrAbortBlocksProcessing { 1045 log.Error(fmt.Sprintf("failed to extend chain: %s", err.Error())) 1046 return false 1047 } 1048 panic(fmt.Sprintf("failed to extend chain: %s", err.Error())) 1049 } 1050 1051 log.EmitCheckpoint(log.BlockCreated, "block", fmt.Sprintf("%x", block.Hash())) 1052 } 1053 return true 1054 } 1055 1056 // Sets new appliedIndex in-memory, *and* writes this appliedIndex to LevelDB. 1057 func (pm *ProtocolManager) advanceAppliedIndex(index uint64) { 1058 pm.writeAppliedIndex(index) 1059 1060 pm.mu.Lock() 1061 pm.appliedIndex = index 1062 pm.mu.Unlock() 1063 } 1064 1065 func (pm *ProtocolManager) updateLeader(leader uint64) { 1066 pm.mu.Lock() 1067 defer pm.mu.Unlock() 1068 1069 pm.leader = uint16(leader) 1070 } 1071 1072 // The Address for the current leader, or an error if no leader is elected. 1073 func (pm *ProtocolManager) LeaderAddress() (*Address, error) { 1074 pm.mu.RLock() 1075 defer pm.mu.RUnlock() 1076 1077 if minterRole == pm.role { 1078 return pm.address, nil 1079 } else if l, ok := pm.peers[pm.leader]; ok { 1080 return l.address, nil 1081 } 1082 // We expect to reach this if pm.leader is 0, which is how etcd denotes the lack of a leader. 1083 return nil, errNoLeaderElected 1084 } 1085 1086 // Returns the raft id for a given enodeId 1087 func (pm *ProtocolManager) FetchRaftId(enodeId string) (uint16, error) { 1088 node, err := enode.ParseV4(enodeId) 1089 if err != nil { 1090 return 0, err 1091 } 1092 for raftId, peer := range pm.peers { 1093 if peer.p2pNode.ID() == node.ID() { 1094 return raftId, nil 1095 } 1096 } 1097 return 0, fmt.Errorf("node not found in the cluster: %v", enodeId) 1098 }