github.com/hyperion-hyn/go-ethereum@v2.4.0+incompatible/raft/handler.go (about) 1 package raft 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "net" 8 "net/http" 9 "net/url" 10 "os" 11 "strconv" 12 "sync" 13 "time" 14 15 "github.com/coreos/etcd/etcdserver/stats" 16 "github.com/coreos/etcd/pkg/fileutil" 17 raftTypes "github.com/coreos/etcd/pkg/types" 18 etcdRaft "github.com/coreos/etcd/raft" 19 "github.com/coreos/etcd/raft/raftpb" 20 "github.com/coreos/etcd/rafthttp" 21 "github.com/coreos/etcd/snap" 22 "github.com/coreos/etcd/wal" 23 mapset "github.com/deckarep/golang-set" 24 "github.com/syndtr/goleveldb/leveldb" 25 26 "github.com/ethereum/go-ethereum/core" 27 "github.com/ethereum/go-ethereum/core/types" 28 "github.com/ethereum/go-ethereum/eth/downloader" 29 "github.com/ethereum/go-ethereum/event" 30 "github.com/ethereum/go-ethereum/log" 31 "github.com/ethereum/go-ethereum/p2p" 32 "github.com/ethereum/go-ethereum/p2p/enode" 33 "github.com/ethereum/go-ethereum/p2p/enr" 34 "github.com/ethereum/go-ethereum/rlp" 35 ) 36 37 type ProtocolManager struct { 38 mu sync.RWMutex // For protecting concurrent JS access to "local peer" and "remote peer" state 39 quitSync chan struct{} 40 stopped bool 41 42 // Static configuration 43 joinExisting bool // Whether to join an existing cluster when a WAL doesn't already exist 44 bootstrapNodes []*enode.Node 45 raftId uint16 46 raftPort uint16 47 48 // Local peer state (protected by mu vs concurrent access via JS) 49 address *Address 50 role int // Role: minter or verifier 51 appliedIndex uint64 // The index of the last-applied raft entry 52 snapshotIndex uint64 // The index of the latest snapshot. 53 54 // Remote peer state (protected by mu vs concurrent access via JS) 55 leader uint16 56 peers map[uint16]*Peer 57 removedPeers mapset.Set // *Permanently removed* peers 58 59 // P2P transport 60 p2pServer *p2p.Server // Initialized in start() 61 useDns bool 62 63 // Blockchain services 64 blockchain *core.BlockChain 65 downloader *downloader.Downloader 66 minter *minter 67 68 // Blockchain events 69 eventMux *event.TypeMux 70 minedBlockSub *event.TypeMuxSubscription 71 72 // Raft proposal events 73 blockProposalC chan *types.Block // for mined blocks to raft 74 confChangeProposalC chan raftpb.ConfChange // for config changes from js console to raft 75 76 // Raft transport 77 unsafeRawNode etcdRaft.Node 78 transport *rafthttp.Transport 79 httpstopc chan struct{} 80 httpdonec chan struct{} 81 82 // Raft snapshotting 83 snapshotter *snap.Snapshotter 84 snapdir string 85 confState raftpb.ConfState 86 87 // Raft write-ahead log 88 waldir string 89 wal *wal.WAL 90 91 // Storage 92 quorumRaftDb *leveldb.DB // Persistent storage for last-applied raft index 93 raftStorage *etcdRaft.MemoryStorage // Volatile raft storage 94 } 95 96 var errNoLeaderElected = errors.New("no leader is currently elected") 97 98 // 99 // Public interface 100 // 101 102 func NewProtocolManager(raftId uint16, raftPort uint16, blockchain *core.BlockChain, mux *event.TypeMux, bootstrapNodes []*enode.Node, joinExisting bool, datadir string, minter *minter, downloader *downloader.Downloader, useDns bool) (*ProtocolManager, error) { 103 waldir := fmt.Sprintf("%s/raft-wal", datadir) 104 snapdir := fmt.Sprintf("%s/raft-snap", datadir) 105 quorumRaftDbLoc := fmt.Sprintf("%s/quorum-raft-state", datadir) 106 107 manager := &ProtocolManager{ 108 bootstrapNodes: bootstrapNodes, 109 peers: make(map[uint16]*Peer), 110 leader: uint16(etcdRaft.None), 111 removedPeers: mapset.NewSet(), 112 joinExisting: joinExisting, 113 blockchain: blockchain, 114 eventMux: mux, 115 blockProposalC: make(chan *types.Block, 10), 116 confChangeProposalC: make(chan raftpb.ConfChange), 117 httpstopc: make(chan struct{}), 118 httpdonec: make(chan struct{}), 119 waldir: waldir, 120 snapdir: snapdir, 121 snapshotter: snap.New(snapdir), 122 raftId: raftId, 123 raftPort: raftPort, 124 quitSync: make(chan struct{}), 125 raftStorage: etcdRaft.NewMemoryStorage(), 126 minter: minter, 127 downloader: downloader, 128 useDns: useDns, 129 } 130 131 if db, err := openQuorumRaftDb(quorumRaftDbLoc); err != nil { 132 return nil, err 133 } else { 134 manager.quorumRaftDb = db 135 } 136 137 return manager, nil 138 } 139 140 func (pm *ProtocolManager) Start(p2pServer *p2p.Server) { 141 log.Info("starting raft protocol handler") 142 143 pm.p2pServer = p2pServer 144 pm.minedBlockSub = pm.eventMux.Subscribe(core.NewMinedBlockEvent{}) 145 pm.startRaft() 146 // update raft peers info to p2p server 147 pm.p2pServer.SetCheckPeerInRaft(pm.peerExist) 148 go pm.minedBroadcastLoop() 149 } 150 151 func (pm *ProtocolManager) Stop() { 152 pm.mu.Lock() 153 defer pm.mu.Unlock() 154 155 defer log.Info("raft protocol handler stopped") 156 157 if pm.stopped { 158 return 159 } 160 161 log.Info("stopping raft protocol handler...") 162 163 for raftId, peer := range pm.peers { 164 pm.disconnectFromPeer(raftId, peer) 165 } 166 167 pm.minedBlockSub.Unsubscribe() 168 169 if pm.transport != nil { 170 pm.transport.Stop() 171 } 172 173 close(pm.httpstopc) 174 <-pm.httpdonec 175 close(pm.quitSync) 176 177 if pm.unsafeRawNode != nil { 178 pm.unsafeRawNode.Stop() 179 } 180 181 pm.quorumRaftDb.Close() 182 183 pm.p2pServer = nil 184 185 pm.minter.stop() 186 187 pm.stopped = true 188 } 189 190 func (pm *ProtocolManager) NodeInfo() *RaftNodeInfo { 191 pm.mu.RLock() // as we read role and peers 192 defer pm.mu.RUnlock() 193 194 roleDescription := "" 195 if pm.role == minterRole { 196 roleDescription = "minter" 197 } else if pm.isVerifierNode() { 198 roleDescription = "verifier" 199 } else if pm.isLearnerNode() { 200 roleDescription = "learner" 201 } 202 203 peerAddresses := make([]*Address, len(pm.peers)) 204 peerIdx := 0 205 for _, peer := range pm.peers { 206 peerAddresses[peerIdx] = peer.address 207 peerIdx += 1 208 } 209 210 removedPeerIfaces := pm.removedPeers 211 removedPeerIds := make([]uint16, removedPeerIfaces.Cardinality()) 212 i := 0 213 for removedIface := range removedPeerIfaces.Iterator().C { 214 removedPeerIds[i] = removedIface.(uint16) 215 i++ 216 } 217 218 // 219 // NOTE: before exposing any new fields here, make sure that the underlying 220 // ProtocolManager members are protected from concurrent access by pm.mu! 221 // 222 return &RaftNodeInfo{ 223 ClusterSize: len(pm.peers) + 1, 224 Role: roleDescription, 225 Address: pm.address, 226 PeerAddresses: peerAddresses, 227 RemovedPeerIds: removedPeerIds, 228 AppliedIndex: pm.appliedIndex, 229 SnapshotIndex: pm.snapshotIndex, 230 } 231 } 232 233 // There seems to be a very rare race in raft where during `etcdRaft.StartNode` 234 // it will call back our `Process` method before it's finished returning the 235 // `raft.Node`, `pm.unsafeRawNode`, to us. This re-entrance through a separate 236 // thread will cause a nil pointer dereference. To work around this, this 237 // getter method should be used instead of reading `pm.unsafeRawNode` directly. 238 func (pm *ProtocolManager) rawNode() etcdRaft.Node { 239 for pm.unsafeRawNode == nil { 240 time.Sleep(100 * time.Millisecond) 241 } 242 243 return pm.unsafeRawNode 244 } 245 246 func (pm *ProtocolManager) nextRaftId() uint16 { 247 pm.mu.RLock() 248 defer pm.mu.RUnlock() 249 250 maxId := pm.raftId 251 252 for peerId := range pm.peers { 253 if maxId < peerId { 254 maxId = peerId 255 } 256 } 257 258 removedPeerIfaces := pm.removedPeers 259 for removedIface := range removedPeerIfaces.Iterator().C { 260 removedId := removedIface.(uint16) 261 262 if maxId < removedId { 263 maxId = removedId 264 } 265 } 266 267 return maxId + 1 268 } 269 270 func (pm *ProtocolManager) isRaftIdRemoved(id uint16) bool { 271 pm.mu.RLock() 272 defer pm.mu.RUnlock() 273 274 return pm.removedPeers.Contains(id) 275 } 276 277 func (pm *ProtocolManager) isRaftIdUsed(raftId uint16) bool { 278 if pm.raftId == raftId || pm.isRaftIdRemoved(raftId) { 279 return true 280 } 281 282 pm.mu.RLock() 283 defer pm.mu.RUnlock() 284 285 return pm.peers[raftId] != nil 286 } 287 288 func (pm *ProtocolManager) isNodeAlreadyInCluster(node *enode.Node) error { 289 pm.mu.RLock() 290 defer pm.mu.RUnlock() 291 292 for _, peer := range pm.peers { 293 peerRaftId := peer.address.RaftId 294 peerNode := peer.p2pNode 295 296 if peerNode.ID() == node.ID() { 297 return fmt.Errorf("node with this enode has already been added to the cluster: %s", node.ID()) 298 } 299 300 if peerNode.IP().Equal(node.IP()) { 301 if peerNode.TCP() == node.TCP() { 302 return fmt.Errorf("existing node %v with raft ID %v is already using eth p2p at %v:%v", peerNode.ID(), peerRaftId, node.IP(), node.TCP()) 303 } else if peer.address.RaftPort == enr.RaftPort(node.RaftPort()) { 304 return fmt.Errorf("existing node %v with raft ID %v is already using raft at %v:%v", peerNode.ID(), peerRaftId, node.IP(), node.RaftPort()) 305 } 306 } 307 } 308 309 return nil 310 } 311 312 func (pm *ProtocolManager) peerExist(node *enode.Node) bool { 313 pm.mu.RLock() 314 defer pm.mu.RUnlock() 315 316 for _, p := range pm.peers { 317 if node.ID() == p.p2pNode.ID() { 318 return true 319 } 320 } 321 return false 322 } 323 324 func (pm *ProtocolManager) ProposeNewPeer(enodeId string, isLearner bool) (uint16, error) { 325 if pm.isLearnerNode() { 326 return 0, errors.New("learner node can't add peer or learner") 327 } 328 parsedUrl, _ := url.Parse(enodeId) 329 node, err := enode.ParseV4(enodeId) 330 if err != nil { 331 return 0, err 332 } 333 334 //use the hostname instead of the IP, since if DNS is not enabled, the hostname should *be* the IP 335 ip := net.ParseIP(parsedUrl.Hostname()) 336 if !pm.useDns && (len(ip.To4()) != 4) { 337 return 0, fmt.Errorf("expected IPv4 address (with length 4), but got IP of length %v", len(node.IP())) 338 } 339 340 if !node.HasRaftPort() { 341 return 0, fmt.Errorf("enodeId is missing raftport querystring parameter: %v", enodeId) 342 } 343 344 if err := pm.isNodeAlreadyInCluster(node); err != nil { 345 return 0, err 346 } 347 348 raftId := pm.nextRaftId() 349 address := newAddress(raftId, node.RaftPort(), node, pm.useDns) 350 351 confChangeType := raftpb.ConfChangeAddNode 352 353 if isLearner { 354 confChangeType = raftpb.ConfChangeAddLearnerNode 355 } 356 357 pm.confChangeProposalC <- raftpb.ConfChange{ 358 Type: confChangeType, 359 NodeID: uint64(raftId), 360 Context: address.toBytes(pm.useDns), 361 } 362 363 return raftId, nil 364 } 365 366 func (pm *ProtocolManager) ProposePeerRemoval(raftId uint16) error { 367 if pm.isLearnerNode() && raftId != pm.raftId { 368 return errors.New("learner node can't remove other peer") 369 } 370 pm.confChangeProposalC <- raftpb.ConfChange{ 371 Type: raftpb.ConfChangeRemoveNode, 372 NodeID: uint64(raftId), 373 } 374 return nil 375 } 376 377 func (pm *ProtocolManager) PromoteToPeer(raftId uint16) (bool, error) { 378 if pm.isLearnerNode() { 379 return false, errors.New("learner node can't promote to peer") 380 } 381 382 if !pm.isLearner(raftId) { 383 return false, fmt.Errorf("%d is not a learner. only learner can be promoted to peer", raftId) 384 } 385 386 pm.confChangeProposalC <- raftpb.ConfChange{ 387 Type: raftpb.ConfChangeAddNode, 388 NodeID: uint64(raftId), 389 } 390 return true, nil 391 } 392 393 // 394 // MsgWriter interface (necessary for p2p.Send) 395 // 396 397 func (pm *ProtocolManager) WriteMsg(msg p2p.Msg) error { 398 // read *into* buffer 399 var buffer = make([]byte, msg.Size) 400 msg.Payload.Read(buffer) 401 402 return pm.rawNode().Propose(context.TODO(), buffer) 403 } 404 405 // 406 // Raft interface 407 // 408 409 func (pm *ProtocolManager) Process(ctx context.Context, m raftpb.Message) error { 410 return pm.rawNode().Step(ctx, m) 411 } 412 413 func (pm *ProtocolManager) IsIDRemoved(id uint64) bool { 414 return pm.isRaftIdRemoved(uint16(id)) 415 } 416 417 func (pm *ProtocolManager) ReportUnreachable(id uint64) { 418 log.Info("peer is currently unreachable", "peer id", id) 419 420 pm.rawNode().ReportUnreachable(id) 421 } 422 423 func (pm *ProtocolManager) ReportSnapshot(id uint64, status etcdRaft.SnapshotStatus) { 424 if status == etcdRaft.SnapshotFailure { 425 log.Info("failed to send snapshot", "raft peer", id) 426 } else if status == etcdRaft.SnapshotFinish { 427 log.Info("finished sending snapshot", "raft peer", id) 428 } 429 430 pm.rawNode().ReportSnapshot(id, status) 431 } 432 433 // 434 // Private methods 435 // 436 437 func (pm *ProtocolManager) startRaft() { 438 if !fileutil.Exist(pm.snapdir) { 439 if err := os.Mkdir(pm.snapdir, 0750); err != nil { 440 fatalf("cannot create dir for snapshot (%v)", err) 441 } 442 } 443 walExisted := wal.Exist(pm.waldir) 444 lastAppliedIndex := pm.loadAppliedIndex() 445 446 id := raftTypes.ID(pm.raftId).String() 447 ss := stats.NewServerStats(id, id) 448 449 pm.transport = &rafthttp.Transport{ 450 ID: raftTypes.ID(pm.raftId), 451 ClusterID: 0x1000, 452 Raft: pm, 453 ServerStats: ss, 454 LeaderStats: stats.NewLeaderStats(strconv.Itoa(int(pm.raftId))), 455 ErrorC: make(chan error), 456 } 457 pm.transport.Start() 458 459 // We load the snapshot to connect to prev peers before replaying the WAL, 460 // which typically goes further into the future than the snapshot. 461 462 var maybeRaftSnapshot *raftpb.Snapshot 463 464 if walExisted { 465 maybeRaftSnapshot = pm.loadSnapshot() // re-establishes peer connections 466 } 467 468 loadedWal, entries := pm.replayWAL(maybeRaftSnapshot) 469 pm.wal = loadedWal 470 471 if walExisted { 472 473 // If we shutdown but didn't manage to flush the state to disk, then it will be the case that we will only sync 474 // up to the snapshot. In this case, we can replay the raft entries that we have in saved to replay the blocks 475 // back into our chain. We output errors but cannot do much if one occurs, since we can't fork to a different 476 // chain and all other nodes in the network have confirmed these blocks 477 if maybeRaftSnapshot != nil { 478 currentChainHead := pm.blockchain.CurrentBlock().Number() 479 for _, entry := range entries { 480 if entry.Type == raftpb.EntryNormal { 481 var block types.Block 482 if err := rlp.DecodeBytes(entry.Data, &block); err != nil { 483 log.Error("error decoding block: ", "err", err) 484 continue 485 } 486 487 if thisBlockHead := pm.blockchain.GetBlockByHash(block.Hash()); thisBlockHead != nil { 488 // check if the block is already existing in the local chain 489 // and the block number is greater than current chain head 490 if thisBlockHeadNum := thisBlockHead.Number(); thisBlockHeadNum.Cmp(currentChainHead) > 0 { 491 // insert the block only if its already seen 492 blocks := []*types.Block{&block} 493 if _, err := pm.blockchain.InsertChain(blocks); err != nil { 494 log.Error("error inserting the block into the chain", "number", block.NumberU64(), "hash", block.Hash(), "err", err) 495 } 496 } 497 } 498 } 499 } 500 } 501 502 if hardState, _, err := pm.raftStorage.InitialState(); err != nil { 503 panic(fmt.Sprintf("failed to read initial state from raft while restarting: %v", err)) 504 } else { 505 if lastPersistedCommittedIndex := hardState.Commit; lastPersistedCommittedIndex < lastAppliedIndex { 506 log.Info("rolling back applied index to last-durably-committed", "last applied index", lastAppliedIndex, "last persisted index", lastPersistedCommittedIndex) 507 508 // Roll back our applied index. See the logic and explanation around 509 // the single call to `pm.applyNewChainHead` for more context. 510 lastAppliedIndex = lastPersistedCommittedIndex 511 } 512 513 // fix raft applied index out of range 514 firstIndex, err := pm.raftStorage.FirstIndex() 515 if err != nil { 516 panic(fmt.Sprintf("failed to read last persisted applied index from raft while restarting: %v", err)) 517 } 518 lastPersistedAppliedIndex := firstIndex - 1 519 if lastPersistedAppliedIndex > lastAppliedIndex { 520 log.Debug("set lastAppliedIndex to lastPersistedAppliedIndex", "last applied index", lastAppliedIndex, "last persisted applied index", lastPersistedAppliedIndex) 521 522 lastAppliedIndex = lastPersistedAppliedIndex 523 pm.advanceAppliedIndex(lastAppliedIndex) 524 } 525 } 526 } 527 528 // NOTE: cockroach sets this to false for now until they've "worked out the 529 // bugs" 530 enablePreVote := true 531 532 raftConfig := &etcdRaft.Config{ 533 Applied: lastAppliedIndex, 534 ID: uint64(pm.raftId), 535 ElectionTick: 10, // NOTE: cockroach sets this to 15 536 HeartbeatTick: 1, // NOTE: cockroach sets this to 5 537 Storage: pm.raftStorage, 538 539 // NOTE, from cockroach: 540 // "PreVote and CheckQuorum are two ways of achieving the same thing. 541 // PreVote is more compatible with quiesced ranges, so we want to switch 542 // to it once we've worked out the bugs." 543 // 544 // TODO: vendor again? 545 // PreVote: enablePreVote, 546 CheckQuorum: !enablePreVote, 547 548 // MaxSizePerMsg controls how many Raft log entries the leader will send to 549 // followers in a single MsgApp. 550 MaxSizePerMsg: 4096, // NOTE: in cockroachdb this is 16*1024 551 552 // MaxInflightMsgs controls how many in-flight messages Raft will send to 553 // a follower without hearing a response. The total number of Raft log 554 // entries is a combination of this setting and MaxSizePerMsg. 555 // 556 // NOTE: Cockroach's settings (MaxSizePerMsg of 4k and MaxInflightMsgs 557 // of 4) provide for up to 64 KB of raft log to be sent without 558 // acknowledgement. With an average entry size of 1 KB that translates 559 // to ~64 commands that might be executed in the handling of a single 560 // etcdraft.Ready operation. 561 MaxInflightMsgs: 256, // NOTE: in cockroachdb this is 4 562 } 563 564 log.Info("startRaft", "raft ID", raftConfig.ID) 565 566 if walExisted { 567 log.Info("remounting an existing raft log; connecting to peers.") 568 569 pm.unsafeRawNode = etcdRaft.RestartNode(raftConfig) 570 } else if pm.joinExisting { 571 log.Info("newly joining an existing cluster; waiting for connections.") 572 pm.unsafeRawNode = etcdRaft.StartNode(raftConfig, nil) 573 } else { 574 if numPeers := len(pm.bootstrapNodes); numPeers == 0 { 575 panic("exiting due to empty raft peers list") 576 } else { 577 log.Info("starting a new raft log", "initial cluster size of", numPeers) 578 } 579 580 raftPeers, peerAddresses, localAddress := pm.makeInitialRaftPeers() 581 582 pm.setLocalAddress(localAddress) 583 584 // We add all peers up-front even though we will see a ConfChangeAddNode 585 // for each shortly. This is because raft's ConfState will contain all of 586 // these nodes before we see these log entries, and we always want our 587 // snapshots to have all addresses for each of the nodes in the ConfState. 588 for _, peerAddress := range peerAddresses { 589 pm.addPeer(peerAddress) 590 } 591 pm.unsafeRawNode = etcdRaft.StartNode(raftConfig, raftPeers) 592 } 593 log.Info("raft node started") 594 go pm.serveRaft() 595 go pm.serveLocalProposals() 596 go pm.eventLoop() 597 go pm.handleRoleChange(pm.rawNode().RoleChan().Out()) 598 } 599 600 func (pm *ProtocolManager) setLocalAddress(addr *Address) { 601 pm.mu.Lock() 602 pm.address = addr 603 pm.mu.Unlock() 604 // By setting `URLs` on the raft transport, we advertise our URL (in an HTTP 605 // header) to any recipient. This is necessary for a newcomer to the cluster 606 // to be able to accept a snapshot from us to bootstrap them. 607 if urls, err := raftTypes.NewURLs([]string{pm.raftUrl(addr)}); err == nil { 608 pm.transport.URLs = urls 609 } else { 610 panic(fmt.Sprintf("error: could not create URL from local address: %v", addr)) 611 } 612 } 613 614 func (pm *ProtocolManager) serveRaft() { 615 urlString := fmt.Sprintf("http://0.0.0.0:%d", pm.raftPort) 616 url, err := url.Parse(urlString) 617 if err != nil { 618 fatalf("Failed parsing URL (%v)", err) 619 } 620 621 listener, err := newStoppableListener(url.Host, pm.httpstopc) 622 if err != nil { 623 fatalf("Failed to listen rafthttp (%v)", err) 624 } 625 err = (&http.Server{Handler: pm.transport.Handler()}).Serve(listener) 626 select { 627 case <-pm.httpstopc: 628 default: 629 fatalf("Failed to serve rafthttp (%v)", err) 630 } 631 close(pm.httpdonec) 632 } 633 634 func (pm *ProtocolManager) isLearner(rid uint16) bool { 635 pm.mu.RLock() 636 defer pm.mu.RUnlock() 637 for _, n := range pm.confState.Learners { 638 if uint16(n) == rid { 639 return true 640 } 641 } 642 return false 643 } 644 645 func (pm *ProtocolManager) isLearnerNode() bool { 646 return pm.isLearner(pm.raftId) 647 } 648 649 func (pm *ProtocolManager) isVerifierNode() bool { 650 return pm.isVerifier(pm.raftId) 651 } 652 653 func (pm *ProtocolManager) isVerifier(rid uint16) bool { 654 pm.mu.RLock() 655 defer pm.mu.RUnlock() 656 for _, n := range pm.confState.Nodes { 657 if uint16(n) == rid { 658 return true 659 } 660 } 661 return false 662 } 663 664 func (pm *ProtocolManager) handleRoleChange(roleC <-chan interface{}) { 665 for { 666 select { 667 case role := <-roleC: 668 intRole, ok := role.(int) 669 670 if !ok { 671 panic("Couldn't cast role to int") 672 } 673 674 if intRole == minterRole { 675 log.EmitCheckpoint(log.BecameMinter) 676 pm.minter.start() 677 } else { // verifier 678 if pm.isVerifierNode() { 679 log.EmitCheckpoint(log.BecameVerifier) 680 } else { 681 log.EmitCheckpoint(log.BecameLearner) 682 } 683 pm.minter.stop() 684 } 685 686 pm.mu.Lock() 687 pm.role = intRole 688 pm.mu.Unlock() 689 690 case <-pm.quitSync: 691 return 692 } 693 } 694 } 695 696 func (pm *ProtocolManager) minedBroadcastLoop() { 697 for obj := range pm.minedBlockSub.Chan() { 698 switch ev := obj.Data.(type) { 699 case core.NewMinedBlockEvent: 700 select { 701 case pm.blockProposalC <- ev.Block: 702 case <-pm.quitSync: 703 return 704 } 705 } 706 } 707 } 708 709 // Serve two channels to handle new blocks and raft configuration changes originating locally. 710 func (pm *ProtocolManager) serveLocalProposals() { 711 // 712 // TODO: does it matter that this will restart from 0 whenever we restart a cluster? 713 // 714 var confChangeCount uint64 715 716 for { 717 select { 718 case block, ok := <-pm.blockProposalC: 719 if !ok { 720 log.Info("error: read from blockProposalC failed") 721 return 722 } 723 724 size, r, err := rlp.EncodeToReader(block) 725 if err != nil { 726 panic(fmt.Sprintf("error: failed to send RLP-encoded block: %s", err.Error())) 727 } 728 var buffer = make([]byte, uint32(size)) 729 r.Read(buffer) 730 731 // blocks until accepted by the raft state machine 732 pm.rawNode().Propose(context.TODO(), buffer) 733 case cc, ok := <-pm.confChangeProposalC: 734 if !ok { 735 log.Info("error: read from confChangeProposalC failed") 736 return 737 } 738 739 confChangeCount++ 740 cc.ID = confChangeCount 741 pm.rawNode().ProposeConfChange(context.TODO(), cc) 742 case <-pm.quitSync: 743 return 744 } 745 } 746 } 747 748 func (pm *ProtocolManager) entriesToApply(allEntries []raftpb.Entry) (entriesToApply []raftpb.Entry) { 749 if len(allEntries) == 0 { 750 return 751 } 752 753 first := allEntries[0].Index 754 pm.mu.RLock() 755 lastApplied := pm.appliedIndex 756 pm.mu.RUnlock() 757 758 if first > lastApplied+1 { 759 fatalf("first index of committed entry[%d] should <= appliedIndex[%d] + 1", first, lastApplied) 760 } 761 762 firstToApply := lastApplied - first + 1 763 764 if firstToApply < uint64(len(allEntries)) { 765 entriesToApply = allEntries[firstToApply:] 766 } 767 return 768 } 769 770 func (pm *ProtocolManager) raftUrl(address *Address) string { 771 if !pm.useDns { 772 parsedIp := net.ParseIP(address.Hostname) 773 return fmt.Sprintf("http://%s:%d", parsedIp.To4(), address.RaftPort) 774 } 775 776 if parsedIp := net.ParseIP(address.Hostname); parsedIp != nil { 777 if ipv4 := parsedIp.To4(); ipv4 != nil { 778 //this is an IPv4 address 779 return fmt.Sprintf("http://%s:%d", ipv4, address.RaftPort) 780 } 781 //this is an IPv6 address 782 return fmt.Sprintf("http://[%s]:%d", parsedIp, address.RaftPort) 783 } 784 return fmt.Sprintf("http://%s:%d", address.Hostname, address.RaftPort) 785 } 786 787 func (pm *ProtocolManager) addPeer(address *Address) { 788 pm.mu.Lock() 789 defer pm.mu.Unlock() 790 791 raftId := address.RaftId 792 793 //Quorum - RAFT - derive pubkey from nodeId 794 pubKey, err := enode.HexPubkey(address.NodeId.String()) 795 if err != nil { 796 log.Error("error decoding pub key from enodeId", "enodeId", address.NodeId.String(), "err", err) 797 panic(err) 798 } 799 800 // Add P2P connection: 801 p2pNode := enode.NewV4Hostname(pubKey, address.Hostname, int(address.P2pPort), 0, int(address.RaftPort)) 802 pm.p2pServer.AddPeer(p2pNode) 803 804 // Add raft transport connection: 805 pm.transport.AddPeer(raftTypes.ID(raftId), []string{pm.raftUrl(address)}) 806 pm.peers[raftId] = &Peer{address, p2pNode} 807 } 808 809 func (pm *ProtocolManager) disconnectFromPeer(raftId uint16, peer *Peer) { 810 pm.p2pServer.RemovePeer(peer.p2pNode) 811 pm.transport.RemovePeer(raftTypes.ID(raftId)) 812 } 813 814 func (pm *ProtocolManager) removePeer(raftId uint16) { 815 pm.mu.Lock() 816 defer pm.mu.Unlock() 817 818 if peer := pm.peers[raftId]; peer != nil { 819 pm.disconnectFromPeer(raftId, peer) 820 821 delete(pm.peers, raftId) 822 } 823 824 // This is only necessary sometimes, but it's idempotent. Also, we *always* 825 // do this, and not just when there's still a peer in the map, because we 826 // need to do it for our *own* raft ID before we get booted from the cluster 827 // so that snapshots are identical on all nodes. It's important for a booted 828 // node to have a snapshot identical to every other node because that node 829 // can potentially re-enter the cluster with a new raft ID. 830 pm.removedPeers.Add(raftId) 831 } 832 833 func (pm *ProtocolManager) eventLoop() { 834 ticker := time.NewTicker(tickerMS * time.Millisecond) 835 defer ticker.Stop() 836 defer pm.wal.Close() 837 838 exitAfterApplying := false 839 840 for { 841 select { 842 case <-ticker.C: 843 pm.rawNode().Tick() 844 845 // when the node is first ready it gives us entries to commit and messages 846 // to immediately publish 847 case rd := <-pm.rawNode().Ready(): 848 pm.wal.Save(rd.HardState, rd.Entries) 849 850 if rd.SoftState != nil { 851 pm.updateLeader(rd.SoftState.Lead) 852 } 853 854 if snap := rd.Snapshot; !etcdRaft.IsEmptySnap(snap) { 855 pm.saveRaftSnapshot(snap) 856 pm.applyRaftSnapshot(snap) 857 pm.advanceAppliedIndex(snap.Metadata.Index) 858 } 859 860 // 1: Write HardState, Entries, and Snapshot to persistent storage if they 861 // are not empty. 862 pm.raftStorage.Append(rd.Entries) 863 864 // 2: Send all Messages to the nodes named in the To field. 865 pm.transport.Send(rd.Messages) 866 867 // 3: Apply Snapshot (if any) and CommittedEntries to the state machine. 868 for _, entry := range pm.entriesToApply(rd.CommittedEntries) { 869 switch entry.Type { 870 case raftpb.EntryNormal: 871 if len(entry.Data) == 0 { 872 break 873 } 874 var block types.Block 875 err := rlp.DecodeBytes(entry.Data, &block) 876 if err != nil { 877 log.Error("error decoding block: ", err) 878 } 879 880 if pm.blockchain.HasBlock(block.Hash(), block.NumberU64()) { 881 // This can happen: 882 // 883 // if (1) we crashed after applying this block to the chain, but 884 // before writing appliedIndex to LDB. 885 // or (2) we crashed in a scenario where we applied further than 886 // raft *durably persisted* its committed index (see 887 // https://github.com/coreos/etcd/pull/7899). In this 888 // scenario, when the node comes back up, we will re-apply 889 // a few entries. 890 891 headBlockHash := pm.blockchain.CurrentBlock().Hash() 892 log.Warn("not applying already-applied block", "block hash", block.Hash(), "parent", block.ParentHash(), "head", headBlockHash) 893 } else { 894 if !pm.applyNewChainHead(&block) { 895 // return false only if insert chain is interrupted 896 // stop eventloop 897 return 898 } 899 } 900 901 case raftpb.EntryConfChange: 902 var cc raftpb.ConfChange 903 cc.Unmarshal(entry.Data) 904 raftId := uint16(cc.NodeID) 905 906 pm.confState = *pm.rawNode().ApplyConfChange(cc) 907 log.Info("confChange", "confState", pm.confState) 908 forceSnapshot := false 909 910 switch cc.Type { 911 case raftpb.ConfChangeAddNode, raftpb.ConfChangeAddLearnerNode: 912 confChangeTypeName := raftpb.ConfChangeType_name[int32(cc.Type)] 913 log.Info(confChangeTypeName, "raft id", raftId) 914 if pm.isRaftIdRemoved(raftId) { 915 log.Info("ignoring "+confChangeTypeName+" for permanently-removed peer", "raft id", raftId) 916 } else if pm.isRaftIdUsed(raftId) && raftId <= uint16(len(pm.bootstrapNodes)) { 917 // See initial cluster logic in startRaft() for more information. 918 log.Info("ignoring expected "+confChangeTypeName+" for initial peer", "raft id", raftId) 919 // We need a snapshot to exist to reconnect to peers on start-up after a crash. 920 forceSnapshot = true 921 } else { // add peer or add learner or promote learner to voter 922 forceSnapshot = true 923 //if raft id exists as peer, you are promoting learner to peer 924 if pm.isRaftIdUsed(raftId) { 925 log.Info("promote learner node to voter node", "raft id", raftId) 926 } else { 927 //if raft id does not exist, you are adding peer/learner 928 log.Info("add peer/learner -> "+confChangeTypeName, "raft id", raftId) 929 pm.addPeer(bytesToAddress(cc.Context)) 930 } 931 } 932 933 case raftpb.ConfChangeRemoveNode: 934 if pm.isRaftIdRemoved(raftId) { 935 log.Info("ignoring ConfChangeRemoveNode for already-removed peer", "raft id", raftId) 936 } else { 937 log.Info("removing peer due to ConfChangeRemoveNode", "raft id", raftId) 938 939 forceSnapshot = true 940 941 if raftId == pm.raftId { 942 exitAfterApplying = true 943 } 944 945 pm.removePeer(raftId) 946 } 947 948 case raftpb.ConfChangeUpdateNode: 949 // NOTE: remember to forceSnapshot in this case, if we add support 950 // for this. 951 fatalf("not yet handled: ConfChangeUpdateNode") 952 } 953 954 if forceSnapshot { 955 // We force a snapshot here to persist our updated confState, so we 956 // know our fellow cluster members when we come back online. 957 // 958 // It is critical here to snapshot *before* writing our applied 959 // index in LevelDB, otherwise a crash while/before snapshotting 960 // (after advancing our applied index) would result in the loss of a 961 // cluster member upon restart: we would re-mount with an old 962 // ConfState. 963 pm.triggerSnapshot(entry.Index) 964 } 965 } 966 967 pm.advanceAppliedIndex(entry.Index) 968 } 969 970 pm.maybeTriggerSnapshot() 971 972 if exitAfterApplying { 973 log.Warn("permanently removing self from the cluster") 974 pm.Stop() 975 log.Warn("permanently exited the cluster") 976 977 return 978 } 979 980 // 4: Call Node.Advance() to signal readiness for the next batch of 981 // updates. 982 pm.rawNode().Advance() 983 984 case <-pm.quitSync: 985 return 986 } 987 } 988 } 989 990 func (pm *ProtocolManager) makeInitialRaftPeers() (raftPeers []etcdRaft.Peer, peerAddresses []*Address, localAddress *Address) { 991 initialNodes := pm.bootstrapNodes 992 raftPeers = make([]etcdRaft.Peer, len(initialNodes)) // Entire cluster 993 peerAddresses = make([]*Address, len(initialNodes)-1) // Cluster without *this* node 994 995 peersSeen := 0 996 for i, node := range initialNodes { 997 raftId := uint16(i + 1) 998 // We initially get the raftPort from the enode ID's query string. As an alternative, we can move away from 999 // requiring the use of static peers for the initial set, and load them from e.g. another JSON file which 1000 // contains pairs of enodes and raft ports, or we can get this initial peer list from commandline flags. 1001 address := newAddress(raftId, node.RaftPort(), node, pm.useDns) 1002 raftPeers[i] = etcdRaft.Peer{ 1003 ID: uint64(raftId), 1004 Context: address.toBytes(pm.useDns), 1005 } 1006 1007 if raftId == pm.raftId { 1008 localAddress = address 1009 } else { 1010 peerAddresses[peersSeen] = address 1011 peersSeen += 1 1012 } 1013 } 1014 1015 return 1016 } 1017 1018 func blockExtendsChain(block *types.Block, chain *core.BlockChain) bool { 1019 return block.ParentHash() == chain.CurrentBlock().Hash() 1020 } 1021 1022 func (pm *ProtocolManager) applyNewChainHead(block *types.Block) bool { 1023 if !blockExtendsChain(block, pm.blockchain) { 1024 headBlock := pm.blockchain.CurrentBlock() 1025 1026 log.Info("Non-extending block", "block", block.Hash(), "parent", block.ParentHash(), "head", headBlock.Hash()) 1027 1028 pm.minter.invalidRaftOrderingChan <- InvalidRaftOrdering{headBlock: headBlock, invalidBlock: block} 1029 } else { 1030 if existingBlock := pm.blockchain.GetBlockByHash(block.Hash()); nil == existingBlock { 1031 if err := pm.blockchain.Validator().ValidateBody(block); err != nil { 1032 panic(fmt.Sprintf("failed to validate block %x (%v)", block.Hash(), err)) 1033 } 1034 } 1035 1036 for _, tx := range block.Transactions() { 1037 log.EmitCheckpoint(log.TxAccepted, "tx", tx.Hash().Hex()) 1038 } 1039 1040 _, err := pm.blockchain.InsertChain([]*types.Block{block}) 1041 1042 if err != nil { 1043 if err == core.ErrAbortBlocksProcessing { 1044 log.Error(fmt.Sprintf("failed to extend chain: %s", err.Error())) 1045 return false 1046 } 1047 panic(fmt.Sprintf("failed to extend chain: %s", err.Error())) 1048 } 1049 1050 log.EmitCheckpoint(log.BlockCreated, "block", fmt.Sprintf("%x", block.Hash())) 1051 } 1052 return true 1053 } 1054 1055 // Sets new appliedIndex in-memory, *and* writes this appliedIndex to LevelDB. 1056 func (pm *ProtocolManager) advanceAppliedIndex(index uint64) { 1057 pm.writeAppliedIndex(index) 1058 1059 pm.mu.Lock() 1060 pm.appliedIndex = index 1061 pm.mu.Unlock() 1062 } 1063 1064 func (pm *ProtocolManager) updateLeader(leader uint64) { 1065 pm.mu.Lock() 1066 defer pm.mu.Unlock() 1067 1068 pm.leader = uint16(leader) 1069 } 1070 1071 // The Address for the current leader, or an error if no leader is elected. 1072 func (pm *ProtocolManager) LeaderAddress() (*Address, error) { 1073 pm.mu.RLock() 1074 defer pm.mu.RUnlock() 1075 1076 if minterRole == pm.role { 1077 return pm.address, nil 1078 } else if l, ok := pm.peers[pm.leader]; ok { 1079 return l.address, nil 1080 } 1081 // We expect to reach this if pm.leader is 0, which is how etcd denotes the lack of a leader. 1082 return nil, errNoLeaderElected 1083 } 1084 1085 // Returns the raft id for a given enodeId 1086 func (pm *ProtocolManager) FetchRaftId(enodeId string) (uint16, error) { 1087 node, err := enode.ParseV4(enodeId) 1088 if err != nil { 1089 return 0, err 1090 } 1091 for raftId, peer := range pm.peers { 1092 if peer.p2pNode.ID() == node.ID() { 1093 return raftId, nil 1094 } 1095 } 1096 return 0, fmt.Errorf("node not found in the cluster: %v", enodeId) 1097 }