github.com/baptiste-b-pegasys/quorum/v22@v22.4.2/raft/handler.go

github.com/baptiste-b-pegasys/quorum/v22@v22.4.2/raft/handler.go (about)

     1  package raft
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"net"
     8  	"net/http"
     9  	"net/url"
    10  	"os"
    11  	"strconv"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/coreos/etcd/etcdserver/stats"
    16  	"github.com/coreos/etcd/pkg/fileutil"
    17  	raftTypes "github.com/coreos/etcd/pkg/types"
    18  	etcdRaft "github.com/coreos/etcd/raft"
    19  	"github.com/coreos/etcd/raft/raftpb"
    20  	"github.com/coreos/etcd/rafthttp"
    21  	"github.com/coreos/etcd/snap"
    22  	"github.com/coreos/etcd/wal"
    23  	mapset "github.com/deckarep/golang-set"
    24  	"github.com/syndtr/goleveldb/leveldb"
    25  
    26  	"github.com/ethereum/go-ethereum/core"
    27  	"github.com/ethereum/go-ethereum/core/types"
    28  	"github.com/ethereum/go-ethereum/eth/downloader"
    29  	"github.com/ethereum/go-ethereum/event"
    30  	"github.com/ethereum/go-ethereum/log"
    31  	"github.com/ethereum/go-ethereum/p2p"
    32  	"github.com/ethereum/go-ethereum/p2p/enode"
    33  	"github.com/ethereum/go-ethereum/p2p/enr"
    34  	"github.com/ethereum/go-ethereum/rlp"
    35  )
    36  
    37  type ProtocolManager struct {
    38  	mu       sync.RWMutex // For protecting concurrent JS access to "local peer" and "remote peer" state
    39  	quitSync chan struct{}
    40  	stopped  bool
    41  
    42  	// Static configuration
    43  	joinExisting   bool // Whether to join an existing cluster when a WAL doesn't already exist
    44  	bootstrapNodes []*enode.Node
    45  	raftId         uint16
    46  	raftPort       uint16
    47  
    48  	// Local peer state (protected by mu vs concurrent access via JS)
    49  	address       *Address
    50  	role          int    // Role: minter or verifier
    51  	appliedIndex  uint64 // The index of the last-applied raft entry
    52  	snapshotIndex uint64 // The index of the latest snapshot.
    53  
    54  	// Remote peer state (protected by mu vs concurrent access via JS)
    55  	leader       uint16
    56  	peers        map[uint16]*Peer
    57  	removedPeers mapset.Set // *Permanently removed* peers
    58  
    59  	// P2P transport
    60  	p2pServer *p2p.Server
    61  	useDns    bool
    62  
    63  	// Blockchain services
    64  	blockchain *core.BlockChain
    65  	downloader *downloader.Downloader
    66  	minter     *minter
    67  
    68  	// Blockchain events
    69  	eventMux      *event.TypeMux
    70  	minedBlockSub *event.TypeMuxSubscription
    71  
    72  	// Raft proposal events
    73  	blockProposalC      chan *types.Block      // for mined blocks to raft
    74  	confChangeProposalC chan raftpb.ConfChange // for config changes from js console to raft
    75  
    76  	// Raft transport
    77  	unsafeRawNode etcdRaft.Node
    78  	transport     *rafthttp.Transport
    79  	httpstopc     chan struct{}
    80  	httpdonec     chan struct{}
    81  
    82  	// Raft snapshotting
    83  	snapshotter *snap.Snapshotter
    84  	snapdir     string
    85  	confState   raftpb.ConfState
    86  
    87  	// Raft write-ahead log
    88  	waldir string
    89  	wal    *wal.WAL
    90  
    91  	// Storage
    92  	quorumRaftDb *leveldb.DB             // Persistent storage for last-applied raft index
    93  	raftStorage  *etcdRaft.MemoryStorage // Volatile raft storage
    94  }
    95  
    96  var errNoLeaderElected = errors.New("no leader is currently elected")
    97  
    98  //
    99  // Public interface
   100  //
   101  
   102  func NewProtocolManager(raftId uint16, raftPort uint16, blockchain *core.BlockChain, mux *event.TypeMux, bootstrapNodes []*enode.Node, joinExisting bool, raftLogDir string, minter *minter, downloader *downloader.Downloader, useDns bool, p2pServer *p2p.Server) (*ProtocolManager, error) {
   103  	waldir := fmt.Sprintf("%s/raft-wal", raftLogDir)
   104  	snapdir := fmt.Sprintf("%s/raft-snap", raftLogDir)
   105  	quorumRaftDbLoc := fmt.Sprintf("%s/quorum-raft-state", raftLogDir)
   106  
   107  	manager := &ProtocolManager{
   108  		bootstrapNodes:      bootstrapNodes,
   109  		peers:               make(map[uint16]*Peer),
   110  		leader:              uint16(etcdRaft.None),
   111  		removedPeers:        mapset.NewSet(),
   112  		joinExisting:        joinExisting,
   113  		blockchain:          blockchain,
   114  		eventMux:            mux,
   115  		blockProposalC:      make(chan *types.Block, 10),
   116  		confChangeProposalC: make(chan raftpb.ConfChange),
   117  		httpstopc:           make(chan struct{}),
   118  		httpdonec:           make(chan struct{}),
   119  		waldir:              waldir,
   120  		snapdir:             snapdir,
   121  		snapshotter:         snap.New(snapdir),
   122  		raftId:              raftId,
   123  		raftPort:            raftPort,
   124  		quitSync:            make(chan struct{}),
   125  		raftStorage:         etcdRaft.NewMemoryStorage(),
   126  		minter:              minter,
   127  		downloader:          downloader,
   128  		useDns:              useDns,
   129  		p2pServer:           p2pServer,
   130  	}
   131  
   132  	if db, err := openQuorumRaftDb(quorumRaftDbLoc); err != nil {
   133  		return nil, err
   134  	} else {
   135  		manager.quorumRaftDb = db
   136  	}
   137  
   138  	return manager, nil
   139  }
   140  
   141  func (pm *ProtocolManager) Start() {
   142  	log.Info("starting raft protocol handler")
   143  
   144  	pm.minedBlockSub = pm.eventMux.Subscribe(core.NewMinedBlockEvent{})
   145  	pm.startRaft()
   146  	// update raft peers info to p2p server
   147  	pm.p2pServer.SetCheckPeerInRaft(pm.peerExist)
   148  	go pm.minedBroadcastLoop()
   149  }
   150  
   151  func (pm *ProtocolManager) Stop() {
   152  	pm.mu.Lock()
   153  	defer pm.mu.Unlock()
   154  
   155  	defer log.Info("raft protocol handler stopped")
   156  
   157  	if pm.stopped {
   158  		return
   159  	}
   160  
   161  	log.Info("stopping raft protocol handler...")
   162  
   163  	for raftId, peer := range pm.peers {
   164  		pm.disconnectFromPeer(raftId, peer)
   165  	}
   166  
   167  	pm.minedBlockSub.Unsubscribe()
   168  
   169  	if pm.transport != nil {
   170  		pm.transport.Stop()
   171  	}
   172  
   173  	close(pm.httpstopc)
   174  	<-pm.httpdonec
   175  	close(pm.quitSync)
   176  
   177  	if pm.unsafeRawNode != nil {
   178  		pm.unsafeRawNode.Stop()
   179  	}
   180  
   181  	pm.quorumRaftDb.Close()
   182  
   183  	pm.p2pServer = nil
   184  
   185  	pm.minter.stop()
   186  
   187  	pm.stopped = true
   188  }
   189  
   190  func (pm *ProtocolManager) NodeInfo() *RaftNodeInfo {
   191  	pm.mu.RLock() // as we read role and peers
   192  	defer pm.mu.RUnlock()
   193  
   194  	roleDescription := ""
   195  	if pm.role == minterRole {
   196  		roleDescription = "minter"
   197  	} else if pm.isVerifierNode() {
   198  		roleDescription = "verifier"
   199  	} else if pm.isLearnerNode() {
   200  		roleDescription = "learner"
   201  	}
   202  
   203  	peerAddresses := make([]*Address, len(pm.peers))
   204  	peerIdx := 0
   205  	for _, peer := range pm.peers {
   206  		peerAddresses[peerIdx] = peer.address
   207  		peerIdx += 1
   208  	}
   209  
   210  	removedPeerIfaces := pm.removedPeers
   211  	removedPeerIds := make([]uint16, removedPeerIfaces.Cardinality())
   212  	i := 0
   213  	for removedIface := range removedPeerIfaces.Iterator().C {
   214  		removedPeerIds[i] = removedIface.(uint16)
   215  		i++
   216  	}
   217  
   218  	//
   219  	// NOTE: before exposing any new fields here, make sure that the underlying
   220  	// ProtocolManager members are protected from concurrent access by pm.mu!
   221  	//
   222  	return &RaftNodeInfo{
   223  		ClusterSize:    len(pm.peers) + 1,
   224  		Role:           roleDescription,
   225  		Address:        pm.address,
   226  		PeerAddresses:  peerAddresses,
   227  		RemovedPeerIds: removedPeerIds,
   228  		AppliedIndex:   pm.appliedIndex,
   229  		SnapshotIndex:  pm.snapshotIndex,
   230  	}
   231  }
   232  
   233  // There seems to be a very rare race in raft where during `etcdRaft.StartNode`
   234  // it will call back our `Process` method before it's finished returning the
   235  // `raft.Node`, `pm.unsafeRawNode`, to us. This re-entrance through a separate
   236  // thread will cause a nil pointer dereference. To work around this, this
   237  // getter method should be used instead of reading `pm.unsafeRawNode` directly.
   238  func (pm *ProtocolManager) rawNode() etcdRaft.Node {
   239  	for pm.unsafeRawNode == nil {
   240  		time.Sleep(100 * time.Millisecond)
   241  	}
   242  
   243  	return pm.unsafeRawNode
   244  }
   245  
   246  func (pm *ProtocolManager) nextRaftId() uint16 {
   247  	pm.mu.RLock()
   248  	defer pm.mu.RUnlock()
   249  
   250  	maxId := pm.raftId
   251  
   252  	for peerId := range pm.peers {
   253  		if maxId < peerId {
   254  			maxId = peerId
   255  		}
   256  	}
   257  
   258  	removedPeerIfaces := pm.removedPeers
   259  	for removedIface := range removedPeerIfaces.Iterator().C {
   260  		removedId := removedIface.(uint16)
   261  
   262  		if maxId < removedId {
   263  			maxId = removedId
   264  		}
   265  	}
   266  
   267  	return maxId + 1
   268  }
   269  
   270  func (pm *ProtocolManager) isRaftIdRemoved(id uint16) bool {
   271  	pm.mu.RLock()
   272  	defer pm.mu.RUnlock()
   273  
   274  	return pm.removedPeers.Contains(id)
   275  }
   276  
   277  func (pm *ProtocolManager) isRaftIdUsed(raftId uint16) bool {
   278  	if pm.raftId == raftId || pm.isRaftIdRemoved(raftId) {
   279  		return true
   280  	}
   281  
   282  	pm.mu.RLock()
   283  	defer pm.mu.RUnlock()
   284  
   285  	return pm.peers[raftId] != nil
   286  }
   287  
   288  func (pm *ProtocolManager) isNodeAlreadyInCluster(node *enode.Node) error {
   289  	pm.mu.RLock()
   290  	defer pm.mu.RUnlock()
   291  
   292  	thisEnode := enode.MustParse(pm.p2pServer.NodeInfo().Enode)
   293  	if thisEnode.EnodeID() == node.EnodeID() {
   294  		return fmt.Errorf("enode is this enode (self): node with this enode has already been added to the cluster: %s", node.ID())
   295  	}
   296  
   297  	for _, peer := range pm.peers {
   298  		peerRaftId := peer.address.RaftId
   299  		peerNode := peer.p2pNode
   300  
   301  		if peerNode.ID() == node.ID() {
   302  			return fmt.Errorf("node with this enode has already been added to the cluster: %s", node.ID())
   303  		}
   304  
   305  		if peerNode.IP().Equal(node.IP()) {
   306  			if peerNode.TCP() == node.TCP() {
   307  				return fmt.Errorf("existing node %v with raft ID %v is already using eth p2p at %v:%v", peerNode.ID(), peerRaftId, node.IP(), node.TCP())
   308  			} else if peer.address.RaftPort == enr.RaftPort(node.RaftPort()) {
   309  				return fmt.Errorf("existing node %v with raft ID %v is already using raft at %v:%v", peerNode.ID(), peerRaftId, node.IP(), node.RaftPort())
   310  			}
   311  		}
   312  	}
   313  
   314  	return nil
   315  }
   316  
   317  func (pm *ProtocolManager) peerExist(node *enode.Node) bool {
   318  	pm.mu.RLock()
   319  	defer pm.mu.RUnlock()
   320  
   321  	for _, p := range pm.peers {
   322  		if node.ID() == p.p2pNode.ID() {
   323  			return true
   324  		}
   325  	}
   326  	return false
   327  }
   328  
   329  func (pm *ProtocolManager) ProposeNewPeer(enodeURL string, isLearner bool) (uint16, error) {
   330  	if pm.isLearnerNode() {
   331  		return 0, errors.New("learner node can't add peer or learner")
   332  	}
   333  	node, err := enode.ParseV4(enodeURL)
   334  	if err != nil {
   335  		return 0, err
   336  	}
   337  
   338  	if !pm.useDns {
   339  		// hostname is not allowed if DNS is not enabled
   340  		if node.Host() != "" {
   341  			return 0, fmt.Errorf("raft must enable dns to use hostname")
   342  		}
   343  		if len(node.IP()) != 4 {
   344  			return 0, fmt.Errorf("expected IPv4 address (with length 4), but got IP of length %v", len(node.IP()))
   345  		}
   346  	}
   347  
   348  	if !node.HasRaftPort() {
   349  		return 0, fmt.Errorf("enodeId is missing raftport querystring parameter: %v", enodeURL)
   350  	}
   351  
   352  	if err := pm.isNodeAlreadyInCluster(node); err != nil {
   353  		return 0, err
   354  	}
   355  
   356  	raftId := pm.nextRaftId()
   357  	address := newAddress(raftId, node.RaftPort(), node, pm.useDns)
   358  
   359  	confChangeType := raftpb.ConfChangeAddNode
   360  
   361  	if isLearner {
   362  		confChangeType = raftpb.ConfChangeAddLearnerNode
   363  	}
   364  
   365  	pm.confChangeProposalC <- raftpb.ConfChange{
   366  		Type:    confChangeType,
   367  		NodeID:  uint64(raftId),
   368  		Context: address.toBytes(),
   369  	}
   370  
   371  	return raftId, nil
   372  }
   373  
   374  func (pm *ProtocolManager) ProposePeerRemoval(raftId uint16) error {
   375  	if pm.isLearnerNode() && raftId != pm.raftId {
   376  		return errors.New("learner node can't remove other peer")
   377  	}
   378  	pm.confChangeProposalC <- raftpb.ConfChange{
   379  		Type:   raftpb.ConfChangeRemoveNode,
   380  		NodeID: uint64(raftId),
   381  	}
   382  	return nil
   383  }
   384  
   385  func (pm *ProtocolManager) PromoteToPeer(raftId uint16) (bool, error) {
   386  	if pm.isLearnerNode() {
   387  		return false, errors.New("learner node can't promote to peer")
   388  	}
   389  
   390  	if !pm.isLearner(raftId) {
   391  		return false, fmt.Errorf("%d is not a learner. only learner can be promoted to peer", raftId)
   392  	}
   393  
   394  	pm.confChangeProposalC <- raftpb.ConfChange{
   395  		Type:   raftpb.ConfChangeAddNode,
   396  		NodeID: uint64(raftId),
   397  	}
   398  	return true, nil
   399  }
   400  
   401  //
   402  // MsgWriter interface (necessary for p2p.Send)
   403  //
   404  
   405  func (pm *ProtocolManager) WriteMsg(msg p2p.Msg) error {
   406  	// read *into* buffer
   407  	var buffer = make([]byte, msg.Size)
   408  	msg.Payload.Read(buffer)
   409  
   410  	return pm.rawNode().Propose(context.TODO(), buffer)
   411  }
   412  
   413  //
   414  // Raft interface
   415  //
   416  
   417  func (pm *ProtocolManager) Process(ctx context.Context, m raftpb.Message) error {
   418  	return pm.rawNode().Step(ctx, m)
   419  }
   420  
   421  func (pm *ProtocolManager) IsIDRemoved(id uint64) bool {
   422  	return pm.isRaftIdRemoved(uint16(id))
   423  }
   424  
   425  func (pm *ProtocolManager) ReportUnreachable(id uint64) {
   426  	log.Info("peer is currently unreachable", "peer id", id)
   427  
   428  	pm.rawNode().ReportUnreachable(id)
   429  }
   430  
   431  func (pm *ProtocolManager) ReportSnapshot(id uint64, status etcdRaft.SnapshotStatus) {
   432  	if status == etcdRaft.SnapshotFailure {
   433  		log.Info("failed to send snapshot", "raft peer", id)
   434  	} else if status == etcdRaft.SnapshotFinish {
   435  		log.Info("finished sending snapshot", "raft peer", id)
   436  	}
   437  
   438  	pm.rawNode().ReportSnapshot(id, status)
   439  }
   440  
   441  //
   442  // Private methods
   443  //
   444  
   445  func (pm *ProtocolManager) startRaft() {
   446  	if !fileutil.Exist(pm.snapdir) {
   447  		if err := os.Mkdir(pm.snapdir, 0750); err != nil {
   448  			fatalf("cannot create dir for snapshot (%v)", err)
   449  		}
   450  	}
   451  	walExisted := wal.Exist(pm.waldir)
   452  	lastAppliedIndex := pm.loadAppliedIndex()
   453  
   454  	id := raftTypes.ID(pm.raftId).String()
   455  	ss := stats.NewServerStats(id, id)
   456  
   457  	pm.transport = &rafthttp.Transport{
   458  		ID:          raftTypes.ID(pm.raftId),
   459  		ClusterID:   0x1000,
   460  		Raft:        pm,
   461  		ServerStats: ss,
   462  		LeaderStats: stats.NewLeaderStats(strconv.Itoa(int(pm.raftId))),
   463  		ErrorC:      make(chan error),
   464  	}
   465  	pm.transport.Start()
   466  
   467  	// We load the snapshot to connect to prev peers before replaying the WAL,
   468  	// which typically goes further into the future than the snapshot.
   469  
   470  	var maybeRaftSnapshot *raftpb.Snapshot
   471  
   472  	if walExisted {
   473  		maybeRaftSnapshot = pm.loadSnapshot() // re-establishes peer connections
   474  	}
   475  
   476  	loadedWal, entries := pm.replayWAL(maybeRaftSnapshot)
   477  	pm.wal = loadedWal
   478  
   479  	if walExisted {
   480  
   481  		// If we shutdown but didn't manage to flush the state to disk, then it will be the case that we will only sync
   482  		// up to the snapshot. In this case, we can replay the raft entries that we have in saved to replay the blocks
   483  		// back into our chain. We output errors but cannot do much if one occurs, since we can't fork to a different
   484  		// chain and all other nodes in the network have confirmed these blocks
   485  		if maybeRaftSnapshot != nil {
   486  			currentChainHead := pm.blockchain.CurrentBlock().Number()
   487  			for _, entry := range entries {
   488  				if entry.Type == raftpb.EntryNormal {
   489  					var block types.Block
   490  					if err := rlp.DecodeBytes(entry.Data, &block); err != nil {
   491  						log.Error("error decoding block: ", "err", err)
   492  						continue
   493  					}
   494  
   495  					if thisBlockHead := pm.blockchain.GetBlockByHash(block.Hash()); thisBlockHead != nil {
   496  						// check if the block is already existing in the local chain
   497  						// and the block number is greater than current chain head
   498  						if thisBlockHeadNum := thisBlockHead.Number(); thisBlockHeadNum.Cmp(currentChainHead) > 0 {
   499  							// insert the block only if its already seen
   500  							blocks := []*types.Block{&block}
   501  							if _, err := pm.blockchain.InsertChain(blocks); err != nil {
   502  								log.Error("error inserting the block into the chain", "number", block.NumberU64(), "hash", block.Hash(), "err", err)
   503  							}
   504  						}
   505  					}
   506  				}
   507  			}
   508  		}
   509  
   510  		if hardState, _, err := pm.raftStorage.InitialState(); err != nil {
   511  			panic(fmt.Sprintf("failed to read initial state from raft while restarting: %v", err))
   512  		} else {
   513  			if lastPersistedCommittedIndex := hardState.Commit; lastPersistedCommittedIndex < lastAppliedIndex {
   514  				log.Info("rolling back applied index to last-durably-committed", "last applied index", lastAppliedIndex, "last persisted index", lastPersistedCommittedIndex)
   515  
   516  				// Roll back our applied index. See the logic and explanation around
   517  				// the single call to `pm.applyNewChainHead` for more context.
   518  				lastAppliedIndex = lastPersistedCommittedIndex
   519  			}
   520  
   521  			// fix raft applied index out of range
   522  			firstIndex, err := pm.raftStorage.FirstIndex()
   523  			if err != nil {
   524  				panic(fmt.Sprintf("failed to read last persisted applied index from raft while restarting: %v", err))
   525  			}
   526  			lastPersistedAppliedIndex := firstIndex - 1
   527  			if lastPersistedAppliedIndex > lastAppliedIndex {
   528  				log.Debug("set lastAppliedIndex to lastPersistedAppliedIndex", "last applied index", lastAppliedIndex, "last persisted applied index", lastPersistedAppliedIndex)
   529  
   530  				lastAppliedIndex = lastPersistedAppliedIndex
   531  				pm.advanceAppliedIndex(lastAppliedIndex)
   532  			}
   533  		}
   534  	}
   535  
   536  	// NOTE: cockroach sets this to false for now until they've "worked out the
   537  	//       bugs"
   538  	enablePreVote := true
   539  
   540  	raftConfig := &etcdRaft.Config{
   541  		Applied:       lastAppliedIndex,
   542  		ID:            uint64(pm.raftId),
   543  		ElectionTick:  10, // NOTE: cockroach sets this to 15
   544  		HeartbeatTick: 1,  // NOTE: cockroach sets this to 5
   545  		Storage:       pm.raftStorage,
   546  
   547  		// NOTE, from cockroach:
   548  		// "PreVote and CheckQuorum are two ways of achieving the same thing.
   549  		// PreVote is more compatible with quiesced ranges, so we want to switch
   550  		// to it once we've worked out the bugs."
   551  		//
   552  		// TODO: vendor again?
   553  		// PreVote:     enablePreVote,
   554  		CheckQuorum: !enablePreVote,
   555  
   556  		// MaxSizePerMsg controls how many Raft log entries the leader will send to
   557  		// followers in a single MsgApp.
   558  		MaxSizePerMsg: 4096, // NOTE: in cockroachdb this is 16*1024
   559  
   560  		// MaxInflightMsgs controls how many in-flight messages Raft will send to
   561  		// a follower without hearing a response. The total number of Raft log
   562  		// entries is a combination of this setting and MaxSizePerMsg.
   563  		//
   564  		// NOTE: Cockroach's settings (MaxSizePerMsg of 4k and MaxInflightMsgs
   565  		// of 4) provide for up to 64 KB of raft log to be sent without
   566  		// acknowledgement. With an average entry size of 1 KB that translates
   567  		// to ~64 commands that might be executed in the handling of a single
   568  		// etcdraft.Ready operation.
   569  		MaxInflightMsgs: 256, // NOTE: in cockroachdb this is 4
   570  	}
   571  
   572  	log.Info("startRaft", "raft ID", raftConfig.ID)
   573  
   574  	if walExisted {
   575  		log.Info("remounting an existing raft log; connecting to peers.")
   576  
   577  		pm.unsafeRawNode = etcdRaft.RestartNode(raftConfig)
   578  	} else if pm.joinExisting {
   579  		log.Info("newly joining an existing cluster; waiting for connections.")
   580  		pm.unsafeRawNode = etcdRaft.StartNode(raftConfig, nil)
   581  	} else {
   582  		if numPeers := len(pm.bootstrapNodes); numPeers == 0 {
   583  			panic("exiting due to empty raft peers list")
   584  		} else {
   585  			log.Info("starting a new raft log", "initial cluster size of", numPeers)
   586  		}
   587  
   588  		raftPeers, peerAddresses, localAddress := pm.makeInitialRaftPeers()
   589  
   590  		pm.setLocalAddress(localAddress)
   591  
   592  		// We add all peers up-front even though we will see a ConfChangeAddNode
   593  		// for each shortly. This is because raft's ConfState will contain all of
   594  		// these nodes before we see these log entries, and we always want our
   595  		// snapshots to have all addresses for each of the nodes in the ConfState.
   596  		for _, peerAddress := range peerAddresses {
   597  			pm.addPeer(peerAddress)
   598  		}
   599  		pm.unsafeRawNode = etcdRaft.StartNode(raftConfig, raftPeers)
   600  	}
   601  	log.Info("raft node started")
   602  	go pm.serveRaft()
   603  	go pm.serveLocalProposals()
   604  	go pm.eventLoop()
   605  	go pm.handleRoleChange(pm.rawNode().RoleChan().Out())
   606  }
   607  
   608  func (pm *ProtocolManager) setLocalAddress(addr *Address) {
   609  	pm.mu.Lock()
   610  	pm.address = addr
   611  	pm.mu.Unlock()
   612  	// By setting `URLs` on the raft transport, we advertise our URL (in an HTTP
   613  	// header) to any recipient. This is necessary for a newcomer to the cluster
   614  	// to be able to accept a snapshot from us to bootstrap them.
   615  	if urls, err := raftTypes.NewURLs([]string{pm.raftUrl(addr)}); err == nil {
   616  		pm.transport.URLs = urls
   617  	} else {
   618  		panic(fmt.Sprintf("error: could not create URL from local address: %v", addr))
   619  	}
   620  }
   621  
   622  func (pm *ProtocolManager) serveRaft() {
   623  	urlString := fmt.Sprintf("http://0.0.0.0:%d", pm.raftPort)
   624  	url, err := url.Parse(urlString)
   625  	if err != nil {
   626  		fatalf("Failed parsing URL (%v)", err)
   627  	}
   628  
   629  	listener, err := newStoppableListener(url.Host, pm.httpstopc)
   630  	if err != nil {
   631  		fatalf("Failed to listen rafthttp (%v)", err)
   632  	}
   633  	err = (&http.Server{Handler: pm.transport.Handler()}).Serve(listener)
   634  	select {
   635  	case <-pm.httpstopc:
   636  	default:
   637  		fatalf("Failed to serve rafthttp (%v)", err)
   638  	}
   639  	close(pm.httpdonec)
   640  }
   641  
   642  func (pm *ProtocolManager) isLearner(rid uint16) bool {
   643  	pm.mu.RLock()
   644  	defer pm.mu.RUnlock()
   645  	for _, n := range pm.confState.Learners {
   646  		if uint16(n) == rid {
   647  			return true
   648  		}
   649  	}
   650  	return false
   651  }
   652  
   653  func (pm *ProtocolManager) isLearnerNode() bool {
   654  	return pm.isLearner(pm.raftId)
   655  }
   656  
   657  func (pm *ProtocolManager) isVerifierNode() bool {
   658  	return pm.isVerifier(pm.raftId)
   659  }
   660  
   661  func (pm *ProtocolManager) isVerifier(rid uint16) bool {
   662  	pm.mu.RLock()
   663  	defer pm.mu.RUnlock()
   664  	for _, n := range pm.confState.Nodes {
   665  		if uint16(n) == rid {
   666  			return true
   667  		}
   668  	}
   669  	return false
   670  }
   671  
   672  func (pm *ProtocolManager) handleRoleChange(roleC <-chan interface{}) {
   673  	for {
   674  		select {
   675  		case role := <-roleC:
   676  			intRole, ok := role.(int)
   677  
   678  			if !ok {
   679  				panic("Couldn't cast role to int")
   680  			}
   681  			if intRole == minterRole {
   682  				log.EmitCheckpoint(log.BecameMinter)
   683  				pm.minter.start()
   684  			} else { // verifier
   685  				if pm.isVerifierNode() {
   686  					log.EmitCheckpoint(log.BecameVerifier)
   687  				} else {
   688  					log.EmitCheckpoint(log.BecameLearner)
   689  				}
   690  				pm.minter.stop()
   691  			}
   692  
   693  			pm.mu.Lock()
   694  			pm.role = intRole
   695  			pm.mu.Unlock()
   696  		case <-pm.quitSync:
   697  			return
   698  		}
   699  	}
   700  }
   701  
   702  func (pm *ProtocolManager) minedBroadcastLoop() {
   703  	for obj := range pm.minedBlockSub.Chan() {
   704  		switch ev := obj.Data.(type) {
   705  		case core.NewMinedBlockEvent:
   706  			select {
   707  			case pm.blockProposalC <- ev.Block:
   708  			case <-pm.quitSync:
   709  				return
   710  			}
   711  		}
   712  	}
   713  }
   714  
   715  // Serve two channels to handle new blocks and raft configuration changes originating locally.
   716  func (pm *ProtocolManager) serveLocalProposals() {
   717  	//
   718  	// TODO: does it matter that this will restart from 0 whenever we restart a cluster?
   719  	//
   720  	var confChangeCount uint64
   721  
   722  	for {
   723  		select {
   724  		case block, ok := <-pm.blockProposalC:
   725  			if !ok {
   726  				log.Info("error: read from blockProposalC failed")
   727  				return
   728  			}
   729  
   730  			size, r, err := rlp.EncodeToReader(block)
   731  			if err != nil {
   732  				panic(fmt.Sprintf("error: failed to send RLP-encoded block: %s", err.Error()))
   733  			}
   734  			var buffer = make([]byte, uint32(size))
   735  			r.Read(buffer)
   736  
   737  			// blocks until accepted by the raft state machine
   738  			pm.rawNode().Propose(context.TODO(), buffer)
   739  		case cc, ok := <-pm.confChangeProposalC:
   740  			if !ok {
   741  				log.Info("error: read from confChangeProposalC failed")
   742  				return
   743  			}
   744  
   745  			confChangeCount++
   746  			cc.ID = confChangeCount
   747  			pm.rawNode().ProposeConfChange(context.TODO(), cc)
   748  		case <-pm.quitSync:
   749  			return
   750  		}
   751  	}
   752  }
   753  
   754  func (pm *ProtocolManager) entriesToApply(allEntries []raftpb.Entry) (entriesToApply []raftpb.Entry) {
   755  	if len(allEntries) == 0 {
   756  		return
   757  	}
   758  
   759  	first := allEntries[0].Index
   760  	pm.mu.RLock()
   761  	lastApplied := pm.appliedIndex
   762  	pm.mu.RUnlock()
   763  
   764  	if first > lastApplied+1 {
   765  		fatalf("first index of committed entry[%d] should <= appliedIndex[%d] + 1", first, lastApplied)
   766  	}
   767  
   768  	firstToApply := lastApplied - first + 1
   769  
   770  	if firstToApply < uint64(len(allEntries)) {
   771  		entriesToApply = allEntries[firstToApply:]
   772  	}
   773  	return
   774  }
   775  
   776  func (pm *ProtocolManager) raftUrl(address *Address) string {
   777  	if parsedIp := net.ParseIP(address.Hostname); parsedIp != nil {
   778  		if ipv4 := parsedIp.To4(); ipv4 != nil {
   779  			//this is an IPv4 address
   780  			return fmt.Sprintf("http://%s:%d", ipv4, address.RaftPort)
   781  		}
   782  		//this is an IPv6 address
   783  		return fmt.Sprintf("http://[%s]:%d", parsedIp, address.RaftPort)
   784  	}
   785  	return fmt.Sprintf("http://%s:%d", address.Hostname, address.RaftPort)
   786  }
   787  
   788  func (pm *ProtocolManager) addPeer(address *Address) {
   789  	pm.mu.Lock()
   790  	defer pm.mu.Unlock()
   791  
   792  	raftId := address.RaftId
   793  
   794  	//Quorum - RAFT - derive pubkey from nodeId
   795  	pubKey, err := enode.HexPubkey(address.NodeId.String())
   796  	if err != nil {
   797  		log.Error("error decoding pub key from enodeId", "enodeId", address.NodeId.String(), "err", err)
   798  		panic(err)
   799  	}
   800  
   801  	// Add P2P connection:
   802  	p2pNode := enode.NewV4Hostname(pubKey, address.Hostname, int(address.P2pPort), 0, int(address.RaftPort))
   803  	pm.p2pServer.AddPeer(p2pNode)
   804  
   805  	// Add raft transport connection:
   806  	pm.transport.AddPeer(raftTypes.ID(raftId), []string{pm.raftUrl(address)})
   807  	pm.peers[raftId] = &Peer{address, p2pNode}
   808  }
   809  
   810  func (pm *ProtocolManager) disconnectFromPeer(raftId uint16, peer *Peer) {
   811  	pm.p2pServer.RemovePeer(peer.p2pNode)
   812  	pm.transport.RemovePeer(raftTypes.ID(raftId))
   813  }
   814  
   815  func (pm *ProtocolManager) removePeer(raftId uint16) {
   816  	pm.mu.Lock()
   817  	defer pm.mu.Unlock()
   818  
   819  	if peer := pm.peers[raftId]; peer != nil {
   820  		pm.disconnectFromPeer(raftId, peer)
   821  
   822  		delete(pm.peers, raftId)
   823  	}
   824  
   825  	// This is only necessary sometimes, but it's idempotent. Also, we *always*
   826  	// do this, and not just when there's still a peer in the map, because we
   827  	// need to do it for our *own* raft ID before we get booted from the cluster
   828  	// so that snapshots are identical on all nodes. It's important for a booted
   829  	// node to have a snapshot identical to every other node because that node
   830  	// can potentially re-enter the cluster with a new raft ID.
   831  	pm.removedPeers.Add(raftId)
   832  }
   833  
   834  func (pm *ProtocolManager) eventLoop() {
   835  	ticker := time.NewTicker(tickerMS * time.Millisecond)
   836  	defer ticker.Stop()
   837  	defer pm.wal.Close()
   838  
   839  	exitAfterApplying := false
   840  
   841  	for {
   842  		select {
   843  		case <-ticker.C:
   844  			pm.rawNode().Tick()
   845  
   846  			// when the node is first ready it gives us entries to commit and messages
   847  			// to immediately publish
   848  		case rd := <-pm.rawNode().Ready():
   849  			pm.wal.Save(rd.HardState, rd.Entries)
   850  
   851  			if rd.SoftState != nil {
   852  				pm.updateLeader(rd.SoftState.Lead)
   853  			}
   854  
   855  			if snap := rd.Snapshot; !etcdRaft.IsEmptySnap(snap) {
   856  				pm.saveRaftSnapshot(snap)
   857  				pm.applyRaftSnapshot(snap)
   858  				pm.advanceAppliedIndex(snap.Metadata.Index)
   859  			}
   860  
   861  			// 1: Write HardState, Entries, and Snapshot to persistent storage if they
   862  			// are not empty.
   863  			pm.raftStorage.Append(rd.Entries)
   864  
   865  			// 2: Send all Messages to the nodes named in the To field.
   866  			pm.transport.Send(rd.Messages)
   867  
   868  			// 3: Apply Snapshot (if any) and CommittedEntries to the state machine.
   869  			for _, entry := range pm.entriesToApply(rd.CommittedEntries) {
   870  				switch entry.Type {
   871  				case raftpb.EntryNormal:
   872  					if len(entry.Data) == 0 {
   873  						break
   874  					}
   875  					var block types.Block
   876  					err := rlp.DecodeBytes(entry.Data, &block)
   877  					if err != nil {
   878  						log.Error("error decoding block", "err", err)
   879  					}
   880  
   881  					if pm.blockchain.HasBlock(block.Hash(), block.NumberU64()) {
   882  						// This can happen:
   883  						//
   884  						// if (1) we crashed after applying this block to the chain, but
   885  						//        before writing appliedIndex to LDB.
   886  						// or (2) we crashed in a scenario where we applied further than
   887  						//        raft *durably persisted* its committed index (see
   888  						//        https://github.com/coreos/etcd/pull/7899). In this
   889  						//        scenario, when the node comes back up, we will re-apply
   890  						//        a few entries.
   891  
   892  						headBlockHash := pm.blockchain.CurrentBlock().Hash()
   893  						log.Warn("not applying already-applied block", "block hash", block.Hash(), "parent", block.ParentHash(), "head", headBlockHash)
   894  					} else {
   895  						if !pm.applyNewChainHead(&block) {
   896  							// return false only if insert chain is interrupted
   897  							// stop eventloop
   898  							return
   899  						}
   900  					}
   901  
   902  				case raftpb.EntryConfChange:
   903  					var cc raftpb.ConfChange
   904  					cc.Unmarshal(entry.Data)
   905  					raftId := uint16(cc.NodeID)
   906  
   907  					pm.confState = *pm.rawNode().ApplyConfChange(cc)
   908  					log.Info("confChange", "confState", pm.confState)
   909  					forceSnapshot := false
   910  
   911  					switch cc.Type {
   912  					case raftpb.ConfChangeAddNode, raftpb.ConfChangeAddLearnerNode:
   913  						confChangeTypeName := raftpb.ConfChangeType_name[int32(cc.Type)]
   914  						log.Info(confChangeTypeName, "raft id", raftId)
   915  						if pm.isRaftIdRemoved(raftId) {
   916  							log.Info("ignoring "+confChangeTypeName+" for permanently-removed peer", "raft id", raftId)
   917  						} else if pm.isRaftIdUsed(raftId) && raftId <= uint16(len(pm.bootstrapNodes)) {
   918  							// See initial cluster logic in startRaft() for more information.
   919  							log.Info("ignoring expected "+confChangeTypeName+" for initial peer", "raft id", raftId)
   920  							// We need a snapshot to exist to reconnect to peers on start-up after a crash.
   921  							forceSnapshot = true
   922  						} else { // add peer or add learner or promote learner to voter
   923  							forceSnapshot = true
   924  							//if raft id exists as peer, you are promoting learner to peer
   925  							if pm.isRaftIdUsed(raftId) {
   926  								log.Info("promote learner node to voter node", "raft id", raftId)
   927  							} else {
   928  								//if raft id does not exist, you are adding peer/learner
   929  								log.Info("add peer/learner -> "+confChangeTypeName, "raft id", raftId)
   930  								pm.addPeer(bytesToAddress(cc.Context))
   931  							}
   932  						}
   933  
   934  					case raftpb.ConfChangeRemoveNode:
   935  						if pm.isRaftIdRemoved(raftId) {
   936  							log.Info("ignoring ConfChangeRemoveNode for already-removed peer", "raft id", raftId)
   937  						} else {
   938  							log.Info("removing peer due to ConfChangeRemoveNode", "raft id", raftId)
   939  
   940  							forceSnapshot = true
   941  
   942  							if raftId == pm.raftId {
   943  								exitAfterApplying = true
   944  							}
   945  
   946  							pm.removePeer(raftId)
   947  						}
   948  
   949  					case raftpb.ConfChangeUpdateNode:
   950  						// NOTE: remember to forceSnapshot in this case, if we add support
   951  						// for this.
   952  						fatalf("not yet handled: ConfChangeUpdateNode")
   953  					}
   954  
   955  					if forceSnapshot {
   956  						// We force a snapshot here to persist our updated confState, so we
   957  						// know our fellow cluster members when we come back online.
   958  						//
   959  						// It is critical here to snapshot *before* writing our applied
   960  						// index in LevelDB, otherwise a crash while/before snapshotting
   961  						// (after advancing our applied index) would result in the loss of a
   962  						// cluster member upon restart: we would re-mount with an old
   963  						// ConfState.
   964  						pm.triggerSnapshot(entry.Index)
   965  					}
   966  				}
   967  
   968  				pm.advanceAppliedIndex(entry.Index)
   969  			}
   970  
   971  			pm.maybeTriggerSnapshot()
   972  
   973  			if exitAfterApplying {
   974  				log.Warn("permanently removing self from the cluster")
   975  				pm.Stop()
   976  				log.Warn("permanently exited the cluster")
   977  
   978  				return
   979  			}
   980  
   981  			// 4: Call Node.Advance() to signal readiness for the next batch of
   982  			// updates.
   983  			pm.rawNode().Advance()
   984  
   985  		case <-pm.quitSync:
   986  			return
   987  		}
   988  	}
   989  }
   990  
   991  func (pm *ProtocolManager) makeInitialRaftPeers() (raftPeers []etcdRaft.Peer, peerAddresses []*Address, localAddress *Address) {
   992  	initialNodes := pm.bootstrapNodes
   993  	raftPeers = make([]etcdRaft.Peer, len(initialNodes))  // Entire cluster
   994  	peerAddresses = make([]*Address, len(initialNodes)-1) // Cluster without *this* node
   995  
   996  	peersSeen := 0
   997  	for i, node := range initialNodes {
   998  		raftId := uint16(i + 1)
   999  		// We initially get the raftPort from the enode ID's query string. As an alternative, we can move away from
  1000  		// requiring the use of static peers for the initial set, and load them from e.g. another JSON file which
  1001  		// contains pairs of enodes and raft ports, or we can get this initial peer list from commandline flags.
  1002  		address := newAddress(raftId, node.RaftPort(), node, pm.useDns)
  1003  		raftPeers[i] = etcdRaft.Peer{
  1004  			ID:      uint64(raftId),
  1005  			Context: address.toBytes(),
  1006  		}
  1007  
  1008  		if raftId == pm.raftId {
  1009  			localAddress = address
  1010  		} else {
  1011  			peerAddresses[peersSeen] = address
  1012  			peersSeen += 1
  1013  		}
  1014  	}
  1015  
  1016  	return
  1017  }
  1018  
  1019  func blockExtendsChain(block *types.Block, chain *core.BlockChain) bool {
  1020  	return block.ParentHash() == chain.CurrentBlock().Hash()
  1021  }
  1022  
  1023  func (pm *ProtocolManager) applyNewChainHead(block *types.Block) bool {
  1024  	if !blockExtendsChain(block, pm.blockchain) {
  1025  		headBlock := pm.blockchain.CurrentBlock()
  1026  
  1027  		log.Info("Non-extending block", "block", block.Hash(), "parent", block.ParentHash(), "head", headBlock.Hash())
  1028  
  1029  		pm.minter.invalidRaftOrderingChan <- InvalidRaftOrdering{headBlock: headBlock, invalidBlock: block}
  1030  	} else {
  1031  		if existingBlock := pm.blockchain.GetBlockByHash(block.Hash()); nil == existingBlock {
  1032  			if err := pm.blockchain.Validator().ValidateBody(block); err != nil {
  1033  				panic(fmt.Sprintf("failed to validate block %x (%v)", block.Hash(), err))
  1034  			}
  1035  		}
  1036  
  1037  		for _, tx := range block.Transactions() {
  1038  			log.EmitCheckpoint(log.TxAccepted, "tx", tx.Hash().Hex())
  1039  		}
  1040  
  1041  		_, err := pm.blockchain.InsertChain([]*types.Block{block})
  1042  
  1043  		if err != nil {
  1044  			if err == core.ErrAbortBlocksProcessing {
  1045  				log.Error(fmt.Sprintf("failed to extend chain: %s", err.Error()))
  1046  				return false
  1047  			}
  1048  			panic(fmt.Sprintf("failed to extend chain: %s", err.Error()))
  1049  		}
  1050  
  1051  		log.EmitCheckpoint(log.BlockCreated, "block", fmt.Sprintf("%x", block.Hash()))
  1052  	}
  1053  	return true
  1054  }
  1055  
  1056  // Sets new appliedIndex in-memory, *and* writes this appliedIndex to LevelDB.
  1057  func (pm *ProtocolManager) advanceAppliedIndex(index uint64) {
  1058  	pm.writeAppliedIndex(index)
  1059  
  1060  	pm.mu.Lock()
  1061  	pm.appliedIndex = index
  1062  	pm.mu.Unlock()
  1063  }
  1064  
  1065  func (pm *ProtocolManager) updateLeader(leader uint64) {
  1066  	pm.mu.Lock()
  1067  	defer pm.mu.Unlock()
  1068  
  1069  	pm.leader = uint16(leader)
  1070  }
  1071  
  1072  // The Address for the current leader, or an error if no leader is elected.
  1073  func (pm *ProtocolManager) LeaderAddress() (*Address, error) {
  1074  	pm.mu.RLock()
  1075  	defer pm.mu.RUnlock()
  1076  
  1077  	if minterRole == pm.role {
  1078  		return pm.address, nil
  1079  	} else if l, ok := pm.peers[pm.leader]; ok {
  1080  		return l.address, nil
  1081  	}
  1082  	// We expect to reach this if pm.leader is 0, which is how etcd denotes the lack of a leader.
  1083  	return nil, errNoLeaderElected
  1084  }
  1085  
  1086  // Returns the raft id for a given enodeId
  1087  func (pm *ProtocolManager) FetchRaftId(enodeId string) (uint16, error) {
  1088  	node, err := enode.ParseV4(enodeId)
  1089  	if err != nil {
  1090  		return 0, err
  1091  	}
  1092  	for raftId, peer := range pm.peers {
  1093  		if peer.p2pNode.ID() == node.ID() {
  1094  			return raftId, nil
  1095  		}
  1096  	}
  1097  	return 0, fmt.Errorf("node not found in the cluster: %v", enodeId)
  1098  }