github.com/aergoio/aergo@v1.3.1/consensus/impl/raftv2/cluster.go (about)

     1  package raftv2
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"crypto/sha1"
     7  	"encoding/binary"
     8  	"encoding/json"
     9  	"errors"
    10  	"fmt"
    11  	"github.com/aergoio/aergo/cmd/aergocli/util"
    12  	"github.com/aergoio/aergo/internal/enc"
    13  	"github.com/aergoio/aergo/message"
    14  	"sort"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/aergoio/aergo/consensus"
    19  	"github.com/aergoio/aergo/pkg/component"
    20  	"github.com/aergoio/aergo/types"
    21  	raftlib "github.com/aergoio/etcd/raft"
    22  	"github.com/aergoio/etcd/raft/raftpb"
    23  )
    24  
    25  var (
    26  	MaxConfChangeTimeOut = time.Second * 100
    27  
    28  	ErrClusterHasNoMember   = errors.New("cluster has no member")
    29  	ErrNotExistRaftMember   = errors.New("not exist member of raft cluster")
    30  	ErrNoEnableSyncPeer     = errors.New("no peer to sync chain")
    31  	ErrMemberAlreadyApplied = errors.New("member is already added")
    32  
    33  	ErrInvalidMembershipReqType = errors.New("invalid type of membership change request")
    34  	ErrPendingConfChange        = errors.New("pending membership change request is in progree. try again when it is finished")
    35  	ErrConChangeTimeOut         = errors.New("timeouted membership change request")
    36  	ErrConfChangeChannelBusy    = errors.New("channel of conf change propose is busy")
    37  	ErrCCMemberIsNil            = errors.New("memeber is nil")
    38  	ErrNotMatchedRaftName       = errors.New("mismatched name of raft identity")
    39  	ErrNotMatchedRaftPeerID     = errors.New("mismatched peerid of raft identity")
    40  	ErrNotExitRaftProgress      = errors.New("progress of this node doesn't exist")
    41  	ErrUnhealtyNodeExist        = errors.New("can't add some node if unhealthy nodes exist")
    42  	ErrRemoveHealthyNode        = errors.New("remove of a healthy node may cause the cluster to hang")
    43  )
    44  
    45  const (
    46  	MembersNameInit    = "init"
    47  	MembersNameApplied = "applied"
    48  	MembersNameRemoved = "removed"
    49  	InvalidClusterID   = 0
    50  )
    51  
    52  type RaftInfo struct {
    53  	Leader string
    54  	Total  uint32
    55  	Name   string
    56  	RaftId string
    57  	Status *json.RawMessage
    58  }
    59  
    60  type NotifyFn func(event *message.RaftClusterEvent)
    61  
    62  // raft cluster membership
    63  // copy from dpos/bp
    64  // TODO refactoring
    65  // Cluster represents a cluster of block producers.
    66  type Cluster struct {
    67  	component.ICompSyncRequester
    68  	sync.Mutex
    69  	cdb consensus.ChainDB
    70  
    71  	chainID        []byte
    72  	chainTimestamp int64
    73  	rs             *raftServer
    74  
    75  	appliedIndex uint64
    76  	appliedTerm  uint64
    77  
    78  	identity consensus.RaftIdentity
    79  
    80  	Size uint32
    81  
    82  	// @ MatchClusterAndConfState
    83  	// cluster members must match nodes of confstate. otherwise confchange may fail and be skipped by comparing with cluster members.
    84  	// Mismatch of cluster and confstate occures when node joins a exising cluster. Joined node starts from latest members, but confstate is empty.
    85  	// If snapshot is written before all confchange logs  be applied, mismatched state is written to disk.
    86  	// After recovery from snapshot, problems will happen.
    87  	members *Members // using for 1. booting
    88  	//           2. send cluster info to remote
    89  	appliedMembers *Members // using for 1. verifying runtime confchange.
    90  	// 			 2. creating snapshot
    91  	//           3. recover from snapshot
    92  
    93  	// raft http reject message from removed member
    94  	// TODO for p2p
    95  	removedMembers *Members
    96  
    97  	changeSeq   uint64
    98  	confChangeC chan *consensus.ConfChangePropose
    99  
   100  	savedChange *consensus.ConfChangePropose
   101  
   102  	notifyFn NotifyFn
   103  }
   104  
   105  type Members struct {
   106  	name      string
   107  	MapByID   map[uint64]*consensus.Member // restore from DB or snapshot
   108  	MapByName map[string]*consensus.Member
   109  
   110  	Index map[types.PeerID]uint64 // peer ID to raft ID mapping
   111  
   112  	Addresses []string //for raft server TODO remove
   113  }
   114  
   115  func newMembers(name string) *Members {
   116  	return &Members{
   117  		name:      name,
   118  		MapByID:   make(map[uint64]*consensus.Member),
   119  		MapByName: make(map[string]*consensus.Member),
   120  		Index:     make(map[types.PeerID]uint64),
   121  		Addresses: make([]string, 0),
   122  	}
   123  }
   124  
   125  func (mbrs *Members) len() int {
   126  	return len(mbrs.MapByID)
   127  }
   128  
   129  func (mbrs *Members) ToArray() []*consensus.Member {
   130  	count := len(mbrs.MapByID)
   131  
   132  	var arrs = make([]*consensus.Member, count)
   133  
   134  	i := 0
   135  	for _, m := range mbrs.MapByID {
   136  		arrs[i] = m
   137  		i++
   138  	}
   139  
   140  	sort.Sort(consensus.MembersByName(arrs))
   141  
   142  	return arrs
   143  }
   144  
   145  func (mbrs *Members) ToMemberAttrArray() []*types.MemberAttr {
   146  	count := len(mbrs.MapByID)
   147  
   148  	var arrs = make([]*types.MemberAttr, count)
   149  
   150  	mbrArray := mbrs.ToArray()
   151  
   152  	i := 0
   153  	for _, m := range mbrArray {
   154  		arrs[i] = &m.MemberAttr
   155  		i++
   156  	}
   157  
   158  	return arrs
   159  }
   160  
   161  func (mbrs *Members) toString() string {
   162  	var buf string
   163  
   164  	buf += fmt.Sprintf("%s", mbrs.name)
   165  
   166  	if mbrs == nil {
   167  		return "[]"
   168  	}
   169  
   170  	mbrsArr := mbrs.ToArray()
   171  	sort.Sort(consensus.MembersByName(mbrsArr))
   172  
   173  	buf += fmt.Sprintf("[")
   174  	for _, bp := range mbrsArr {
   175  		buf += fmt.Sprintf("%s", bp.ToString())
   176  	}
   177  	buf += fmt.Sprintf("]")
   178  
   179  	return buf
   180  }
   181  
   182  func NewCluster(chainID []byte, bf *BlockFactory, raftName string, p2pPeerID types.PeerID, chainTimestamp int64, notifyFn NotifyFn) *Cluster {
   183  	cl := &Cluster{
   184  		chainID:            chainID,
   185  		chainTimestamp:     chainTimestamp,
   186  		ICompSyncRequester: bf,
   187  		identity:           consensus.RaftIdentity{Name: raftName},
   188  		members:            newMembers(MembersNameInit),
   189  		appliedMembers:     newMembers(MembersNameApplied),
   190  		removedMembers:     newMembers(MembersNameRemoved),
   191  		confChangeC:        make(chan *consensus.ConfChangePropose),
   192  	}
   193  	if bf != nil {
   194  		cl.cdb = bf.ChainWAL
   195  	}
   196  
   197  	if len(p2pPeerID) > 0 {
   198  		cl.identity.PeerID = types.IDB58Encode(p2pPeerID)
   199  	}
   200  	cl.notifyFn = notifyFn
   201  
   202  	return cl
   203  }
   204  
   205  func NewClusterFromMemberAttrs(clusterID uint64, chainID []byte, memberAttrs []*types.MemberAttr) (*Cluster, error) {
   206  	cl := NewCluster(chainID, nil, "", "", 0, nil)
   207  
   208  	for _, mbrAttr := range memberAttrs {
   209  		var mbr consensus.Member
   210  
   211  		mbr.SetAttr(mbrAttr)
   212  
   213  		if err := cl.isValidMember(&mbr); err != nil {
   214  			logger.Error().Err(err).Str("mbr", mbr.ToString()).Msg("fail to add member")
   215  			return nil, err
   216  		}
   217  
   218  		if err := cl.addMember(&mbr, false); err != nil {
   219  			logger.Error().Err(err).Str("mbr", mbr.ToString()).Msg("fail to add member")
   220  			return nil, err
   221  		}
   222  	}
   223  
   224  	if clusterID == InvalidClusterID {
   225  		return nil, ErrClusterNotReady
   226  	}
   227  	cl.identity.ClusterID = clusterID
   228  
   229  	return cl, nil
   230  }
   231  
   232  func (cl *Cluster) ClusterID() uint64 {
   233  	return cl.identity.ClusterID
   234  }
   235  
   236  func (cl *Cluster) NodeName() string {
   237  	return cl.identity.Name
   238  }
   239  
   240  func (cl *Cluster) NodeID() uint64 {
   241  	return cl.identity.ID
   242  }
   243  
   244  func (cl *Cluster) NodePeerID() string {
   245  	return cl.identity.PeerID
   246  }
   247  
   248  func (cl *Cluster) SetNodeID(nodeid uint64) {
   249  	cl.identity.ID = nodeid
   250  }
   251  
   252  func (cl *Cluster) SetClusterID(clusterid uint64) {
   253  	logger.Debug().Str("id", EtcdIDToString(clusterid)).Msg("set cluster ID")
   254  
   255  	cl.identity.ClusterID = clusterid
   256  }
   257  
   258  // RecoverIdentity reset node id and name of cluster.
   259  // raft identity is saved in WAL and reset when server is restarted
   260  func (cl *Cluster) RecoverIdentity(id *consensus.RaftIdentity) error {
   261  	cl.Lock()
   262  	defer cl.Unlock()
   263  
   264  	// check name
   265  	if cl.identity.Name != id.Name {
   266  		return ErrNotMatchedRaftName
   267  	}
   268  
   269  	if cl.identity.PeerID != id.PeerID {
   270  		return ErrNotMatchedRaftPeerID
   271  	}
   272  
   273  	if id.ClusterID == 0 {
   274  		return ErrInvalidRaftIdentity
   275  	}
   276  
   277  	cl.identity = *id
   278  
   279  	logger.Info().Str("identity", id.ToString()).Msg("recover raft identity of this node")
   280  
   281  	return nil
   282  }
   283  
   284  func (cl *Cluster) Recover(snapshot *raftpb.Snapshot) (bool, error) {
   285  	var snapdata = &consensus.SnapshotData{}
   286  
   287  	if err := snapdata.Decode(snapshot.Data); err != nil {
   288  		return false, err
   289  	}
   290  
   291  	logger.Info().Str("snap", snapdata.ToString()).Msg("cluster recover from snapshot")
   292  
   293  	if cl.isAllMembersEqual(snapdata.Members, snapdata.RemovedMembers) {
   294  		logger.Info().Msg("cluster recover skipped since all members are equal to previous configure")
   295  		return true, nil
   296  	}
   297  
   298  	cl.ResetMembers()
   299  
   300  	// members restore
   301  	for _, mbr := range snapdata.Members {
   302  		if err := cl.addMember(mbr, true); err != nil {
   303  			return false, err
   304  		}
   305  	}
   306  
   307  	for _, mbr := range snapdata.RemovedMembers {
   308  		cl.RemovedMembers().add(mbr)
   309  	}
   310  
   311  	logger.Info().Str("info", cl.toStringWithLock()).Msg("cluster recovered")
   312  
   313  	return false, nil
   314  }
   315  
   316  func (cl *Cluster) ResetMembers() {
   317  	cl.Lock()
   318  	defer cl.Unlock()
   319  
   320  	cl.members = newMembers(MembersNameInit)
   321  	cl.appliedMembers = newMembers(MembersNameApplied)
   322  	cl.removedMembers = newMembers(MembersNameRemoved)
   323  
   324  	cl.Size = 0
   325  }
   326  
   327  func (cl *Cluster) isMatch(confstate *raftpb.ConfState) bool {
   328  	var matched int
   329  
   330  	if len(cl.AppliedMembers().MapByID) != len(confstate.Nodes) {
   331  		return false
   332  	}
   333  
   334  	for _, confID := range confstate.Nodes {
   335  		if _, ok := cl.AppliedMembers().MapByID[confID]; !ok {
   336  			return false
   337  		}
   338  
   339  		matched++
   340  	}
   341  
   342  	return true
   343  }
   344  
   345  func (cl *Cluster) Members() *Members {
   346  	return cl.members
   347  }
   348  
   349  func (cl *Cluster) AppliedMembers() *Members {
   350  	return cl.appliedMembers
   351  }
   352  
   353  func (cl *Cluster) RemovedMembers() *Members {
   354  	return cl.removedMembers
   355  }
   356  
   357  func (cl *Cluster) Quorum() uint32 {
   358  	return cl.Size/2 + 1
   359  }
   360  
   361  func (cl *Cluster) getStartPeers() ([]raftlib.Peer, error) {
   362  	cl.Lock()
   363  	defer cl.Unlock()
   364  
   365  	if cl.Size == 0 {
   366  		return nil, ErrClusterHasNoMember
   367  	}
   368  
   369  	rpeers := make([]raftlib.Peer, cl.Size)
   370  
   371  	var i int
   372  	for _, member := range cl.members.MapByID {
   373  		data, err := json.Marshal(member)
   374  		if err != nil {
   375  			return nil, err
   376  		}
   377  		rpeers[i] = raftlib.Peer{ID: uint64(member.ID), Context: data}
   378  		i++
   379  	}
   380  
   381  	return rpeers, nil
   382  }
   383  
   384  // getAnyPeerAddressToSync returns peer address that has block of no for sync
   385  func (cl *Cluster) getAnyPeerAddressToSync() (types.PeerID, error) {
   386  	cl.Lock()
   387  	defer cl.Unlock()
   388  
   389  	for _, member := range cl.Members().MapByID {
   390  		if member.Name != cl.NodeName() {
   391  			return member.GetPeerID(), nil
   392  		}
   393  	}
   394  
   395  	return "", ErrNoEnableSyncPeer
   396  }
   397  
   398  func (cl *Cluster) isValidMember(member *consensus.Member) error {
   399  	cl.Lock()
   400  	defer cl.Unlock()
   401  
   402  	mbrs := cl.members
   403  
   404  	for _, prevMember := range mbrs.MapByID {
   405  		if prevMember.HasDuplicatedAttr(member) {
   406  			logger.Error().Str("prev", prevMember.ToString()).Str("cur", member.ToString()).Msg("duplicated configuration for raft BP member")
   407  			return ErrDupBP
   408  		}
   409  	}
   410  
   411  	// check if peerID of this node is valid
   412  	if cl.NodeName() == member.Name && enc.ToString([]byte(member.GetPeerID())) != cl.NodePeerID() {
   413  		logger.Error().Str("config", member.GetPeerID().String()).Str("cluster peerid", cl.NodePeerID()).Msg("peerID value is not matched with P2P")
   414  		return ErrInvalidRaftPeerID
   415  	}
   416  
   417  	return nil
   418  }
   419  
   420  func (cl *Cluster) addMember(member *consensus.Member, applied bool) error {
   421  	logger.Info().Str("member", member.ToString()).Bool("applied", applied).Msg("member add")
   422  
   423  	cl.Lock()
   424  	defer cl.Unlock()
   425  
   426  	if applied {
   427  		if cl.AppliedMembers().isExist(member.ID) {
   428  			return ErrMemberAlreadyApplied
   429  		}
   430  		logger.Debug().Str("member", member.ToString()).Msg("add to applied members")
   431  		cl.AppliedMembers().add(member)
   432  
   433  		// notify to p2p TODO temporary code
   434  		peerID, err := types.IDFromBytes(member.PeerID)
   435  		if err != nil {
   436  			panic("invalid member peerid " + enc.ToString(member.PeerID))
   437  		}
   438  
   439  		if cl.notifyFn != nil {
   440  			cl.notifyFn(&message.RaftClusterEvent{BPAdded: []types.PeerID{peerID}})
   441  		}
   442  	}
   443  
   444  	if cl.members.isExist(member.ID) {
   445  		logger.Debug().Str("member", member.ToString()).Msg("omit adding to init members")
   446  		return nil
   447  	}
   448  
   449  	cl.members.add(member)
   450  	cl.Size++
   451  
   452  	return nil
   453  }
   454  
   455  func (cl *Cluster) removeMember(member *consensus.Member) error {
   456  	logger.Info().Str("member", member.ToString()).Msg("member remove")
   457  
   458  	cl.Lock()
   459  	defer cl.Unlock()
   460  
   461  	cl.AppliedMembers().remove(member)
   462  	cl.members.remove(member)
   463  	cl.removedMembers.add(member)
   464  
   465  	cl.Size--
   466  	// notify to p2p TODO temporary code
   467  	peerID, err := types.IDFromBytes(member.PeerID)
   468  	if err != nil {
   469  		panic("invalid member peerid " + enc.ToString(member.PeerID))
   470  	}
   471  
   472  	if cl.notifyFn != nil {
   473  		cl.notifyFn(&message.RaftClusterEvent{BPRemoved: []types.PeerID{peerID}})
   474  	}
   475  
   476  	return nil
   477  }
   478  
   479  // ValidateAndMergeExistingCluster tests if members of existing cluster are matched with this cluster
   480  func (cl *Cluster) ValidateAndMergeExistingCluster(existingCl *Cluster) bool {
   481  	cl.Lock()
   482  	defer cl.Unlock()
   483  
   484  	if !bytes.Equal(existingCl.chainID, cl.chainID) {
   485  		logger.Error().Msg("My chainID is different from the existing cluster")
   486  		return false
   487  	}
   488  
   489  	// check if this node is already added in existing cluster
   490  	remoteMember := existingCl.Members().getMemberByName(cl.NodeName())
   491  	if remoteMember == nil {
   492  		logger.Error().Msg("This node doesn't exist in the existing cluster")
   493  		return false
   494  	}
   495  
   496  	// TODO check my network config is equal to member of remote
   497  	if enc.ToString(remoteMember.PeerID) != cl.NodePeerID() {
   498  		logger.Error().Msg("peerid is different with peerid of member of existing cluster")
   499  	}
   500  
   501  	cl.members = existingCl.Members()
   502  	cl.Size = existingCl.Size
   503  
   504  	myNodeID := existingCl.getNodeID(cl.NodeName())
   505  
   506  	// reset self nodeID of cluster
   507  	cl.SetNodeID(myNodeID)
   508  	cl.SetClusterID(existingCl.ClusterID())
   509  
   510  	logger.Debug().Str("my", cl.toStringWithLock()).Msg("cluster merged with existing cluster")
   511  	return true
   512  }
   513  
   514  func (cl *Cluster) getMemberAttrs() ([]*types.MemberAttr, error) {
   515  	cl.Lock()
   516  	defer cl.Unlock()
   517  
   518  	attrs := make([]*types.MemberAttr, cl.members.len())
   519  
   520  	if cl.members.len() == 0 {
   521  		return nil, ErrClusterHasNoMember
   522  	}
   523  
   524  	var i = 0
   525  	for _, mbr := range cl.members.MapByID {
   526  		// copy attr since it can be modified
   527  		attr := mbr.MemberAttr
   528  		attrs[i] = &attr
   529  		i++
   530  	}
   531  
   532  	return attrs, nil
   533  }
   534  
   535  // IsIDRemoved return true if given raft id is not exist in cluster
   536  func (cl *Cluster) IsIDRemoved(id uint64) bool {
   537  	return cl.RemovedMembers().isExist(id)
   538  }
   539  
   540  // GenerateID generate cluster ID by hashing IDs of all initial members
   541  func (cl *Cluster) GenerateID(useBackup bool) {
   542  	var buf []byte
   543  
   544  	if useBackup {
   545  		blk, err := cl.cdb.GetBestBlock()
   546  		if err != nil || blk == nil {
   547  			logger.Fatal().Msg("failed to get best block from backup datafiles")
   548  		}
   549  
   550  		buf = append(buf, blk.BlockHash()...)
   551  	}
   552  
   553  	mbrs := cl.Members().ToArray()
   554  	sort.Sort(consensus.MembersByName(mbrs))
   555  
   556  	for _, mbr := range mbrs {
   557  		logger.Debug().Str("id", EtcdIDToString(mbr.GetID())).Msg("member ID")
   558  
   559  		buf = append(buf, types.Uint64ToBytes(mbr.GetID())...)
   560  	}
   561  
   562  	hash := sha1.Sum(buf)
   563  	cl.identity.ClusterID = binary.LittleEndian.Uint64(hash[:8])
   564  
   565  	logger.Info().Str("id", EtcdIDToString(cl.ClusterID())).Msg("generate cluster ID")
   566  }
   567  
   568  func (cl *Cluster) isAllMembersEqual(members []*consensus.Member, RemovedMembers []*consensus.Member) bool {
   569  	membersEqual := func(x []*consensus.Member, y []*consensus.Member) bool {
   570  		if len(x) != len(y) {
   571  			return false
   572  		}
   573  
   574  		for i, mX := range x {
   575  			mY := y[i]
   576  			if !mX.Equal(mY) {
   577  				return false
   578  			}
   579  		}
   580  
   581  		return true
   582  	}
   583  
   584  	clMembers := cl.AppliedMembers().ToArray()
   585  	clRemovedMembers := cl.RemovedMembers().ToArray()
   586  
   587  	sort.Sort(consensus.MembersByName(members))
   588  	sort.Sort(consensus.MembersByName(RemovedMembers))
   589  
   590  	if !membersEqual(clMembers, members) {
   591  		return false
   592  	}
   593  
   594  	if !membersEqual(clRemovedMembers, RemovedMembers) {
   595  		return false
   596  	}
   597  
   598  	return true
   599  }
   600  
   601  func (mbrs *Members) add(member *consensus.Member) {
   602  	mbrs.MapByID[member.ID] = member
   603  	mbrs.MapByName[member.Name] = member
   604  	mbrs.Index[member.GetPeerID()] = member.ID
   605  	mbrs.Addresses = append(mbrs.Addresses, member.Address)
   606  }
   607  
   608  func (mbrs *Members) remove(member *consensus.Member) {
   609  	delete(mbrs.MapByID, member.ID)
   610  	delete(mbrs.MapByName, member.Name)
   611  	delete(mbrs.Index, member.GetPeerID())
   612  }
   613  
   614  func (mbrs *Members) getMemberByName(name string) *consensus.Member {
   615  	member, ok := mbrs.MapByName[name]
   616  	if !ok {
   617  		return nil
   618  	}
   619  
   620  	return member
   621  }
   622  
   623  func (mbrs *Members) isExist(id uint64) bool {
   624  	return mbrs.getMember(id) != nil
   625  }
   626  
   627  func (mbrs *Members) getMember(id uint64) *consensus.Member {
   628  	member, ok := mbrs.MapByID[id]
   629  	if !ok {
   630  		return nil
   631  	}
   632  
   633  	return member
   634  }
   635  
   636  func (mbrs *Members) getMemberByPeerID(pid types.PeerID) *consensus.Member {
   637  	return mbrs.getMember(mbrs.Index[pid])
   638  }
   639  
   640  func (mbrs *Members) getMemberPeerAddress(id uint64) (types.PeerID, error) {
   641  	member := mbrs.getMember(id)
   642  	if member == nil {
   643  		return "", ErrNotExistRaftMember
   644  	}
   645  
   646  	return member.GetPeerID(), nil
   647  }
   648  
   649  // hasDuplicatedMember returns true if any attributes of the given member is equal to the attributes of cluster members
   650  func (mbrs *Members) hasDuplicatedMember(m *consensus.Member) error {
   651  	for _, prevMember := range mbrs.MapByID {
   652  		if prevMember.HasDuplicatedAttr(m) {
   653  			logger.Error().Str("old", prevMember.ToString()).Str("new", m.ToString()).Msg("duplicated attribute for new member")
   654  			return ErrDupBP
   655  		}
   656  	}
   657  	return nil
   658  }
   659  
   660  func MaxUint64(x, y uint64) uint64 {
   661  	if x < y {
   662  		return y
   663  	}
   664  	return x
   665  }
   666  
   667  /*
   668  // hasSynced get result of GetPeers request from P2P service and check if chain of this node is synchronized with majority of members
   669  func (cc *Cluster) hasSynced() (bool, error) {
   670  	var peers map[types.PeerID]*message.PeerInfo
   671  	var err error
   672  	var peerBestNo uint64 = 0
   673  
   674  	if cc.Size == 1 {
   675  		return true, nil
   676  	}
   677  
   678  	// request GetPeers to p2p
   679  	getBPPeers := func() (map[types.PeerID]*message.PeerInfo, error) {
   680  		peers := make(map[types.PeerID]*message.PeerInfo)
   681  
   682  		result, err := cc.RequestFuture(message.P2PSvc, &message.GetPeers{}, time.Second, "raft cluster sync test").Result()
   683  		if err != nil {
   684  			return nil, err
   685  		}
   686  
   687  		msg := result.(*message.GetPeersRsp)
   688  
   689  		for _, peerElem := range msg.Peers {
   690  			peerID := types.PeerID(peerElem.Addr.PeerID)
   691  			state := peerElem.State
   692  
   693  			if peerElem.Self {
   694  				continue
   695  			}
   696  
   697  			if state.Get() != types.RUNNING {
   698  				logger.Debug().Str("peer", p2putil.ShortForm(peerID)).Msg("peer is not running")
   699  				continue
   700  
   701  			}
   702  
   703  			// check if peer is not bp
   704  			if _, ok := cc.Index[peerID]; !ok {
   705  				continue
   706  			}
   707  
   708  			peers[peerID] = peerElem
   709  
   710  			peerBestNo = MaxUint64(peerElem.LastBlockNumber, peerBestNo)
   711  		}
   712  
   713  		return peers, nil
   714  	}
   715  
   716  	if peers, err = getBPPeers(); err != nil {
   717  		return false, err
   718  	}
   719  
   720  	if uint16(len(peers)) < (cc.Quorum() - 1) {
   721  		logger.Debug().Msg("a majority of peers are not connected")
   722  		return false, nil
   723  	}
   724  
   725  	var best *types.Block
   726  	if best, err = cc.cdb.GetBestBlock(); err != nil {
   727  		return false, err
   728  	}
   729  
   730  	if best.BlockNo()+DefaultMarginChainDiff < peerBestNo {
   731  		logger.Debug().Uint64("best", best.BlockNo()).Uint64("peerbest", peerBestNo).Msg("chain was not synced with majority of peers")
   732  		return false, nil
   733  	}
   734  
   735  	logger.Debug().Uint64("best", best.BlockNo()).Uint64("peerbest", peerBestNo).Int("margin", DefaultMarginChainDiff).Msg("chain has been synced with majority of peers")
   736  
   737  	return true, nil
   738  }
   739  */
   740  func (cl *Cluster) toStringWithLock() string {
   741  	var buf string
   742  
   743  	buf = fmt.Sprintf("total=%d, cluserID=%x, NodeName=%s, RaftID=%x, ", cl.Size, cl.ClusterID(), cl.NodeName(), cl.NodeID())
   744  	buf += "members: " + cl.members.toString()
   745  	buf += ", appliedMembers: " + cl.appliedMembers.toString()
   746  
   747  	return buf
   748  }
   749  
   750  func (cl *Cluster) toString() string {
   751  	cl.Lock()
   752  	defer cl.Unlock()
   753  
   754  	return cl.toStringWithLock()
   755  }
   756  
   757  func (cl *Cluster) getNodeID(name string) uint64 {
   758  	m, ok := cl.Members().MapByName[name]
   759  	if !ok {
   760  		return consensus.InvalidMemberID
   761  	}
   762  
   763  	return m.ID
   764  }
   765  
   766  func (cl *Cluster) getRaftInfo(withStatus bool) *RaftInfo {
   767  	cl.Lock()
   768  	defer cl.Unlock()
   769  
   770  	var leader uint64
   771  	if cl.rs != nil {
   772  		leader = cl.rs.GetLeader()
   773  	}
   774  
   775  	var leaderName string
   776  	var m *consensus.Member
   777  
   778  	if m = cl.Members().getMember(leader); m != nil {
   779  		leaderName = m.Name
   780  	} else {
   781  		leaderName = "id=" + EtcdIDToString(leader)
   782  	}
   783  
   784  	rinfo := &RaftInfo{Leader: leaderName, Total: cl.Size, Name: cl.NodeName(), RaftId: EtcdIDToString(cl.NodeID())}
   785  
   786  	if withStatus && cl.rs != nil {
   787  		b, err := cl.rs.Status().MarshalJSON()
   788  		if err != nil {
   789  			logger.Error().Err(err).Msg("failed to marshalEntryData raft consensus")
   790  		} else {
   791  			m := json.RawMessage(b)
   792  			rinfo.Status = &m
   793  		}
   794  	}
   795  	return rinfo
   796  }
   797  
   798  func (cl *Cluster) toConsensusInfo() *types.ConsensusInfo {
   799  	emptyCons := types.ConsensusInfo{
   800  		Type: GetName(),
   801  	}
   802  
   803  	type PeerInfo struct {
   804  		Name   string
   805  		RaftID string
   806  		PeerID string
   807  		Addr   string
   808  	}
   809  
   810  	b, err := json.Marshal(cl.getRaftInfo(true))
   811  	if err != nil {
   812  		logger.Error().Err(err).Msg("failed to marshalEntryData raft consensus")
   813  		return &emptyCons
   814  	}
   815  
   816  	cl.Lock()
   817  	defer cl.Unlock()
   818  
   819  	cons := emptyCons
   820  	cons.Info = string(b)
   821  
   822  	var i int = 0
   823  	if cl.Size != 0 {
   824  		bps := make([]string, cl.Size)
   825  
   826  		for id, m := range cl.Members().MapByID {
   827  			bp := &PeerInfo{Name: m.Name, RaftID: EtcdIDToString(m.ID), PeerID: m.GetPeerID().Pretty(), Addr: m.Address}
   828  			b, err = json.Marshal(bp)
   829  			if err != nil {
   830  				logger.Error().Err(err).Str("raftid", EtcdIDToString(id)).Msg("failed to marshalEntryData raft consensus bp")
   831  				return &emptyCons
   832  			}
   833  			bps[i] = string(b)
   834  
   835  			i++
   836  		}
   837  		cons.Bps = bps
   838  	}
   839  
   840  	return &cons
   841  }
   842  
   843  func (cl *Cluster) NewMemberFromAddReq(req *types.MembershipChange) (*consensus.Member, error) {
   844  	if len(req.Attr.Name) == 0 || len(req.Attr.Address) == 0 || len(req.Attr.PeerID) == 0 {
   845  		return nil, consensus.ErrInvalidMemberAttr
   846  	}
   847  
   848  	return consensus.NewMember(req.Attr.Name, req.Attr.Address, types.PeerID(req.Attr.PeerID), cl.chainID, time.Now().UnixNano()), nil
   849  }
   850  
   851  func (cl *Cluster) NewMemberFromRemoveReq(req *types.MembershipChange) (*consensus.Member, error) {
   852  	if req.Attr.ID == consensus.InvalidMemberID {
   853  		return nil, consensus.ErrInvalidMemberID
   854  	}
   855  
   856  	member := consensus.NewMember("", "", types.PeerID(""), cl.chainID, 0)
   857  	member.SetMemberID(req.Attr.ID)
   858  
   859  	return member, nil
   860  }
   861  
   862  func (cl *Cluster) ChangeMembership(req *types.MembershipChange, nowait bool) (*consensus.Member, error) {
   863  	var (
   864  		proposal *consensus.ConfChangePropose
   865  		err      error
   866  	)
   867  
   868  	submit := func() error {
   869  		cl.Lock()
   870  		defer cl.Unlock()
   871  
   872  		if proposal, err = cl.makeProposal(req, nowait); err != nil {
   873  			logger.Error().Uint64("requestID", req.GetRequestID()).Msg("failed to make proposal for membership change")
   874  			return err
   875  		}
   876  
   877  		if err = cl.isEnableChangeMembership(proposal.Cc); err != nil {
   878  			logger.Error().Err(err).Msg("failed cluster availability check to change membership")
   879  			return err
   880  		}
   881  
   882  		if err = cl.submitProposal(proposal, nowait); err != nil {
   883  			return err
   884  		}
   885  
   886  		return nil
   887  	}
   888  
   889  	if err = submit(); err != nil {
   890  		return nil, err
   891  	}
   892  
   893  	if nowait {
   894  		return nil, nil
   895  	}
   896  
   897  	return cl.recvConfChangeReply(proposal.ReplyC)
   898  }
   899  
   900  func (cl *Cluster) makeProposal(req *types.MembershipChange, nowait bool) (*consensus.ConfChangePropose, error) {
   901  	if cl.savedChange != nil {
   902  		logger.Error().Str("cc", types.RaftConfChangeToString(cl.savedChange.Cc)).Msg("already exist pending conf change")
   903  		return nil, ErrPendingConfChange
   904  	}
   905  
   906  	var (
   907  		replyC chan *consensus.ConfChangeReply
   908  		member *consensus.Member
   909  		err    error
   910  	)
   911  
   912  	switch req.Type {
   913  	case types.MembershipChangeType_ADD_MEMBER:
   914  		member, err = cl.NewMemberFromAddReq(req)
   915  
   916  	case types.MembershipChangeType_REMOVE_MEMBER:
   917  		member, err = cl.NewMemberFromRemoveReq(req)
   918  
   919  	default:
   920  		return nil, ErrInvalidMembershipReqType
   921  	}
   922  
   923  	if err != nil {
   924  		logger.Error().Err(err).Uint64("requestID", req.GetRequestID()).Msg("failed to make new member")
   925  		return nil, err
   926  	}
   927  
   928  	// make raft confChange
   929  	cc, err := cl.makeConfChange(req.GetRequestID(), req.Type, member)
   930  	if err != nil {
   931  		logger.Error().Err(err).Uint64("requestID", req.GetRequestID()).Msg("failed to make conf change of raft")
   932  		return nil, err
   933  	}
   934  
   935  	// validate member change
   936  	if err = cl.validateChangeMembership(cc, member, false); err != nil {
   937  		logger.Error().Err(err).Uint64("requestID", req.GetRequestID()).Msg("failed to validate request of membership change")
   938  		return nil, err
   939  	}
   940  
   941  	if !nowait {
   942  		replyC = make(chan *consensus.ConfChangeReply, 1)
   943  	}
   944  
   945  	// TODO check cancel
   946  	ctx, cancel := context.WithTimeout(context.Background(), MaxConfChangeTimeOut)
   947  	defer cancel()
   948  
   949  	// send proposeC (confChange, replyC)
   950  	proposal := consensus.ConfChangePropose{Ctx: ctx, Cc: cc, ReplyC: replyC}
   951  
   952  	return &proposal, nil
   953  }
   954  
   955  func (cl *Cluster) submitProposal(proposal *consensus.ConfChangePropose, nowait bool) error {
   956  	if cl.savedChange != nil {
   957  		return ErrPendingConfChange
   958  	}
   959  
   960  	cl.saveConfChangePropose(proposal)
   961  
   962  	select {
   963  	case cl.confChangeC <- proposal:
   964  		logger.Info().Uint64("requestID", proposal.Cc.ID).Msg("proposal of conf change is sent to raft")
   965  	default:
   966  		logger.Error().Uint64("requestID", proposal.Cc.ID).Msg("proposal of conf change is dropped. confChange channel is busy")
   967  
   968  		if !nowait {
   969  			close(proposal.ReplyC)
   970  		}
   971  		cl.resetSavedConfChangePropose()
   972  		return ErrConfChangeChannelBusy
   973  	}
   974  
   975  	return nil
   976  }
   977  
   978  func (cl *Cluster) recvConfChangeReply(replyC chan *consensus.ConfChangeReply) (*consensus.Member, error) {
   979  	select {
   980  	case reply, ok := <-replyC:
   981  		if !ok {
   982  			logger.Panic().Msg("reply channel of change request must not be closed")
   983  		}
   984  
   985  		if reply.Err != nil {
   986  			logger.Error().Err(reply.Err).Msg("failed conf change")
   987  			return nil, reply.Err
   988  		}
   989  
   990  		logger.Info().Str("cluster", cl.toString()).Str("target", reply.Member.ToString()).Msg("reply of conf change is succeed")
   991  
   992  		return reply.Member, nil
   993  	case <-time.After(MaxConfChangeTimeOut):
   994  		// saved conf change must be reset in raft server after request completes
   995  		logger.Warn().Msg("proposal of conf change is time-out")
   996  
   997  		return nil, ErrConChangeTimeOut
   998  	}
   999  }
  1000  
  1001  func (cl *Cluster) AfterConfChange(cc *raftpb.ConfChange, member *consensus.Member, err error) {
  1002  	cl.Lock()
  1003  	defer cl.Unlock()
  1004  
  1005  	// TODO XXX if leader is rebooted, savedChange will be nil, so need to handle this situation
  1006  	if cl.savedChange == nil || cl.savedChange.Cc.ID != cc.ID {
  1007  		return
  1008  	}
  1009  
  1010  	propose := cl.savedChange
  1011  
  1012  	logger.Info().Str("req", util.JSON(propose.Cc)).Msg("conf change succeed")
  1013  
  1014  	cl.resetSavedConfChangePropose()
  1015  
  1016  	if propose.ReplyC != nil {
  1017  		propose.ReplyC <- &consensus.ConfChangeReply{Member: member, Err: err}
  1018  		close(propose.ReplyC)
  1019  	}
  1020  }
  1021  
  1022  func (cl *Cluster) saveConfChangePropose(ccPropose *consensus.ConfChangePropose) {
  1023  	logger.Debug().Uint64("ccid", ccPropose.Cc.ID).Msg("this conf change propose is saved in cluster")
  1024  	cl.savedChange = ccPropose
  1025  }
  1026  
  1027  func (cl *Cluster) resetSavedConfChangePropose() {
  1028  	var ccid uint64
  1029  
  1030  	if cl.savedChange == nil {
  1031  		return
  1032  	}
  1033  
  1034  	ccid = cl.savedChange.Cc.ID
  1035  
  1036  	logger.Debug().Uint64("requestID", ccid).Msg("reset saved conf change propose")
  1037  
  1038  	cl.savedChange = nil
  1039  }
  1040  
  1041  var (
  1042  	ErrRaftStatusEmpty = errors.New("raft status is empty")
  1043  )
  1044  
  1045  // isEnableChangeMembership check if membership change request can stop cluster.
  1046  // case add : current avaliable node < (n + 1)/ 2 + 1
  1047  // case remove : avaliable node except node to remove < (n - 1) / 2 - 1
  1048  //
  1049  // Default :
  1050  // - Add : 1 node라도 장애 node or slow node가 존재하면 add는 불가
  1051  //         slow node기준 - block 높이가 100이상 차이 나는 경우
  1052  // - Remove :
  1053  //         현재 cluster가 available 해야함
  1054  //		   node를 뺄때는 (정상node - 1) >= (n - 1) / 2 + 1 이어야함
  1055  //         slow node는 항상 뺄수 있다. slow node를 뺌으로써 cluster를 정상으로 만들기 위함
  1056  // - force 모드: 무조건 실행 한다.
  1057  func (cl *Cluster) isEnableChangeMembership(cc *raftpb.ConfChange) error {
  1058  	status := cl.rs.Status()
  1059  	if status.ID == 0 {
  1060  		logger.Debug().Msg("raft node is not initialized")
  1061  		return ErrRaftStatusEmpty
  1062  	}
  1063  
  1064  	cp, err := cl.rs.GetClusterProgress()
  1065  	if err != nil {
  1066  		logger.Error().Err(err).Msg("failed to get cluster progress")
  1067  		return err
  1068  	}
  1069  
  1070  	logger.Info().Str("info", cp.ToString()).Msg("cluster progress")
  1071  
  1072  	getHealthyMembers := func(cp *ClusterProgress) int {
  1073  		var healthy int
  1074  
  1075  		for _, mp := range cp.MemberProgresses {
  1076  			if mp.Status == MemberProgressStateHealthy {
  1077  				healthy++
  1078  			}
  1079  		}
  1080  
  1081  		return healthy
  1082  	}
  1083  
  1084  	isClusterAvilable := func(total int, healthy int) bool {
  1085  		quorum := total/2 + 1
  1086  
  1087  		logger.Info().Int("quorum", quorum).Int("total", total).Int("healthy", healthy).Msg("cluster quorum")
  1088  
  1089  		return healthy >= quorum
  1090  	}
  1091  
  1092  	healthy := getHealthyMembers(cp)
  1093  
  1094  	if !isClusterAvilable(cp.N, healthy) {
  1095  		logger.Warn().Msg("curretn cluster status doesn't satisfy quorum")
  1096  	}
  1097  
  1098  	switch {
  1099  	case cc.Type == raftpb.ConfChangeAddNode:
  1100  		for _, mp := range cp.MemberProgresses {
  1101  			if mp.Status != MemberProgressStateHealthy {
  1102  				logger.Error().Uint64("slowgap", MaxSlowNodeGap).Str("unhealthy member", mp.ToString()).Msg("exist unhealthy member in cluster. If you want add some node, fix the unhealthy node and try again")
  1103  				return ErrUnhealtyNodeExist
  1104  			}
  1105  		}
  1106  
  1107  		return nil
  1108  	case cc.Type == raftpb.ConfChangeRemoveNode:
  1109  		mp, ok := cp.MemberProgresses[cc.NodeID]
  1110  		if !ok {
  1111  			logger.Error().Uint64("id", cc.NodeID).Msg("not exist progress of member")
  1112  			return ErrNotExitRaftProgress
  1113  		}
  1114  
  1115  		if mp.Status != MemberProgressStateHealthy {
  1116  			logger.Warn().Uint64("memberid", mp.MemberID).Msg("try to remove slow node")
  1117  			return nil
  1118  		}
  1119  
  1120  		if !isClusterAvilable(cp.N-1, healthy-1) {
  1121  			logger.Error().Msg("can't remove healthy node. If you remove this node, cluster can be stop.")
  1122  			return ErrRemoveHealthyNode
  1123  		}
  1124  
  1125  		return nil
  1126  	default:
  1127  		logger.Error().Msg("type of conf change is invalid")
  1128  		return ErrInvalidMembershipReqType
  1129  	}
  1130  }
  1131  
  1132  func (cl *Cluster) validateChangeMembership(cc *raftpb.ConfChange, member *consensus.Member, needlock bool) error {
  1133  	if member == nil {
  1134  		return ErrCCMemberIsNil
  1135  	}
  1136  
  1137  	if needlock {
  1138  		cl.Lock()
  1139  		defer cl.Unlock()
  1140  	}
  1141  
  1142  	appliedMembers := cl.AppliedMembers()
  1143  
  1144  	if member.ID == consensus.InvalidMemberID {
  1145  		return consensus.ErrInvalidMemberID
  1146  	}
  1147  	if cl.RemovedMembers().isExist(member.ID) {
  1148  		return ErrCCAlreadyRemoved
  1149  	}
  1150  
  1151  	switch cc.Type {
  1152  	case raftpb.ConfChangeAddNode:
  1153  		if !member.IsValid() {
  1154  			logger.Error().Str("member", member.ToString()).Msg("member has invalid fields")
  1155  			return ErrInvalidMember
  1156  		}
  1157  
  1158  		if m := appliedMembers.getMember(member.ID); m != nil {
  1159  			return ErrCCAlreadyAdded
  1160  		}
  1161  
  1162  		if err := appliedMembers.hasDuplicatedMember(member); err != nil {
  1163  			return err
  1164  		}
  1165  
  1166  	case raftpb.ConfChangeRemoveNode:
  1167  		var m *consensus.Member
  1168  
  1169  		if m = appliedMembers.getMember(member.ID); m == nil {
  1170  			return ErrCCNoMemberToRemove
  1171  		}
  1172  
  1173  		*member = *m
  1174  	default:
  1175  		return ErrInvCCType
  1176  	}
  1177  
  1178  	// - TODO UPDATE
  1179  	return nil
  1180  }
  1181  
  1182  func (cl *Cluster) makeConfChange(reqID uint64, reqType types.MembershipChangeType, member *consensus.Member) (*raftpb.ConfChange, error) {
  1183  	var changeType raftpb.ConfChangeType
  1184  	switch reqType {
  1185  	case types.MembershipChangeType_ADD_MEMBER:
  1186  		changeType = raftpb.ConfChangeAddNode
  1187  	case types.MembershipChangeType_REMOVE_MEMBER:
  1188  		changeType = raftpb.ConfChangeRemoveNode
  1189  	default:
  1190  		return nil, ErrInvalidMembershipReqType
  1191  	}
  1192  
  1193  	logger.Debug().Uint64("requestID", reqID).Str("member", member.ToString()).Msg("conf change target member")
  1194  
  1195  	cl.changeSeq++
  1196  
  1197  	data, err := json.Marshal(member)
  1198  	if err != nil {
  1199  		return nil, err
  1200  	}
  1201  
  1202  	// generateConfChangeID
  1203  	cc := &raftpb.ConfChange{ID: reqID, Type: changeType, NodeID: uint64(member.ID), Context: data}
  1204  
  1205  	return cc, nil
  1206  }
  1207  
  1208  func EtcdIDToString(id uint64) string {
  1209  	return fmt.Sprintf("%x", id)
  1210  }