github.com/okex/exchain@v1.8.0/libs/tendermint/consensus/consensus.go (about)

     1  package consensus
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/okex/exchain/libs/system/trace"
    10  	cfg "github.com/okex/exchain/libs/tendermint/config"
    11  	cstypes "github.com/okex/exchain/libs/tendermint/consensus/types"
    12  	"github.com/okex/exchain/libs/tendermint/crypto"
    13  	tmevents "github.com/okex/exchain/libs/tendermint/libs/events"
    14  	"github.com/okex/exchain/libs/tendermint/libs/log"
    15  	"github.com/okex/exchain/libs/tendermint/libs/service"
    16  	"github.com/okex/exchain/libs/tendermint/p2p"
    17  	sm "github.com/okex/exchain/libs/tendermint/state"
    18  	"github.com/okex/exchain/libs/tendermint/types"
    19  	"github.com/pkg/errors"
    20  	"github.com/spf13/viper"
    21  )
    22  
    23  //-----------------------------------------------------------------------------
    24  // Errors
    25  
    26  var (
    27  	ErrInvalidProposalSignature = errors.New("error invalid proposal signature")
    28  	ErrInvalidProposalPOLRound  = errors.New("error invalid proposal POL round")
    29  	ErrAddingVote               = errors.New("error adding vote")
    30  	ErrVoteHeightMismatch       = errors.New("error vote height mismatch")
    31  
    32  	errPubKeyIsNotSet = errors.New("pubkey is not set. Look for \"Can't get private validator pubkey\" errors")
    33  
    34  	activeViewChange = false
    35  )
    36  
    37  func SetActiveVC(value bool) {
    38  	activeViewChange = value
    39  }
    40  
    41  func GetActiveVC() bool {
    42  	return activeViewChange
    43  }
    44  
    45  type preBlockTaskRes struct {
    46  	block      *types.Block
    47  	blockParts *types.PartSet
    48  }
    49  
    50  //-----------------------------------------------------------------------------
    51  
    52  const (
    53  	msgQueueSize   = 1000
    54  	EnablePrerunTx = "enable-preruntx"
    55  )
    56  
    57  // msgs from the reactor which may update the state
    58  type msgInfo struct {
    59  	Msg    Message `json:"msg"`
    60  	PeerID p2p.ID  `json:"peer_key"`
    61  }
    62  
    63  // internally generated messages which may update the state
    64  type timeoutInfo struct {
    65  	Duration         time.Duration         `json:"duration"`
    66  	Height           int64                 `json:"height"`
    67  	Round            int                   `json:"round"`
    68  	Step             cstypes.RoundStepType `json:"step"`
    69  	ActiveViewChange bool                  `json:"active-view-change"`
    70  }
    71  
    72  func (ti *timeoutInfo) String() string {
    73  	return fmt.Sprintf("%v ; %d/%d %v", ti.Duration, ti.Height, ti.Round, ti.Step)
    74  }
    75  
    76  // interface to the mempool
    77  type txNotifier interface {
    78  	TxsAvailable() <-chan struct{}
    79  }
    80  
    81  // interface to the evidence pool
    82  type evidencePool interface {
    83  	AddEvidence(types.Evidence) error
    84  }
    85  
    86  // State handles execution of the consensus algorithm.
    87  // It processes votes and proposals, and upon reaching agreement,
    88  // commits blocks to the chain and executes them against the application.
    89  // The internal state machine receives input from peers, the internal validator, and from a timer.
    90  type State struct {
    91  	service.BaseService
    92  
    93  	// config details
    94  	config        *cfg.ConsensusConfig
    95  	privValidator types.PrivValidator // for signing votes
    96  
    97  	// store blocks and commits
    98  	blockStore sm.BlockStore
    99  
   100  	// create and execute blocks
   101  	blockExec *sm.BlockExecutor
   102  
   103  	// notify us if txs are available
   104  	txNotifier txNotifier
   105  
   106  	// add evidence to the pool
   107  	// when it's detected
   108  	evpool evidencePool
   109  
   110  	// internal state
   111  	mtx      sync.RWMutex
   112  	stateMtx sync.RWMutex
   113  	cstypes.RoundState
   114  	state sm.State // State until height-1.
   115  	// privValidator pubkey, memoized for the duration of one block
   116  	// to avoid extra requests to HSM
   117  	privValidatorPubKey crypto.PubKey
   118  
   119  	// state changes may be triggered by: msgs from peers,
   120  	// msgs from ourself, or by timeouts
   121  	peerMsgQueue     chan msgInfo
   122  	internalMsgQueue chan msgInfo
   123  	timeoutTicker    TimeoutTicker
   124  
   125  	// information about about added votes and block parts are written on this channel
   126  	// so statistics can be computed by reactor
   127  	statsMsgQueue chan msgInfo
   128  
   129  	// we use eventBus to trigger msg broadcasts in the reactor,
   130  	// and to notify external subscribers, eg. through a websocket
   131  	eventBus *types.EventBus
   132  
   133  	// a Write-Ahead Log ensures we can recover from any kind of crash
   134  	// and helps us avoid signing conflicting votes
   135  	wal          WAL
   136  	replayMode   bool // so we don't log signing errors during replay
   137  	doWALCatchup bool // determines if we even try to do the catchup
   138  
   139  	// for tests where we want to limit the number of transitions the state makes
   140  	nSteps int
   141  
   142  	// some functions can be overwritten for testing
   143  	decideProposal func(height int64, round int)
   144  	doPrevote      func(height int64, round int)
   145  	setProposal    func(proposal *types.Proposal) (bool, error)
   146  
   147  	// closed when we finish shutting down
   148  	done chan struct{}
   149  
   150  	// synchronous pubsub between consensus state and reactor.
   151  	// state only emits EventNewRoundStep and EventVote
   152  	evsw tmevents.EventSwitch
   153  
   154  	// for reporting metrics
   155  	metrics *Metrics
   156  
   157  	trc          *trace.Tracer
   158  	blockTimeTrc *trace.Tracer
   159  
   160  	prerunTx bool
   161  	bt       *BlockTransport
   162  
   163  	vcMsg    *ViewChangeMessage
   164  	vcHeight map[int64]string
   165  
   166  	preBlockTaskChan chan *preBlockTask
   167  	taskResultChan   chan *preBlockTaskRes
   168  }
   169  
   170  // preBlockSignal
   171  type preBlockTask struct {
   172  	height   int64
   173  	duration time.Duration
   174  }
   175  
   176  // StateOption sets an optional parameter on the State.
   177  type StateOption func(*State)
   178  
   179  // NewState returns a new State.
   180  func NewState(
   181  	config *cfg.ConsensusConfig,
   182  	state sm.State,
   183  	blockExec *sm.BlockExecutor,
   184  	blockStore sm.BlockStore,
   185  	txNotifier txNotifier,
   186  	evpool evidencePool,
   187  	options ...StateOption,
   188  ) *State {
   189  	cs := &State{
   190  		config:           config,
   191  		blockExec:        blockExec,
   192  		blockStore:       blockStore,
   193  		txNotifier:       txNotifier,
   194  		peerMsgQueue:     make(chan msgInfo, msgQueueSize),
   195  		internalMsgQueue: make(chan msgInfo, msgQueueSize),
   196  		timeoutTicker:    NewTimeoutTicker(),
   197  		statsMsgQueue:    make(chan msgInfo, msgQueueSize),
   198  		done:             make(chan struct{}),
   199  		doWALCatchup:     true,
   200  		wal:              nilWAL{},
   201  		evpool:           evpool,
   202  		evsw:             tmevents.NewEventSwitch(),
   203  		metrics:          NopMetrics(),
   204  		trc:              trace.NewTracer(trace.Consensus),
   205  		prerunTx:         viper.GetBool(EnablePrerunTx),
   206  		bt:               &BlockTransport{},
   207  		blockTimeTrc:     trace.NewTracer(trace.LastBlockTime),
   208  		vcHeight:         make(map[int64]string),
   209  		taskResultChan:   make(chan *preBlockTaskRes, 1),
   210  		preBlockTaskChan: make(chan *preBlockTask, 1),
   211  	}
   212  	// set function defaults (may be overwritten before calling Start)
   213  	cs.decideProposal = cs.defaultDecideProposal
   214  	cs.doPrevote = cs.defaultDoPrevote
   215  	cs.setProposal = cs.defaultSetProposal
   216  
   217  	// We have no votes, so reconstruct LastCommit from SeenCommit.
   218  	if state.LastBlockHeight > types.GetStartBlockHeight() {
   219  		cs.reconstructLastCommit(state)
   220  	}
   221  
   222  	cs.updateToState(state)
   223  	if cs.prerunTx {
   224  		cs.blockExec.InitPrerun()
   225  	}
   226  
   227  	// Don't call scheduleRound0 yet.
   228  	// We do that upon Start().
   229  	cs.BaseService = *service.NewBaseService(nil, "State", cs)
   230  	for _, option := range options {
   231  		option(cs)
   232  	}
   233  	return cs
   234  }
   235  
   236  //----------------------------------------
   237  // Public interface
   238  
   239  // SetLogger implements Service.
   240  func (cs *State) SetLogger(l log.Logger) {
   241  	cs.BaseService.Logger = l
   242  	cs.timeoutTicker.SetLogger(l)
   243  }
   244  
   245  // SetEventBus sets event bus.
   246  func (cs *State) SetEventBus(b *types.EventBus) {
   247  	cs.eventBus = b
   248  	cs.blockExec.SetEventBus(b)
   249  }
   250  
   251  // StateMetrics sets the metrics.
   252  func StateMetrics(metrics *Metrics) StateOption {
   253  	return func(cs *State) { cs.metrics = metrics }
   254  }
   255  
   256  // String returns a string.
   257  func (cs *State) String() string {
   258  	// better not to access shared variables
   259  	return fmt.Sprintf("ConsensusState") //(H:%v R:%v S:%v", cs.Height, cs.Round, cs.Step)
   260  }
   261  
   262  // GetState returns a copy of the chain state.
   263  func (cs *State) GetState() sm.State {
   264  	cs.mtx.RLock()
   265  	defer cs.mtx.RUnlock()
   266  	return cs.state.Copy()
   267  }
   268  
   269  // GetLastHeight returns the last height committed.
   270  // If there were no blocks, returns 0.
   271  func (cs *State) GetLastHeight() int64 {
   272  	cs.mtx.RLock()
   273  	defer cs.mtx.RUnlock()
   274  	return cs.RoundState.Height - 1
   275  }
   276  
   277  // GetRoundState returns a shallow copy of the internal consensus state.
   278  func (cs *State) GetRoundState() *cstypes.RoundState {
   279  	cs.mtx.RLock()
   280  	rs := cs.RoundState // copy
   281  	cs.mtx.RUnlock()
   282  	return &rs
   283  }
   284  
   285  // GetRoundStateJSON returns a json of RoundState, marshalled using go-amino.
   286  func (cs *State) GetRoundStateJSON() ([]byte, error) {
   287  	cs.mtx.RLock()
   288  	defer cs.mtx.RUnlock()
   289  	return cdc.MarshalJSON(cs.RoundState)
   290  }
   291  
   292  // GetRoundStateSimpleJSON returns a json of RoundStateSimple, marshalled using go-amino.
   293  func (cs *State) GetRoundStateSimpleJSON() ([]byte, error) {
   294  	cs.mtx.RLock()
   295  	defer cs.mtx.RUnlock()
   296  	return cdc.MarshalJSON(cs.RoundState.RoundStateSimple())
   297  }
   298  
   299  // GetValidators returns a copy of the current validators.
   300  func (cs *State) GetValidators() (int64, []*types.Validator) {
   301  	cs.mtx.RLock()
   302  	defer cs.mtx.RUnlock()
   303  	return cs.state.LastBlockHeight, cs.state.Validators.Copy().Validators
   304  }
   305  
   306  // SetPrivValidator sets the private validator account for signing votes. It
   307  // immediately requests pubkey and caches it.
   308  func (cs *State) SetPrivValidator(priv types.PrivValidator) {
   309  	cs.mtx.Lock()
   310  	defer cs.mtx.Unlock()
   311  
   312  	cs.privValidator = priv
   313  
   314  	if err := cs.updatePrivValidatorPubKey(); err != nil {
   315  		cs.Logger.Error("Can't get private validator pubkey", "err", err)
   316  	}
   317  }
   318  
   319  // SetTimeoutTicker sets the local timer. It may be useful to overwrite for testing.
   320  func (cs *State) SetTimeoutTicker(timeoutTicker TimeoutTicker) {
   321  	cs.mtx.Lock()
   322  	cs.timeoutTicker = timeoutTicker
   323  	cs.mtx.Unlock()
   324  }
   325  
   326  // LoadCommit loads the commit for a given height.
   327  func (cs *State) LoadCommit(height int64) *types.Commit {
   328  	cs.mtx.RLock()
   329  	defer cs.mtx.RUnlock()
   330  	if height == cs.blockStore.Height() {
   331  		return cs.blockStore.LoadSeenCommit(height)
   332  	}
   333  	return cs.blockStore.LoadBlockCommit(height)
   334  }
   335  
   336  // OnStart implements service.Service.
   337  // It loads the latest state via the WAL, and starts the timeout and receive routines.
   338  func (cs *State) OnStart() error {
   339  	if err := cs.evsw.Start(); err != nil {
   340  		cs.Logger.Error("evsw start failed. err: ", err)
   341  		return err
   342  	}
   343  
   344  	// we may set the WAL in testing before calling Start,
   345  	// so only OpenWAL if its still the nilWAL
   346  	if _, ok := cs.wal.(nilWAL); ok {
   347  		walFile := cs.config.WalFile()
   348  		wal, err := cs.OpenWAL(walFile)
   349  		if err != nil {
   350  			cs.Logger.Error("Error loading State wal", "err", err.Error())
   351  			return err
   352  		}
   353  		cs.wal = wal
   354  	}
   355  
   356  	// we need the timeoutRoutine for replay so
   357  	// we don't block on the tick chan.
   358  	// NOTE: we will get a build up of garbage go routines
   359  	// firing on the tockChan until the receiveRoutine is started
   360  	// to deal with them (by that point, at most one will be valid)
   361  	if err := cs.timeoutTicker.Start(); err != nil {
   362  		return err
   363  	}
   364  
   365  	// we may have lost some votes if the process crashed
   366  	// reload from consensus log to catchup
   367  	if cs.doWALCatchup {
   368  		if err := cs.catchupReplay(cs.Height); err != nil {
   369  			// don't try to recover from data corruption error
   370  			if IsDataCorruptionError(err) {
   371  				cs.Logger.Error("Encountered corrupt WAL file", "err", err.Error())
   372  				cs.Logger.Error("Please repair the WAL file before restarting")
   373  				fmt.Println(`You can attempt to repair the WAL as follows:
   374  
   375  ----
   376  WALFILE=~/.tendermint/data/cs.wal/wal
   377  cp $WALFILE ${WALFILE}.bak # backup the file
   378  go run scripts/wal2json/main.go $WALFILE > wal.json # this will panic, but can be ignored
   379  rm $WALFILE # remove the corrupt file
   380  go run scripts/json2wal/main.go wal.json $WALFILE # rebuild the file without corruption
   381  ----`)
   382  
   383  				return err
   384  			}
   385  
   386  			cs.Logger.Error("Error on catchup replay. Proceeding to start State anyway", "err", err.Error())
   387  			// NOTE: if we ever do return an error here,
   388  			// make sure to stop the timeoutTicker
   389  		}
   390  	}
   391  
   392  	if cs.done == nil {
   393  		cs.done = make(chan struct{})
   394  	}
   395  
   396  	// now start the receiveRoutine
   397  	go cs.receiveRoutine(0)
   398  
   399  	go cs.preMakeBlockRoutine()
   400  
   401  	// schedule the first round!
   402  	// use GetRoundState so we don't race the receiveRoutine for access
   403  	cs.scheduleRound0(cs.GetRoundState())
   404  
   405  	return nil
   406  }
   407  
   408  // OnStop implements service.Service.
   409  func (cs *State) OnStop() {
   410  	cs.evsw.Stop()
   411  	cs.timeoutTicker.Stop()
   412  	// WAL is stopped in receiveRoutine.
   413  }
   414  
   415  func (cs *State) OnReset() error {
   416  	cs.evsw.Reset()
   417  	cs.wal.Reset()
   418  	cs.wal = nilWAL{}
   419  	cs.timeoutTicker.Reset()
   420  	return nil
   421  }
   422  
   423  // Wait waits for the the main routine to return.
   424  // NOTE: be sure to Stop() the event switch and drain
   425  // any event channels or this may deadlock
   426  func (cs *State) Wait() {
   427  	if cs.done != nil {
   428  		<-cs.done
   429  	}
   430  }
   431  
   432  // OpenWAL opens a file to log all consensus messages and timeouts for deterministic accountability
   433  func (cs *State) OpenWAL(walFile string) (WAL, error) {
   434  	wal, err := NewWAL(walFile)
   435  	if err != nil {
   436  		cs.Logger.Error("Failed to open WAL for consensus state", "wal", walFile, "err", err)
   437  		return nil, err
   438  	}
   439  	wal.SetLogger(cs.Logger.With("wal", walFile))
   440  	if err := wal.Start(); err != nil {
   441  		return nil, err
   442  	}
   443  	return wal, nil
   444  }
   445  
   446  //------------------------------------------------------------
   447  // internal functions for managing the state
   448  
   449  func (cs *State) updateRoundStep(round int, step cstypes.RoundStepType) {
   450  	cs.Round = round
   451  	cs.Step = step
   452  }
   453  
   454  // Reconstruct LastCommit from SeenCommit, which we saved along with the block,
   455  // (which happens even before saving the state)
   456  func (cs *State) reconstructLastCommit(state sm.State) {
   457  	if state.LastBlockHeight == types.GetStartBlockHeight() {
   458  		return
   459  	}
   460  	seenCommit := cs.blockStore.LoadSeenCommit(state.LastBlockHeight)
   461  	if seenCommit == nil {
   462  		panic(fmt.Sprintf("Failed to reconstruct LastCommit: seen commit for height %v not found",
   463  			state.LastBlockHeight))
   464  	}
   465  	lastPrecommits := types.CommitToVoteSet(state.ChainID, seenCommit, state.LastValidators)
   466  	if !lastPrecommits.HasTwoThirdsMajority() {
   467  		panic("Failed to reconstruct LastCommit: Does not have +2/3 maj")
   468  	}
   469  	cs.LastCommit = lastPrecommits
   470  }
   471  
   472  func (cs *State) newStep() {
   473  	rs := cs.RoundStateEvent()
   474  	cs.wal.Write(rs)
   475  	cs.nSteps++
   476  	// newStep is called by updateToState in NewState before the eventBus is set!
   477  	if cs.eventBus != nil {
   478  		cs.eventBus.PublishEventNewRoundStep(rs)
   479  		cs.evsw.FireEvent(types.EventNewRoundStep, &cs.RoundState)
   480  	}
   481  }
   482  
   483  // needProofBlock returns true on the first height (so the genesis app hash is signed right away)
   484  // and where the last block (height-1) caused the app hash to change
   485  func (cs *State) needProofBlock(height int64) bool {
   486  	if height == types.GetStartBlockHeight()+1 {
   487  		return true
   488  	}
   489  
   490  	lastBlockMeta := cs.blockStore.LoadBlockMeta(height - 1)
   491  	if lastBlockMeta == nil {
   492  		panic(fmt.Sprintf("needProofBlock: last block meta for height %d not found", height-1))
   493  	}
   494  	return !bytes.Equal(cs.state.AppHash, lastBlockMeta.Header.AppHash)
   495  }
   496  
   497  func (cs *State) recordMetrics(height int64, block *types.Block) {
   498  	cs.metrics.Validators.Set(float64(cs.Validators.Size()))
   499  	cs.metrics.ValidatorsPower.Set(float64(cs.Validators.TotalVotingPower()))
   500  
   501  	var (
   502  		missingValidators      int
   503  		missingValidatorsPower int64
   504  	)
   505  	// height=0 -> MissingValidators and MissingValidatorsPower are both 0.
   506  	// Remember that the first LastCommit is intentionally empty, so it's not
   507  	// fair to increment missing validators number.
   508  	if height > types.GetStartBlockHeight()+1 {
   509  		// Sanity check that commit size matches validator set size - only applies
   510  		// after first block.
   511  		var (
   512  			commitSize = block.LastCommit.Size()
   513  			valSetLen  = len(cs.LastValidators.Validators)
   514  			address    types.Address
   515  		)
   516  		if commitSize != valSetLen {
   517  			panic(fmt.Sprintf("commit size (%d) doesn't match valset length (%d) at height %d\n\n%v\n\n%v",
   518  				commitSize, valSetLen, block.Height, block.LastCommit.Signatures, cs.LastValidators.Validators))
   519  		}
   520  
   521  		if cs.privValidator != nil {
   522  			if cs.privValidatorPubKey == nil {
   523  				// Metrics won't be updated, but it's not critical.
   524  				cs.Logger.Error(fmt.Sprintf("recordMetrics: %v", errPubKeyIsNotSet))
   525  			} else {
   526  				address = cs.privValidatorPubKey.Address()
   527  			}
   528  		}
   529  
   530  		for i, val := range cs.LastValidators.Validators {
   531  			commitSig := block.LastCommit.Signatures[i]
   532  			if commitSig.Absent() {
   533  				missingValidators++
   534  				missingValidatorsPower += val.VotingPower
   535  			}
   536  
   537  			if bytes.Equal(val.Address, address) {
   538  				label := []string{
   539  					"validator_address", val.Address.String(),
   540  				}
   541  				cs.metrics.ValidatorPower.With(label...).Set(float64(val.VotingPower))
   542  				if commitSig.ForBlock() {
   543  					cs.metrics.ValidatorLastSignedHeight.With(label...).Set(float64(height))
   544  				} else {
   545  					cs.metrics.ValidatorMissedBlocks.With(label...).Add(float64(1))
   546  				}
   547  			}
   548  
   549  		}
   550  	}
   551  	cs.metrics.MissingValidators.Set(float64(missingValidators))
   552  	cs.metrics.MissingValidatorsPower.Set(float64(missingValidatorsPower))
   553  
   554  	cs.metrics.ByzantineValidators.Set(float64(len(block.Evidence.Evidence)))
   555  	byzantineValidatorsPower := int64(0)
   556  	for _, ev := range block.Evidence.Evidence {
   557  		if _, val := cs.Validators.GetByAddress(ev.Address()); val != nil {
   558  			byzantineValidatorsPower += val.VotingPower
   559  		}
   560  	}
   561  	cs.metrics.ByzantineValidatorsPower.Set(float64(byzantineValidatorsPower))
   562  
   563  	if height > 1 {
   564  		lastBlockMeta := cs.blockStore.LoadBlockMeta(height - 1)
   565  		if lastBlockMeta != nil {
   566  			cs.metrics.BlockIntervalSeconds.Set(
   567  				block.Time.Sub(lastBlockMeta.Header.Time).Seconds(),
   568  			)
   569  		}
   570  	}
   571  
   572  	cs.metrics.NumTxs.Set(float64(len(block.Data.Txs)))
   573  	cs.metrics.TotalTxs.Add(float64(len(block.Data.Txs)))
   574  	cs.metrics.BlockSizeBytes.Set(float64(block.FastSize()))
   575  	cs.metrics.CommittedHeight.Set(float64(block.Height))
   576  }
   577  
   578  // updatePrivValidatorPubKey get's the private validator public key and
   579  // memoizes it. This func returns an error if the private validator is not
   580  // responding or responds with an error.
   581  func (cs *State) updatePrivValidatorPubKey() error {
   582  	if cs.privValidator == nil {
   583  		return nil
   584  	}
   585  
   586  	pubKey, err := cs.privValidator.GetPubKey()
   587  	if err != nil {
   588  		return err
   589  	}
   590  	cs.privValidatorPubKey = pubKey
   591  	return nil
   592  }
   593  
   594  func (cs *State) BlockExec() *sm.BlockExecutor {
   595  	return cs.blockExec
   596  }
   597  
   598  //---------------------------------------------------------
   599  
   600  func CompareHRS(h1 int64, r1 int, s1 cstypes.RoundStepType, h2 int64, r2 int, s2 cstypes.RoundStepType, hasVC bool) int {
   601  	if h1 < h2 {
   602  		return -1
   603  	} else if h1 > h2 {
   604  		return 1
   605  	}
   606  	if r1 < r2 {
   607  		return -1
   608  	} else if r1 > r2 {
   609  		return 1
   610  	}
   611  	if hasVC {
   612  		return 1
   613  	}
   614  	if s1 < s2 {
   615  		return -1
   616  	} else if s1 > s2 {
   617  		return 1
   618  	}
   619  	return 0
   620  }