github.com/okex/exchain@v1.8.0/libs/tendermint/consensus/consensus.go (about) 1 package consensus 2 3 import ( 4 "bytes" 5 "fmt" 6 "sync" 7 "time" 8 9 "github.com/okex/exchain/libs/system/trace" 10 cfg "github.com/okex/exchain/libs/tendermint/config" 11 cstypes "github.com/okex/exchain/libs/tendermint/consensus/types" 12 "github.com/okex/exchain/libs/tendermint/crypto" 13 tmevents "github.com/okex/exchain/libs/tendermint/libs/events" 14 "github.com/okex/exchain/libs/tendermint/libs/log" 15 "github.com/okex/exchain/libs/tendermint/libs/service" 16 "github.com/okex/exchain/libs/tendermint/p2p" 17 sm "github.com/okex/exchain/libs/tendermint/state" 18 "github.com/okex/exchain/libs/tendermint/types" 19 "github.com/pkg/errors" 20 "github.com/spf13/viper" 21 ) 22 23 //----------------------------------------------------------------------------- 24 // Errors 25 26 var ( 27 ErrInvalidProposalSignature = errors.New("error invalid proposal signature") 28 ErrInvalidProposalPOLRound = errors.New("error invalid proposal POL round") 29 ErrAddingVote = errors.New("error adding vote") 30 ErrVoteHeightMismatch = errors.New("error vote height mismatch") 31 32 errPubKeyIsNotSet = errors.New("pubkey is not set. Look for \"Can't get private validator pubkey\" errors") 33 34 activeViewChange = false 35 ) 36 37 func SetActiveVC(value bool) { 38 activeViewChange = value 39 } 40 41 func GetActiveVC() bool { 42 return activeViewChange 43 } 44 45 type preBlockTaskRes struct { 46 block *types.Block 47 blockParts *types.PartSet 48 } 49 50 //----------------------------------------------------------------------------- 51 52 const ( 53 msgQueueSize = 1000 54 EnablePrerunTx = "enable-preruntx" 55 ) 56 57 // msgs from the reactor which may update the state 58 type msgInfo struct { 59 Msg Message `json:"msg"` 60 PeerID p2p.ID `json:"peer_key"` 61 } 62 63 // internally generated messages which may update the state 64 type timeoutInfo struct { 65 Duration time.Duration `json:"duration"` 66 Height int64 `json:"height"` 67 Round int `json:"round"` 68 Step cstypes.RoundStepType `json:"step"` 69 ActiveViewChange bool `json:"active-view-change"` 70 } 71 72 func (ti *timeoutInfo) String() string { 73 return fmt.Sprintf("%v ; %d/%d %v", ti.Duration, ti.Height, ti.Round, ti.Step) 74 } 75 76 // interface to the mempool 77 type txNotifier interface { 78 TxsAvailable() <-chan struct{} 79 } 80 81 // interface to the evidence pool 82 type evidencePool interface { 83 AddEvidence(types.Evidence) error 84 } 85 86 // State handles execution of the consensus algorithm. 87 // It processes votes and proposals, and upon reaching agreement, 88 // commits blocks to the chain and executes them against the application. 89 // The internal state machine receives input from peers, the internal validator, and from a timer. 90 type State struct { 91 service.BaseService 92 93 // config details 94 config *cfg.ConsensusConfig 95 privValidator types.PrivValidator // for signing votes 96 97 // store blocks and commits 98 blockStore sm.BlockStore 99 100 // create and execute blocks 101 blockExec *sm.BlockExecutor 102 103 // notify us if txs are available 104 txNotifier txNotifier 105 106 // add evidence to the pool 107 // when it's detected 108 evpool evidencePool 109 110 // internal state 111 mtx sync.RWMutex 112 stateMtx sync.RWMutex 113 cstypes.RoundState 114 state sm.State // State until height-1. 115 // privValidator pubkey, memoized for the duration of one block 116 // to avoid extra requests to HSM 117 privValidatorPubKey crypto.PubKey 118 119 // state changes may be triggered by: msgs from peers, 120 // msgs from ourself, or by timeouts 121 peerMsgQueue chan msgInfo 122 internalMsgQueue chan msgInfo 123 timeoutTicker TimeoutTicker 124 125 // information about about added votes and block parts are written on this channel 126 // so statistics can be computed by reactor 127 statsMsgQueue chan msgInfo 128 129 // we use eventBus to trigger msg broadcasts in the reactor, 130 // and to notify external subscribers, eg. through a websocket 131 eventBus *types.EventBus 132 133 // a Write-Ahead Log ensures we can recover from any kind of crash 134 // and helps us avoid signing conflicting votes 135 wal WAL 136 replayMode bool // so we don't log signing errors during replay 137 doWALCatchup bool // determines if we even try to do the catchup 138 139 // for tests where we want to limit the number of transitions the state makes 140 nSteps int 141 142 // some functions can be overwritten for testing 143 decideProposal func(height int64, round int) 144 doPrevote func(height int64, round int) 145 setProposal func(proposal *types.Proposal) (bool, error) 146 147 // closed when we finish shutting down 148 done chan struct{} 149 150 // synchronous pubsub between consensus state and reactor. 151 // state only emits EventNewRoundStep and EventVote 152 evsw tmevents.EventSwitch 153 154 // for reporting metrics 155 metrics *Metrics 156 157 trc *trace.Tracer 158 blockTimeTrc *trace.Tracer 159 160 prerunTx bool 161 bt *BlockTransport 162 163 vcMsg *ViewChangeMessage 164 vcHeight map[int64]string 165 166 preBlockTaskChan chan *preBlockTask 167 taskResultChan chan *preBlockTaskRes 168 } 169 170 // preBlockSignal 171 type preBlockTask struct { 172 height int64 173 duration time.Duration 174 } 175 176 // StateOption sets an optional parameter on the State. 177 type StateOption func(*State) 178 179 // NewState returns a new State. 180 func NewState( 181 config *cfg.ConsensusConfig, 182 state sm.State, 183 blockExec *sm.BlockExecutor, 184 blockStore sm.BlockStore, 185 txNotifier txNotifier, 186 evpool evidencePool, 187 options ...StateOption, 188 ) *State { 189 cs := &State{ 190 config: config, 191 blockExec: blockExec, 192 blockStore: blockStore, 193 txNotifier: txNotifier, 194 peerMsgQueue: make(chan msgInfo, msgQueueSize), 195 internalMsgQueue: make(chan msgInfo, msgQueueSize), 196 timeoutTicker: NewTimeoutTicker(), 197 statsMsgQueue: make(chan msgInfo, msgQueueSize), 198 done: make(chan struct{}), 199 doWALCatchup: true, 200 wal: nilWAL{}, 201 evpool: evpool, 202 evsw: tmevents.NewEventSwitch(), 203 metrics: NopMetrics(), 204 trc: trace.NewTracer(trace.Consensus), 205 prerunTx: viper.GetBool(EnablePrerunTx), 206 bt: &BlockTransport{}, 207 blockTimeTrc: trace.NewTracer(trace.LastBlockTime), 208 vcHeight: make(map[int64]string), 209 taskResultChan: make(chan *preBlockTaskRes, 1), 210 preBlockTaskChan: make(chan *preBlockTask, 1), 211 } 212 // set function defaults (may be overwritten before calling Start) 213 cs.decideProposal = cs.defaultDecideProposal 214 cs.doPrevote = cs.defaultDoPrevote 215 cs.setProposal = cs.defaultSetProposal 216 217 // We have no votes, so reconstruct LastCommit from SeenCommit. 218 if state.LastBlockHeight > types.GetStartBlockHeight() { 219 cs.reconstructLastCommit(state) 220 } 221 222 cs.updateToState(state) 223 if cs.prerunTx { 224 cs.blockExec.InitPrerun() 225 } 226 227 // Don't call scheduleRound0 yet. 228 // We do that upon Start(). 229 cs.BaseService = *service.NewBaseService(nil, "State", cs) 230 for _, option := range options { 231 option(cs) 232 } 233 return cs 234 } 235 236 //---------------------------------------- 237 // Public interface 238 239 // SetLogger implements Service. 240 func (cs *State) SetLogger(l log.Logger) { 241 cs.BaseService.Logger = l 242 cs.timeoutTicker.SetLogger(l) 243 } 244 245 // SetEventBus sets event bus. 246 func (cs *State) SetEventBus(b *types.EventBus) { 247 cs.eventBus = b 248 cs.blockExec.SetEventBus(b) 249 } 250 251 // StateMetrics sets the metrics. 252 func StateMetrics(metrics *Metrics) StateOption { 253 return func(cs *State) { cs.metrics = metrics } 254 } 255 256 // String returns a string. 257 func (cs *State) String() string { 258 // better not to access shared variables 259 return fmt.Sprintf("ConsensusState") //(H:%v R:%v S:%v", cs.Height, cs.Round, cs.Step) 260 } 261 262 // GetState returns a copy of the chain state. 263 func (cs *State) GetState() sm.State { 264 cs.mtx.RLock() 265 defer cs.mtx.RUnlock() 266 return cs.state.Copy() 267 } 268 269 // GetLastHeight returns the last height committed. 270 // If there were no blocks, returns 0. 271 func (cs *State) GetLastHeight() int64 { 272 cs.mtx.RLock() 273 defer cs.mtx.RUnlock() 274 return cs.RoundState.Height - 1 275 } 276 277 // GetRoundState returns a shallow copy of the internal consensus state. 278 func (cs *State) GetRoundState() *cstypes.RoundState { 279 cs.mtx.RLock() 280 rs := cs.RoundState // copy 281 cs.mtx.RUnlock() 282 return &rs 283 } 284 285 // GetRoundStateJSON returns a json of RoundState, marshalled using go-amino. 286 func (cs *State) GetRoundStateJSON() ([]byte, error) { 287 cs.mtx.RLock() 288 defer cs.mtx.RUnlock() 289 return cdc.MarshalJSON(cs.RoundState) 290 } 291 292 // GetRoundStateSimpleJSON returns a json of RoundStateSimple, marshalled using go-amino. 293 func (cs *State) GetRoundStateSimpleJSON() ([]byte, error) { 294 cs.mtx.RLock() 295 defer cs.mtx.RUnlock() 296 return cdc.MarshalJSON(cs.RoundState.RoundStateSimple()) 297 } 298 299 // GetValidators returns a copy of the current validators. 300 func (cs *State) GetValidators() (int64, []*types.Validator) { 301 cs.mtx.RLock() 302 defer cs.mtx.RUnlock() 303 return cs.state.LastBlockHeight, cs.state.Validators.Copy().Validators 304 } 305 306 // SetPrivValidator sets the private validator account for signing votes. It 307 // immediately requests pubkey and caches it. 308 func (cs *State) SetPrivValidator(priv types.PrivValidator) { 309 cs.mtx.Lock() 310 defer cs.mtx.Unlock() 311 312 cs.privValidator = priv 313 314 if err := cs.updatePrivValidatorPubKey(); err != nil { 315 cs.Logger.Error("Can't get private validator pubkey", "err", err) 316 } 317 } 318 319 // SetTimeoutTicker sets the local timer. It may be useful to overwrite for testing. 320 func (cs *State) SetTimeoutTicker(timeoutTicker TimeoutTicker) { 321 cs.mtx.Lock() 322 cs.timeoutTicker = timeoutTicker 323 cs.mtx.Unlock() 324 } 325 326 // LoadCommit loads the commit for a given height. 327 func (cs *State) LoadCommit(height int64) *types.Commit { 328 cs.mtx.RLock() 329 defer cs.mtx.RUnlock() 330 if height == cs.blockStore.Height() { 331 return cs.blockStore.LoadSeenCommit(height) 332 } 333 return cs.blockStore.LoadBlockCommit(height) 334 } 335 336 // OnStart implements service.Service. 337 // It loads the latest state via the WAL, and starts the timeout and receive routines. 338 func (cs *State) OnStart() error { 339 if err := cs.evsw.Start(); err != nil { 340 cs.Logger.Error("evsw start failed. err: ", err) 341 return err 342 } 343 344 // we may set the WAL in testing before calling Start, 345 // so only OpenWAL if its still the nilWAL 346 if _, ok := cs.wal.(nilWAL); ok { 347 walFile := cs.config.WalFile() 348 wal, err := cs.OpenWAL(walFile) 349 if err != nil { 350 cs.Logger.Error("Error loading State wal", "err", err.Error()) 351 return err 352 } 353 cs.wal = wal 354 } 355 356 // we need the timeoutRoutine for replay so 357 // we don't block on the tick chan. 358 // NOTE: we will get a build up of garbage go routines 359 // firing on the tockChan until the receiveRoutine is started 360 // to deal with them (by that point, at most one will be valid) 361 if err := cs.timeoutTicker.Start(); err != nil { 362 return err 363 } 364 365 // we may have lost some votes if the process crashed 366 // reload from consensus log to catchup 367 if cs.doWALCatchup { 368 if err := cs.catchupReplay(cs.Height); err != nil { 369 // don't try to recover from data corruption error 370 if IsDataCorruptionError(err) { 371 cs.Logger.Error("Encountered corrupt WAL file", "err", err.Error()) 372 cs.Logger.Error("Please repair the WAL file before restarting") 373 fmt.Println(`You can attempt to repair the WAL as follows: 374 375 ---- 376 WALFILE=~/.tendermint/data/cs.wal/wal 377 cp $WALFILE ${WALFILE}.bak # backup the file 378 go run scripts/wal2json/main.go $WALFILE > wal.json # this will panic, but can be ignored 379 rm $WALFILE # remove the corrupt file 380 go run scripts/json2wal/main.go wal.json $WALFILE # rebuild the file without corruption 381 ----`) 382 383 return err 384 } 385 386 cs.Logger.Error("Error on catchup replay. Proceeding to start State anyway", "err", err.Error()) 387 // NOTE: if we ever do return an error here, 388 // make sure to stop the timeoutTicker 389 } 390 } 391 392 if cs.done == nil { 393 cs.done = make(chan struct{}) 394 } 395 396 // now start the receiveRoutine 397 go cs.receiveRoutine(0) 398 399 go cs.preMakeBlockRoutine() 400 401 // schedule the first round! 402 // use GetRoundState so we don't race the receiveRoutine for access 403 cs.scheduleRound0(cs.GetRoundState()) 404 405 return nil 406 } 407 408 // OnStop implements service.Service. 409 func (cs *State) OnStop() { 410 cs.evsw.Stop() 411 cs.timeoutTicker.Stop() 412 // WAL is stopped in receiveRoutine. 413 } 414 415 func (cs *State) OnReset() error { 416 cs.evsw.Reset() 417 cs.wal.Reset() 418 cs.wal = nilWAL{} 419 cs.timeoutTicker.Reset() 420 return nil 421 } 422 423 // Wait waits for the the main routine to return. 424 // NOTE: be sure to Stop() the event switch and drain 425 // any event channels or this may deadlock 426 func (cs *State) Wait() { 427 if cs.done != nil { 428 <-cs.done 429 } 430 } 431 432 // OpenWAL opens a file to log all consensus messages and timeouts for deterministic accountability 433 func (cs *State) OpenWAL(walFile string) (WAL, error) { 434 wal, err := NewWAL(walFile) 435 if err != nil { 436 cs.Logger.Error("Failed to open WAL for consensus state", "wal", walFile, "err", err) 437 return nil, err 438 } 439 wal.SetLogger(cs.Logger.With("wal", walFile)) 440 if err := wal.Start(); err != nil { 441 return nil, err 442 } 443 return wal, nil 444 } 445 446 //------------------------------------------------------------ 447 // internal functions for managing the state 448 449 func (cs *State) updateRoundStep(round int, step cstypes.RoundStepType) { 450 cs.Round = round 451 cs.Step = step 452 } 453 454 // Reconstruct LastCommit from SeenCommit, which we saved along with the block, 455 // (which happens even before saving the state) 456 func (cs *State) reconstructLastCommit(state sm.State) { 457 if state.LastBlockHeight == types.GetStartBlockHeight() { 458 return 459 } 460 seenCommit := cs.blockStore.LoadSeenCommit(state.LastBlockHeight) 461 if seenCommit == nil { 462 panic(fmt.Sprintf("Failed to reconstruct LastCommit: seen commit for height %v not found", 463 state.LastBlockHeight)) 464 } 465 lastPrecommits := types.CommitToVoteSet(state.ChainID, seenCommit, state.LastValidators) 466 if !lastPrecommits.HasTwoThirdsMajority() { 467 panic("Failed to reconstruct LastCommit: Does not have +2/3 maj") 468 } 469 cs.LastCommit = lastPrecommits 470 } 471 472 func (cs *State) newStep() { 473 rs := cs.RoundStateEvent() 474 cs.wal.Write(rs) 475 cs.nSteps++ 476 // newStep is called by updateToState in NewState before the eventBus is set! 477 if cs.eventBus != nil { 478 cs.eventBus.PublishEventNewRoundStep(rs) 479 cs.evsw.FireEvent(types.EventNewRoundStep, &cs.RoundState) 480 } 481 } 482 483 // needProofBlock returns true on the first height (so the genesis app hash is signed right away) 484 // and where the last block (height-1) caused the app hash to change 485 func (cs *State) needProofBlock(height int64) bool { 486 if height == types.GetStartBlockHeight()+1 { 487 return true 488 } 489 490 lastBlockMeta := cs.blockStore.LoadBlockMeta(height - 1) 491 if lastBlockMeta == nil { 492 panic(fmt.Sprintf("needProofBlock: last block meta for height %d not found", height-1)) 493 } 494 return !bytes.Equal(cs.state.AppHash, lastBlockMeta.Header.AppHash) 495 } 496 497 func (cs *State) recordMetrics(height int64, block *types.Block) { 498 cs.metrics.Validators.Set(float64(cs.Validators.Size())) 499 cs.metrics.ValidatorsPower.Set(float64(cs.Validators.TotalVotingPower())) 500 501 var ( 502 missingValidators int 503 missingValidatorsPower int64 504 ) 505 // height=0 -> MissingValidators and MissingValidatorsPower are both 0. 506 // Remember that the first LastCommit is intentionally empty, so it's not 507 // fair to increment missing validators number. 508 if height > types.GetStartBlockHeight()+1 { 509 // Sanity check that commit size matches validator set size - only applies 510 // after first block. 511 var ( 512 commitSize = block.LastCommit.Size() 513 valSetLen = len(cs.LastValidators.Validators) 514 address types.Address 515 ) 516 if commitSize != valSetLen { 517 panic(fmt.Sprintf("commit size (%d) doesn't match valset length (%d) at height %d\n\n%v\n\n%v", 518 commitSize, valSetLen, block.Height, block.LastCommit.Signatures, cs.LastValidators.Validators)) 519 } 520 521 if cs.privValidator != nil { 522 if cs.privValidatorPubKey == nil { 523 // Metrics won't be updated, but it's not critical. 524 cs.Logger.Error(fmt.Sprintf("recordMetrics: %v", errPubKeyIsNotSet)) 525 } else { 526 address = cs.privValidatorPubKey.Address() 527 } 528 } 529 530 for i, val := range cs.LastValidators.Validators { 531 commitSig := block.LastCommit.Signatures[i] 532 if commitSig.Absent() { 533 missingValidators++ 534 missingValidatorsPower += val.VotingPower 535 } 536 537 if bytes.Equal(val.Address, address) { 538 label := []string{ 539 "validator_address", val.Address.String(), 540 } 541 cs.metrics.ValidatorPower.With(label...).Set(float64(val.VotingPower)) 542 if commitSig.ForBlock() { 543 cs.metrics.ValidatorLastSignedHeight.With(label...).Set(float64(height)) 544 } else { 545 cs.metrics.ValidatorMissedBlocks.With(label...).Add(float64(1)) 546 } 547 } 548 549 } 550 } 551 cs.metrics.MissingValidators.Set(float64(missingValidators)) 552 cs.metrics.MissingValidatorsPower.Set(float64(missingValidatorsPower)) 553 554 cs.metrics.ByzantineValidators.Set(float64(len(block.Evidence.Evidence))) 555 byzantineValidatorsPower := int64(0) 556 for _, ev := range block.Evidence.Evidence { 557 if _, val := cs.Validators.GetByAddress(ev.Address()); val != nil { 558 byzantineValidatorsPower += val.VotingPower 559 } 560 } 561 cs.metrics.ByzantineValidatorsPower.Set(float64(byzantineValidatorsPower)) 562 563 if height > 1 { 564 lastBlockMeta := cs.blockStore.LoadBlockMeta(height - 1) 565 if lastBlockMeta != nil { 566 cs.metrics.BlockIntervalSeconds.Set( 567 block.Time.Sub(lastBlockMeta.Header.Time).Seconds(), 568 ) 569 } 570 } 571 572 cs.metrics.NumTxs.Set(float64(len(block.Data.Txs))) 573 cs.metrics.TotalTxs.Add(float64(len(block.Data.Txs))) 574 cs.metrics.BlockSizeBytes.Set(float64(block.FastSize())) 575 cs.metrics.CommittedHeight.Set(float64(block.Height)) 576 } 577 578 // updatePrivValidatorPubKey get's the private validator public key and 579 // memoizes it. This func returns an error if the private validator is not 580 // responding or responds with an error. 581 func (cs *State) updatePrivValidatorPubKey() error { 582 if cs.privValidator == nil { 583 return nil 584 } 585 586 pubKey, err := cs.privValidator.GetPubKey() 587 if err != nil { 588 return err 589 } 590 cs.privValidatorPubKey = pubKey 591 return nil 592 } 593 594 func (cs *State) BlockExec() *sm.BlockExecutor { 595 return cs.blockExec 596 } 597 598 //--------------------------------------------------------- 599 600 func CompareHRS(h1 int64, r1 int, s1 cstypes.RoundStepType, h2 int64, r2 int, s2 cstypes.RoundStepType, hasVC bool) int { 601 if h1 < h2 { 602 return -1 603 } else if h1 > h2 { 604 return 1 605 } 606 if r1 < r2 { 607 return -1 608 } else if r1 > r2 { 609 return 1 610 } 611 if hasVC { 612 return 1 613 } 614 if s1 < s2 { 615 return -1 616 } else if s1 > s2 { 617 return 1 618 } 619 return 0 620 }