github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/consensus/hotstuff/eventhandler/event_handler.go (about)

     1  package eventhandler
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/onflow/flow-go/consensus/hotstuff"
    12  	"github.com/onflow/flow-go/consensus/hotstuff/model"
    13  	"github.com/onflow/flow-go/model/flow"
    14  )
    15  
    16  // EventHandler is the main handler for individual events that trigger state transition.
    17  // It exposes API to handle one event at a time synchronously. EventHandler is *not concurrency safe*.
    18  // Please use the EventLoop to ensure that only a single go-routine executes the EventHandler's algorithms.
    19  // EventHandler is implemented in event-driven way, it reacts to incoming events and performs certain actions.
    20  // It doesn't perform any actions on its own. There are 3 main responsibilities of EventHandler, vote, propose,
    21  // timeout. There are specific scenarios that lead to each of those actions.
    22  //   - create vote: voting logic is triggered by OnReceiveProposal, after receiving proposal we have all required information
    23  //     to create a valid vote. Compliance engine makes sure that we receive proposals, whose parents are known.
    24  //     Creating a vote can be triggered ONLY by receiving proposal.
    25  //   - create timeout: creating model.TimeoutObject[TO] is triggered by OnLocalTimeout, after reaching deadline for current round.
    26  //     EventHandler gets notified about it and has to create a model.TimeoutObject and broadcast it to other replicas.
    27  //     Creating a TO can be triggered by reaching round deadline or triggered as part of Bracha broadcast when superminority
    28  //     of replicas have contributed to TC creation and created a partial TC.
    29  //   - create a proposal: proposing logic is more complicated. Creating a proposal is triggered by the EventHandler receiving
    30  //     a QC or TC that induces a view change to a view where the replica is primary. As an edge case, the EventHandler
    31  //     can receive a QC or TC that triggers the view change, but we can't create a proposal in case we are missing parent block the newest QC refers to.
    32  //     In case we already have the QC, but are still missing the respective parent, OnReceiveProposal can trigger the proposing logic
    33  //     as well, but only when receiving proposal for view lower than active view.
    34  //     To summarize, to make a valid proposal for view N we need to have a QC or TC for N-1 and know the proposal with blockID
    35  //     NewestQC.BlockID.
    36  //
    37  // Not concurrency safe.
    38  type EventHandler struct {
    39  	log           zerolog.Logger
    40  	paceMaker     hotstuff.PaceMaker
    41  	blockProducer hotstuff.BlockProducer
    42  	forks         hotstuff.Forks
    43  	persist       hotstuff.Persister
    44  	committee     hotstuff.Replicas
    45  	safetyRules   hotstuff.SafetyRules
    46  	notifier      hotstuff.Consumer
    47  }
    48  
    49  var _ hotstuff.EventHandler = (*EventHandler)(nil)
    50  
    51  // NewEventHandler creates an EventHandler instance with initial components.
    52  func NewEventHandler(
    53  	log zerolog.Logger,
    54  	paceMaker hotstuff.PaceMaker,
    55  	blockProducer hotstuff.BlockProducer,
    56  	forks hotstuff.Forks,
    57  	persist hotstuff.Persister,
    58  	committee hotstuff.Replicas,
    59  	safetyRules hotstuff.SafetyRules,
    60  	notifier hotstuff.Consumer,
    61  ) (*EventHandler, error) {
    62  	e := &EventHandler{
    63  		log:           log.With().Str("hotstuff", "participant").Logger(),
    64  		paceMaker:     paceMaker,
    65  		blockProducer: blockProducer,
    66  		forks:         forks,
    67  		persist:       persist,
    68  		safetyRules:   safetyRules,
    69  		committee:     committee,
    70  		notifier:      notifier,
    71  	}
    72  	return e, nil
    73  }
    74  
    75  // OnReceiveQc processes a valid qc constructed by internal vote aggregator or discovered in TimeoutObject.
    76  // All inputs should be validated before feeding into this function. Assuming trusted data.
    77  // No errors are expected during normal operation.
    78  func (e *EventHandler) OnReceiveQc(qc *flow.QuorumCertificate) error {
    79  	curView := e.paceMaker.CurView()
    80  	log := e.log.With().
    81  		Uint64("cur_view", curView).
    82  		Uint64("qc_view", qc.View).
    83  		Hex("qc_block_id", qc.BlockID[:]).
    84  		Logger()
    85  	log.Debug().Msg("received QC")
    86  	e.notifier.OnReceiveQc(curView, qc)
    87  	defer e.notifier.OnEventProcessed()
    88  
    89  	newViewEvent, err := e.paceMaker.ProcessQC(qc)
    90  	if err != nil {
    91  		return fmt.Errorf("could not process QC: %w", err)
    92  	}
    93  	if newViewEvent == nil {
    94  		log.Debug().Msg("QC didn't trigger view change, nothing to do")
    95  		return nil
    96  	}
    97  
    98  	// current view has changed, go to new view
    99  	log.Debug().Msg("QC triggered view change, starting new view now")
   100  	return e.proposeForNewViewIfPrimary()
   101  }
   102  
   103  // OnReceiveTc processes a valid tc constructed by internal timeout aggregator, discovered in TimeoutObject or
   104  // broadcast over the network.
   105  // All inputs should be validated before feeding into this function. Assuming trusted data.
   106  // No errors are expected during normal operation.
   107  func (e *EventHandler) OnReceiveTc(tc *flow.TimeoutCertificate) error {
   108  	curView := e.paceMaker.CurView()
   109  	log := e.log.With().
   110  		Uint64("cur_view", curView).
   111  		Uint64("tc_view", tc.View).
   112  		Uint64("tc_newest_qc_view", tc.NewestQC.View).
   113  		Hex("tc_newest_qc_block_id", tc.NewestQC.BlockID[:]).
   114  		Logger()
   115  	log.Debug().Msg("received TC")
   116  	e.notifier.OnReceiveTc(curView, tc)
   117  	defer e.notifier.OnEventProcessed()
   118  
   119  	newViewEvent, err := e.paceMaker.ProcessTC(tc)
   120  	if err != nil {
   121  		return fmt.Errorf("could not process TC for view %d: %w", tc.View, err)
   122  	}
   123  	if newViewEvent == nil {
   124  		log.Debug().Msg("TC didn't trigger view change, nothing to do")
   125  		return nil
   126  	}
   127  
   128  	// current view has changed, go to new view
   129  	log.Debug().Msg("TC triggered view change, starting new view now")
   130  	return e.proposeForNewViewIfPrimary()
   131  }
   132  
   133  // OnReceiveProposal processes a block proposal received from another HotStuff
   134  // consensus participant.
   135  // All inputs should be validated before feeding into this function. Assuming trusted data.
   136  // No errors are expected during normal operation.
   137  func (e *EventHandler) OnReceiveProposal(proposal *model.Proposal) error {
   138  	block := proposal.Block
   139  	curView := e.paceMaker.CurView()
   140  	log := e.log.With().
   141  		Uint64("cur_view", curView).
   142  		Uint64("block_view", block.View).
   143  		Hex("block_id", block.BlockID[:]).
   144  		Uint64("qc_view", block.QC.View).
   145  		Hex("proposer_id", block.ProposerID[:]).
   146  		Logger()
   147  	log.Debug().Msg("proposal received from compliance engine")
   148  	e.notifier.OnReceiveProposal(curView, proposal)
   149  	defer e.notifier.OnEventProcessed()
   150  
   151  	// ignore stale proposals
   152  	if block.View < e.forks.FinalizedView() {
   153  		log.Debug().Msg("stale proposal")
   154  		return nil
   155  	}
   156  
   157  	// store the block.
   158  	err := e.forks.AddValidatedBlock(block)
   159  	if err != nil {
   160  		return fmt.Errorf("cannot add proposal to forks (%x): %w", block.BlockID, err)
   161  	}
   162  
   163  	_, err = e.paceMaker.ProcessQC(proposal.Block.QC)
   164  	if err != nil {
   165  		return fmt.Errorf("could not process QC for block %x: %w", block.BlockID, err)
   166  	}
   167  
   168  	_, err = e.paceMaker.ProcessTC(proposal.LastViewTC)
   169  	if err != nil {
   170  		return fmt.Errorf("could not process TC for block %x: %w", block.BlockID, err)
   171  	}
   172  
   173  	// if the block is for the current view, then try voting for this block
   174  	err = e.processBlockForCurrentView(proposal)
   175  	if err != nil {
   176  		return fmt.Errorf("failed processing current block: %w", err)
   177  	}
   178  	log.Debug().Msg("proposal processed from compliance engine")
   179  
   180  	// nothing to do if this proposal is for current view
   181  	if proposal.Block.View == e.paceMaker.CurView() {
   182  		return nil
   183  	}
   184  
   185  	return e.proposeForNewViewIfPrimary()
   186  }
   187  
   188  // TimeoutChannel returns the channel for subscribing the waiting timeout on receiving
   189  // block or votes for the current view.
   190  func (e *EventHandler) TimeoutChannel() <-chan time.Time {
   191  	return e.paceMaker.TimeoutChannel()
   192  }
   193  
   194  // OnLocalTimeout handles a local timeout event by creating a model.TimeoutObject and broadcasting it.
   195  // No errors are expected during normal operation.
   196  func (e *EventHandler) OnLocalTimeout() error {
   197  	curView := e.paceMaker.CurView()
   198  	e.log.Debug().Uint64("cur_view", curView).Msg("timeout received from event loop")
   199  	e.notifier.OnLocalTimeout(curView)
   200  	defer e.notifier.OnEventProcessed()
   201  
   202  	err := e.broadcastTimeoutObjectIfAuthorized()
   203  	if err != nil {
   204  		return fmt.Errorf("unexpected exception while processing timeout in view %d: %w", curView, err)
   205  	}
   206  	return nil
   207  }
   208  
   209  // OnPartialTcCreated handles notification produces by the internal timeout aggregator.
   210  // If the notification is for the current view, a corresponding model.TimeoutObject is broadcast to the consensus committee.
   211  // No errors are expected during normal operation.
   212  func (e *EventHandler) OnPartialTcCreated(partialTC *hotstuff.PartialTcCreated) error {
   213  	curView := e.paceMaker.CurView()
   214  	lastViewTC := partialTC.LastViewTC
   215  	logger := e.log.With().
   216  		Uint64("cur_view", curView).
   217  		Uint64("qc_view", partialTC.NewestQC.View)
   218  	if lastViewTC != nil {
   219  		logger.Uint64("last_view_tc_view", lastViewTC.View)
   220  	}
   221  	log := logger.Logger()
   222  	log.Debug().Msg("constructed partial TC")
   223  
   224  	e.notifier.OnPartialTc(curView, partialTC)
   225  	defer e.notifier.OnEventProcessed()
   226  
   227  	// process QC, this might trigger view change
   228  	_, err := e.paceMaker.ProcessQC(partialTC.NewestQC)
   229  	if err != nil {
   230  		return fmt.Errorf("could not process newest QC: %w", err)
   231  	}
   232  
   233  	// process TC, this might trigger view change
   234  	_, err = e.paceMaker.ProcessTC(lastViewTC)
   235  	if err != nil {
   236  		return fmt.Errorf("could not process TC for view %d: %w", lastViewTC.View, err)
   237  	}
   238  
   239  	// NOTE: in other cases when we have observed a view change we will trigger proposing logic, this is desired logic
   240  	// for handling proposal, QC and TC. However, observing a partial TC means
   241  	// that superminority have timed out and there was at least one honest replica in that set. Honest replicas will never vote
   242  	// after timing out for current view meaning we won't be able to collect supermajority of votes for a proposal made after
   243  	// observing partial TC.
   244  
   245  	// by definition, we are allowed to produce timeout object if we have received partial TC for current view
   246  	if e.paceMaker.CurView() != partialTC.View {
   247  		return nil
   248  	}
   249  
   250  	log.Debug().Msg("partial TC generated for current view, broadcasting timeout")
   251  	err = e.broadcastTimeoutObjectIfAuthorized()
   252  	if err != nil {
   253  		return fmt.Errorf("unexpected exception while processing partial TC in view %d: %w", partialTC.View, err)
   254  	}
   255  	return nil
   256  }
   257  
   258  // Start starts the event handler.
   259  // No errors are expected during normal operation.
   260  // CAUTION: EventHandler is not concurrency safe. The Start method must
   261  // be executed by the same goroutine that also calls the other business logic
   262  // methods, or concurrency safety has to be implemented externally.
   263  func (e *EventHandler) Start(ctx context.Context) error {
   264  	e.notifier.OnStart(e.paceMaker.CurView())
   265  	defer e.notifier.OnEventProcessed()
   266  	e.paceMaker.Start(ctx)
   267  	err := e.proposeForNewViewIfPrimary()
   268  	if err != nil {
   269  		return fmt.Errorf("could not start new view: %w", err)
   270  	}
   271  	return nil
   272  }
   273  
   274  // broadcastTimeoutObjectIfAuthorized attempts to generate a model.TimeoutObject, adds it
   275  // to `timeoutAggregator` and broadcasts it to the consensus commettee. We check, whether
   276  // this node, at the current view, is part of the consensus committee. Otherwise, this
   277  // method is functionally a no-op.
   278  // For example, right after an epoch switchover a consensus node might still be online but
   279  // not part of the _active_ consensus committee anymore. Consequently, it should not broadcast
   280  // timeouts anymore.
   281  // No errors are expected during normal operation.
   282  func (e *EventHandler) broadcastTimeoutObjectIfAuthorized() error {
   283  	curView := e.paceMaker.CurView()
   284  	newestQC := e.paceMaker.NewestQC()
   285  	lastViewTC := e.paceMaker.LastViewTC()
   286  	log := e.log.With().Uint64("cur_view", curView).Logger()
   287  
   288  	if newestQC.View+1 == curView {
   289  		// in case last view has ended with QC and TC, make sure that only QC is included
   290  		// otherwise such timeout is invalid. This case is possible if TC has included QC with the same
   291  		// view as the TC itself, meaning that newestQC.View == lastViewTC.View
   292  		lastViewTC = nil
   293  	}
   294  
   295  	timeout, err := e.safetyRules.ProduceTimeout(curView, newestQC, lastViewTC)
   296  	if err != nil {
   297  		if model.IsNoTimeoutError(err) {
   298  			log.Warn().Err(err).Msgf("not generating timeout as this node is not part of the active committee")
   299  			return nil
   300  		}
   301  		return fmt.Errorf("could not produce timeout: %w", err)
   302  	}
   303  
   304  	// raise a notification to broadcast timeout
   305  	e.notifier.OnOwnTimeout(timeout)
   306  	log.Debug().Msg("broadcast TimeoutObject done")
   307  
   308  	return nil
   309  }
   310  
   311  // proposeForNewViewIfPrimary will only be called when we may be able to propose a block, after processing a new event.
   312  //   - after entering a new view as a result of processing a QC or TC, then we may propose for the newly entered view
   313  //   - after receiving a proposal (but not changing view), if that proposal is referenced by our highest known QC,
   314  //     and the proposal was previously unknown, then we can propose a block in the current view
   315  //
   316  // It reads the current view, and generates a proposal if we are the leader.
   317  // No errors are expected during normal operation.
   318  func (e *EventHandler) proposeForNewViewIfPrimary() error {
   319  	start := time.Now() // track the start time
   320  	curView := e.paceMaker.CurView()
   321  	currentLeader, err := e.committee.LeaderForView(curView)
   322  	if err != nil {
   323  		return fmt.Errorf("failed to determine primary for new view %d: %w", curView, err)
   324  	}
   325  	finalizedView := e.forks.FinalizedView()
   326  	log := e.log.With().
   327  		Uint64("cur_view", curView).
   328  		Uint64("finalized_view", finalizedView).
   329  		Hex("leader_id", currentLeader[:]).Logger()
   330  
   331  	e.notifier.OnCurrentViewDetails(curView, finalizedView, currentLeader)
   332  
   333  	// check that I am the primary for this view and that I haven't already proposed; otherwise there is nothing to do
   334  	if e.committee.Self() != currentLeader {
   335  		return nil
   336  	}
   337  	for _, b := range e.forks.GetBlocksForView(curView) { // on the happy path, this slice is empty
   338  		if b.ProposerID == e.committee.Self() {
   339  			log.Debug().Msg("already proposed for current view")
   340  			return nil
   341  		} else {
   342  			// sanity check: the following code should never be reached, as this node is the current leader, i.e.
   343  			// we should _not_ consider a proposal for this view from any other as valid and store it in forks.
   344  			return fmt.Errorf("this node (%v) is leader for the current view %d, but have a proposal from node %v for this view", currentLeader, curView, b.ProposerID)
   345  		}
   346  	}
   347  
   348  	// attempt to generate proposal:
   349  	newestQC := e.paceMaker.NewestQC()
   350  	lastViewTC := e.paceMaker.LastViewTC()
   351  
   352  	_, found := e.forks.GetBlock(newestQC.BlockID)
   353  	if !found {
   354  		// we don't know anything about block referenced by our newest QC, in this case we can't
   355  		// create a valid proposal since we can't guarantee validity of block payload.
   356  		log.Warn().
   357  			Uint64("qc_view", newestQC.View).
   358  			Hex("block_id", newestQC.BlockID[:]).Msg("haven't synced the latest block yet; can't propose")
   359  		return nil
   360  	}
   361  	log.Debug().Msg("generating proposal as leader")
   362  
   363  	// Sanity checks to make sure that resulting proposal is valid:
   364  	// In its proposal, the leader for view N needs to present evidence that it has legitimately entered view N.
   365  	// As evidence, we include a QC or TC for view N-1, which should always be available as the PaceMaker advances
   366  	// to view N only after observing a QC or TC from view N-1. Moreover, QC and TC are always processed together. As
   367  	// EventHandler is strictly single-threaded without reentrancy, we must have a QC or TC for the prior view (curView-1).
   368  	// Failing one of these sanity checks is a symptom of state corruption or a severe implementation bug.
   369  	if newestQC.View+1 != curView {
   370  		if lastViewTC == nil {
   371  			return fmt.Errorf("possible state corruption, expected lastViewTC to be not nil")
   372  		}
   373  		if lastViewTC.View+1 != curView {
   374  			return fmt.Errorf("possible state corruption, don't have QC(view=%d) and TC(view=%d) for previous view(currentView=%d)",
   375  				newestQC.View, lastViewTC.View, curView)
   376  		}
   377  	} else {
   378  		// In case last view has ended with QC and TC, make sure that only QC is included,
   379  		// otherwise such proposal is invalid. This case is possible if TC has included QC with the same
   380  		// view as the TC itself, meaning that newestQC.View == lastViewTC.View
   381  		lastViewTC = nil
   382  	}
   383  
   384  	flowProposal, err := e.blockProducer.MakeBlockProposal(curView, newestQC, lastViewTC)
   385  	if err != nil {
   386  		return fmt.Errorf("can not make block proposal for curView %v: %w", curView, err)
   387  	}
   388  	proposedBlock := model.BlockFromFlow(flowProposal) // turn the signed flow header into a proposal
   389  
   390  	// determine target publication time
   391  	// CAUTION:
   392  	//  • we must call `TargetPublicationTime` _before_ `AddValidatedBlock`, because `AddValidatedBlock`
   393  	//    may emit a BlockIncorporatedEvent, which changes CruiseControl's state.
   394  	//  • metrics for the PaceMaker/CruiseControl assume that the event handler is the only caller of
   395  	//   `TargetPublicationTime`. Technically, `TargetPublicationTime` records the publication delay
   396  	//    relative to its _latest_ call.
   397  	targetPublicationTime := e.paceMaker.TargetPublicationTime(flowProposal.View, start, flowProposal.ParentID)
   398  
   399  	// we want to store created proposal in forks to make sure that we don't create more proposals for
   400  	// current view. Due to asynchronous nature of our design it's possible that after creating proposal
   401  	// we will be asked to propose again for same view.
   402  	err = e.forks.AddValidatedBlock(proposedBlock)
   403  	if err != nil {
   404  		return fmt.Errorf("could not add newly created proposal (%v): %w", proposedBlock.BlockID, err)
   405  	}
   406  
   407  	log.Debug().
   408  		Uint64("block_view", proposedBlock.View).
   409  		Time("target_publication", targetPublicationTime).
   410  		Hex("block_id", proposedBlock.BlockID[:]).
   411  		Uint64("parent_view", newestQC.View).
   412  		Hex("parent_id", newestQC.BlockID[:]).
   413  		Hex("signer", proposedBlock.ProposerID[:]).
   414  		Msg("forwarding proposal to communicator for broadcasting")
   415  
   416  	// raise a notification with proposal (also triggers broadcast)
   417  	e.notifier.OnOwnProposal(flowProposal, targetPublicationTime)
   418  	return nil
   419  }
   420  
   421  // processBlockForCurrentView processes the block for the current view.
   422  // It is called AFTER the block has been stored or found in Forks
   423  // It checks whether to vote for this block.
   424  // No errors are expected during normal operation.
   425  func (e *EventHandler) processBlockForCurrentView(proposal *model.Proposal) error {
   426  	// sanity check that block is really for the current view:
   427  	curView := e.paceMaker.CurView()
   428  	block := proposal.Block
   429  	if block.View != curView {
   430  		// ignore outdated proposals in case we have moved forward
   431  		return nil
   432  	}
   433  	// leader (node ID) for next view
   434  	nextLeader, err := e.committee.LeaderForView(curView + 1)
   435  	if errors.Is(err, model.ErrViewForUnknownEpoch) {
   436  		// We are attempting process a block in an unknown epoch
   437  		// This should never happen, because:
   438  		// * the compliance layer ensures proposals are passed to the event loop strictly after their parent
   439  		// * the protocol state ensures that, before incorporating the first block of an epoch E,
   440  		//    either E is known or we have triggered epoch fallback mode - in either case the epoch for the
   441  		//    current epoch is known
   442  		return fmt.Errorf("attempting to process a block for current view in unknown epoch")
   443  	}
   444  	if err != nil {
   445  		return fmt.Errorf("failed to determine primary for next view %d: %w", curView+1, err)
   446  	}
   447  
   448  	// safetyRules performs all the checks to decide whether to vote for this block or not.
   449  	err = e.ownVote(proposal, curView, nextLeader)
   450  	if err != nil {
   451  		return fmt.Errorf("unexpected error in voting logic: %w", err)
   452  	}
   453  
   454  	return nil
   455  }
   456  
   457  // ownVote generates and forwards the own vote, if we decide to vote.
   458  // Any errors are potential symptoms of uncovered edge cases or corrupted internal state (fatal).
   459  // No errors are expected during normal operation.
   460  func (e *EventHandler) ownVote(proposal *model.Proposal, curView uint64, nextLeader flow.Identifier) error {
   461  	block := proposal.Block
   462  	log := e.log.With().
   463  		Uint64("block_view", block.View).
   464  		Hex("block_id", block.BlockID[:]).
   465  		Uint64("parent_view", block.QC.View).
   466  		Hex("parent_id", block.QC.BlockID[:]).
   467  		Hex("signer", block.ProposerID[:]).
   468  		Logger()
   469  
   470  	_, found := e.forks.GetBlock(proposal.Block.QC.BlockID)
   471  	if !found {
   472  		// we don't have parent for this proposal, we can't vote since we can't guarantee validity of proposals
   473  		// payload. Strictly speaking this shouldn't ever happen because compliance engine makes sure that we
   474  		// receive proposals with valid parents.
   475  		return fmt.Errorf("won't vote for proposal, no parent block for this proposal")
   476  	}
   477  
   478  	// safetyRules performs all the checks to decide whether to vote for this block or not.
   479  	ownVote, err := e.safetyRules.ProduceVote(proposal, curView)
   480  	if err != nil {
   481  		if !model.IsNoVoteError(err) {
   482  			// unknown error, exit the event loop
   483  			return fmt.Errorf("could not produce vote: %w", err)
   484  		}
   485  		log.Debug().Err(err).Msg("should not vote for this block")
   486  		return nil
   487  	}
   488  
   489  	log.Debug().Msg("forwarding vote to compliance engine")
   490  	// raise a notification to send vote
   491  	e.notifier.OnOwnVote(ownVote.BlockID, ownVote.View, ownVote.SigData, nextLeader)
   492  	return nil
   493  }