github.com/onflow/flow-go@v0.33.17/consensus/hotstuff/eventloop/event_loop.go (about)

     1  package eventloop
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	"github.com/rs/zerolog"
     9  
    10  	"github.com/onflow/flow-go/consensus/hotstuff"
    11  	"github.com/onflow/flow-go/consensus/hotstuff/model"
    12  	"github.com/onflow/flow-go/consensus/hotstuff/tracker"
    13  	"github.com/onflow/flow-go/engine"
    14  	"github.com/onflow/flow-go/model/flow"
    15  	"github.com/onflow/flow-go/module"
    16  	"github.com/onflow/flow-go/module/component"
    17  	"github.com/onflow/flow-go/module/irrecoverable"
    18  	"github.com/onflow/flow-go/module/metrics"
    19  )
    20  
    21  // queuedProposal is a helper structure that is used to transmit proposal in channel
    22  // it contains an attached insertionTime that is used to measure how long we have waited between queening proposal and
    23  // actually processing by `EventHandler`.
    24  type queuedProposal struct {
    25  	proposal      *model.Proposal
    26  	insertionTime time.Time
    27  }
    28  
    29  // EventLoop buffers all incoming events to the hotstuff EventHandler, and feeds EventHandler one event at a time.
    30  type EventLoop struct {
    31  	*component.ComponentManager
    32  	log                      zerolog.Logger
    33  	eventHandler             hotstuff.EventHandler
    34  	metrics                  module.HotstuffMetrics
    35  	mempoolMetrics           module.MempoolMetrics
    36  	proposals                chan queuedProposal
    37  	newestSubmittedTc        *tracker.NewestTCTracker
    38  	newestSubmittedQc        *tracker.NewestQCTracker
    39  	newestSubmittedPartialTc *tracker.NewestPartialTcTracker
    40  	tcSubmittedNotifier      engine.Notifier
    41  	qcSubmittedNotifier      engine.Notifier
    42  	partialTcCreatedNotifier engine.Notifier
    43  	startTime                time.Time
    44  }
    45  
    46  var _ hotstuff.EventLoop = (*EventLoop)(nil)
    47  var _ component.Component = (*EventLoop)(nil)
    48  
    49  // NewEventLoop creates an instance of EventLoop.
    50  func NewEventLoop(
    51  	log zerolog.Logger,
    52  	metrics module.HotstuffMetrics,
    53  	mempoolMetrics module.MempoolMetrics,
    54  	eventHandler hotstuff.EventHandler,
    55  	startTime time.Time,
    56  ) (*EventLoop, error) {
    57  	// we will use a buffered channel to avoid blocking of caller
    58  	// we can't afford to drop messages since it undermines liveness, but we also want to avoid blocking of compliance
    59  	// engine. We assume that we should be able to process proposals faster than compliance engine feeds them, worst case
    60  	// we will fill the buffer and block compliance engine worker but that should happen only if compliance engine receives
    61  	// large number of blocks in short period of time(when catching up for instance).
    62  	proposals := make(chan queuedProposal, 1000)
    63  
    64  	el := &EventLoop{
    65  		log:                      log,
    66  		eventHandler:             eventHandler,
    67  		metrics:                  metrics,
    68  		mempoolMetrics:           mempoolMetrics,
    69  		proposals:                proposals,
    70  		tcSubmittedNotifier:      engine.NewNotifier(),
    71  		qcSubmittedNotifier:      engine.NewNotifier(),
    72  		partialTcCreatedNotifier: engine.NewNotifier(),
    73  		newestSubmittedTc:        tracker.NewNewestTCTracker(),
    74  		newestSubmittedQc:        tracker.NewNewestQCTracker(),
    75  		newestSubmittedPartialTc: tracker.NewNewestPartialTcTracker(),
    76  		startTime:                startTime,
    77  	}
    78  
    79  	componentBuilder := component.NewComponentManagerBuilder()
    80  	componentBuilder.AddWorker(func(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
    81  		ready()
    82  
    83  		// launch when scheduled by el.startTime
    84  		el.log.Info().Msgf("event loop will start at: %v", startTime)
    85  		select {
    86  		case <-ctx.Done():
    87  			return
    88  		case <-time.After(time.Until(startTime)):
    89  			el.log.Info().Msgf("starting event loop")
    90  			err := el.loop(ctx)
    91  			if err != nil {
    92  				el.log.Error().Err(err).Msg("irrecoverable event loop error")
    93  				ctx.Throw(err)
    94  			}
    95  		}
    96  	})
    97  	el.ComponentManager = componentBuilder.Build()
    98  
    99  	return el, nil
   100  }
   101  
   102  // loop executes the core HotStuff logic in a single thread. It picks inputs from the various
   103  // inbound channels and executes the EventHandler's respective method for processing this input.
   104  // During normal operations, the EventHandler is not expected to return any errors, as all inputs
   105  // are assumed to be fully validated (or produced by trusted components within the node). Therefore,
   106  // any error is a symptom of state corruption, bugs or violation of API contracts. In all cases,
   107  // continuing operations is not an option, i.e. we exit the event loop and return an exception.
   108  func (el *EventLoop) loop(ctx context.Context) error {
   109  	err := el.eventHandler.Start(ctx) // must be called by the same go-routine that also executes the business logic!
   110  	if err != nil {
   111  		return fmt.Errorf("could not start event handler: %w", err)
   112  	}
   113  
   114  	shutdownSignaled := ctx.Done()
   115  	timeoutCertificates := el.tcSubmittedNotifier.Channel()
   116  	quorumCertificates := el.qcSubmittedNotifier.Channel()
   117  	partialTCs := el.partialTcCreatedNotifier.Channel()
   118  
   119  	for {
   120  		// Giving timeout events the priority to be processed first.
   121  		// This is to prevent attacks from malicious nodes that attempt
   122  		// to block honest nodes' pacemaker from progressing by sending
   123  		// other events.
   124  		timeoutChannel := el.eventHandler.TimeoutChannel()
   125  
   126  		// the first select makes sure we process timeouts with priority
   127  		select {
   128  
   129  		// if we receive the shutdown signal, exit the loop
   130  		case <-shutdownSignaled:
   131  			return nil
   132  
   133  		// processing timeout or partial TC event are top priority since
   134  		// they allow node to contribute to TC aggregation when replicas can't
   135  		// make progress on happy path
   136  		case <-timeoutChannel:
   137  
   138  			processStart := time.Now()
   139  			err = el.eventHandler.OnLocalTimeout()
   140  			if err != nil {
   141  				return fmt.Errorf("could not process timeout: %w", err)
   142  			}
   143  			// measure how long it takes for a timeout event to be processed
   144  			el.metrics.HotStuffBusyDuration(time.Since(processStart), metrics.HotstuffEventTypeLocalTimeout)
   145  
   146  			// At this point, we have received and processed an event from the timeout channel.
   147  			// A timeout also means that we have made progress. A new timeout will have
   148  			// been started and el.eventHandler.TimeoutChannel() will be a NEW channel (for the just-started timeout).
   149  			// Very important to start the for loop from the beginning, to continue the with the new timeout channel!
   150  			continue
   151  
   152  		case <-partialTCs:
   153  
   154  			processStart := time.Now()
   155  			err = el.eventHandler.OnPartialTcCreated(el.newestSubmittedPartialTc.NewestPartialTc())
   156  			if err != nil {
   157  				return fmt.Errorf("could no process partial created TC event: %w", err)
   158  			}
   159  			// measure how long it takes for a partial TC to be processed
   160  			el.metrics.HotStuffBusyDuration(time.Since(processStart), metrics.HotstuffEventTypeOnPartialTc)
   161  
   162  			// At this point, we have received and processed partial TC event, it could have resulted in several scenarios:
   163  			// 1. a view change with potential voting or proposal creation
   164  			// 2. a created and broadcast timeout object
   165  			// 3. QC and TC didn't result in view change and no timeout was created since we have already timed out or
   166  			// the partial TC was created for view different from current one.
   167  			continue
   168  
   169  		default:
   170  			// fall through to non-priority events
   171  		}
   172  
   173  		idleStart := time.Now()
   174  
   175  		// select for block headers/QCs here
   176  		select {
   177  
   178  		// same as before
   179  		case <-shutdownSignaled:
   180  			return nil
   181  
   182  		// same as before
   183  		case <-timeoutChannel:
   184  			// measure how long the event loop was idle waiting for an
   185  			// incoming event
   186  			el.metrics.HotStuffIdleDuration(time.Since(idleStart))
   187  
   188  			processStart := time.Now()
   189  			err = el.eventHandler.OnLocalTimeout()
   190  			if err != nil {
   191  				return fmt.Errorf("could not process timeout: %w", err)
   192  			}
   193  			// measure how long it takes for a timeout event to be processed
   194  			el.metrics.HotStuffBusyDuration(time.Since(processStart), metrics.HotstuffEventTypeLocalTimeout)
   195  
   196  		// if we have a new proposal, process it
   197  		case queuedItem := <-el.proposals:
   198  			// the wait duration is measured as how long it takes from a block being
   199  			// received to event handler commencing the processing of the block
   200  			el.metrics.HotStuffWaitDuration(time.Since(queuedItem.insertionTime), metrics.HotstuffEventTypeOnProposal)
   201  
   202  			// measure how long the event loop was idle waiting for an
   203  			// incoming event
   204  			el.metrics.HotStuffIdleDuration(time.Since(idleStart))
   205  
   206  			processStart := time.Now()
   207  			proposal := queuedItem.proposal
   208  			err = el.eventHandler.OnReceiveProposal(proposal)
   209  			if err != nil {
   210  				return fmt.Errorf("could not process proposal %v: %w", proposal.Block.BlockID, err)
   211  			}
   212  			// measure how long it takes for a proposal to be processed
   213  			el.metrics.HotStuffBusyDuration(time.Since(processStart), metrics.HotstuffEventTypeOnProposal)
   214  
   215  			el.log.Info().
   216  				Dur("dur_ms", time.Since(processStart)).
   217  				Uint64("view", proposal.Block.View).
   218  				Hex("block_id", proposal.Block.BlockID[:]).
   219  				Msg("block proposal has been processed successfully")
   220  
   221  		// if we have a new QC, process it
   222  		case <-quorumCertificates:
   223  			// measure how long the event loop was idle waiting for an
   224  			// incoming event
   225  			el.metrics.HotStuffIdleDuration(time.Since(idleStart))
   226  
   227  			processStart := time.Now()
   228  			err = el.eventHandler.OnReceiveQc(el.newestSubmittedQc.NewestQC())
   229  			if err != nil {
   230  				return fmt.Errorf("could not process QC: %w", err)
   231  			}
   232  			// measure how long it takes for a QC to be processed
   233  			el.metrics.HotStuffBusyDuration(time.Since(processStart), metrics.HotstuffEventTypeOnQC)
   234  
   235  			// if we have a new TC, process it
   236  		case <-timeoutCertificates:
   237  			// measure how long the event loop was idle waiting for an
   238  			// incoming event
   239  			el.metrics.HotStuffIdleDuration(time.Since(idleStart))
   240  
   241  			processStart := time.Now()
   242  			err = el.eventHandler.OnReceiveTc(el.newestSubmittedTc.NewestTC())
   243  			if err != nil {
   244  				return fmt.Errorf("could not process TC: %w", err)
   245  			}
   246  			// measure how long it takes for a TC to be processed
   247  			el.metrics.HotStuffBusyDuration(time.Since(processStart), metrics.HotstuffEventTypeOnTC)
   248  
   249  		case <-partialTCs:
   250  			// measure how long the event loop was idle waiting for an
   251  			// incoming event
   252  			el.metrics.HotStuffIdleDuration(time.Since(idleStart))
   253  
   254  			processStart := time.Now()
   255  			err = el.eventHandler.OnPartialTcCreated(el.newestSubmittedPartialTc.NewestPartialTc())
   256  			if err != nil {
   257  				return fmt.Errorf("could no process partial created TC event: %w", err)
   258  			}
   259  			// measure how long it takes for a partial TC to be processed
   260  			el.metrics.HotStuffBusyDuration(time.Since(processStart), metrics.HotstuffEventTypeOnPartialTc)
   261  		}
   262  	}
   263  }
   264  
   265  // SubmitProposal pushes the received block to the proposals channel
   266  func (el *EventLoop) SubmitProposal(proposal *model.Proposal) {
   267  	queueItem := queuedProposal{
   268  		proposal:      proposal,
   269  		insertionTime: time.Now(),
   270  	}
   271  	select {
   272  	case el.proposals <- queueItem:
   273  	case <-el.ComponentManager.ShutdownSignal():
   274  		return
   275  	}
   276  	el.mempoolMetrics.MempoolEntries(metrics.HotstuffEventTypeOnProposal, uint(len(el.proposals)))
   277  }
   278  
   279  // onTrustedQC pushes the received QC(which MUST be validated) to the quorumCertificates channel
   280  func (el *EventLoop) onTrustedQC(qc *flow.QuorumCertificate) {
   281  	if el.newestSubmittedQc.Track(qc) {
   282  		el.qcSubmittedNotifier.Notify()
   283  	}
   284  }
   285  
   286  // onTrustedTC pushes the received TC(which MUST be validated) to the timeoutCertificates channel
   287  func (el *EventLoop) onTrustedTC(tc *flow.TimeoutCertificate) {
   288  	if el.newestSubmittedTc.Track(tc) {
   289  		el.tcSubmittedNotifier.Notify()
   290  	} else if el.newestSubmittedQc.Track(tc.NewestQC) {
   291  		el.qcSubmittedNotifier.Notify()
   292  	}
   293  }
   294  
   295  // OnTcConstructedFromTimeouts pushes the received TC to the timeoutCertificates channel
   296  func (el *EventLoop) OnTcConstructedFromTimeouts(tc *flow.TimeoutCertificate) {
   297  	el.onTrustedTC(tc)
   298  }
   299  
   300  // OnPartialTcCreated created a hotstuff.PartialTcCreated payload and pushes it into partialTcCreated buffered channel for
   301  // further processing by EventHandler. Since we use buffered channel this function can block if buffer is full.
   302  func (el *EventLoop) OnPartialTcCreated(view uint64, newestQC *flow.QuorumCertificate, lastViewTC *flow.TimeoutCertificate) {
   303  	event := &hotstuff.PartialTcCreated{
   304  		View:       view,
   305  		NewestQC:   newestQC,
   306  		LastViewTC: lastViewTC,
   307  	}
   308  	if el.newestSubmittedPartialTc.Track(event) {
   309  		el.partialTcCreatedNotifier.Notify()
   310  	}
   311  }
   312  
   313  // OnNewQcDiscovered pushes already validated QCs that were submitted from TimeoutAggregator to the event handler
   314  func (el *EventLoop) OnNewQcDiscovered(qc *flow.QuorumCertificate) {
   315  	el.onTrustedQC(qc)
   316  }
   317  
   318  // OnNewTcDiscovered pushes already validated TCs that were submitted from TimeoutAggregator to the event handler
   319  func (el *EventLoop) OnNewTcDiscovered(tc *flow.TimeoutCertificate) {
   320  	el.onTrustedTC(tc)
   321  }
   322  
   323  // OnQcConstructedFromVotes implements hotstuff.VoteCollectorConsumer and pushes received qc into processing pipeline.
   324  func (el *EventLoop) OnQcConstructedFromVotes(qc *flow.QuorumCertificate) {
   325  	el.onTrustedQC(qc)
   326  }
   327  
   328  // OnTimeoutProcessed implements hotstuff.TimeoutCollectorConsumer and is no-op
   329  func (el *EventLoop) OnTimeoutProcessed(timeout *model.TimeoutObject) {}
   330  
   331  // OnVoteProcessed implements hotstuff.VoteCollectorConsumer and is no-op
   332  func (el *EventLoop) OnVoteProcessed(vote *model.Vote) {}