github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/consensus/message_hub/message_hub.go (about)

     1  package message_hub
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/onflow/flow-go/consensus/hotstuff"
    12  	"github.com/onflow/flow-go/consensus/hotstuff/model"
    13  	"github.com/onflow/flow-go/consensus/hotstuff/notifications"
    14  	"github.com/onflow/flow-go/engine"
    15  	"github.com/onflow/flow-go/engine/common/fifoqueue"
    16  	"github.com/onflow/flow-go/engine/consensus"
    17  	"github.com/onflow/flow-go/model/flow"
    18  	"github.com/onflow/flow-go/model/flow/filter"
    19  	"github.com/onflow/flow-go/model/messages"
    20  	"github.com/onflow/flow-go/module"
    21  	"github.com/onflow/flow-go/module/component"
    22  	"github.com/onflow/flow-go/module/irrecoverable"
    23  	"github.com/onflow/flow-go/module/metrics"
    24  	"github.com/onflow/flow-go/network"
    25  	"github.com/onflow/flow-go/network/channels"
    26  	"github.com/onflow/flow-go/state/protocol"
    27  	"github.com/onflow/flow-go/storage"
    28  	"github.com/onflow/flow-go/utils/logging"
    29  )
    30  
    31  // defaultMessageHubRequestsWorkers number of workers to dispatch events for requests
    32  const defaultMessageHubRequestsWorkers = 5
    33  
    34  // defaultProposalQueueCapacity number of pending outgoing proposals stored in queue
    35  const defaultProposalQueueCapacity = 3
    36  
    37  // defaultVoteQueueCapacity number of pending outgoing votes stored in queue
    38  const defaultVoteQueueCapacity = 20
    39  
    40  // defaultTimeoutQueueCapacity number of pending outgoing timeouts stored in queue
    41  const defaultTimeoutQueueCapacity = 3
    42  
    43  // packedVote is a helper structure to pack recipientID and vote into one structure to pass through fifoqueue.FifoQueue
    44  type packedVote struct {
    45  	recipientID flow.Identifier
    46  	vote        *messages.BlockVote
    47  }
    48  
    49  // MessageHub is a central module for handling incoming and outgoing messages via consensus channel.
    50  // It performs message routing for incoming messages by matching them by type and sending to respective engine.
    51  // For incoming messages handling processing looks like this:
    52  //
    53  //	   +-------------------+      +------------+
    54  //	-->| Consensus-Channel |----->| MessageHub |
    55  //	   +-------------------+      +------+-----+
    56  //	                         ------------|------------
    57  //	   +------+---------+    |    +------+-----+     |    +------+------------+
    58  //	   | VoteAggregator |----+    | Compliance |     +----| TimeoutAggregator |
    59  //	   +----------------+         +------------+          +------+------------+
    60  //	          vote                     block                  timeout object
    61  //
    62  // MessageHub acts as communicator and handles hotstuff.Consumer communication events to send votes, broadcast timeouts
    63  // and proposals. It is responsible for communication between consensus participants.
    64  // It implements hotstuff.Consumer interface and needs to be subscribed for notifications via pub/sub.
    65  // All communicator events are handled on worker thread to prevent sender from blocking.
    66  // For outgoing messages processing logic looks like this:
    67  //
    68  //	+-------------------+      +------------+      +----------+      +------------------------+
    69  //	| Consensus-Channel |<-----| MessageHub |<-----| Consumer |<-----|        Hotstuff        |
    70  //	+-------------------+      +------+-----+      +----------+      +------------------------+
    71  //	                                                  pub/sub          vote, timeout, proposal
    72  //
    73  // MessageHub is safe to use in concurrent environment.
    74  type MessageHub struct {
    75  	*component.ComponentManager
    76  	notifications.NoopConsumer
    77  	log                        zerolog.Logger
    78  	me                         module.Local
    79  	engineMetrics              module.EngineMetrics
    80  	state                      protocol.State
    81  	payloads                   storage.Payloads
    82  	con                        network.Conduit
    83  	pushBlocksCon              network.Conduit
    84  	ownOutboundMessageNotifier engine.Notifier
    85  	ownOutboundVotes           *fifoqueue.FifoQueue // queue for handling outgoing vote transmissions
    86  	ownOutboundProposals       *fifoqueue.FifoQueue // queue for handling outgoing proposal transmissions
    87  	ownOutboundTimeouts        *fifoqueue.FifoQueue // queue for handling outgoing timeout transmissions
    88  
    89  	// injected dependencies
    90  	compliance        consensus.Compliance       // handler of incoming block proposals
    91  	hotstuff          module.HotStuff            // used to submit proposals that were previously broadcast
    92  	voteAggregator    hotstuff.VoteAggregator    // handler of incoming votes
    93  	timeoutAggregator hotstuff.TimeoutAggregator // handler of incoming timeouts
    94  }
    95  
    96  var _ network.MessageProcessor = (*MessageHub)(nil)
    97  var _ hotstuff.CommunicatorConsumer = (*MessageHub)(nil)
    98  
    99  // NewMessageHub constructs new instance of message hub
   100  // No errors are expected during normal operations.
   101  func NewMessageHub(log zerolog.Logger,
   102  	engineMetrics module.EngineMetrics,
   103  	net network.EngineRegistry,
   104  	me module.Local,
   105  	compliance consensus.Compliance,
   106  	hotstuff module.HotStuff,
   107  	voteAggregator hotstuff.VoteAggregator,
   108  	timeoutAggregator hotstuff.TimeoutAggregator,
   109  	state protocol.State,
   110  	payloads storage.Payloads,
   111  ) (*MessageHub, error) {
   112  	ownOutboundVotes, err := fifoqueue.NewFifoQueue(defaultVoteQueueCapacity)
   113  	if err != nil {
   114  		return nil, fmt.Errorf("could not initialize votes queue")
   115  	}
   116  	ownOutboundProposals, err := fifoqueue.NewFifoQueue(defaultProposalQueueCapacity)
   117  	if err != nil {
   118  		return nil, fmt.Errorf("could not initialize blocks queue")
   119  	}
   120  	ownOutboundTimeouts, err := fifoqueue.NewFifoQueue(defaultTimeoutQueueCapacity)
   121  	if err != nil {
   122  		return nil, fmt.Errorf("could not initialize timeouts queue")
   123  	}
   124  	hub := &MessageHub{
   125  		log:                        log.With().Str("engine", "message_hub").Logger(),
   126  		me:                         me,
   127  		engineMetrics:              engineMetrics,
   128  		state:                      state,
   129  		payloads:                   payloads,
   130  		compliance:                 compliance,
   131  		hotstuff:                   hotstuff,
   132  		voteAggregator:             voteAggregator,
   133  		timeoutAggregator:          timeoutAggregator,
   134  		ownOutboundMessageNotifier: engine.NewNotifier(),
   135  		ownOutboundVotes:           ownOutboundVotes,
   136  		ownOutboundProposals:       ownOutboundProposals,
   137  		ownOutboundTimeouts:        ownOutboundTimeouts,
   138  	}
   139  
   140  	// register with the network layer and store the conduit
   141  	hub.con, err = net.Register(channels.ConsensusCommittee, hub)
   142  	if err != nil {
   143  		return nil, fmt.Errorf("could not register core: %w", err)
   144  	}
   145  
   146  	// register with the network layer and store the conduit
   147  	hub.pushBlocksCon, err = net.Register(channels.PushBlocks, hub)
   148  	if err != nil {
   149  		return nil, fmt.Errorf("could not register engine: %w", err)
   150  	}
   151  
   152  	componentBuilder := component.NewComponentManagerBuilder()
   153  	// This implementation tolerates if the networking layer sometimes blocks on send requests.
   154  	// We use by default 5 go-routines here. This is fine, because outbound messages are temporally sparse
   155  	// under normal operations. Hence, the go-routines should mostly be asleep waiting for work.
   156  	for i := 0; i < defaultMessageHubRequestsWorkers; i++ {
   157  		componentBuilder.AddWorker(func(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   158  			ready()
   159  			hub.queuedMessagesProcessingLoop(ctx)
   160  		})
   161  	}
   162  	hub.ComponentManager = componentBuilder.Build()
   163  	return hub, nil
   164  }
   165  
   166  // queuedMessagesProcessingLoop orchestrates dispatching of previously queued messages
   167  func (h *MessageHub) queuedMessagesProcessingLoop(ctx irrecoverable.SignalerContext) {
   168  	notifier := h.ownOutboundMessageNotifier.Channel()
   169  	for {
   170  		select {
   171  		case <-ctx.Done():
   172  			return
   173  		case <-notifier:
   174  			err := h.sendOwnMessages(ctx)
   175  			if err != nil {
   176  				ctx.Throw(fmt.Errorf("internal error processing queued messages: %w", err))
   177  				return
   178  			}
   179  		}
   180  	}
   181  }
   182  
   183  // sendOwnMessages is a function which dispatches previously queued messages on worker thread
   184  // This function is called whenever we have queued messages ready to be dispatched.
   185  // No errors are expected during normal operations.
   186  func (h *MessageHub) sendOwnMessages(ctx context.Context) error {
   187  	for {
   188  		select {
   189  		case <-ctx.Done():
   190  			return nil
   191  		default:
   192  		}
   193  
   194  		msg, ok := h.ownOutboundProposals.Pop()
   195  		if ok {
   196  			block := msg.(*flow.Header)
   197  			err := h.sendOwnProposal(block)
   198  			if err != nil {
   199  				return fmt.Errorf("could not process queued block %v: %w", block.ID(), err)
   200  			}
   201  			continue
   202  		}
   203  
   204  		msg, ok = h.ownOutboundVotes.Pop()
   205  		if ok {
   206  			packed := msg.(*packedVote)
   207  			err := h.sendOwnVote(packed)
   208  			if err != nil {
   209  				return fmt.Errorf("could not process queued vote: %w", err)
   210  			}
   211  			continue
   212  		}
   213  
   214  		msg, ok = h.ownOutboundTimeouts.Pop()
   215  		if ok {
   216  			err := h.sendOwnTimeout(msg.(*model.TimeoutObject))
   217  			if err != nil {
   218  				return fmt.Errorf("coult not process queued timeout: %w", err)
   219  			}
   220  			continue
   221  		}
   222  
   223  		// when there is no more messages in the queue, back to the loop to wait
   224  		// for the next incoming message to arrive.
   225  		return nil
   226  	}
   227  }
   228  
   229  // sendOwnTimeout propagates the timeout to the consensus committee (excluding myself)
   230  // No errors are expected during normal operations.
   231  func (h *MessageHub) sendOwnTimeout(timeout *model.TimeoutObject) error {
   232  	log := timeout.LogContext(h.log).Logger()
   233  	log.Info().Msg("processing timeout broadcast request from hotstuff")
   234  
   235  	// Retrieve all consensus nodes (excluding myself).
   236  	// CAUTION: We must include consensus nodes that are joining, because otherwise
   237  	//          TCs might not be constructed at epoch switchover.
   238  	//          TCs might not be constructed at epoch switchover.
   239  	recipients, err := h.state.Final().Identities(filter.And(
   240  		filter.IsValidCurrentEpochParticipantOrJoining,
   241  		filter.HasRole[flow.Identity](flow.RoleConsensus),
   242  		filter.Not(filter.HasNodeID[flow.Identity](h.me.NodeID())),
   243  	))
   244  	if err != nil {
   245  		return fmt.Errorf("could not get consensus recipients for broadcasting timeout: %w", err)
   246  	}
   247  
   248  	// create the timeout message
   249  	msg := &messages.TimeoutObject{
   250  		View:        timeout.View,
   251  		NewestQC:    timeout.NewestQC,
   252  		LastViewTC:  timeout.LastViewTC,
   253  		SigData:     timeout.SigData,
   254  		TimeoutTick: timeout.TimeoutTick,
   255  	}
   256  	err = h.con.Publish(msg, recipients.NodeIDs()...)
   257  	if err != nil {
   258  		if !errors.Is(err, network.EmptyTargetList) {
   259  			log.Err(err).Msg("could not broadcast timeout")
   260  		}
   261  		return nil
   262  	}
   263  	log.Info().Msg("consensus timeout was broadcast")
   264  	h.engineMetrics.MessageSent(metrics.EngineConsensusMessageHub, metrics.MessageTimeoutObject)
   265  
   266  	return nil
   267  }
   268  
   269  // sendOwnVote propagates the vote via unicast to another node that is the next leader
   270  // No errors are expected during normal operations.
   271  func (h *MessageHub) sendOwnVote(packed *packedVote) error {
   272  	log := h.log.With().
   273  		Hex("block_id", packed.vote.BlockID[:]).
   274  		Uint64("block_view", packed.vote.View).
   275  		Hex("recipient_id", packed.recipientID[:]).
   276  		Logger()
   277  	log.Info().Msg("processing vote transmission request from hotstuff")
   278  
   279  	// send the vote the desired recipient
   280  	err := h.con.Unicast(packed.vote, packed.recipientID)
   281  	if err != nil {
   282  		log.Err(err).Msg("could not send vote")
   283  		return nil
   284  	}
   285  	h.engineMetrics.MessageSent(metrics.EngineConsensusMessageHub, metrics.MessageBlockVote)
   286  	log.Info().Msg("block vote transmitted")
   287  
   288  	return nil
   289  }
   290  
   291  // sendOwnProposal propagates the block proposal to the consensus committee and submits to non-consensus network:
   292  //   - broadcast to all other consensus participants (excluding myself)
   293  //   - broadcast to all non-consensus participants
   294  //
   295  // No errors are expected during normal operations.
   296  func (h *MessageHub) sendOwnProposal(header *flow.Header) error {
   297  	// first, check that we are the proposer of the block
   298  	if header.ProposerID != h.me.NodeID() {
   299  		return fmt.Errorf("cannot broadcast proposal with non-local proposer (%x)", header.ProposerID)
   300  	}
   301  
   302  	// retrieve the payload for the block
   303  	payload, err := h.payloads.ByBlockID(header.ID())
   304  	if err != nil {
   305  		return fmt.Errorf("could not retrieve payload for proposal: %w", err)
   306  	}
   307  
   308  	log := h.log.With().
   309  		Str("chain_id", header.ChainID.String()).
   310  		Uint64("block_height", header.Height).
   311  		Uint64("block_view", header.View).
   312  		Hex("block_id", logging.Entity(header)).
   313  		Hex("parent_id", header.ParentID[:]).
   314  		Hex("payload_hash", header.PayloadHash[:]).
   315  		Int("guarantees_count", len(payload.Guarantees)).
   316  		Int("seals_count", len(payload.Seals)).
   317  		Int("receipts_count", len(payload.Receipts)).
   318  		Time("timestamp", header.Timestamp).
   319  		Hex("signers", header.ParentVoterIndices).
   320  		//Dur("delay", delay).
   321  		Logger()
   322  
   323  	log.Debug().Msg("processing proposal broadcast request from hotstuff")
   324  
   325  	// Retrieve all consensus nodes (excluding myself).
   326  	// CAUTION: We must also include nodes that are joining, because otherwise new consensus
   327  	//          nodes for the next epoch are left out. As most nodes might be interested in
   328  	//          new proposals, we simply broadcast to all non-ejected nodes (excluding myself).
   329  	// Note: retrieving the final state requires a time-intensive database read.
   330  	//       Therefore, we execute this in a separate routine, because
   331  	//       `OnOwnTimeout` is directly called by the consensus core logic.
   332  	allIdentities, err := h.state.AtBlockID(header.ParentID).Identities(filter.And(
   333  		filter.Not(filter.HasParticipationStatus(flow.EpochParticipationStatusEjected)),
   334  		filter.Not(filter.HasNodeID[flow.Identity](h.me.NodeID())),
   335  	))
   336  	if err != nil {
   337  		return fmt.Errorf("could not get identities for broadcasting proposal: %w", err)
   338  	}
   339  
   340  	consRecipients := allIdentities.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))
   341  
   342  	// NOTE: some fields are not needed for the message
   343  	// - proposer ID is conveyed over the network message
   344  	// - the payload hash is deduced from the payload
   345  	proposal := messages.NewBlockProposal(&flow.Block{
   346  		Header:  header,
   347  		Payload: payload,
   348  	})
   349  
   350  	// broadcast the proposal to consensus nodes
   351  	err = h.con.Publish(proposal, consRecipients.NodeIDs()...)
   352  	if err != nil {
   353  		if !errors.Is(err, network.EmptyTargetList) {
   354  			log.Err(err).Msg("could not send proposal message")
   355  		}
   356  		return nil
   357  	}
   358  	log.Info().Msg("block proposal was broadcast")
   359  
   360  	// submit proposal to non-consensus nodes
   361  	h.provideProposal(proposal, allIdentities.Filter(filter.Not(filter.HasRole[flow.Identity](flow.RoleConsensus))))
   362  	h.engineMetrics.MessageSent(metrics.EngineConsensusMessageHub, metrics.MessageBlockProposal)
   363  
   364  	return nil
   365  }
   366  
   367  // provideProposal is used when we want to broadcast a local block to the rest  of the
   368  // network (non-consensus nodes).
   369  func (h *MessageHub) provideProposal(proposal *messages.BlockProposal, recipients flow.IdentityList) {
   370  	header := proposal.Block.Header
   371  	blockID := header.ID()
   372  	log := h.log.With().
   373  		Uint64("block_view", header.View).
   374  		Hex("block_id", blockID[:]).
   375  		Hex("parent_id", header.ParentID[:]).
   376  		Logger()
   377  	log.Info().Msg("block proposal submitted for propagation")
   378  
   379  	// submit the block to the targets
   380  	err := h.pushBlocksCon.Publish(proposal, recipients.NodeIDs()...)
   381  	if err != nil {
   382  		h.log.Err(err).Msg("failed to broadcast block")
   383  		return
   384  	}
   385  
   386  	log.Info().Msg("block proposal propagated to non-consensus nodes")
   387  }
   388  
   389  // OnOwnVote propagates the vote to relevant recipient(s):
   390  //   - [common case] vote is queued and is sent via unicast to another node that is the next leader by worker
   391  //   - [special case] this node is the next leader: vote is directly forwarded to the node's internal `VoteAggregator`
   392  func (h *MessageHub) OnOwnVote(blockID flow.Identifier, view uint64, sigData []byte, recipientID flow.Identifier) {
   393  	vote := &messages.BlockVote{
   394  		BlockID: blockID,
   395  		View:    view,
   396  		SigData: sigData,
   397  	}
   398  
   399  	// special case: I am the next leader
   400  	if recipientID == h.me.NodeID() {
   401  		h.forwardToOwnVoteAggregator(vote, h.me.NodeID()) // forward vote to my own `voteAggregator`
   402  		return
   403  	}
   404  
   405  	// common case: someone else is leader
   406  	packed := &packedVote{
   407  		recipientID: recipientID,
   408  		vote:        vote,
   409  	}
   410  	if ok := h.ownOutboundVotes.Push(packed); ok {
   411  		h.ownOutboundMessageNotifier.Notify()
   412  	} else {
   413  		h.engineMetrics.OutboundMessageDropped(metrics.EngineConsensusMessageHub, metrics.MessageBlockVote)
   414  	}
   415  }
   416  
   417  // OnOwnTimeout forwards timeout to node's internal `timeoutAggregator` and queues timeout for
   418  // subsequent propagation to all consensus participants (excluding this node)
   419  func (h *MessageHub) OnOwnTimeout(timeout *model.TimeoutObject) {
   420  	h.forwardToOwnTimeoutAggregator(timeout) // forward timeout to my own `timeoutAggregator`
   421  	if ok := h.ownOutboundTimeouts.Push(timeout); ok {
   422  		h.ownOutboundMessageNotifier.Notify()
   423  	} else {
   424  		h.engineMetrics.OutboundMessageDropped(metrics.EngineConsensusMessageHub, metrics.MessageTimeoutObject)
   425  	}
   426  }
   427  
   428  // OnOwnProposal directly forwards proposal to HotStuff core logic (skipping compliance engine as we assume our
   429  // own proposals to be correct) and queues proposal for subsequent propagation to all consensus participants (including this node).
   430  // The proposal will only be placed in the queue, after the specified delay (or dropped on shutdown signal).
   431  func (h *MessageHub) OnOwnProposal(proposal *flow.Header, targetPublicationTime time.Time) {
   432  	go func() {
   433  		select {
   434  		case <-time.After(time.Until(targetPublicationTime)):
   435  		case <-h.ShutdownSignal():
   436  			return
   437  		}
   438  
   439  		hotstuffProposal := model.ProposalFromFlow(proposal)
   440  		// notify vote aggregator that new block proposal is available, in case we are next leader
   441  		h.voteAggregator.AddBlock(hotstuffProposal) // non-blocking
   442  
   443  		// TODO(active-pacemaker): replace with pub/sub?
   444  		// submit proposal to our own processing pipeline
   445  		h.hotstuff.SubmitProposal(hotstuffProposal) // non-blocking
   446  
   447  		if ok := h.ownOutboundProposals.Push(proposal); ok {
   448  			h.ownOutboundMessageNotifier.Notify()
   449  		} else {
   450  			h.engineMetrics.OutboundMessageDropped(metrics.EngineConsensusMessageHub, metrics.MessageBlockProposal)
   451  		}
   452  	}()
   453  }
   454  
   455  // Process handles incoming messages from consensus channel. After matching message by type, sends it to the correct
   456  // component for handling.
   457  // No errors are expected during normal operations.
   458  func (h *MessageHub) Process(channel channels.Channel, originID flow.Identifier, message interface{}) error {
   459  	switch msg := message.(type) {
   460  	case *messages.BlockProposal:
   461  		h.compliance.OnBlockProposal(flow.Slashable[*messages.BlockProposal]{
   462  			OriginID: originID,
   463  			Message:  msg,
   464  		})
   465  	case *messages.BlockVote:
   466  		h.forwardToOwnVoteAggregator(msg, originID)
   467  	case *messages.TimeoutObject:
   468  		t := &model.TimeoutObject{
   469  			View:        msg.View,
   470  			NewestQC:    msg.NewestQC,
   471  			LastViewTC:  msg.LastViewTC,
   472  			SignerID:    originID,
   473  			SigData:     msg.SigData,
   474  			TimeoutTick: msg.TimeoutTick,
   475  		}
   476  		h.forwardToOwnTimeoutAggregator(t)
   477  	default:
   478  		h.log.Warn().
   479  			Bool(logging.KeySuspicious, true).
   480  			Hex("origin_id", logging.ID(originID)).
   481  			Str("message_type", fmt.Sprintf("%T", message)).
   482  			Str("channel", channel.String()).
   483  			Msgf("delivered unsupported message type")
   484  	}
   485  	return nil
   486  }
   487  
   488  // forwardToOwnVoteAggregator converts vote to generic `model.Vote`, logs vote and forwards it to own `voteAggregator`.
   489  // Per API convention, timeoutAggregator` is non-blocking, hence, this call returns quickly.
   490  func (h *MessageHub) forwardToOwnVoteAggregator(vote *messages.BlockVote, originID flow.Identifier) {
   491  	h.engineMetrics.MessageReceived(metrics.EngineConsensusMessageHub, metrics.MessageBlockVote)
   492  	v := &model.Vote{
   493  		View:     vote.View,
   494  		BlockID:  vote.BlockID,
   495  		SignerID: originID,
   496  		SigData:  vote.SigData,
   497  	}
   498  	h.log.Info().
   499  		Uint64("block_view", v.View).
   500  		Hex("block_id", v.BlockID[:]).
   501  		Hex("voter", v.SignerID[:]).
   502  		Str("vote_id", v.ID().String()).
   503  		Msg("block vote received, forwarding block vote to hotstuff vote aggregator")
   504  	h.voteAggregator.AddVote(v)
   505  }
   506  
   507  // forwardToOwnTimeoutAggregator logs timeout and forwards it to own `timeoutAggregator`.
   508  // Per API convention, timeoutAggregator` is non-blocking, hence, this call returns quickly.
   509  func (h *MessageHub) forwardToOwnTimeoutAggregator(t *model.TimeoutObject) {
   510  	h.engineMetrics.MessageReceived(metrics.EngineConsensusMessageHub, metrics.MessageTimeoutObject)
   511  	h.log.Info().
   512  		Hex("origin_id", t.SignerID[:]).
   513  		Uint64("view", t.View).
   514  		Str("timeout_id", t.ID().String()).
   515  		Msg("timeout received, forwarding timeout to hotstuff timeout aggregator")
   516  	h.timeoutAggregator.AddTimeout(t)
   517  }