github.com/onflow/flow-go@v0.33.17/engine/collection/message_hub/message_hub.go (about)

     1  package message_hub
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/onflow/flow-go/consensus/hotstuff"
    12  	"github.com/onflow/flow-go/consensus/hotstuff/model"
    13  	"github.com/onflow/flow-go/consensus/hotstuff/notifications"
    14  	"github.com/onflow/flow-go/engine"
    15  	"github.com/onflow/flow-go/engine/collection"
    16  	"github.com/onflow/flow-go/engine/common/fifoqueue"
    17  	"github.com/onflow/flow-go/model/cluster"
    18  	"github.com/onflow/flow-go/model/flow"
    19  	"github.com/onflow/flow-go/model/flow/filter"
    20  	"github.com/onflow/flow-go/model/messages"
    21  	"github.com/onflow/flow-go/module"
    22  	"github.com/onflow/flow-go/module/component"
    23  	"github.com/onflow/flow-go/module/irrecoverable"
    24  	"github.com/onflow/flow-go/module/metrics"
    25  	"github.com/onflow/flow-go/network"
    26  	"github.com/onflow/flow-go/network/channels"
    27  	clusterkv "github.com/onflow/flow-go/state/cluster"
    28  	"github.com/onflow/flow-go/state/protocol"
    29  	"github.com/onflow/flow-go/storage"
    30  	"github.com/onflow/flow-go/utils/logging"
    31  )
    32  
    33  // defaultMessageHubRequestsWorkers number of workers to dispatch events for requests
    34  const defaultMessageHubRequestsWorkers = 5
    35  
    36  // defaultProposalQueueCapacity number of pending outgoing proposals stored in queue
    37  const defaultProposalQueueCapacity = 3
    38  
    39  // defaultVoteQueueCapacity number of pending outgoing votes stored in queue
    40  const defaultVoteQueueCapacity = 20
    41  
    42  // defaultTimeoutQueueCapacity number of pending outgoing timeouts stored in queue
    43  const defaultTimeoutQueueCapacity = 3
    44  
    45  // packedVote is a helper structure to pack recipientID and vote into one structure to pass through fifoqueue.FifoQueue
    46  type packedVote struct {
    47  	recipientID flow.Identifier
    48  	vote        *messages.ClusterBlockVote
    49  }
    50  
    51  // MessageHub is a central module for handling incoming and outgoing messages via cluster consensus channel.
    52  // It performs message routing for incoming messages by matching them by type and sending to respective engine.
    53  // For incoming messages handling processing looks like this:
    54  //
    55  //	   +-------------------+      +------------+
    56  //	-->|  Cluster-Channel  |----->| MessageHub |
    57  //	   +-------------------+      +------+-----+
    58  //	                         ------------|------------
    59  //	   +------+---------+    |    +------+-----+     |    +------+------------+
    60  //	   | VoteAggregator |----+    | Compliance |     +----| TimeoutAggregator |
    61  //	   +----------------+         +------------+          +------+------------+
    62  //	          vote                     block                  timeout object
    63  //
    64  // MessageHub acts as communicator and handles hotstuff.Consumer communication events to send votes, broadcast timeouts
    65  // and proposals. It is responsible for communication between cluster consensus participants.
    66  // It implements hotstuff.Consumer interface and needs to be subscribed for notifications via pub/sub.
    67  // All communicator events are handled on worker thread to prevent sender from blocking.
    68  // For outgoing messages processing logic looks like this:
    69  //
    70  //	+-------------------+      +------------+      +----------+      +------------------------+
    71  //	|  Cluster-Channel  |<-----| MessageHub |<-----| Consumer |<-----|        Hotstuff        |
    72  //	+-------------------+      +------+-----+      +----------+      +------------------------+
    73  //	                                                  pub/sub          vote, timeout, proposal
    74  //
    75  // MessageHub is safe to use in concurrent environment.
    76  type MessageHub struct {
    77  	*component.ComponentManager
    78  	notifications.NoopConsumer
    79  	log                        zerolog.Logger
    80  	me                         module.Local
    81  	engineMetrics              module.EngineMetrics
    82  	state                      protocol.State
    83  	payloads                   storage.ClusterPayloads
    84  	con                        network.Conduit
    85  	ownOutboundMessageNotifier engine.Notifier
    86  	ownOutboundVotes           *fifoqueue.FifoQueue // queue for handling outgoing vote transmissions
    87  	ownOutboundProposals       *fifoqueue.FifoQueue // queue for handling outgoing proposal transmissions
    88  	ownOutboundTimeouts        *fifoqueue.FifoQueue // queue for handling outgoing timeout transmissions
    89  	clusterIdentityFilter      flow.IdentityFilter
    90  
    91  	// injected dependencies
    92  	compliance        collection.Compliance      // handler of incoming block proposals
    93  	hotstuff          module.HotStuff            // used to submit proposals that were previously broadcast
    94  	voteAggregator    hotstuff.VoteAggregator    // handler of incoming votes
    95  	timeoutAggregator hotstuff.TimeoutAggregator // handler of incoming timeouts
    96  }
    97  
    98  var _ network.MessageProcessor = (*MessageHub)(nil)
    99  var _ hotstuff.CommunicatorConsumer = (*MessageHub)(nil)
   100  
   101  // NewMessageHub constructs new instance of message hub
   102  // No errors are expected during normal operations.
   103  func NewMessageHub(log zerolog.Logger,
   104  	engineMetrics module.EngineMetrics,
   105  	net network.EngineRegistry,
   106  	me module.Local,
   107  	compliance collection.Compliance,
   108  	hotstuff module.HotStuff,
   109  	voteAggregator hotstuff.VoteAggregator,
   110  	timeoutAggregator hotstuff.TimeoutAggregator,
   111  	state protocol.State,
   112  	clusterState clusterkv.State,
   113  	payloads storage.ClusterPayloads,
   114  ) (*MessageHub, error) {
   115  	// find my cluster for the current epoch
   116  	// TODO this should flow from cluster state as source of truth
   117  	clusters, err := state.Final().Epochs().Current().Clustering()
   118  	if err != nil {
   119  		return nil, fmt.Errorf("could not get clusters: %w", err)
   120  	}
   121  	currentCluster, _, found := clusters.ByNodeID(me.NodeID())
   122  	if !found {
   123  		return nil, fmt.Errorf("could not find cluster for self")
   124  	}
   125  
   126  	ownOutboundVotes, err := fifoqueue.NewFifoQueue(defaultVoteQueueCapacity)
   127  	if err != nil {
   128  		return nil, fmt.Errorf("could not initialize votes queue")
   129  	}
   130  	ownOutboundProposals, err := fifoqueue.NewFifoQueue(defaultProposalQueueCapacity)
   131  	if err != nil {
   132  		return nil, fmt.Errorf("could not initialize blocks queue")
   133  	}
   134  	ownOutboundTimeouts, err := fifoqueue.NewFifoQueue(defaultTimeoutQueueCapacity)
   135  	if err != nil {
   136  		return nil, fmt.Errorf("could not initialize timeouts queue")
   137  	}
   138  	hub := &MessageHub{
   139  		log:                        log.With().Str("engine", "cluster_message_hub").Logger(),
   140  		me:                         me,
   141  		engineMetrics:              engineMetrics,
   142  		state:                      state,
   143  		payloads:                   payloads,
   144  		compliance:                 compliance,
   145  		hotstuff:                   hotstuff,
   146  		voteAggregator:             voteAggregator,
   147  		timeoutAggregator:          timeoutAggregator,
   148  		ownOutboundMessageNotifier: engine.NewNotifier(),
   149  		ownOutboundVotes:           ownOutboundVotes,
   150  		ownOutboundProposals:       ownOutboundProposals,
   151  		ownOutboundTimeouts:        ownOutboundTimeouts,
   152  		clusterIdentityFilter: filter.And(
   153  			filter.In(currentCluster),
   154  			filter.Not(filter.HasNodeID(me.NodeID())),
   155  		),
   156  	}
   157  
   158  	// register network conduit
   159  	chainID, err := clusterState.Params().ChainID()
   160  	if err != nil {
   161  		return nil, fmt.Errorf("could not get chain ID: %w", err)
   162  	}
   163  	conduit, err := net.Register(channels.ConsensusCluster(chainID), hub)
   164  	if err != nil {
   165  		return nil, fmt.Errorf("could not register engine: %w", err)
   166  	}
   167  	hub.con = conduit
   168  
   169  	componentBuilder := component.NewComponentManagerBuilder()
   170  	// This implementation tolerates if the networking layer sometimes blocks on send requests.
   171  	// We use by default 5 go-routines here. This is fine, because outbound messages are temporally sparse
   172  	// under normal operations. Hence, the go-routines should mostly be asleep waiting for work.
   173  	for i := 0; i < defaultMessageHubRequestsWorkers; i++ {
   174  		componentBuilder.AddWorker(func(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
   175  			ready()
   176  			hub.queuedMessagesProcessingLoop(ctx)
   177  		})
   178  	}
   179  	hub.ComponentManager = componentBuilder.Build()
   180  	return hub, nil
   181  }
   182  
   183  // queuedMessagesProcessingLoop orchestrates dispatching of previously queued messages
   184  func (h *MessageHub) queuedMessagesProcessingLoop(ctx irrecoverable.SignalerContext) {
   185  	notifier := h.ownOutboundMessageNotifier.Channel()
   186  	for {
   187  		select {
   188  		case <-ctx.Done():
   189  			return
   190  		case <-notifier:
   191  			err := h.sendOwnMessages(ctx)
   192  			if err != nil {
   193  				ctx.Throw(fmt.Errorf("internal error processing queued messages: %w", err))
   194  				return
   195  			}
   196  		}
   197  	}
   198  }
   199  
   200  // sendOwnMessages is a function which dispatches previously queued messages on worker thread
   201  // This function is called whenever we have queued messages ready to be dispatched.
   202  // No errors are expected during normal operations.
   203  func (h *MessageHub) sendOwnMessages(ctx context.Context) error {
   204  	for {
   205  		select {
   206  		case <-ctx.Done():
   207  			return nil
   208  		default:
   209  		}
   210  
   211  		msg, ok := h.ownOutboundProposals.Pop()
   212  		if ok {
   213  			block := msg.(*flow.Header)
   214  			err := h.sendOwnProposal(block)
   215  			if err != nil {
   216  				return fmt.Errorf("could not process queued block %v: %w", block.ID(), err)
   217  			}
   218  			continue
   219  		}
   220  
   221  		msg, ok = h.ownOutboundVotes.Pop()
   222  		if ok {
   223  			packed := msg.(*packedVote)
   224  			err := h.sendOwnVote(packed)
   225  			if err != nil {
   226  				return fmt.Errorf("could not process queued vote: %w", err)
   227  			}
   228  			continue
   229  		}
   230  
   231  		msg, ok = h.ownOutboundTimeouts.Pop()
   232  		if ok {
   233  			err := h.sendOwnTimeout(msg.(*model.TimeoutObject))
   234  			if err != nil {
   235  				return fmt.Errorf("coult not process queued timeout: %w", err)
   236  			}
   237  			continue
   238  		}
   239  
   240  		// when there is no more messages in the queue, back to the loop to wait
   241  		// for the next incoming message to arrive.
   242  		return nil
   243  	}
   244  }
   245  
   246  // sendOwnTimeout propagates the timeout to the consensus committee (excluding myself)
   247  // No errors are expected during normal operations.
   248  func (h *MessageHub) sendOwnTimeout(timeout *model.TimeoutObject) error {
   249  	log := timeout.LogContext(h.log).Logger()
   250  	log.Info().Msg("processing timeout broadcast request from hotstuff")
   251  
   252  	// Retrieve all collection nodes in our cluster (excluding myself).
   253  	recipients, err := h.state.Final().Identities(h.clusterIdentityFilter)
   254  	if err != nil {
   255  		return fmt.Errorf("could not get cluster members for broadcasting timeout: %w", err)
   256  	}
   257  	// create the timeout message
   258  	msg := &messages.ClusterTimeoutObject{
   259  		View:        timeout.View,
   260  		NewestQC:    timeout.NewestQC,
   261  		LastViewTC:  timeout.LastViewTC,
   262  		SigData:     timeout.SigData,
   263  		TimeoutTick: timeout.TimeoutTick,
   264  	}
   265  
   266  	err = h.con.Publish(msg, recipients.NodeIDs()...)
   267  	if err != nil {
   268  		if !errors.Is(err, network.EmptyTargetList) {
   269  			log.Err(err).Msg("could not broadcast timeout")
   270  		}
   271  		return nil
   272  	}
   273  	log.Info().Msg("cluster timeout was broadcast")
   274  	h.engineMetrics.MessageSent(metrics.EngineCollectionMessageHub, metrics.MessageTimeoutObject)
   275  
   276  	return nil
   277  }
   278  
   279  // sendOwnVote propagates the vote via unicast to another node that is the next leader
   280  // No errors are expected during normal operations.
   281  func (h *MessageHub) sendOwnVote(packed *packedVote) error {
   282  	log := h.log.With().
   283  		Hex("collection_id", packed.vote.BlockID[:]).
   284  		Uint64("collection_view", packed.vote.View).
   285  		Hex("recipient_id", packed.recipientID[:]).
   286  		Logger()
   287  	log.Info().Msg("processing vote transmission request from hotstuff")
   288  
   289  	// send the vote the desired recipient
   290  	err := h.con.Unicast(packed.vote, packed.recipientID)
   291  	if err != nil {
   292  		log.Err(err).Msg("could not send vote")
   293  		return nil
   294  	}
   295  	log.Info().Msg("collection vote transmitted")
   296  	h.engineMetrics.MessageSent(metrics.EngineCollectionMessageHub, metrics.MessageBlockVote)
   297  
   298  	return nil
   299  }
   300  
   301  // sendOwnProposal propagates the block proposal to the consensus committee by broadcasting to all other cluster participants (excluding myself)
   302  // No errors are expected during normal operations.
   303  func (h *MessageHub) sendOwnProposal(header *flow.Header) error {
   304  	// first, check that we are the proposer of the block
   305  	if header.ProposerID != h.me.NodeID() {
   306  		return fmt.Errorf("cannot broadcast proposal with non-local proposer (%x)", header.ProposerID)
   307  	}
   308  
   309  	// retrieve the payload for the block
   310  	payload, err := h.payloads.ByBlockID(header.ID())
   311  	if err != nil {
   312  		return fmt.Errorf("could not retrieve payload for proposal: %w", err)
   313  	}
   314  
   315  	log := h.log.With().
   316  		Str("chain_id", header.ChainID.String()).
   317  		Uint64("block_height", header.Height).
   318  		Uint64("block_view", header.View).
   319  		Hex("block_id", logging.ID(header.ID())).
   320  		Hex("parent_id", header.ParentID[:]).
   321  		Hex("ref_block", payload.ReferenceBlockID[:]).
   322  		Int("transaction_count", payload.Collection.Len()).
   323  		Hex("parent_signer_indices", header.ParentVoterIndices).
   324  		Logger()
   325  
   326  	log.Debug().Msg("processing cluster broadcast request from hotstuff")
   327  
   328  	// retrieve all collection nodes in our cluster
   329  	recipients, err := h.state.Final().Identities(h.clusterIdentityFilter)
   330  	if err != nil {
   331  		return fmt.Errorf("could not get cluster members for broadcasting collection proposal")
   332  	}
   333  
   334  	// create the proposal message for the collection
   335  	proposal := messages.NewClusterBlockProposal(&cluster.Block{
   336  		Header:  header,
   337  		Payload: payload,
   338  	})
   339  
   340  	// broadcast the proposal to consensus nodes
   341  	err = h.con.Publish(proposal, recipients.NodeIDs()...)
   342  	if err != nil {
   343  		if !errors.Is(err, network.EmptyTargetList) {
   344  			log.Err(err).Msg("could not send proposal message")
   345  		}
   346  		return nil
   347  	}
   348  	log.Info().Msg("cluster proposal was broadcast")
   349  	h.engineMetrics.MessageSent(metrics.EngineCollectionMessageHub, metrics.MessageBlockProposal)
   350  
   351  	return nil
   352  }
   353  
   354  // OnOwnVote propagates the vote to relevant recipient(s):
   355  //   - [common case] vote is queued and is sent via unicast to another node that is the next leader by worker
   356  //   - [special case] this node is the next leader: vote is directly forwarded to the node's internal `VoteAggregator`
   357  func (h *MessageHub) OnOwnVote(blockID flow.Identifier, view uint64, sigData []byte, recipientID flow.Identifier) {
   358  	vote := &messages.ClusterBlockVote{
   359  		BlockID: blockID,
   360  		View:    view,
   361  		SigData: sigData,
   362  	}
   363  
   364  	// special case: I am the next leader
   365  	if recipientID == h.me.NodeID() {
   366  		h.forwardToOwnVoteAggregator(vote, h.me.NodeID()) // forward vote to my own `voteAggregator`
   367  		return
   368  	}
   369  
   370  	// common case: someone else is leader
   371  	packed := &packedVote{
   372  		recipientID: recipientID,
   373  		vote:        vote,
   374  	}
   375  	if ok := h.ownOutboundVotes.Push(packed); ok {
   376  		h.ownOutboundMessageNotifier.Notify()
   377  	} else {
   378  		h.engineMetrics.OutboundMessageDropped(metrics.EngineCollectionMessageHub, metrics.MessageBlockVote)
   379  	}
   380  }
   381  
   382  // OnOwnTimeout forwards timeout to node's internal `timeoutAggregator` and queues timeout for
   383  // subsequent propagation to all consensus participants (excluding this node)
   384  func (h *MessageHub) OnOwnTimeout(timeout *model.TimeoutObject) {
   385  	h.forwardToOwnTimeoutAggregator(timeout) // forward timeout to my own `timeoutAggregator`
   386  	if ok := h.ownOutboundTimeouts.Push(timeout); ok {
   387  		h.ownOutboundMessageNotifier.Notify()
   388  	} else {
   389  		h.engineMetrics.OutboundMessageDropped(metrics.EngineCollectionMessageHub, metrics.MessageTimeoutObject)
   390  	}
   391  }
   392  
   393  // OnOwnProposal directly forwards proposal to HotStuff core logic(skipping compliance engine as we assume our
   394  // own proposals to be correct) and queues proposal for subsequent propagation to all consensus participants (including this node).
   395  // The proposal will only be placed in the queue, after the specified delay (or dropped on shutdown signal).
   396  func (h *MessageHub) OnOwnProposal(proposal *flow.Header, targetPublicationTime time.Time) {
   397  	go func() {
   398  		select {
   399  		case <-time.After(time.Until(targetPublicationTime)):
   400  		case <-h.ShutdownSignal():
   401  			return
   402  		}
   403  
   404  		hotstuffProposal := model.ProposalFromFlow(proposal)
   405  		// notify vote aggregator that new block proposal is available, in case we are next leader
   406  		h.voteAggregator.AddBlock(hotstuffProposal) // non-blocking
   407  
   408  		// TODO(active-pacemaker): replace with pub/sub?
   409  		// submit proposal to our own processing pipeline
   410  		h.hotstuff.SubmitProposal(hotstuffProposal) // non-blocking
   411  
   412  		if ok := h.ownOutboundProposals.Push(proposal); ok {
   413  			h.ownOutboundMessageNotifier.Notify()
   414  		} else {
   415  			h.engineMetrics.OutboundMessageDropped(metrics.EngineCollectionMessageHub, metrics.MessageBlockProposal)
   416  		}
   417  	}()
   418  }
   419  
   420  // Process handles incoming messages from consensus channel. After matching message by type, sends it to the correct
   421  // component for handling.
   422  // No errors are expected during normal operations.
   423  func (h *MessageHub) Process(channel channels.Channel, originID flow.Identifier, message interface{}) error {
   424  	switch msg := message.(type) {
   425  	case *messages.ClusterBlockProposal:
   426  		h.compliance.OnClusterBlockProposal(flow.Slashable[*messages.ClusterBlockProposal]{
   427  			OriginID: originID,
   428  			Message:  msg,
   429  		})
   430  	case *messages.ClusterBlockVote:
   431  		h.forwardToOwnVoteAggregator(msg, originID)
   432  	case *messages.ClusterTimeoutObject:
   433  		t := &model.TimeoutObject{
   434  			View:        msg.View,
   435  			NewestQC:    msg.NewestQC,
   436  			LastViewTC:  msg.LastViewTC,
   437  			SignerID:    originID,
   438  			SigData:     msg.SigData,
   439  			TimeoutTick: msg.TimeoutTick,
   440  		}
   441  		h.forwardToOwnTimeoutAggregator(t)
   442  	default:
   443  		h.log.Warn().
   444  			Bool(logging.KeySuspicious, true).
   445  			Hex("origin_id", logging.ID(originID)).
   446  			Str("message_type", fmt.Sprintf("%T", message)).
   447  			Str("channel", channel.String()).
   448  			Msgf("delivered unsupported message type")
   449  	}
   450  	return nil
   451  }
   452  
   453  // forwardToOwnVoteAggregator converts vote to generic `model.Vote`, logs vote and forwards it to own `voteAggregator`.
   454  // Per API convention, timeoutAggregator` is non-blocking, hence, this call returns quickly.
   455  func (h *MessageHub) forwardToOwnVoteAggregator(vote *messages.ClusterBlockVote, originID flow.Identifier) {
   456  	h.engineMetrics.MessageReceived(metrics.EngineCollectionMessageHub, metrics.MessageBlockVote)
   457  	v := &model.Vote{
   458  		View:     vote.View,
   459  		BlockID:  vote.BlockID,
   460  		SignerID: originID,
   461  		SigData:  vote.SigData,
   462  	}
   463  	h.log.Info().
   464  		Uint64("block_view", v.View).
   465  		Hex("block_id", v.BlockID[:]).
   466  		Hex("voter", v.SignerID[:]).
   467  		Str("vote_id", v.ID().String()).
   468  		Msg("block vote received, forwarding block vote to hotstuff vote aggregator")
   469  	h.voteAggregator.AddVote(v)
   470  }
   471  
   472  // forwardToOwnTimeoutAggregator logs timeout and forwards it to own `timeoutAggregator`.
   473  // Per API convention, timeoutAggregator` is non-blocking, hence, this call returns quickly.
   474  func (h *MessageHub) forwardToOwnTimeoutAggregator(t *model.TimeoutObject) {
   475  	h.engineMetrics.MessageReceived(metrics.EngineCollectionMessageHub, metrics.MessageTimeoutObject)
   476  	h.log.Info().
   477  		Hex("origin_id", t.SignerID[:]).
   478  		Uint64("view", t.View).
   479  		Str("timeout_id", t.ID().String()).
   480  		Msg("timeout received, forwarding timeout to hotstuff timeout aggregator")
   481  	h.timeoutAggregator.AddTimeout(t)
   482  }