github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/consensus/message_hub/message_hub.go (about) 1 package message_hub 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "time" 8 9 "github.com/rs/zerolog" 10 11 "github.com/onflow/flow-go/consensus/hotstuff" 12 "github.com/onflow/flow-go/consensus/hotstuff/model" 13 "github.com/onflow/flow-go/consensus/hotstuff/notifications" 14 "github.com/onflow/flow-go/engine" 15 "github.com/onflow/flow-go/engine/common/fifoqueue" 16 "github.com/onflow/flow-go/engine/consensus" 17 "github.com/onflow/flow-go/model/flow" 18 "github.com/onflow/flow-go/model/flow/filter" 19 "github.com/onflow/flow-go/model/messages" 20 "github.com/onflow/flow-go/module" 21 "github.com/onflow/flow-go/module/component" 22 "github.com/onflow/flow-go/module/irrecoverable" 23 "github.com/onflow/flow-go/module/metrics" 24 "github.com/onflow/flow-go/network" 25 "github.com/onflow/flow-go/network/channels" 26 "github.com/onflow/flow-go/state/protocol" 27 "github.com/onflow/flow-go/storage" 28 "github.com/onflow/flow-go/utils/logging" 29 ) 30 31 // defaultMessageHubRequestsWorkers number of workers to dispatch events for requests 32 const defaultMessageHubRequestsWorkers = 5 33 34 // defaultProposalQueueCapacity number of pending outgoing proposals stored in queue 35 const defaultProposalQueueCapacity = 3 36 37 // defaultVoteQueueCapacity number of pending outgoing votes stored in queue 38 const defaultVoteQueueCapacity = 20 39 40 // defaultTimeoutQueueCapacity number of pending outgoing timeouts stored in queue 41 const defaultTimeoutQueueCapacity = 3 42 43 // packedVote is a helper structure to pack recipientID and vote into one structure to pass through fifoqueue.FifoQueue 44 type packedVote struct { 45 recipientID flow.Identifier 46 vote *messages.BlockVote 47 } 48 49 // MessageHub is a central module for handling incoming and outgoing messages via consensus channel. 50 // It performs message routing for incoming messages by matching them by type and sending to respective engine. 51 // For incoming messages handling processing looks like this: 52 // 53 // +-------------------+ +------------+ 54 // -->| Consensus-Channel |----->| MessageHub | 55 // +-------------------+ +------+-----+ 56 // ------------|------------ 57 // +------+---------+ | +------+-----+ | +------+------------+ 58 // | VoteAggregator |----+ | Compliance | +----| TimeoutAggregator | 59 // +----------------+ +------------+ +------+------------+ 60 // vote block timeout object 61 // 62 // MessageHub acts as communicator and handles hotstuff.Consumer communication events to send votes, broadcast timeouts 63 // and proposals. It is responsible for communication between consensus participants. 64 // It implements hotstuff.Consumer interface and needs to be subscribed for notifications via pub/sub. 65 // All communicator events are handled on worker thread to prevent sender from blocking. 66 // For outgoing messages processing logic looks like this: 67 // 68 // +-------------------+ +------------+ +----------+ +------------------------+ 69 // | Consensus-Channel |<-----| MessageHub |<-----| Consumer |<-----| Hotstuff | 70 // +-------------------+ +------+-----+ +----------+ +------------------------+ 71 // pub/sub vote, timeout, proposal 72 // 73 // MessageHub is safe to use in concurrent environment. 74 type MessageHub struct { 75 *component.ComponentManager 76 notifications.NoopConsumer 77 log zerolog.Logger 78 me module.Local 79 engineMetrics module.EngineMetrics 80 state protocol.State 81 payloads storage.Payloads 82 con network.Conduit 83 pushBlocksCon network.Conduit 84 ownOutboundMessageNotifier engine.Notifier 85 ownOutboundVotes *fifoqueue.FifoQueue // queue for handling outgoing vote transmissions 86 ownOutboundProposals *fifoqueue.FifoQueue // queue for handling outgoing proposal transmissions 87 ownOutboundTimeouts *fifoqueue.FifoQueue // queue for handling outgoing timeout transmissions 88 89 // injected dependencies 90 compliance consensus.Compliance // handler of incoming block proposals 91 hotstuff module.HotStuff // used to submit proposals that were previously broadcast 92 voteAggregator hotstuff.VoteAggregator // handler of incoming votes 93 timeoutAggregator hotstuff.TimeoutAggregator // handler of incoming timeouts 94 } 95 96 var _ network.MessageProcessor = (*MessageHub)(nil) 97 var _ hotstuff.CommunicatorConsumer = (*MessageHub)(nil) 98 99 // NewMessageHub constructs new instance of message hub 100 // No errors are expected during normal operations. 101 func NewMessageHub(log zerolog.Logger, 102 engineMetrics module.EngineMetrics, 103 net network.EngineRegistry, 104 me module.Local, 105 compliance consensus.Compliance, 106 hotstuff module.HotStuff, 107 voteAggregator hotstuff.VoteAggregator, 108 timeoutAggregator hotstuff.TimeoutAggregator, 109 state protocol.State, 110 payloads storage.Payloads, 111 ) (*MessageHub, error) { 112 ownOutboundVotes, err := fifoqueue.NewFifoQueue(defaultVoteQueueCapacity) 113 if err != nil { 114 return nil, fmt.Errorf("could not initialize votes queue") 115 } 116 ownOutboundProposals, err := fifoqueue.NewFifoQueue(defaultProposalQueueCapacity) 117 if err != nil { 118 return nil, fmt.Errorf("could not initialize blocks queue") 119 } 120 ownOutboundTimeouts, err := fifoqueue.NewFifoQueue(defaultTimeoutQueueCapacity) 121 if err != nil { 122 return nil, fmt.Errorf("could not initialize timeouts queue") 123 } 124 hub := &MessageHub{ 125 log: log.With().Str("engine", "message_hub").Logger(), 126 me: me, 127 engineMetrics: engineMetrics, 128 state: state, 129 payloads: payloads, 130 compliance: compliance, 131 hotstuff: hotstuff, 132 voteAggregator: voteAggregator, 133 timeoutAggregator: timeoutAggregator, 134 ownOutboundMessageNotifier: engine.NewNotifier(), 135 ownOutboundVotes: ownOutboundVotes, 136 ownOutboundProposals: ownOutboundProposals, 137 ownOutboundTimeouts: ownOutboundTimeouts, 138 } 139 140 // register with the network layer and store the conduit 141 hub.con, err = net.Register(channels.ConsensusCommittee, hub) 142 if err != nil { 143 return nil, fmt.Errorf("could not register core: %w", err) 144 } 145 146 // register with the network layer and store the conduit 147 hub.pushBlocksCon, err = net.Register(channels.PushBlocks, hub) 148 if err != nil { 149 return nil, fmt.Errorf("could not register engine: %w", err) 150 } 151 152 componentBuilder := component.NewComponentManagerBuilder() 153 // This implementation tolerates if the networking layer sometimes blocks on send requests. 154 // We use by default 5 go-routines here. This is fine, because outbound messages are temporally sparse 155 // under normal operations. Hence, the go-routines should mostly be asleep waiting for work. 156 for i := 0; i < defaultMessageHubRequestsWorkers; i++ { 157 componentBuilder.AddWorker(func(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 158 ready() 159 hub.queuedMessagesProcessingLoop(ctx) 160 }) 161 } 162 hub.ComponentManager = componentBuilder.Build() 163 return hub, nil 164 } 165 166 // queuedMessagesProcessingLoop orchestrates dispatching of previously queued messages 167 func (h *MessageHub) queuedMessagesProcessingLoop(ctx irrecoverable.SignalerContext) { 168 notifier := h.ownOutboundMessageNotifier.Channel() 169 for { 170 select { 171 case <-ctx.Done(): 172 return 173 case <-notifier: 174 err := h.sendOwnMessages(ctx) 175 if err != nil { 176 ctx.Throw(fmt.Errorf("internal error processing queued messages: %w", err)) 177 return 178 } 179 } 180 } 181 } 182 183 // sendOwnMessages is a function which dispatches previously queued messages on worker thread 184 // This function is called whenever we have queued messages ready to be dispatched. 185 // No errors are expected during normal operations. 186 func (h *MessageHub) sendOwnMessages(ctx context.Context) error { 187 for { 188 select { 189 case <-ctx.Done(): 190 return nil 191 default: 192 } 193 194 msg, ok := h.ownOutboundProposals.Pop() 195 if ok { 196 block := msg.(*flow.Header) 197 err := h.sendOwnProposal(block) 198 if err != nil { 199 return fmt.Errorf("could not process queued block %v: %w", block.ID(), err) 200 } 201 continue 202 } 203 204 msg, ok = h.ownOutboundVotes.Pop() 205 if ok { 206 packed := msg.(*packedVote) 207 err := h.sendOwnVote(packed) 208 if err != nil { 209 return fmt.Errorf("could not process queued vote: %w", err) 210 } 211 continue 212 } 213 214 msg, ok = h.ownOutboundTimeouts.Pop() 215 if ok { 216 err := h.sendOwnTimeout(msg.(*model.TimeoutObject)) 217 if err != nil { 218 return fmt.Errorf("coult not process queued timeout: %w", err) 219 } 220 continue 221 } 222 223 // when there is no more messages in the queue, back to the loop to wait 224 // for the next incoming message to arrive. 225 return nil 226 } 227 } 228 229 // sendOwnTimeout propagates the timeout to the consensus committee (excluding myself) 230 // No errors are expected during normal operations. 231 func (h *MessageHub) sendOwnTimeout(timeout *model.TimeoutObject) error { 232 log := timeout.LogContext(h.log).Logger() 233 log.Info().Msg("processing timeout broadcast request from hotstuff") 234 235 // Retrieve all consensus nodes (excluding myself). 236 // CAUTION: We must include consensus nodes that are joining, because otherwise 237 // TCs might not be constructed at epoch switchover. 238 // TCs might not be constructed at epoch switchover. 239 recipients, err := h.state.Final().Identities(filter.And( 240 filter.IsValidCurrentEpochParticipantOrJoining, 241 filter.HasRole[flow.Identity](flow.RoleConsensus), 242 filter.Not(filter.HasNodeID[flow.Identity](h.me.NodeID())), 243 )) 244 if err != nil { 245 return fmt.Errorf("could not get consensus recipients for broadcasting timeout: %w", err) 246 } 247 248 // create the timeout message 249 msg := &messages.TimeoutObject{ 250 View: timeout.View, 251 NewestQC: timeout.NewestQC, 252 LastViewTC: timeout.LastViewTC, 253 SigData: timeout.SigData, 254 TimeoutTick: timeout.TimeoutTick, 255 } 256 err = h.con.Publish(msg, recipients.NodeIDs()...) 257 if err != nil { 258 if !errors.Is(err, network.EmptyTargetList) { 259 log.Err(err).Msg("could not broadcast timeout") 260 } 261 return nil 262 } 263 log.Info().Msg("consensus timeout was broadcast") 264 h.engineMetrics.MessageSent(metrics.EngineConsensusMessageHub, metrics.MessageTimeoutObject) 265 266 return nil 267 } 268 269 // sendOwnVote propagates the vote via unicast to another node that is the next leader 270 // No errors are expected during normal operations. 271 func (h *MessageHub) sendOwnVote(packed *packedVote) error { 272 log := h.log.With(). 273 Hex("block_id", packed.vote.BlockID[:]). 274 Uint64("block_view", packed.vote.View). 275 Hex("recipient_id", packed.recipientID[:]). 276 Logger() 277 log.Info().Msg("processing vote transmission request from hotstuff") 278 279 // send the vote the desired recipient 280 err := h.con.Unicast(packed.vote, packed.recipientID) 281 if err != nil { 282 log.Err(err).Msg("could not send vote") 283 return nil 284 } 285 h.engineMetrics.MessageSent(metrics.EngineConsensusMessageHub, metrics.MessageBlockVote) 286 log.Info().Msg("block vote transmitted") 287 288 return nil 289 } 290 291 // sendOwnProposal propagates the block proposal to the consensus committee and submits to non-consensus network: 292 // - broadcast to all other consensus participants (excluding myself) 293 // - broadcast to all non-consensus participants 294 // 295 // No errors are expected during normal operations. 296 func (h *MessageHub) sendOwnProposal(header *flow.Header) error { 297 // first, check that we are the proposer of the block 298 if header.ProposerID != h.me.NodeID() { 299 return fmt.Errorf("cannot broadcast proposal with non-local proposer (%x)", header.ProposerID) 300 } 301 302 // retrieve the payload for the block 303 payload, err := h.payloads.ByBlockID(header.ID()) 304 if err != nil { 305 return fmt.Errorf("could not retrieve payload for proposal: %w", err) 306 } 307 308 log := h.log.With(). 309 Str("chain_id", header.ChainID.String()). 310 Uint64("block_height", header.Height). 311 Uint64("block_view", header.View). 312 Hex("block_id", logging.Entity(header)). 313 Hex("parent_id", header.ParentID[:]). 314 Hex("payload_hash", header.PayloadHash[:]). 315 Int("guarantees_count", len(payload.Guarantees)). 316 Int("seals_count", len(payload.Seals)). 317 Int("receipts_count", len(payload.Receipts)). 318 Time("timestamp", header.Timestamp). 319 Hex("signers", header.ParentVoterIndices). 320 //Dur("delay", delay). 321 Logger() 322 323 log.Debug().Msg("processing proposal broadcast request from hotstuff") 324 325 // Retrieve all consensus nodes (excluding myself). 326 // CAUTION: We must also include nodes that are joining, because otherwise new consensus 327 // nodes for the next epoch are left out. As most nodes might be interested in 328 // new proposals, we simply broadcast to all non-ejected nodes (excluding myself). 329 // Note: retrieving the final state requires a time-intensive database read. 330 // Therefore, we execute this in a separate routine, because 331 // `OnOwnTimeout` is directly called by the consensus core logic. 332 allIdentities, err := h.state.AtBlockID(header.ParentID).Identities(filter.And( 333 filter.Not(filter.HasParticipationStatus(flow.EpochParticipationStatusEjected)), 334 filter.Not(filter.HasNodeID[flow.Identity](h.me.NodeID())), 335 )) 336 if err != nil { 337 return fmt.Errorf("could not get identities for broadcasting proposal: %w", err) 338 } 339 340 consRecipients := allIdentities.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus)) 341 342 // NOTE: some fields are not needed for the message 343 // - proposer ID is conveyed over the network message 344 // - the payload hash is deduced from the payload 345 proposal := messages.NewBlockProposal(&flow.Block{ 346 Header: header, 347 Payload: payload, 348 }) 349 350 // broadcast the proposal to consensus nodes 351 err = h.con.Publish(proposal, consRecipients.NodeIDs()...) 352 if err != nil { 353 if !errors.Is(err, network.EmptyTargetList) { 354 log.Err(err).Msg("could not send proposal message") 355 } 356 return nil 357 } 358 log.Info().Msg("block proposal was broadcast") 359 360 // submit proposal to non-consensus nodes 361 h.provideProposal(proposal, allIdentities.Filter(filter.Not(filter.HasRole[flow.Identity](flow.RoleConsensus)))) 362 h.engineMetrics.MessageSent(metrics.EngineConsensusMessageHub, metrics.MessageBlockProposal) 363 364 return nil 365 } 366 367 // provideProposal is used when we want to broadcast a local block to the rest of the 368 // network (non-consensus nodes). 369 func (h *MessageHub) provideProposal(proposal *messages.BlockProposal, recipients flow.IdentityList) { 370 header := proposal.Block.Header 371 blockID := header.ID() 372 log := h.log.With(). 373 Uint64("block_view", header.View). 374 Hex("block_id", blockID[:]). 375 Hex("parent_id", header.ParentID[:]). 376 Logger() 377 log.Info().Msg("block proposal submitted for propagation") 378 379 // submit the block to the targets 380 err := h.pushBlocksCon.Publish(proposal, recipients.NodeIDs()...) 381 if err != nil { 382 h.log.Err(err).Msg("failed to broadcast block") 383 return 384 } 385 386 log.Info().Msg("block proposal propagated to non-consensus nodes") 387 } 388 389 // OnOwnVote propagates the vote to relevant recipient(s): 390 // - [common case] vote is queued and is sent via unicast to another node that is the next leader by worker 391 // - [special case] this node is the next leader: vote is directly forwarded to the node's internal `VoteAggregator` 392 func (h *MessageHub) OnOwnVote(blockID flow.Identifier, view uint64, sigData []byte, recipientID flow.Identifier) { 393 vote := &messages.BlockVote{ 394 BlockID: blockID, 395 View: view, 396 SigData: sigData, 397 } 398 399 // special case: I am the next leader 400 if recipientID == h.me.NodeID() { 401 h.forwardToOwnVoteAggregator(vote, h.me.NodeID()) // forward vote to my own `voteAggregator` 402 return 403 } 404 405 // common case: someone else is leader 406 packed := &packedVote{ 407 recipientID: recipientID, 408 vote: vote, 409 } 410 if ok := h.ownOutboundVotes.Push(packed); ok { 411 h.ownOutboundMessageNotifier.Notify() 412 } else { 413 h.engineMetrics.OutboundMessageDropped(metrics.EngineConsensusMessageHub, metrics.MessageBlockVote) 414 } 415 } 416 417 // OnOwnTimeout forwards timeout to node's internal `timeoutAggregator` and queues timeout for 418 // subsequent propagation to all consensus participants (excluding this node) 419 func (h *MessageHub) OnOwnTimeout(timeout *model.TimeoutObject) { 420 h.forwardToOwnTimeoutAggregator(timeout) // forward timeout to my own `timeoutAggregator` 421 if ok := h.ownOutboundTimeouts.Push(timeout); ok { 422 h.ownOutboundMessageNotifier.Notify() 423 } else { 424 h.engineMetrics.OutboundMessageDropped(metrics.EngineConsensusMessageHub, metrics.MessageTimeoutObject) 425 } 426 } 427 428 // OnOwnProposal directly forwards proposal to HotStuff core logic (skipping compliance engine as we assume our 429 // own proposals to be correct) and queues proposal for subsequent propagation to all consensus participants (including this node). 430 // The proposal will only be placed in the queue, after the specified delay (or dropped on shutdown signal). 431 func (h *MessageHub) OnOwnProposal(proposal *flow.Header, targetPublicationTime time.Time) { 432 go func() { 433 select { 434 case <-time.After(time.Until(targetPublicationTime)): 435 case <-h.ShutdownSignal(): 436 return 437 } 438 439 hotstuffProposal := model.ProposalFromFlow(proposal) 440 // notify vote aggregator that new block proposal is available, in case we are next leader 441 h.voteAggregator.AddBlock(hotstuffProposal) // non-blocking 442 443 // TODO(active-pacemaker): replace with pub/sub? 444 // submit proposal to our own processing pipeline 445 h.hotstuff.SubmitProposal(hotstuffProposal) // non-blocking 446 447 if ok := h.ownOutboundProposals.Push(proposal); ok { 448 h.ownOutboundMessageNotifier.Notify() 449 } else { 450 h.engineMetrics.OutboundMessageDropped(metrics.EngineConsensusMessageHub, metrics.MessageBlockProposal) 451 } 452 }() 453 } 454 455 // Process handles incoming messages from consensus channel. After matching message by type, sends it to the correct 456 // component for handling. 457 // No errors are expected during normal operations. 458 func (h *MessageHub) Process(channel channels.Channel, originID flow.Identifier, message interface{}) error { 459 switch msg := message.(type) { 460 case *messages.BlockProposal: 461 h.compliance.OnBlockProposal(flow.Slashable[*messages.BlockProposal]{ 462 OriginID: originID, 463 Message: msg, 464 }) 465 case *messages.BlockVote: 466 h.forwardToOwnVoteAggregator(msg, originID) 467 case *messages.TimeoutObject: 468 t := &model.TimeoutObject{ 469 View: msg.View, 470 NewestQC: msg.NewestQC, 471 LastViewTC: msg.LastViewTC, 472 SignerID: originID, 473 SigData: msg.SigData, 474 TimeoutTick: msg.TimeoutTick, 475 } 476 h.forwardToOwnTimeoutAggregator(t) 477 default: 478 h.log.Warn(). 479 Bool(logging.KeySuspicious, true). 480 Hex("origin_id", logging.ID(originID)). 481 Str("message_type", fmt.Sprintf("%T", message)). 482 Str("channel", channel.String()). 483 Msgf("delivered unsupported message type") 484 } 485 return nil 486 } 487 488 // forwardToOwnVoteAggregator converts vote to generic `model.Vote`, logs vote and forwards it to own `voteAggregator`. 489 // Per API convention, timeoutAggregator` is non-blocking, hence, this call returns quickly. 490 func (h *MessageHub) forwardToOwnVoteAggregator(vote *messages.BlockVote, originID flow.Identifier) { 491 h.engineMetrics.MessageReceived(metrics.EngineConsensusMessageHub, metrics.MessageBlockVote) 492 v := &model.Vote{ 493 View: vote.View, 494 BlockID: vote.BlockID, 495 SignerID: originID, 496 SigData: vote.SigData, 497 } 498 h.log.Info(). 499 Uint64("block_view", v.View). 500 Hex("block_id", v.BlockID[:]). 501 Hex("voter", v.SignerID[:]). 502 Str("vote_id", v.ID().String()). 503 Msg("block vote received, forwarding block vote to hotstuff vote aggregator") 504 h.voteAggregator.AddVote(v) 505 } 506 507 // forwardToOwnTimeoutAggregator logs timeout and forwards it to own `timeoutAggregator`. 508 // Per API convention, timeoutAggregator` is non-blocking, hence, this call returns quickly. 509 func (h *MessageHub) forwardToOwnTimeoutAggregator(t *model.TimeoutObject) { 510 h.engineMetrics.MessageReceived(metrics.EngineConsensusMessageHub, metrics.MessageTimeoutObject) 511 h.log.Info(). 512 Hex("origin_id", t.SignerID[:]). 513 Uint64("view", t.View). 514 Str("timeout_id", t.ID().String()). 515 Msg("timeout received, forwarding timeout to hotstuff timeout aggregator") 516 h.timeoutAggregator.AddTimeout(t) 517 }