github.com/koko1123/flow-go-1@v0.29.6/engine/collection/compliance/engine.go (about) 1 package compliance 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "time" 8 9 "github.com/rs/zerolog" 10 11 "github.com/koko1123/flow-go-1/consensus/hotstuff/model" 12 "github.com/koko1123/flow-go-1/engine" 13 "github.com/koko1123/flow-go-1/engine/common/fifoqueue" 14 "github.com/koko1123/flow-go-1/engine/consensus/sealing/counters" 15 "github.com/koko1123/flow-go-1/model/cluster" 16 "github.com/koko1123/flow-go-1/model/events" 17 "github.com/koko1123/flow-go-1/model/flow" 18 "github.com/koko1123/flow-go-1/model/flow/filter" 19 "github.com/koko1123/flow-go-1/model/messages" 20 "github.com/koko1123/flow-go-1/module" 21 "github.com/koko1123/flow-go-1/module/irrecoverable" 22 "github.com/koko1123/flow-go-1/module/lifecycle" 23 "github.com/koko1123/flow-go-1/module/metrics" 24 "github.com/koko1123/flow-go-1/network" 25 "github.com/koko1123/flow-go-1/network/channels" 26 "github.com/koko1123/flow-go-1/state/protocol" 27 "github.com/koko1123/flow-go-1/storage" 28 "github.com/koko1123/flow-go-1/utils/logging" 29 ) 30 31 // defaultBlockQueueCapacity maximum capacity of block proposals queue 32 const defaultBlockQueueCapacity = 10000 33 34 // defaultVoteQueueCapacity maximum capacity of block votes queue 35 const defaultVoteQueueCapacity = 1000 36 37 // Engine is a wrapper struct for `Core` which implements cluster consensus algorithm. 38 // Engine is responsible for handling incoming messages, queueing for processing, broadcasting proposals. 39 type Engine struct { 40 unit *engine.Unit 41 lm *lifecycle.LifecycleManager 42 log zerolog.Logger 43 metrics module.EngineMetrics 44 me module.Local 45 headers storage.Headers 46 payloads storage.ClusterPayloads 47 state protocol.State 48 core *Core 49 pendingBlocks engine.MessageStore 50 pendingVotes engine.MessageStore 51 messageHandler *engine.MessageHandler 52 finalizedView counters.StrictMonotonousCounter 53 finalizationEventsNotifier engine.Notifier 54 con network.Conduit 55 stopHotstuff context.CancelFunc 56 cluster flow.IdentityList // consensus participants in our cluster 57 } 58 59 func NewEngine( 60 log zerolog.Logger, 61 net network.Network, 62 me module.Local, 63 state protocol.State, 64 payloads storage.ClusterPayloads, 65 core *Core, 66 ) (*Engine, error) { 67 engineLog := log.With().Str("cluster_compliance", "engine").Logger() 68 69 // find my cluster for the current epoch 70 // TODO this should flow from cluster state as source of truth 71 clusters, err := state.Final().Epochs().Current().Clustering() 72 if err != nil { 73 return nil, fmt.Errorf("could not get clusters: %w", err) 74 } 75 currentCluster, _, found := clusters.ByNodeID(me.NodeID()) 76 if !found { 77 return nil, fmt.Errorf("could not find cluster for self") 78 } 79 80 // FIFO queue for block proposals 81 blocksQueue, err := fifoqueue.NewFifoQueue( 82 defaultBlockQueueCapacity, 83 fifoqueue.WithLengthObserver(func(len int) { 84 core.mempoolMetrics.MempoolEntries(metrics.ResourceClusterBlockProposalQueue, uint(len)) 85 }), 86 ) 87 if err != nil { 88 return nil, fmt.Errorf("failed to create queue for inbound receipts: %w", err) 89 } 90 pendingBlocks := &engine.FifoMessageStore{ 91 FifoQueue: blocksQueue, 92 } 93 94 // FIFO queue for block votes 95 votesQueue, err := fifoqueue.NewFifoQueue( 96 defaultVoteQueueCapacity, 97 fifoqueue.WithLengthObserver(func(len int) { core.mempoolMetrics.MempoolEntries(metrics.ResourceClusterBlockVoteQueue, uint(len)) }), 98 ) 99 if err != nil { 100 return nil, fmt.Errorf("failed to create queue for inbound approvals: %w", err) 101 } 102 pendingVotes := &engine.FifoMessageStore{FifoQueue: votesQueue} 103 104 // define message queueing behaviour 105 handler := engine.NewMessageHandler( 106 engineLog, 107 engine.NewNotifier(), 108 engine.Pattern{ 109 Match: func(msg *engine.Message) bool { 110 _, ok := msg.Payload.(*messages.ClusterBlockProposal) 111 if ok { 112 core.metrics.MessageReceived(metrics.EngineClusterCompliance, metrics.MessageClusterBlockProposal) 113 } 114 return ok 115 }, 116 Store: pendingBlocks, 117 }, 118 engine.Pattern{ 119 Match: func(msg *engine.Message) bool { 120 _, ok := msg.Payload.(*events.SyncedClusterBlock) 121 if ok { 122 core.metrics.MessageReceived(metrics.EngineClusterCompliance, metrics.MessageSyncedClusterBlock) 123 } 124 return ok 125 }, 126 Map: func(msg *engine.Message) (*engine.Message, bool) { 127 syncedBlock := msg.Payload.(*events.SyncedClusterBlock) 128 msg = &engine.Message{ 129 OriginID: msg.OriginID, 130 Payload: &messages.ClusterBlockProposal{ 131 Block: syncedBlock.Block, 132 }, 133 } 134 return msg, true 135 }, 136 Store: pendingBlocks, 137 }, 138 engine.Pattern{ 139 Match: func(msg *engine.Message) bool { 140 _, ok := msg.Payload.(*messages.ClusterBlockVote) 141 if ok { 142 core.metrics.MessageReceived(metrics.EngineClusterCompliance, metrics.MessageClusterBlockVote) 143 } 144 return ok 145 }, 146 Store: pendingVotes, 147 }, 148 ) 149 150 eng := &Engine{ 151 unit: engine.NewUnit(), 152 lm: lifecycle.NewLifecycleManager(), 153 log: engineLog, 154 metrics: core.metrics, 155 me: me, 156 headers: core.headers, 157 payloads: payloads, 158 state: state, 159 core: core, 160 pendingBlocks: pendingBlocks, 161 pendingVotes: pendingVotes, 162 messageHandler: handler, 163 finalizationEventsNotifier: engine.NewNotifier(), 164 con: nil, 165 cluster: currentCluster, 166 } 167 168 chainID, err := core.state.Params().ChainID() 169 if err != nil { 170 return nil, fmt.Errorf("could not get chain ID: %w", err) 171 } 172 173 // register network conduit 174 conduit, err := net.Register(channels.ConsensusCluster(chainID), eng) 175 if err != nil { 176 return nil, fmt.Errorf("could not register engine: %w", err) 177 } 178 eng.con = conduit 179 180 return eng, nil 181 } 182 183 // WithConsensus adds the consensus algorithm to the engine. This must be 184 // called before the engine can start. 185 func (e *Engine) WithConsensus(hot module.HotStuff) *Engine { 186 e.core.hotstuff = hot 187 return e 188 } 189 190 // WithSync adds the block requester to the engine. This must be 191 // called before the engine can start. 192 func (e *Engine) WithSync(sync module.BlockRequester) *Engine { 193 e.core.sync = sync 194 return e 195 } 196 197 // Ready returns a ready channel that is closed once the engine has fully 198 // started. For consensus engine, this is true once the underlying consensus 199 // algorithm has started. 200 func (e *Engine) Ready() <-chan struct{} { 201 if e.core.hotstuff == nil { 202 panic("must initialize compliance engine with hotstuff engine") 203 } 204 e.lm.OnStart(func() { 205 e.unit.Launch(e.loop) 206 e.unit.Launch(e.finalizationProcessingLoop) 207 208 ctx, cancel := context.WithCancel(context.Background()) 209 signalerCtx, hotstuffErrChan := irrecoverable.WithSignaler(ctx) 210 e.stopHotstuff = cancel 211 212 // TODO: this workaround for handling fatal HotStuff errors is required only 213 // because this engine and epochmgr do not use the Component pattern yet 214 e.unit.Launch(func() { 215 e.handleHotStuffError(hotstuffErrChan) 216 }) 217 218 e.core.hotstuff.Start(signalerCtx) 219 // wait for request handler to startup 220 <-e.core.hotstuff.Ready() 221 }) 222 return e.lm.Started() 223 } 224 225 // Done returns a done channel that is closed once the engine has fully stopped. 226 // For the consensus engine, we wait for hotstuff to finish. 227 func (e *Engine) Done() <-chan struct{} { 228 e.lm.OnStop(func() { 229 e.log.Info().Msg("shutting down hotstuff eventloop") 230 e.stopHotstuff() 231 <-e.core.hotstuff.Done() 232 e.log.Info().Msg("all components have been shut down") 233 <-e.unit.Done() 234 }) 235 return e.lm.Stopped() 236 } 237 238 // SubmitLocal submits an event originating on the local node. 239 func (e *Engine) SubmitLocal(event interface{}) { 240 err := e.ProcessLocal(event) 241 if err != nil { 242 e.log.Fatal().Err(err).Msg("internal error processing event") 243 } 244 } 245 246 // Submit submits the given event from the node with the given origin ID 247 // for processing in a non-blocking manner. It returns instantly and logs 248 // a potential processing error internally when done. 249 func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, event interface{}) { 250 err := e.Process(channel, originID, event) 251 if err != nil { 252 e.log.Fatal().Err(err).Msg("internal error processing event") 253 } 254 } 255 256 // ProcessLocal processes an event originating on the local node. 257 func (e *Engine) ProcessLocal(event interface{}) error { 258 return e.messageHandler.Process(e.me.NodeID(), event) 259 } 260 261 // Process processes the given event from the node with the given origin ID in 262 // a blocking manner. It returns the potential processing error when done. 263 func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, event interface{}) error { 264 err := e.messageHandler.Process(originID, event) 265 if err != nil { 266 if engine.IsIncompatibleInputTypeError(err) { 267 e.log.Warn().Msgf("%v delivered unsupported message %T through %v", originID, event, channel) 268 return nil 269 } 270 return fmt.Errorf("unexpected error while processing engine message: %w", err) 271 } 272 return nil 273 } 274 275 func (e *Engine) loop() { 276 for { 277 select { 278 case <-e.unit.Quit(): 279 return 280 case <-e.messageHandler.GetNotifier(): 281 err := e.processAvailableMessages() 282 if err != nil { 283 e.log.Fatal().Err(err).Msg("internal error processing queued message") 284 } 285 } 286 } 287 } 288 289 func (e *Engine) processAvailableMessages() error { 290 291 for { 292 msg, ok := e.pendingBlocks.Get() 293 if ok { 294 err := e.core.OnBlockProposal(msg.OriginID, msg.Payload.(*messages.ClusterBlockProposal)) 295 if err != nil { 296 return fmt.Errorf("could not handle block proposal: %w", err) 297 } 298 continue 299 } 300 301 msg, ok = e.pendingVotes.Get() 302 if ok { 303 err := e.core.OnBlockVote(msg.OriginID, msg.Payload.(*messages.ClusterBlockVote)) 304 if err != nil { 305 return fmt.Errorf("could not handle block vote: %w", err) 306 } 307 continue 308 } 309 310 // when there is no more messages in the queue, back to the loop to wait 311 // for the next incoming message to arrive. 312 return nil 313 } 314 } 315 316 // SendVote will send a vote to the desired node. 317 func (e *Engine) SendVote(blockID flow.Identifier, view uint64, sigData []byte, recipientID flow.Identifier) error { 318 319 log := e.log.With(). 320 Hex("collection_id", blockID[:]). 321 Uint64("collection_view", view). 322 Hex("recipient_id", recipientID[:]). 323 Logger() 324 log.Info().Msg("processing vote transmission request from hotstuff") 325 326 // build the vote message 327 vote := &messages.ClusterBlockVote{ 328 BlockID: blockID, 329 View: view, 330 SigData: sigData, 331 } 332 333 // TODO: this is a hot-fix to mitigate the effects of the following Unicast call blocking occasionally 334 e.unit.Launch(func() { 335 // send the vote the desired recipient 336 err := e.con.Unicast(vote, recipientID) 337 if err != nil { 338 log.Warn().Err(err).Msg("could not send vote") 339 return 340 } 341 e.metrics.MessageSent(metrics.EngineClusterCompliance, metrics.MessageClusterBlockVote) 342 log.Info().Msg("collection vote transmitted") 343 }) 344 345 return nil 346 } 347 348 // BroadcastProposalWithDelay submits a cluster block proposal (effectively a proposal 349 // for the next collection) to all the collection nodes in our cluster. 350 func (e *Engine) BroadcastProposalWithDelay(header *flow.Header, delay time.Duration) error { 351 352 // first, check that we are the proposer of the block 353 if header.ProposerID != e.me.NodeID() { 354 return fmt.Errorf("cannot broadcast proposal with non-local proposer (%x)", header.ProposerID) 355 } 356 357 // get the parent of the block 358 parent, err := e.headers.ByBlockID(header.ParentID) 359 if err != nil { 360 return fmt.Errorf("could not retrieve proposal parent: %w", err) 361 } 362 363 // fill in the fields that can't be populated by HotStuff 364 //TODO clean this up - currently we set these fields in builder, then lose 365 // them in HotStuff, then need to set them again here 366 header.ChainID = parent.ChainID 367 header.Height = parent.Height + 1 368 369 // retrieve the payload for the block 370 payload, err := e.payloads.ByBlockID(header.ID()) 371 if err != nil { 372 return fmt.Errorf("could not get payload for block: %w", err) 373 } 374 375 log := e.log.With(). 376 Str("chain_id", header.ChainID.String()). 377 Uint64("block_height", header.Height). 378 Uint64("block_view", header.View). 379 Hex("block_id", logging.ID(header.ID())). 380 Hex("parent_id", header.ParentID[:]). 381 Hex("ref_block", payload.ReferenceBlockID[:]). 382 Int("transaction_count", payload.Collection.Len()). 383 Hex("parent_signer_indices", header.ParentVoterIndices). 384 Dur("delay", delay). 385 Logger() 386 387 log.Debug().Msg("processing cluster broadcast request from hotstuff") 388 389 // retrieve all collection nodes in our cluster 390 recipients, err := e.state.Final().Identities(filter.And( 391 filter.In(e.cluster), 392 filter.Not(filter.HasNodeID(e.me.NodeID())), 393 )) 394 if err != nil { 395 return fmt.Errorf("could not get cluster members: %w", err) 396 } 397 398 e.unit.LaunchAfter(delay, func() { 399 400 go e.core.hotstuff.SubmitProposal(header, parent.View) 401 402 // create the proposal message for the collection 403 block := &cluster.Block{ 404 Header: header, 405 Payload: payload, 406 } 407 msg := messages.NewClusterBlockProposal(block) 408 409 err := e.con.Publish(msg, recipients.NodeIDs()...) 410 if errors.Is(err, network.EmptyTargetList) { 411 return 412 } 413 if err != nil { 414 log.Error().Err(err).Msg("could not broadcast proposal") 415 return 416 } 417 418 log.Info().Msg("cluster proposal proposed") 419 420 e.metrics.MessageSent(metrics.EngineClusterCompliance, metrics.MessageClusterBlockProposal) 421 e.core.collectionMetrics.ClusterBlockProposed(block) 422 }) 423 424 return nil 425 } 426 427 // BroadcastProposal will propagate a block proposal to all non-local consensus nodes. 428 // Note the header has incomplete fields, because it was converted from a hotstuff. 429 func (e *Engine) BroadcastProposal(header *flow.Header) error { 430 return e.BroadcastProposalWithDelay(header, 0) 431 } 432 433 // OnFinalizedBlock implements the `OnFinalizedBlock` callback from the `hotstuff.FinalizationConsumer` 434 // 435 // (1) Informs sealing.Core about finalization of respective block. 436 // 437 // CAUTION: the input to this callback is treated as trusted; precautions should be taken that messages 438 // from external nodes cannot be considered as inputs to this function 439 func (e *Engine) OnFinalizedBlock(block *model.Block) { 440 if e.finalizedView.Set(block.View) { 441 e.finalizationEventsNotifier.Notify() 442 } 443 } 444 445 // finalizationProcessingLoop is a separate goroutine that performs processing of finalization events 446 func (e *Engine) finalizationProcessingLoop() { 447 finalizationNotifier := e.finalizationEventsNotifier.Channel() 448 for { 449 select { 450 case <-e.unit.Quit(): 451 return 452 case <-finalizationNotifier: 453 e.core.ProcessFinalizedView(e.finalizedView.Value()) 454 } 455 } 456 } 457 458 // handleHotStuffError accepts the error channel from the HotStuff component and 459 // crashes the node if any error is detected. 460 // 461 // TODO: this function should be removed in favour of refactoring this engine and 462 // the epochmgr engine to use the Component pattern, so that irrecoverable errors 463 // can be bubbled all the way to the node scaffold 464 func (e *Engine) handleHotStuffError(hotstuffErrs <-chan error) { 465 for { 466 select { 467 case <-e.unit.Quit(): 468 return 469 case err := <-hotstuffErrs: 470 if err != nil { 471 e.log.Fatal().Err(err).Msg("encountered fatal error in HotStuff") 472 } 473 } 474 } 475 }