github.com/koko1123/flow-go-1@v0.29.6/engine/collection/compliance/engine.go (about)

     1  package compliance
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/koko1123/flow-go-1/consensus/hotstuff/model"
    12  	"github.com/koko1123/flow-go-1/engine"
    13  	"github.com/koko1123/flow-go-1/engine/common/fifoqueue"
    14  	"github.com/koko1123/flow-go-1/engine/consensus/sealing/counters"
    15  	"github.com/koko1123/flow-go-1/model/cluster"
    16  	"github.com/koko1123/flow-go-1/model/events"
    17  	"github.com/koko1123/flow-go-1/model/flow"
    18  	"github.com/koko1123/flow-go-1/model/flow/filter"
    19  	"github.com/koko1123/flow-go-1/model/messages"
    20  	"github.com/koko1123/flow-go-1/module"
    21  	"github.com/koko1123/flow-go-1/module/irrecoverable"
    22  	"github.com/koko1123/flow-go-1/module/lifecycle"
    23  	"github.com/koko1123/flow-go-1/module/metrics"
    24  	"github.com/koko1123/flow-go-1/network"
    25  	"github.com/koko1123/flow-go-1/network/channels"
    26  	"github.com/koko1123/flow-go-1/state/protocol"
    27  	"github.com/koko1123/flow-go-1/storage"
    28  	"github.com/koko1123/flow-go-1/utils/logging"
    29  )
    30  
    31  // defaultBlockQueueCapacity maximum capacity of block proposals queue
    32  const defaultBlockQueueCapacity = 10000
    33  
    34  // defaultVoteQueueCapacity maximum capacity of block votes queue
    35  const defaultVoteQueueCapacity = 1000
    36  
    37  // Engine is a wrapper struct for `Core` which implements cluster consensus algorithm.
    38  // Engine is responsible for handling incoming messages, queueing for processing, broadcasting proposals.
    39  type Engine struct {
    40  	unit                       *engine.Unit
    41  	lm                         *lifecycle.LifecycleManager
    42  	log                        zerolog.Logger
    43  	metrics                    module.EngineMetrics
    44  	me                         module.Local
    45  	headers                    storage.Headers
    46  	payloads                   storage.ClusterPayloads
    47  	state                      protocol.State
    48  	core                       *Core
    49  	pendingBlocks              engine.MessageStore
    50  	pendingVotes               engine.MessageStore
    51  	messageHandler             *engine.MessageHandler
    52  	finalizedView              counters.StrictMonotonousCounter
    53  	finalizationEventsNotifier engine.Notifier
    54  	con                        network.Conduit
    55  	stopHotstuff               context.CancelFunc
    56  	cluster                    flow.IdentityList // consensus participants in our cluster
    57  }
    58  
    59  func NewEngine(
    60  	log zerolog.Logger,
    61  	net network.Network,
    62  	me module.Local,
    63  	state protocol.State,
    64  	payloads storage.ClusterPayloads,
    65  	core *Core,
    66  ) (*Engine, error) {
    67  	engineLog := log.With().Str("cluster_compliance", "engine").Logger()
    68  
    69  	// find my cluster for the current epoch
    70  	// TODO this should flow from cluster state as source of truth
    71  	clusters, err := state.Final().Epochs().Current().Clustering()
    72  	if err != nil {
    73  		return nil, fmt.Errorf("could not get clusters: %w", err)
    74  	}
    75  	currentCluster, _, found := clusters.ByNodeID(me.NodeID())
    76  	if !found {
    77  		return nil, fmt.Errorf("could not find cluster for self")
    78  	}
    79  
    80  	// FIFO queue for block proposals
    81  	blocksQueue, err := fifoqueue.NewFifoQueue(
    82  		defaultBlockQueueCapacity,
    83  		fifoqueue.WithLengthObserver(func(len int) {
    84  			core.mempoolMetrics.MempoolEntries(metrics.ResourceClusterBlockProposalQueue, uint(len))
    85  		}),
    86  	)
    87  	if err != nil {
    88  		return nil, fmt.Errorf("failed to create queue for inbound receipts: %w", err)
    89  	}
    90  	pendingBlocks := &engine.FifoMessageStore{
    91  		FifoQueue: blocksQueue,
    92  	}
    93  
    94  	// FIFO queue for block votes
    95  	votesQueue, err := fifoqueue.NewFifoQueue(
    96  		defaultVoteQueueCapacity,
    97  		fifoqueue.WithLengthObserver(func(len int) { core.mempoolMetrics.MempoolEntries(metrics.ResourceClusterBlockVoteQueue, uint(len)) }),
    98  	)
    99  	if err != nil {
   100  		return nil, fmt.Errorf("failed to create queue for inbound approvals: %w", err)
   101  	}
   102  	pendingVotes := &engine.FifoMessageStore{FifoQueue: votesQueue}
   103  
   104  	// define message queueing behaviour
   105  	handler := engine.NewMessageHandler(
   106  		engineLog,
   107  		engine.NewNotifier(),
   108  		engine.Pattern{
   109  			Match: func(msg *engine.Message) bool {
   110  				_, ok := msg.Payload.(*messages.ClusterBlockProposal)
   111  				if ok {
   112  					core.metrics.MessageReceived(metrics.EngineClusterCompliance, metrics.MessageClusterBlockProposal)
   113  				}
   114  				return ok
   115  			},
   116  			Store: pendingBlocks,
   117  		},
   118  		engine.Pattern{
   119  			Match: func(msg *engine.Message) bool {
   120  				_, ok := msg.Payload.(*events.SyncedClusterBlock)
   121  				if ok {
   122  					core.metrics.MessageReceived(metrics.EngineClusterCompliance, metrics.MessageSyncedClusterBlock)
   123  				}
   124  				return ok
   125  			},
   126  			Map: func(msg *engine.Message) (*engine.Message, bool) {
   127  				syncedBlock := msg.Payload.(*events.SyncedClusterBlock)
   128  				msg = &engine.Message{
   129  					OriginID: msg.OriginID,
   130  					Payload: &messages.ClusterBlockProposal{
   131  						Block: syncedBlock.Block,
   132  					},
   133  				}
   134  				return msg, true
   135  			},
   136  			Store: pendingBlocks,
   137  		},
   138  		engine.Pattern{
   139  			Match: func(msg *engine.Message) bool {
   140  				_, ok := msg.Payload.(*messages.ClusterBlockVote)
   141  				if ok {
   142  					core.metrics.MessageReceived(metrics.EngineClusterCompliance, metrics.MessageClusterBlockVote)
   143  				}
   144  				return ok
   145  			},
   146  			Store: pendingVotes,
   147  		},
   148  	)
   149  
   150  	eng := &Engine{
   151  		unit:                       engine.NewUnit(),
   152  		lm:                         lifecycle.NewLifecycleManager(),
   153  		log:                        engineLog,
   154  		metrics:                    core.metrics,
   155  		me:                         me,
   156  		headers:                    core.headers,
   157  		payloads:                   payloads,
   158  		state:                      state,
   159  		core:                       core,
   160  		pendingBlocks:              pendingBlocks,
   161  		pendingVotes:               pendingVotes,
   162  		messageHandler:             handler,
   163  		finalizationEventsNotifier: engine.NewNotifier(),
   164  		con:                        nil,
   165  		cluster:                    currentCluster,
   166  	}
   167  
   168  	chainID, err := core.state.Params().ChainID()
   169  	if err != nil {
   170  		return nil, fmt.Errorf("could not get chain ID: %w", err)
   171  	}
   172  
   173  	// register network conduit
   174  	conduit, err := net.Register(channels.ConsensusCluster(chainID), eng)
   175  	if err != nil {
   176  		return nil, fmt.Errorf("could not register engine: %w", err)
   177  	}
   178  	eng.con = conduit
   179  
   180  	return eng, nil
   181  }
   182  
   183  // WithConsensus adds the consensus algorithm to the engine. This must be
   184  // called before the engine can start.
   185  func (e *Engine) WithConsensus(hot module.HotStuff) *Engine {
   186  	e.core.hotstuff = hot
   187  	return e
   188  }
   189  
   190  // WithSync adds the block requester to the engine. This must be
   191  // called before the engine can start.
   192  func (e *Engine) WithSync(sync module.BlockRequester) *Engine {
   193  	e.core.sync = sync
   194  	return e
   195  }
   196  
   197  // Ready returns a ready channel that is closed once the engine has fully
   198  // started. For consensus engine, this is true once the underlying consensus
   199  // algorithm has started.
   200  func (e *Engine) Ready() <-chan struct{} {
   201  	if e.core.hotstuff == nil {
   202  		panic("must initialize compliance engine with hotstuff engine")
   203  	}
   204  	e.lm.OnStart(func() {
   205  		e.unit.Launch(e.loop)
   206  		e.unit.Launch(e.finalizationProcessingLoop)
   207  
   208  		ctx, cancel := context.WithCancel(context.Background())
   209  		signalerCtx, hotstuffErrChan := irrecoverable.WithSignaler(ctx)
   210  		e.stopHotstuff = cancel
   211  
   212  		// TODO: this workaround for handling fatal HotStuff errors is required only
   213  		//  because this engine and epochmgr do not use the Component pattern yet
   214  		e.unit.Launch(func() {
   215  			e.handleHotStuffError(hotstuffErrChan)
   216  		})
   217  
   218  		e.core.hotstuff.Start(signalerCtx)
   219  		// wait for request handler to startup
   220  		<-e.core.hotstuff.Ready()
   221  	})
   222  	return e.lm.Started()
   223  }
   224  
   225  // Done returns a done channel that is closed once the engine has fully stopped.
   226  // For the consensus engine, we wait for hotstuff to finish.
   227  func (e *Engine) Done() <-chan struct{} {
   228  	e.lm.OnStop(func() {
   229  		e.log.Info().Msg("shutting down hotstuff eventloop")
   230  		e.stopHotstuff()
   231  		<-e.core.hotstuff.Done()
   232  		e.log.Info().Msg("all components have been shut down")
   233  		<-e.unit.Done()
   234  	})
   235  	return e.lm.Stopped()
   236  }
   237  
   238  // SubmitLocal submits an event originating on the local node.
   239  func (e *Engine) SubmitLocal(event interface{}) {
   240  	err := e.ProcessLocal(event)
   241  	if err != nil {
   242  		e.log.Fatal().Err(err).Msg("internal error processing event")
   243  	}
   244  }
   245  
   246  // Submit submits the given event from the node with the given origin ID
   247  // for processing in a non-blocking manner. It returns instantly and logs
   248  // a potential processing error internally when done.
   249  func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, event interface{}) {
   250  	err := e.Process(channel, originID, event)
   251  	if err != nil {
   252  		e.log.Fatal().Err(err).Msg("internal error processing event")
   253  	}
   254  }
   255  
   256  // ProcessLocal processes an event originating on the local node.
   257  func (e *Engine) ProcessLocal(event interface{}) error {
   258  	return e.messageHandler.Process(e.me.NodeID(), event)
   259  }
   260  
   261  // Process processes the given event from the node with the given origin ID in
   262  // a blocking manner. It returns the potential processing error when done.
   263  func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, event interface{}) error {
   264  	err := e.messageHandler.Process(originID, event)
   265  	if err != nil {
   266  		if engine.IsIncompatibleInputTypeError(err) {
   267  			e.log.Warn().Msgf("%v delivered unsupported message %T through %v", originID, event, channel)
   268  			return nil
   269  		}
   270  		return fmt.Errorf("unexpected error while processing engine message: %w", err)
   271  	}
   272  	return nil
   273  }
   274  
   275  func (e *Engine) loop() {
   276  	for {
   277  		select {
   278  		case <-e.unit.Quit():
   279  			return
   280  		case <-e.messageHandler.GetNotifier():
   281  			err := e.processAvailableMessages()
   282  			if err != nil {
   283  				e.log.Fatal().Err(err).Msg("internal error processing queued message")
   284  			}
   285  		}
   286  	}
   287  }
   288  
   289  func (e *Engine) processAvailableMessages() error {
   290  
   291  	for {
   292  		msg, ok := e.pendingBlocks.Get()
   293  		if ok {
   294  			err := e.core.OnBlockProposal(msg.OriginID, msg.Payload.(*messages.ClusterBlockProposal))
   295  			if err != nil {
   296  				return fmt.Errorf("could not handle block proposal: %w", err)
   297  			}
   298  			continue
   299  		}
   300  
   301  		msg, ok = e.pendingVotes.Get()
   302  		if ok {
   303  			err := e.core.OnBlockVote(msg.OriginID, msg.Payload.(*messages.ClusterBlockVote))
   304  			if err != nil {
   305  				return fmt.Errorf("could not handle block vote: %w", err)
   306  			}
   307  			continue
   308  		}
   309  
   310  		// when there is no more messages in the queue, back to the loop to wait
   311  		// for the next incoming message to arrive.
   312  		return nil
   313  	}
   314  }
   315  
   316  // SendVote will send a vote to the desired node.
   317  func (e *Engine) SendVote(blockID flow.Identifier, view uint64, sigData []byte, recipientID flow.Identifier) error {
   318  
   319  	log := e.log.With().
   320  		Hex("collection_id", blockID[:]).
   321  		Uint64("collection_view", view).
   322  		Hex("recipient_id", recipientID[:]).
   323  		Logger()
   324  	log.Info().Msg("processing vote transmission request from hotstuff")
   325  
   326  	// build the vote message
   327  	vote := &messages.ClusterBlockVote{
   328  		BlockID: blockID,
   329  		View:    view,
   330  		SigData: sigData,
   331  	}
   332  
   333  	// TODO: this is a hot-fix to mitigate the effects of the following Unicast call blocking occasionally
   334  	e.unit.Launch(func() {
   335  		// send the vote the desired recipient
   336  		err := e.con.Unicast(vote, recipientID)
   337  		if err != nil {
   338  			log.Warn().Err(err).Msg("could not send vote")
   339  			return
   340  		}
   341  		e.metrics.MessageSent(metrics.EngineClusterCompliance, metrics.MessageClusterBlockVote)
   342  		log.Info().Msg("collection vote transmitted")
   343  	})
   344  
   345  	return nil
   346  }
   347  
   348  // BroadcastProposalWithDelay submits a cluster block proposal (effectively a proposal
   349  // for the next collection) to all the collection nodes in our cluster.
   350  func (e *Engine) BroadcastProposalWithDelay(header *flow.Header, delay time.Duration) error {
   351  
   352  	// first, check that we are the proposer of the block
   353  	if header.ProposerID != e.me.NodeID() {
   354  		return fmt.Errorf("cannot broadcast proposal with non-local proposer (%x)", header.ProposerID)
   355  	}
   356  
   357  	// get the parent of the block
   358  	parent, err := e.headers.ByBlockID(header.ParentID)
   359  	if err != nil {
   360  		return fmt.Errorf("could not retrieve proposal parent: %w", err)
   361  	}
   362  
   363  	// fill in the fields that can't be populated by HotStuff
   364  	//TODO clean this up - currently we set these fields in builder, then lose
   365  	// them in HotStuff, then need to set them again here
   366  	header.ChainID = parent.ChainID
   367  	header.Height = parent.Height + 1
   368  
   369  	// retrieve the payload for the block
   370  	payload, err := e.payloads.ByBlockID(header.ID())
   371  	if err != nil {
   372  		return fmt.Errorf("could not get payload for block: %w", err)
   373  	}
   374  
   375  	log := e.log.With().
   376  		Str("chain_id", header.ChainID.String()).
   377  		Uint64("block_height", header.Height).
   378  		Uint64("block_view", header.View).
   379  		Hex("block_id", logging.ID(header.ID())).
   380  		Hex("parent_id", header.ParentID[:]).
   381  		Hex("ref_block", payload.ReferenceBlockID[:]).
   382  		Int("transaction_count", payload.Collection.Len()).
   383  		Hex("parent_signer_indices", header.ParentVoterIndices).
   384  		Dur("delay", delay).
   385  		Logger()
   386  
   387  	log.Debug().Msg("processing cluster broadcast request from hotstuff")
   388  
   389  	// retrieve all collection nodes in our cluster
   390  	recipients, err := e.state.Final().Identities(filter.And(
   391  		filter.In(e.cluster),
   392  		filter.Not(filter.HasNodeID(e.me.NodeID())),
   393  	))
   394  	if err != nil {
   395  		return fmt.Errorf("could not get cluster members: %w", err)
   396  	}
   397  
   398  	e.unit.LaunchAfter(delay, func() {
   399  
   400  		go e.core.hotstuff.SubmitProposal(header, parent.View)
   401  
   402  		// create the proposal message for the collection
   403  		block := &cluster.Block{
   404  			Header:  header,
   405  			Payload: payload,
   406  		}
   407  		msg := messages.NewClusterBlockProposal(block)
   408  
   409  		err := e.con.Publish(msg, recipients.NodeIDs()...)
   410  		if errors.Is(err, network.EmptyTargetList) {
   411  			return
   412  		}
   413  		if err != nil {
   414  			log.Error().Err(err).Msg("could not broadcast proposal")
   415  			return
   416  		}
   417  
   418  		log.Info().Msg("cluster proposal proposed")
   419  
   420  		e.metrics.MessageSent(metrics.EngineClusterCompliance, metrics.MessageClusterBlockProposal)
   421  		e.core.collectionMetrics.ClusterBlockProposed(block)
   422  	})
   423  
   424  	return nil
   425  }
   426  
   427  // BroadcastProposal will propagate a block proposal to all non-local consensus nodes.
   428  // Note the header has incomplete fields, because it was converted from a hotstuff.
   429  func (e *Engine) BroadcastProposal(header *flow.Header) error {
   430  	return e.BroadcastProposalWithDelay(header, 0)
   431  }
   432  
   433  // OnFinalizedBlock implements the `OnFinalizedBlock` callback from the `hotstuff.FinalizationConsumer`
   434  //
   435  // (1) Informs sealing.Core about finalization of respective block.
   436  //
   437  // CAUTION: the input to this callback is treated as trusted; precautions should be taken that messages
   438  // from external nodes cannot be considered as inputs to this function
   439  func (e *Engine) OnFinalizedBlock(block *model.Block) {
   440  	if e.finalizedView.Set(block.View) {
   441  		e.finalizationEventsNotifier.Notify()
   442  	}
   443  }
   444  
   445  // finalizationProcessingLoop is a separate goroutine that performs processing of finalization events
   446  func (e *Engine) finalizationProcessingLoop() {
   447  	finalizationNotifier := e.finalizationEventsNotifier.Channel()
   448  	for {
   449  		select {
   450  		case <-e.unit.Quit():
   451  			return
   452  		case <-finalizationNotifier:
   453  			e.core.ProcessFinalizedView(e.finalizedView.Value())
   454  		}
   455  	}
   456  }
   457  
   458  // handleHotStuffError accepts the error channel from the HotStuff component and
   459  // crashes the node if any error is detected.
   460  //
   461  // TODO: this function should be removed in favour of refactoring this engine and
   462  // the epochmgr engine to use the Component pattern, so that irrecoverable errors
   463  // can be bubbled all the way to the node scaffold
   464  func (e *Engine) handleHotStuffError(hotstuffErrs <-chan error) {
   465  	for {
   466  		select {
   467  		case <-e.unit.Quit():
   468  			return
   469  		case err := <-hotstuffErrs:
   470  			if err != nil {
   471  				e.log.Fatal().Err(err).Msg("encountered fatal error in HotStuff")
   472  			}
   473  		}
   474  	}
   475  }