github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/collection/synchronization/engine.go (about)

     1  package synchronization
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"time"
     7  
     8  	"github.com/hashicorp/go-multierror"
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/onflow/flow-go/engine"
    12  	"github.com/onflow/flow-go/engine/collection"
    13  	"github.com/onflow/flow-go/engine/common/fifoqueue"
    14  	commonsync "github.com/onflow/flow-go/engine/common/synchronization"
    15  	"github.com/onflow/flow-go/model/chainsync"
    16  	"github.com/onflow/flow-go/model/flow"
    17  	"github.com/onflow/flow-go/model/flow/filter"
    18  	"github.com/onflow/flow-go/model/messages"
    19  	"github.com/onflow/flow-go/module"
    20  	synccore "github.com/onflow/flow-go/module/chainsync"
    21  	"github.com/onflow/flow-go/module/lifecycle"
    22  	"github.com/onflow/flow-go/module/metrics"
    23  	"github.com/onflow/flow-go/network"
    24  	"github.com/onflow/flow-go/network/channels"
    25  	"github.com/onflow/flow-go/state/cluster"
    26  	"github.com/onflow/flow-go/storage"
    27  	"github.com/onflow/flow-go/utils/rand"
    28  )
    29  
    30  // defaultSyncResponseQueueCapacity maximum capacity of sync responses queue
    31  const defaultSyncResponseQueueCapacity = 500
    32  
    33  // defaultBlockResponseQueueCapacity maximum capacity of block responses queue
    34  const defaultBlockResponseQueueCapacity = 500
    35  
    36  // Engine is the synchronization engine, responsible for synchronizing chain state.
    37  type Engine struct {
    38  	unit         *engine.Unit
    39  	lm           *lifecycle.LifecycleManager
    40  	log          zerolog.Logger
    41  	metrics      module.EngineMetrics
    42  	me           module.Local
    43  	participants flow.IdentitySkeletonList
    44  	con          network.Conduit
    45  	comp         collection.Compliance // compliance layer engine
    46  
    47  	pollInterval time.Duration
    48  	scanInterval time.Duration
    49  	core         module.SyncCore
    50  	state        cluster.State
    51  
    52  	requestHandler *RequestHandlerEngine // component responsible for handling requests
    53  
    54  	pendingSyncResponses   engine.MessageStore    // message store for *message.SyncResponse
    55  	pendingBlockResponses  engine.MessageStore    // message store for *message.BlockResponse
    56  	responseMessageHandler *engine.MessageHandler // message handler responsible for response processing
    57  }
    58  
    59  // New creates a new cluster chain synchronization engine.
    60  func New(
    61  	log zerolog.Logger,
    62  	metrics module.EngineMetrics,
    63  	net network.EngineRegistry,
    64  	me module.Local,
    65  	participants flow.IdentitySkeletonList,
    66  	state cluster.State,
    67  	blocks storage.ClusterBlocks,
    68  	comp collection.Compliance,
    69  	core module.SyncCore,
    70  	opts ...commonsync.OptionFunc,
    71  ) (*Engine, error) {
    72  
    73  	opt := commonsync.DefaultConfig()
    74  	for _, f := range opts {
    75  		f(opt)
    76  	}
    77  
    78  	if comp == nil {
    79  		return nil, fmt.Errorf("must initialize synchronization engine with comp engine")
    80  	}
    81  
    82  	// initialize the propagation engine with its dependencies
    83  	e := &Engine{
    84  		unit:         engine.NewUnit(),
    85  		lm:           lifecycle.NewLifecycleManager(),
    86  		log:          log.With().Str("engine", "cluster_synchronization").Logger(),
    87  		metrics:      metrics,
    88  		me:           me,
    89  		participants: participants.Filter(filter.Not(filter.HasNodeID[flow.IdentitySkeleton](me.NodeID()))),
    90  		comp:         comp,
    91  		core:         core,
    92  		pollInterval: opt.PollInterval,
    93  		scanInterval: opt.ScanInterval,
    94  		state:        state,
    95  	}
    96  
    97  	err := e.setupResponseMessageHandler()
    98  	if err != nil {
    99  		return nil, fmt.Errorf("could not setup message handler")
   100  	}
   101  	chainID := state.Params().ChainID()
   102  
   103  	// register the engine with the network layer and store the conduit
   104  	con, err := net.Register(channels.SyncCluster(chainID), e)
   105  	if err != nil {
   106  		return nil, fmt.Errorf("could not register engine: %w", err)
   107  	}
   108  	e.con = con
   109  
   110  	e.requestHandler = NewRequestHandlerEngine(log, metrics, con, me, blocks, core, state)
   111  
   112  	return e, nil
   113  }
   114  
   115  // setupResponseMessageHandler initializes the inbound queues and the MessageHandler for UNTRUSTED responses.
   116  func (e *Engine) setupResponseMessageHandler() error {
   117  	syncResponseQueue, err := fifoqueue.NewFifoQueue(defaultSyncResponseQueueCapacity)
   118  	if err != nil {
   119  		return fmt.Errorf("failed to create queue for sync responses: %w", err)
   120  	}
   121  
   122  	e.pendingSyncResponses = &engine.FifoMessageStore{
   123  		FifoQueue: syncResponseQueue,
   124  	}
   125  
   126  	blockResponseQueue, err := fifoqueue.NewFifoQueue(defaultBlockResponseQueueCapacity)
   127  	if err != nil {
   128  		return fmt.Errorf("failed to create queue for block responses: %w", err)
   129  	}
   130  
   131  	e.pendingBlockResponses = &engine.FifoMessageStore{
   132  		FifoQueue: blockResponseQueue,
   133  	}
   134  
   135  	// define message queueing behaviour
   136  	e.responseMessageHandler = engine.NewMessageHandler(
   137  		e.log,
   138  		engine.NewNotifier(),
   139  		engine.Pattern{
   140  			Match: func(msg *engine.Message) bool {
   141  				_, ok := msg.Payload.(*messages.SyncResponse)
   142  				if ok {
   143  					e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse)
   144  				}
   145  				return ok
   146  			},
   147  			Store: e.pendingSyncResponses,
   148  		},
   149  		engine.Pattern{
   150  			Match: func(msg *engine.Message) bool {
   151  				_, ok := msg.Payload.(*messages.ClusterBlockResponse)
   152  				if ok {
   153  					e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse)
   154  				}
   155  				return ok
   156  			},
   157  			Store: e.pendingBlockResponses,
   158  		},
   159  	)
   160  
   161  	return nil
   162  }
   163  
   164  // Ready returns a ready channel that is closed once the engine has fully started.
   165  func (e *Engine) Ready() <-chan struct{} {
   166  	e.lm.OnStart(func() {
   167  		e.unit.Launch(e.checkLoop)
   168  		e.unit.Launch(e.responseProcessingLoop)
   169  		// wait for request handler to startup
   170  		<-e.requestHandler.Ready()
   171  	})
   172  	return e.lm.Started()
   173  }
   174  
   175  // Done returns a done channel that is closed once the engine has fully stopped.
   176  func (e *Engine) Done() <-chan struct{} {
   177  	e.lm.OnStop(func() {
   178  		// signal the request handler to shutdown
   179  		requestHandlerDone := e.requestHandler.Done()
   180  		// wait for request sending and response processing routines to exit
   181  		<-e.unit.Done()
   182  		// wait for request handler shutdown to complete
   183  		<-requestHandlerDone
   184  	})
   185  	return e.lm.Stopped()
   186  }
   187  
   188  // SubmitLocal submits an event originating on the local node.
   189  func (e *Engine) SubmitLocal(event interface{}) {
   190  	err := e.ProcessLocal(event)
   191  	if err != nil {
   192  		e.log.Fatal().Err(err).Msg("internal error processing event")
   193  	}
   194  }
   195  
   196  // Submit submits the given event from the node with the given origin ID
   197  // for processing in a non-blocking manner. It returns instantly and logs
   198  // a potential processing error internally when done.
   199  func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, event interface{}) {
   200  	err := e.Process(channel, originID, event)
   201  	if err != nil {
   202  		e.log.Fatal().Err(err).Msg("internal error processing event")
   203  	}
   204  }
   205  
   206  // ProcessLocal processes an event originating on the local node.
   207  func (e *Engine) ProcessLocal(event interface{}) error {
   208  	return e.process(e.me.NodeID(), event)
   209  }
   210  
   211  // Process processes the given event from the node with the given origin ID in
   212  // a blocking manner. It returns the potential processing error when done.
   213  func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, event interface{}) error {
   214  	err := e.process(originID, event)
   215  	if err != nil {
   216  		if engine.IsIncompatibleInputTypeError(err) {
   217  			e.log.Warn().Msgf("%v delivered unsupported message %T through %v", originID, event, channel)
   218  			return nil
   219  		}
   220  		return fmt.Errorf("unexpected error while processing engine message: %w", err)
   221  	}
   222  	return nil
   223  }
   224  
   225  // process processes events for the synchronization engine.
   226  // Error returns:
   227  //   - IncompatibleInputTypeError if input has unexpected type
   228  //   - All other errors are potential symptoms of internal state corruption or bugs (fatal).
   229  func (e *Engine) process(originID flow.Identifier, event interface{}) error {
   230  	switch event.(type) {
   231  	case *messages.RangeRequest, *messages.BatchRequest, *messages.SyncRequest:
   232  		return e.requestHandler.process(originID, event)
   233  	case *messages.SyncResponse, *messages.ClusterBlockResponse:
   234  		return e.responseMessageHandler.Process(originID, event)
   235  	default:
   236  		return fmt.Errorf("received input with type %T from %x: %w", event, originID[:], engine.IncompatibleInputTypeError)
   237  	}
   238  }
   239  
   240  // responseProcessingLoop is a separate goroutine that performs processing of queued responses
   241  func (e *Engine) responseProcessingLoop() {
   242  	notifier := e.responseMessageHandler.GetNotifier()
   243  	for {
   244  		select {
   245  		case <-e.unit.Quit():
   246  			return
   247  		case <-notifier:
   248  			e.processAvailableResponses()
   249  		}
   250  	}
   251  }
   252  
   253  // processAvailableResponses is processor of pending events which drives events from networking layer to business logic.
   254  func (e *Engine) processAvailableResponses() {
   255  	for {
   256  		select {
   257  		case <-e.unit.Quit():
   258  			return
   259  		default:
   260  		}
   261  
   262  		msg, ok := e.pendingSyncResponses.Get()
   263  		if ok {
   264  			e.onSyncResponse(msg.OriginID, msg.Payload.(*messages.SyncResponse))
   265  			e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse)
   266  			continue
   267  		}
   268  
   269  		msg, ok = e.pendingBlockResponses.Get()
   270  		if ok {
   271  			e.onBlockResponse(msg.OriginID, msg.Payload.(*messages.ClusterBlockResponse))
   272  			e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse)
   273  			continue
   274  		}
   275  
   276  		// when there is no more messages in the queue, back to the loop to wait
   277  		// for the next incoming message to arrive.
   278  		return
   279  	}
   280  }
   281  
   282  // onSyncResponse processes a synchronization response.
   283  func (e *Engine) onSyncResponse(originID flow.Identifier, res *messages.SyncResponse) {
   284  	final, err := e.state.Final().Head()
   285  	if err != nil {
   286  		e.log.Error().Err(err).Msg("could not get last finalized header")
   287  		return
   288  	}
   289  	e.core.HandleHeight(final, res.Height)
   290  }
   291  
   292  // onBlockResponse processes a response containing a specifically requested block.
   293  func (e *Engine) onBlockResponse(originID flow.Identifier, res *messages.ClusterBlockResponse) {
   294  	// process the blocks one by one
   295  	for _, block := range res.Blocks {
   296  		header := block.Header
   297  		if !e.core.HandleBlock(&header) {
   298  			continue
   299  		}
   300  		synced := flow.Slashable[*messages.ClusterBlockProposal]{
   301  			OriginID: originID,
   302  			Message: &messages.ClusterBlockProposal{
   303  				Block: block,
   304  			},
   305  		}
   306  		// forward the block to the compliance engine for validation and processing
   307  		e.comp.OnSyncedClusterBlock(synced)
   308  	}
   309  }
   310  
   311  // checkLoop will regularly scan for items that need requesting.
   312  func (e *Engine) checkLoop() {
   313  	pollChan := make(<-chan time.Time)
   314  	if e.pollInterval > 0 {
   315  		poll := time.NewTicker(e.pollInterval)
   316  		pollChan = poll.C
   317  		defer poll.Stop()
   318  	}
   319  	scan := time.NewTicker(e.scanInterval)
   320  
   321  CheckLoop:
   322  	for {
   323  		// give the quit channel a priority to be selected
   324  		select {
   325  		case <-e.unit.Quit():
   326  			break CheckLoop
   327  		default:
   328  		}
   329  
   330  		select {
   331  		case <-e.unit.Quit():
   332  			break CheckLoop
   333  		case <-pollChan:
   334  			e.pollHeight()
   335  		case <-scan.C:
   336  			final, err := e.state.Final().Head()
   337  			if err != nil {
   338  				e.log.Fatal().Err(err).Msg("could not get last finalized header")
   339  				continue
   340  			}
   341  			ranges, batches := e.core.ScanPending(final)
   342  			e.sendRequests(ranges, batches)
   343  		}
   344  	}
   345  
   346  	// some minor cleanup
   347  	scan.Stop()
   348  }
   349  
   350  // pollHeight will send a synchronization request to three random nodes.
   351  func (e *Engine) pollHeight() {
   352  	head, err := e.state.Final().Head()
   353  	if err != nil {
   354  		e.log.Error().Err(err).Msg("could not get last finalized header")
   355  		return
   356  	}
   357  
   358  	nonce, err := rand.Uint64()
   359  	if err != nil {
   360  		// TODO: this error should be returned by pollHeight()
   361  		// it is logged for now since the only error possible is related to a failure
   362  		// of the system entropy generation. Such error is going to cause failures in other
   363  		// components where it's handled properly and will lead to crashing the module.
   364  		e.log.Error().Err(err).Msg("nonce generation failed during pollHeight")
   365  		return
   366  	}
   367  
   368  	// send the request for synchronization
   369  	req := &messages.SyncRequest{
   370  		Nonce:  nonce,
   371  		Height: head.Height,
   372  	}
   373  	err = e.con.Multicast(req, synccore.DefaultPollNodes, e.participants.NodeIDs()...)
   374  	if err != nil && !errors.Is(err, network.EmptyTargetList) {
   375  		e.log.Warn().Err(err).Msg("sending sync request to poll heights failed")
   376  		return
   377  	}
   378  	e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageSyncRequest)
   379  }
   380  
   381  // sendRequests sends a request for each range and batch using consensus participants from last finalized snapshot.
   382  func (e *Engine) sendRequests(ranges []chainsync.Range, batches []chainsync.Batch) {
   383  	var errs *multierror.Error
   384  
   385  	for _, ran := range ranges {
   386  		nonce, err := rand.Uint64()
   387  		if err != nil {
   388  			// TODO: this error should be returned by sendRequests
   389  			// it is logged for now since the only error possible is related to a failure
   390  			// of the system entropy generation. Such error is going to cause failures in other
   391  			// components where it's handled properly and will lead to crashing the module.
   392  			e.log.Error().Err(err).Msg("nonce generation failed during range request")
   393  			return
   394  		}
   395  		req := &messages.RangeRequest{
   396  			Nonce:      nonce,
   397  			FromHeight: ran.From,
   398  			ToHeight:   ran.To,
   399  		}
   400  		err = e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...)
   401  		if err != nil {
   402  			errs = multierror.Append(errs, fmt.Errorf("could not submit range request: %w", err))
   403  			continue
   404  		}
   405  		e.log.Debug().
   406  			Uint64("range_from", req.FromHeight).
   407  			Uint64("range_to", req.ToHeight).
   408  			Uint64("range_nonce", req.Nonce).
   409  			Msg("range requested")
   410  		e.core.RangeRequested(ran)
   411  		e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageRangeRequest)
   412  	}
   413  
   414  	for _, batch := range batches {
   415  		nonce, err := rand.Uint64()
   416  		if err != nil {
   417  			// TODO: this error should be returned by sendRequests
   418  			// it is logged for now since the only error possible is related to a failure
   419  			// of the system entropy generation. Such error is going to cause failures in other
   420  			// components where it's handled properly and will lead to crashing the module.
   421  			e.log.Error().Err(err).Msg("nonce generation failed during batch request")
   422  			return
   423  		}
   424  		req := &messages.BatchRequest{
   425  			Nonce:    nonce,
   426  			BlockIDs: batch.BlockIDs,
   427  		}
   428  		err = e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...)
   429  		if err != nil {
   430  			errs = multierror.Append(errs, fmt.Errorf("could not submit batch request: %w", err))
   431  			continue
   432  		}
   433  		e.core.BatchRequested(batch)
   434  		e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageBatchRequest)
   435  	}
   436  
   437  	if err := errs.ErrorOrNil(); err != nil {
   438  		e.log.Warn().Err(err).Msg("sending range and batch requests failed")
   439  	}
   440  }