github.com/onflow/flow-go@v0.33.17/engine/collection/synchronization/engine.go (about)

     1  // (c) 2019 Dapper Labs - ALL RIGHTS RESERVED
     2  
     3  package synchronization
     4  
     5  import (
     6  	"errors"
     7  	"fmt"
     8  	"time"
     9  
    10  	"github.com/hashicorp/go-multierror"
    11  	"github.com/rs/zerolog"
    12  
    13  	"github.com/onflow/flow-go/engine"
    14  	"github.com/onflow/flow-go/engine/collection"
    15  	"github.com/onflow/flow-go/engine/common/fifoqueue"
    16  	commonsync "github.com/onflow/flow-go/engine/common/synchronization"
    17  	"github.com/onflow/flow-go/model/chainsync"
    18  	"github.com/onflow/flow-go/model/flow"
    19  	"github.com/onflow/flow-go/model/flow/filter"
    20  	"github.com/onflow/flow-go/model/messages"
    21  	"github.com/onflow/flow-go/module"
    22  	synccore "github.com/onflow/flow-go/module/chainsync"
    23  	"github.com/onflow/flow-go/module/lifecycle"
    24  	"github.com/onflow/flow-go/module/metrics"
    25  	"github.com/onflow/flow-go/network"
    26  	"github.com/onflow/flow-go/network/channels"
    27  	"github.com/onflow/flow-go/state/cluster"
    28  	"github.com/onflow/flow-go/storage"
    29  	"github.com/onflow/flow-go/utils/rand"
    30  )
    31  
    32  // defaultSyncResponseQueueCapacity maximum capacity of sync responses queue
    33  const defaultSyncResponseQueueCapacity = 500
    34  
    35  // defaultBlockResponseQueueCapacity maximum capacity of block responses queue
    36  const defaultBlockResponseQueueCapacity = 500
    37  
    38  // Engine is the synchronization engine, responsible for synchronizing chain state.
    39  type Engine struct {
    40  	unit         *engine.Unit
    41  	lm           *lifecycle.LifecycleManager
    42  	log          zerolog.Logger
    43  	metrics      module.EngineMetrics
    44  	me           module.Local
    45  	participants flow.IdentityList
    46  	con          network.Conduit
    47  	comp         collection.Compliance // compliance layer engine
    48  
    49  	pollInterval time.Duration
    50  	scanInterval time.Duration
    51  	core         module.SyncCore
    52  	state        cluster.State
    53  
    54  	requestHandler *RequestHandlerEngine // component responsible for handling requests
    55  
    56  	pendingSyncResponses   engine.MessageStore    // message store for *message.SyncResponse
    57  	pendingBlockResponses  engine.MessageStore    // message store for *message.BlockResponse
    58  	responseMessageHandler *engine.MessageHandler // message handler responsible for response processing
    59  }
    60  
    61  // New creates a new cluster chain synchronization engine.
    62  func New(
    63  	log zerolog.Logger,
    64  	metrics module.EngineMetrics,
    65  	net network.EngineRegistry,
    66  	me module.Local,
    67  	participants flow.IdentityList,
    68  	state cluster.State,
    69  	blocks storage.ClusterBlocks,
    70  	comp collection.Compliance,
    71  	core module.SyncCore,
    72  	opts ...commonsync.OptionFunc,
    73  ) (*Engine, error) {
    74  
    75  	opt := commonsync.DefaultConfig()
    76  	for _, f := range opts {
    77  		f(opt)
    78  	}
    79  
    80  	if comp == nil {
    81  		return nil, fmt.Errorf("must initialize synchronization engine with comp engine")
    82  	}
    83  
    84  	// initialize the propagation engine with its dependencies
    85  	e := &Engine{
    86  		unit:         engine.NewUnit(),
    87  		lm:           lifecycle.NewLifecycleManager(),
    88  		log:          log.With().Str("engine", "cluster_synchronization").Logger(),
    89  		metrics:      metrics,
    90  		me:           me,
    91  		participants: participants.Filter(filter.Not(filter.HasNodeID(me.NodeID()))),
    92  		comp:         comp,
    93  		core:         core,
    94  		pollInterval: opt.PollInterval,
    95  		scanInterval: opt.ScanInterval,
    96  		state:        state,
    97  	}
    98  
    99  	err := e.setupResponseMessageHandler()
   100  	if err != nil {
   101  		return nil, fmt.Errorf("could not setup message handler")
   102  	}
   103  
   104  	chainID, err := state.Params().ChainID()
   105  	if err != nil {
   106  		return nil, fmt.Errorf("could not get chain ID: %w", err)
   107  	}
   108  
   109  	// register the engine with the network layer and store the conduit
   110  	con, err := net.Register(channels.SyncCluster(chainID), e)
   111  	if err != nil {
   112  		return nil, fmt.Errorf("could not register engine: %w", err)
   113  	}
   114  	e.con = con
   115  
   116  	e.requestHandler = NewRequestHandlerEngine(log, metrics, con, me, blocks, core, state)
   117  
   118  	return e, nil
   119  }
   120  
   121  // setupResponseMessageHandler initializes the inbound queues and the MessageHandler for UNTRUSTED responses.
   122  func (e *Engine) setupResponseMessageHandler() error {
   123  	syncResponseQueue, err := fifoqueue.NewFifoQueue(defaultSyncResponseQueueCapacity)
   124  	if err != nil {
   125  		return fmt.Errorf("failed to create queue for sync responses: %w", err)
   126  	}
   127  
   128  	e.pendingSyncResponses = &engine.FifoMessageStore{
   129  		FifoQueue: syncResponseQueue,
   130  	}
   131  
   132  	blockResponseQueue, err := fifoqueue.NewFifoQueue(defaultBlockResponseQueueCapacity)
   133  	if err != nil {
   134  		return fmt.Errorf("failed to create queue for block responses: %w", err)
   135  	}
   136  
   137  	e.pendingBlockResponses = &engine.FifoMessageStore{
   138  		FifoQueue: blockResponseQueue,
   139  	}
   140  
   141  	// define message queueing behaviour
   142  	e.responseMessageHandler = engine.NewMessageHandler(
   143  		e.log,
   144  		engine.NewNotifier(),
   145  		engine.Pattern{
   146  			Match: func(msg *engine.Message) bool {
   147  				_, ok := msg.Payload.(*messages.SyncResponse)
   148  				if ok {
   149  					e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse)
   150  				}
   151  				return ok
   152  			},
   153  			Store: e.pendingSyncResponses,
   154  		},
   155  		engine.Pattern{
   156  			Match: func(msg *engine.Message) bool {
   157  				_, ok := msg.Payload.(*messages.ClusterBlockResponse)
   158  				if ok {
   159  					e.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse)
   160  				}
   161  				return ok
   162  			},
   163  			Store: e.pendingBlockResponses,
   164  		},
   165  	)
   166  
   167  	return nil
   168  }
   169  
   170  // Ready returns a ready channel that is closed once the engine has fully started.
   171  func (e *Engine) Ready() <-chan struct{} {
   172  	e.lm.OnStart(func() {
   173  		e.unit.Launch(e.checkLoop)
   174  		e.unit.Launch(e.responseProcessingLoop)
   175  		// wait for request handler to startup
   176  		<-e.requestHandler.Ready()
   177  	})
   178  	return e.lm.Started()
   179  }
   180  
   181  // Done returns a done channel that is closed once the engine has fully stopped.
   182  func (e *Engine) Done() <-chan struct{} {
   183  	e.lm.OnStop(func() {
   184  		// signal the request handler to shutdown
   185  		requestHandlerDone := e.requestHandler.Done()
   186  		// wait for request sending and response processing routines to exit
   187  		<-e.unit.Done()
   188  		// wait for request handler shutdown to complete
   189  		<-requestHandlerDone
   190  	})
   191  	return e.lm.Stopped()
   192  }
   193  
   194  // SubmitLocal submits an event originating on the local node.
   195  func (e *Engine) SubmitLocal(event interface{}) {
   196  	err := e.ProcessLocal(event)
   197  	if err != nil {
   198  		e.log.Fatal().Err(err).Msg("internal error processing event")
   199  	}
   200  }
   201  
   202  // Submit submits the given event from the node with the given origin ID
   203  // for processing in a non-blocking manner. It returns instantly and logs
   204  // a potential processing error internally when done.
   205  func (e *Engine) Submit(channel channels.Channel, originID flow.Identifier, event interface{}) {
   206  	err := e.Process(channel, originID, event)
   207  	if err != nil {
   208  		e.log.Fatal().Err(err).Msg("internal error processing event")
   209  	}
   210  }
   211  
   212  // ProcessLocal processes an event originating on the local node.
   213  func (e *Engine) ProcessLocal(event interface{}) error {
   214  	return e.process(e.me.NodeID(), event)
   215  }
   216  
   217  // Process processes the given event from the node with the given origin ID in
   218  // a blocking manner. It returns the potential processing error when done.
   219  func (e *Engine) Process(channel channels.Channel, originID flow.Identifier, event interface{}) error {
   220  	err := e.process(originID, event)
   221  	if err != nil {
   222  		if engine.IsIncompatibleInputTypeError(err) {
   223  			e.log.Warn().Msgf("%v delivered unsupported message %T through %v", originID, event, channel)
   224  			return nil
   225  		}
   226  		return fmt.Errorf("unexpected error while processing engine message: %w", err)
   227  	}
   228  	return nil
   229  }
   230  
   231  // process processes events for the synchronization engine.
   232  // Error returns:
   233  //   - IncompatibleInputTypeError if input has unexpected type
   234  //   - All other errors are potential symptoms of internal state corruption or bugs (fatal).
   235  func (e *Engine) process(originID flow.Identifier, event interface{}) error {
   236  	switch event.(type) {
   237  	case *messages.RangeRequest, *messages.BatchRequest, *messages.SyncRequest:
   238  		return e.requestHandler.process(originID, event)
   239  	case *messages.SyncResponse, *messages.ClusterBlockResponse:
   240  		return e.responseMessageHandler.Process(originID, event)
   241  	default:
   242  		return fmt.Errorf("received input with type %T from %x: %w", event, originID[:], engine.IncompatibleInputTypeError)
   243  	}
   244  }
   245  
   246  // responseProcessingLoop is a separate goroutine that performs processing of queued responses
   247  func (e *Engine) responseProcessingLoop() {
   248  	notifier := e.responseMessageHandler.GetNotifier()
   249  	for {
   250  		select {
   251  		case <-e.unit.Quit():
   252  			return
   253  		case <-notifier:
   254  			e.processAvailableResponses()
   255  		}
   256  	}
   257  }
   258  
   259  // processAvailableResponses is processor of pending events which drives events from networking layer to business logic.
   260  func (e *Engine) processAvailableResponses() {
   261  	for {
   262  		select {
   263  		case <-e.unit.Quit():
   264  			return
   265  		default:
   266  		}
   267  
   268  		msg, ok := e.pendingSyncResponses.Get()
   269  		if ok {
   270  			e.onSyncResponse(msg.OriginID, msg.Payload.(*messages.SyncResponse))
   271  			e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse)
   272  			continue
   273  		}
   274  
   275  		msg, ok = e.pendingBlockResponses.Get()
   276  		if ok {
   277  			e.onBlockResponse(msg.OriginID, msg.Payload.(*messages.ClusterBlockResponse))
   278  			e.metrics.MessageHandled(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse)
   279  			continue
   280  		}
   281  
   282  		// when there is no more messages in the queue, back to the loop to wait
   283  		// for the next incoming message to arrive.
   284  		return
   285  	}
   286  }
   287  
   288  // onSyncResponse processes a synchronization response.
   289  func (e *Engine) onSyncResponse(originID flow.Identifier, res *messages.SyncResponse) {
   290  	final, err := e.state.Final().Head()
   291  	if err != nil {
   292  		e.log.Error().Err(err).Msg("could not get last finalized header")
   293  		return
   294  	}
   295  	e.core.HandleHeight(final, res.Height)
   296  }
   297  
   298  // onBlockResponse processes a response containing a specifically requested block.
   299  func (e *Engine) onBlockResponse(originID flow.Identifier, res *messages.ClusterBlockResponse) {
   300  	// process the blocks one by one
   301  	for _, block := range res.Blocks {
   302  		header := block.Header
   303  		if !e.core.HandleBlock(&header) {
   304  			continue
   305  		}
   306  		synced := flow.Slashable[*messages.ClusterBlockProposal]{
   307  			OriginID: originID,
   308  			Message: &messages.ClusterBlockProposal{
   309  				Block: block,
   310  			},
   311  		}
   312  		// forward the block to the compliance engine for validation and processing
   313  		e.comp.OnSyncedClusterBlock(synced)
   314  	}
   315  }
   316  
   317  // checkLoop will regularly scan for items that need requesting.
   318  func (e *Engine) checkLoop() {
   319  	pollChan := make(<-chan time.Time)
   320  	if e.pollInterval > 0 {
   321  		poll := time.NewTicker(e.pollInterval)
   322  		pollChan = poll.C
   323  		defer poll.Stop()
   324  	}
   325  	scan := time.NewTicker(e.scanInterval)
   326  
   327  CheckLoop:
   328  	for {
   329  		// give the quit channel a priority to be selected
   330  		select {
   331  		case <-e.unit.Quit():
   332  			break CheckLoop
   333  		default:
   334  		}
   335  
   336  		select {
   337  		case <-e.unit.Quit():
   338  			break CheckLoop
   339  		case <-pollChan:
   340  			e.pollHeight()
   341  		case <-scan.C:
   342  			final, err := e.state.Final().Head()
   343  			if err != nil {
   344  				e.log.Fatal().Err(err).Msg("could not get last finalized header")
   345  				continue
   346  			}
   347  			ranges, batches := e.core.ScanPending(final)
   348  			e.sendRequests(ranges, batches)
   349  		}
   350  	}
   351  
   352  	// some minor cleanup
   353  	scan.Stop()
   354  }
   355  
   356  // pollHeight will send a synchronization request to three random nodes.
   357  func (e *Engine) pollHeight() {
   358  	head, err := e.state.Final().Head()
   359  	if err != nil {
   360  		e.log.Error().Err(err).Msg("could not get last finalized header")
   361  		return
   362  	}
   363  
   364  	nonce, err := rand.Uint64()
   365  	if err != nil {
   366  		// TODO: this error should be returned by pollHeight()
   367  		// it is logged for now since the only error possible is related to a failure
   368  		// of the system entropy generation. Such error is going to cause failures in other
   369  		// components where it's handled properly and will lead to crashing the module.
   370  		e.log.Error().Err(err).Msg("nonce generation failed during pollHeight")
   371  		return
   372  	}
   373  
   374  	// send the request for synchronization
   375  	req := &messages.SyncRequest{
   376  		Nonce:  nonce,
   377  		Height: head.Height,
   378  	}
   379  	err = e.con.Multicast(req, synccore.DefaultPollNodes, e.participants.NodeIDs()...)
   380  	if err != nil && !errors.Is(err, network.EmptyTargetList) {
   381  		e.log.Warn().Err(err).Msg("sending sync request to poll heights failed")
   382  		return
   383  	}
   384  	e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageSyncRequest)
   385  }
   386  
   387  // sendRequests sends a request for each range and batch using consensus participants from last finalized snapshot.
   388  func (e *Engine) sendRequests(ranges []chainsync.Range, batches []chainsync.Batch) {
   389  	var errs *multierror.Error
   390  
   391  	for _, ran := range ranges {
   392  		nonce, err := rand.Uint64()
   393  		if err != nil {
   394  			// TODO: this error should be returned by sendRequests
   395  			// it is logged for now since the only error possible is related to a failure
   396  			// of the system entropy generation. Such error is going to cause failures in other
   397  			// components where it's handled properly and will lead to crashing the module.
   398  			e.log.Error().Err(err).Msg("nonce generation failed during range request")
   399  			return
   400  		}
   401  		req := &messages.RangeRequest{
   402  			Nonce:      nonce,
   403  			FromHeight: ran.From,
   404  			ToHeight:   ran.To,
   405  		}
   406  		err = e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...)
   407  		if err != nil {
   408  			errs = multierror.Append(errs, fmt.Errorf("could not submit range request: %w", err))
   409  			continue
   410  		}
   411  		e.log.Debug().
   412  			Uint64("range_from", req.FromHeight).
   413  			Uint64("range_to", req.ToHeight).
   414  			Uint64("range_nonce", req.Nonce).
   415  			Msg("range requested")
   416  		e.core.RangeRequested(ran)
   417  		e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageRangeRequest)
   418  	}
   419  
   420  	for _, batch := range batches {
   421  		nonce, err := rand.Uint64()
   422  		if err != nil {
   423  			// TODO: this error should be returned by sendRequests
   424  			// it is logged for now since the only error possible is related to a failure
   425  			// of the system entropy generation. Such error is going to cause failures in other
   426  			// components where it's handled properly and will lead to crashing the module.
   427  			e.log.Error().Err(err).Msg("nonce generation failed during batch request")
   428  			return
   429  		}
   430  		req := &messages.BatchRequest{
   431  			Nonce:    nonce,
   432  			BlockIDs: batch.BlockIDs,
   433  		}
   434  		err = e.con.Multicast(req, synccore.DefaultBlockRequestNodes, e.participants.NodeIDs()...)
   435  		if err != nil {
   436  			errs = multierror.Append(errs, fmt.Errorf("could not submit batch request: %w", err))
   437  			continue
   438  		}
   439  		e.core.BatchRequested(batch)
   440  		e.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageBatchRequest)
   441  	}
   442  
   443  	if err := errs.ErrorOrNil(); err != nil {
   444  		e.log.Warn().Err(err).Msg("sending range and batch requests failed")
   445  	}
   446  }