github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/collection/synchronization/request_handler.go (about)

     1  package synchronization
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  
     7  	"github.com/rs/zerolog"
     8  
     9  	"github.com/onflow/flow-go/engine"
    10  	commonsync "github.com/onflow/flow-go/engine/common/synchronization"
    11  	"github.com/onflow/flow-go/model/flow"
    12  	"github.com/onflow/flow-go/model/messages"
    13  	"github.com/onflow/flow-go/module"
    14  	"github.com/onflow/flow-go/module/chainsync"
    15  	"github.com/onflow/flow-go/module/lifecycle"
    16  	"github.com/onflow/flow-go/module/metrics"
    17  	"github.com/onflow/flow-go/network"
    18  	"github.com/onflow/flow-go/network/channels"
    19  	"github.com/onflow/flow-go/state/cluster"
    20  	"github.com/onflow/flow-go/storage"
    21  )
    22  
    23  // defaultSyncRequestQueueCapacity maximum capacity of sync requests queue
    24  const defaultSyncRequestQueueCapacity = 500
    25  
    26  // defaultSyncRequestQueueCapacity maximum capacity of range requests queue
    27  const defaultRangeRequestQueueCapacity = 500
    28  
    29  // defaultSyncRequestQueueCapacity maximum capacity of batch requests queue
    30  const defaultBatchRequestQueueCapacity = 500
    31  
    32  // defaultEngineRequestsWorkers number of workers to dispatch events for requests
    33  const defaultEngineRequestsWorkers = 8
    34  
    35  type RequestHandlerEngine struct {
    36  	unit *engine.Unit
    37  	lm   *lifecycle.LifecycleManager
    38  
    39  	me      module.Local
    40  	log     zerolog.Logger
    41  	metrics module.EngineMetrics
    42  
    43  	blocks storage.ClusterBlocks
    44  	core   module.SyncCore
    45  	state  cluster.State
    46  	con    network.Conduit // used for sending responses to requesters
    47  
    48  	pendingSyncRequests   engine.MessageStore    // message store for *message.SyncRequest
    49  	pendingBatchRequests  engine.MessageStore    // message store for *message.BatchRequest
    50  	pendingRangeRequests  engine.MessageStore    // message store for *message.RangeRequest
    51  	requestMessageHandler *engine.MessageHandler // message handler responsible for request processing
    52  }
    53  
    54  func NewRequestHandlerEngine(
    55  	log zerolog.Logger,
    56  	metrics module.EngineMetrics,
    57  	con network.Conduit,
    58  	me module.Local,
    59  	blocks storage.ClusterBlocks,
    60  	core module.SyncCore,
    61  	state cluster.State,
    62  ) *RequestHandlerEngine {
    63  	r := &RequestHandlerEngine{
    64  		unit:    engine.NewUnit(),
    65  		lm:      lifecycle.NewLifecycleManager(),
    66  		me:      me,
    67  		log:     log.With().Str("engine", "cluster_synchronization").Logger(),
    68  		metrics: metrics,
    69  		blocks:  blocks,
    70  		core:    core,
    71  		state:   state,
    72  		con:     con,
    73  	}
    74  
    75  	r.setupRequestMessageHandler()
    76  
    77  	return r
    78  }
    79  
    80  // SubmitLocal submits an event originating on the local node.
    81  func (r *RequestHandlerEngine) SubmitLocal(event interface{}) {
    82  	err := r.ProcessLocal(event)
    83  	if err != nil {
    84  		r.log.Fatal().Err(err).Msg("internal error processing event")
    85  	}
    86  }
    87  
    88  // Submit submits the given event from the node with the given origin ID
    89  // for processing in a non-blocking manner. It returns instantly and logs
    90  // a potential processing error internally when done.
    91  func (r *RequestHandlerEngine) Submit(channel channels.Channel, originID flow.Identifier, event interface{}) {
    92  	err := r.Process(channel, originID, event)
    93  	if err != nil {
    94  		r.log.Fatal().Err(err).Msg("internal error processing event")
    95  	}
    96  }
    97  
    98  // ProcessLocal processes an event originating on the local node.
    99  func (r *RequestHandlerEngine) ProcessLocal(event interface{}) error {
   100  	return r.process(r.me.NodeID(), event)
   101  }
   102  
   103  // Process processes the given event from the node with the given origin ID in
   104  // a blocking manner. It returns the potential processing error when done.
   105  func (r *RequestHandlerEngine) Process(channel channels.Channel, originID flow.Identifier, event interface{}) error {
   106  	err := r.process(originID, event)
   107  	if err != nil {
   108  		if engine.IsIncompatibleInputTypeError(err) {
   109  			r.log.Warn().Msgf("%v delivered unsupported message %T through %v", originID, event, channel)
   110  			return nil
   111  		}
   112  		return fmt.Errorf("unexpected error while processing engine message: %w", err)
   113  	}
   114  	return nil
   115  }
   116  
   117  // process processes events for the synchronization request handler engine.
   118  // Error returns:
   119  //   - IncompatibleInputTypeError if input has unexpected type
   120  //   - All other errors are potential symptoms of internal state corruption or bugs (fatal).
   121  func (r *RequestHandlerEngine) process(originID flow.Identifier, event interface{}) error {
   122  	return r.requestMessageHandler.Process(originID, event)
   123  }
   124  
   125  // setupRequestMessageHandler initializes the inbound queues and the MessageHandler for UNTRUSTED requests.
   126  func (r *RequestHandlerEngine) setupRequestMessageHandler() {
   127  	// RequestHeap deduplicates requests by keeping only one sync request for each requester.
   128  	r.pendingSyncRequests = commonsync.NewRequestHeap(defaultSyncRequestQueueCapacity)
   129  	r.pendingRangeRequests = commonsync.NewRequestHeap(defaultRangeRequestQueueCapacity)
   130  	r.pendingBatchRequests = commonsync.NewRequestHeap(defaultBatchRequestQueueCapacity)
   131  
   132  	// define message queueing behaviour
   133  	r.requestMessageHandler = engine.NewMessageHandler(
   134  		r.log,
   135  		engine.NewNotifier(),
   136  		engine.Pattern{
   137  			Match: func(msg *engine.Message) bool {
   138  				_, ok := msg.Payload.(*messages.SyncRequest)
   139  				if ok {
   140  					r.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageSyncRequest)
   141  				}
   142  				return ok
   143  			},
   144  			Store: r.pendingSyncRequests,
   145  		},
   146  		engine.Pattern{
   147  			Match: func(msg *engine.Message) bool {
   148  				_, ok := msg.Payload.(*messages.RangeRequest)
   149  				if ok {
   150  					r.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageRangeRequest)
   151  				}
   152  				return ok
   153  			},
   154  			Store: r.pendingRangeRequests,
   155  		},
   156  		engine.Pattern{
   157  			Match: func(msg *engine.Message) bool {
   158  				_, ok := msg.Payload.(*messages.BatchRequest)
   159  				if ok {
   160  					r.metrics.MessageReceived(metrics.EngineClusterSynchronization, metrics.MessageBatchRequest)
   161  				}
   162  				return ok
   163  			},
   164  			Store: r.pendingBatchRequests,
   165  		},
   166  	)
   167  }
   168  
   169  // onSyncRequest processes an outgoing handshake; if we have a higher height, we
   170  // inform the other node of it, so they can organize their block downloads. If
   171  // we have a lower height, we add the difference to our own download queue.
   172  func (r *RequestHandlerEngine) onSyncRequest(originID flow.Identifier, req *messages.SyncRequest) error {
   173  	final, err := r.state.Final().Head()
   174  	if err != nil {
   175  		return fmt.Errorf("could not get last finalized header: %w", err)
   176  	}
   177  
   178  	// queue any missing heights as needed
   179  	r.core.HandleHeight(final, req.Height)
   180  
   181  	// don't bother sending a response if we're within tolerance or if we're
   182  	// behind the requester
   183  	if r.core.WithinTolerance(final, req.Height) || req.Height > final.Height {
   184  		return nil
   185  	}
   186  
   187  	// if we're sufficiently ahead of the requester, send a response
   188  	res := &messages.SyncResponse{
   189  		Height: final.Height,
   190  		Nonce:  req.Nonce,
   191  	}
   192  	err = r.con.Unicast(res, originID)
   193  	if err != nil {
   194  		r.log.Warn().Err(err).Msg("sending sync response failed")
   195  		return nil
   196  	}
   197  	r.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageSyncResponse)
   198  
   199  	return nil
   200  }
   201  
   202  // onRangeRequest processes a request for a range of blocks by height.
   203  func (r *RequestHandlerEngine) onRangeRequest(originID flow.Identifier, req *messages.RangeRequest) error {
   204  	r.log.Debug().Str("origin_id", originID.String()).Msg("received new range request")
   205  	// get the latest final state to know if we can fulfill the request
   206  	head, err := r.state.Final().Head()
   207  	if err != nil {
   208  		return fmt.Errorf("could not get last finalized header: %w", err)
   209  	}
   210  
   211  	// if we don't have anything to send, we can bail right away
   212  	if head.Height < req.FromHeight || req.FromHeight > req.ToHeight {
   213  		return nil
   214  	}
   215  
   216  	// enforce client-side max request size
   217  	var maxSize uint
   218  	// TODO: clean up this logic
   219  	if core, ok := r.core.(*chainsync.Core); ok {
   220  		maxSize = core.Config.MaxSize
   221  	} else {
   222  		maxSize = chainsync.DefaultConfig().MaxSize
   223  	}
   224  	maxHeight := req.FromHeight + uint64(maxSize)
   225  	if maxHeight < req.ToHeight {
   226  		r.log.Warn().
   227  			Uint64("from", req.FromHeight).
   228  			Uint64("to", req.ToHeight).
   229  			Uint64("size", (req.ToHeight-req.FromHeight)+1).
   230  			Uint("max_size", maxSize).
   231  			Msg("range request is too large")
   232  
   233  		req.ToHeight = maxHeight
   234  	}
   235  
   236  	// get all of the blocks, one by one
   237  	blocks := make([]messages.UntrustedClusterBlock, 0, req.ToHeight-req.FromHeight+1)
   238  	for height := req.FromHeight; height <= req.ToHeight; height++ {
   239  		block, err := r.blocks.ByHeight(height)
   240  		if errors.Is(err, storage.ErrNotFound) {
   241  			r.log.Error().Uint64("height", height).Msg("skipping unknown heights")
   242  			break
   243  		}
   244  		if err != nil {
   245  			return fmt.Errorf("could not get block for height (%d): %w", height, err)
   246  		}
   247  		blocks = append(blocks, messages.UntrustedClusterBlockFromInternal(block))
   248  	}
   249  
   250  	// if there are no blocks to send, skip network message
   251  	if len(blocks) == 0 {
   252  		r.log.Debug().Msg("skipping empty range response")
   253  		return nil
   254  	}
   255  
   256  	// send the response
   257  	res := &messages.ClusterBlockResponse{
   258  		Nonce:  req.Nonce,
   259  		Blocks: blocks,
   260  	}
   261  	err = r.con.Unicast(res, originID)
   262  	if err != nil {
   263  		r.log.Warn().Err(err).Hex("origin_id", originID[:]).Msg("sending range response failed")
   264  		return nil
   265  	}
   266  	r.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse)
   267  
   268  	return nil
   269  }
   270  
   271  // onBatchRequest processes a request for a specific block by block ID.
   272  func (r *RequestHandlerEngine) onBatchRequest(originID flow.Identifier, req *messages.BatchRequest) error {
   273  	r.log.Debug().Str("origin_id", originID.String()).Msg("received new batch request")
   274  	// we should bail and send nothing on empty request
   275  	if len(req.BlockIDs) == 0 {
   276  		return nil
   277  	}
   278  
   279  	// TODO: clean up this logic
   280  	var maxSize uint
   281  	if core, ok := r.core.(*chainsync.Core); ok {
   282  		maxSize = core.Config.MaxSize
   283  	} else {
   284  		maxSize = chainsync.DefaultConfig().MaxSize
   285  	}
   286  
   287  	if len(req.BlockIDs) > int(maxSize) {
   288  		r.log.Warn().
   289  			Int("size", len(req.BlockIDs)).
   290  			Uint("max_size", maxSize).
   291  			Msg("batch request is too large")
   292  	}
   293  
   294  	// deduplicate the block IDs in the batch request
   295  	blockIDs := make(map[flow.Identifier]struct{})
   296  	for _, blockID := range req.BlockIDs {
   297  		blockIDs[blockID] = struct{}{}
   298  
   299  		// enforce client-side max request size
   300  		if len(blockIDs) == int(maxSize) {
   301  			break
   302  		}
   303  	}
   304  
   305  	// try to get all the blocks by ID
   306  	blocks := make([]messages.UntrustedClusterBlock, 0, len(blockIDs))
   307  	for blockID := range blockIDs {
   308  		block, err := r.blocks.ByID(blockID)
   309  		if errors.Is(err, storage.ErrNotFound) {
   310  			r.log.Debug().Hex("block_id", blockID[:]).Msg("skipping unknown block")
   311  			continue
   312  		}
   313  		if err != nil {
   314  			return fmt.Errorf("could not get block by ID (%s): %w", blockID, err)
   315  		}
   316  		blocks = append(blocks, messages.UntrustedClusterBlockFromInternal(block))
   317  	}
   318  
   319  	// if there are no blocks to send, skip network message
   320  	if len(blocks) == 0 {
   321  		r.log.Debug().Msg("skipping empty batch response")
   322  		return nil
   323  	}
   324  
   325  	// send the response
   326  	res := &messages.ClusterBlockResponse{
   327  		Nonce:  req.Nonce,
   328  		Blocks: blocks,
   329  	}
   330  	err := r.con.Unicast(res, originID)
   331  	if err != nil {
   332  		r.log.Warn().Err(err).Hex("origin_id", originID[:]).Msg("sending batch response failed")
   333  		return nil
   334  	}
   335  	r.metrics.MessageSent(metrics.EngineClusterSynchronization, metrics.MessageBlockResponse)
   336  
   337  	return nil
   338  }
   339  
   340  // processAvailableRequests is processor of pending events which drives events from networking layer to business logic.
   341  func (r *RequestHandlerEngine) processAvailableRequests() error {
   342  	for {
   343  		select {
   344  		case <-r.unit.Quit():
   345  			return nil
   346  		default:
   347  		}
   348  
   349  		msg, ok := r.pendingSyncRequests.Get()
   350  		if ok {
   351  			err := r.onSyncRequest(msg.OriginID, msg.Payload.(*messages.SyncRequest))
   352  			if err != nil {
   353  				return fmt.Errorf("processing sync request failed: %w", err)
   354  			}
   355  			continue
   356  		}
   357  
   358  		msg, ok = r.pendingRangeRequests.Get()
   359  		if ok {
   360  			err := r.onRangeRequest(msg.OriginID, msg.Payload.(*messages.RangeRequest))
   361  			if err != nil {
   362  				return fmt.Errorf("processing range request failed: %w", err)
   363  			}
   364  			continue
   365  		}
   366  
   367  		msg, ok = r.pendingBatchRequests.Get()
   368  		if ok {
   369  			err := r.onBatchRequest(msg.OriginID, msg.Payload.(*messages.BatchRequest))
   370  			if err != nil {
   371  				return fmt.Errorf("processing batch request failed: %w", err)
   372  			}
   373  			continue
   374  		}
   375  
   376  		// when there is no more messages in the queue, back to the loop to wait
   377  		// for the next incoming message to arrive.
   378  		return nil
   379  	}
   380  }
   381  
   382  // requestProcessingLoop is a separate goroutine that performs processing of queued requests
   383  func (r *RequestHandlerEngine) requestProcessingLoop() {
   384  	notifier := r.requestMessageHandler.GetNotifier()
   385  	for {
   386  		select {
   387  		case <-r.unit.Quit():
   388  			return
   389  		case <-notifier:
   390  			err := r.processAvailableRequests()
   391  			if err != nil {
   392  				r.log.Fatal().Err(err).Msg("internal error processing queued requests")
   393  			}
   394  		}
   395  	}
   396  }
   397  
   398  // Ready returns a ready channel that is closed once the engine has fully started.
   399  func (r *RequestHandlerEngine) Ready() <-chan struct{} {
   400  	r.lm.OnStart(func() {
   401  		for i := 0; i < defaultEngineRequestsWorkers; i++ {
   402  			r.unit.Launch(r.requestProcessingLoop)
   403  		}
   404  	})
   405  	return r.lm.Started()
   406  }
   407  
   408  // Done returns a done channel that is closed once the engine has fully stopped.
   409  func (r *RequestHandlerEngine) Done() <-chan struct{} {
   410  	r.lm.OnStop(func() {
   411  		// wait for all request processing workers to exit
   412  		<-r.unit.Done()
   413  	})
   414  	return r.lm.Stopped()
   415  }