github.com/MetalBlockchain/metalgo@v1.11.9/snow/networking/handler/handler.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package handler
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"sync"
    11  	"sync/atomic"
    12  	"time"
    13  
    14  	"github.com/prometheus/client_golang/prometheus"
    15  	"go.opentelemetry.io/otel/attribute"
    16  	"go.opentelemetry.io/otel/trace"
    17  	"go.uber.org/zap"
    18  	"golang.org/x/sync/errgroup"
    19  
    20  	"github.com/MetalBlockchain/metalgo/api/health"
    21  	"github.com/MetalBlockchain/metalgo/ids"
    22  	"github.com/MetalBlockchain/metalgo/message"
    23  	"github.com/MetalBlockchain/metalgo/network/p2p"
    24  	"github.com/MetalBlockchain/metalgo/snow"
    25  	"github.com/MetalBlockchain/metalgo/snow/engine/common"
    26  	"github.com/MetalBlockchain/metalgo/snow/networking/tracker"
    27  	"github.com/MetalBlockchain/metalgo/snow/validators"
    28  	"github.com/MetalBlockchain/metalgo/subnets"
    29  	"github.com/MetalBlockchain/metalgo/utils/logging"
    30  	"github.com/MetalBlockchain/metalgo/utils/set"
    31  	"github.com/MetalBlockchain/metalgo/utils/timer/mockable"
    32  
    33  	p2ppb "github.com/MetalBlockchain/metalgo/proto/pb/p2p"
    34  	commontracker "github.com/MetalBlockchain/metalgo/snow/engine/common/tracker"
    35  )
    36  
    37  const (
    38  	numDispatchersToClose = 3
    39  	// If a consensus message takes longer than this to process, the handler
    40  	// will log a warning.
    41  	syncProcessingTimeWarnLimit = 30 * time.Second
    42  )
    43  
    44  var (
    45  	_ Handler = (*handler)(nil)
    46  
    47  	errMissingEngine  = errors.New("missing engine")
    48  	errNoStartingGear = errors.New("failed to select starting gear")
    49  )
    50  
    51  type Handler interface {
    52  	common.Timer
    53  	health.Checker
    54  
    55  	Context() *snow.ConsensusContext
    56  	// ShouldHandle returns true if the node with the given ID is allowed to send
    57  	// messages to this chain. If the node is not allowed to send messages to
    58  	// this chain, the message should be dropped.
    59  	ShouldHandle(nodeID ids.NodeID) bool
    60  
    61  	SetEngineManager(engineManager *EngineManager)
    62  	GetEngineManager() *EngineManager
    63  
    64  	SetOnStopped(onStopped func())
    65  	Start(ctx context.Context, recoverPanic bool)
    66  	Push(ctx context.Context, msg Message)
    67  	Len() int
    68  
    69  	Stop(ctx context.Context)
    70  	StopWithError(ctx context.Context, err error)
    71  	// AwaitStopped returns an error if the call would block and [ctx] is done.
    72  	// Even if [ctx] is done when passed into this function, this function will
    73  	// return a nil error if it will not block.
    74  	AwaitStopped(ctx context.Context) (time.Duration, error)
    75  }
    76  
    77  // handler passes incoming messages from the network to the consensus engine.
    78  // (Actually, it receives the incoming messages from a ChainRouter, but same difference.)
    79  type handler struct {
    80  	metrics *metrics
    81  
    82  	// Useful for faking time in tests
    83  	clock mockable.Clock
    84  
    85  	ctx *snow.ConsensusContext
    86  	// TODO: consider using peerTracker instead of validators
    87  	// since peerTracker is already tracking validators
    88  	validators validators.Manager
    89  	// Receives messages from the VM
    90  	msgFromVMChan   <-chan common.Message
    91  	preemptTimeouts chan struct{}
    92  	gossipFrequency time.Duration
    93  
    94  	engineManager *EngineManager
    95  
    96  	// onStopped is called in a goroutine when this handler finishes shutting
    97  	// down. If it is nil then it is skipped.
    98  	onStopped func()
    99  
   100  	// Tracks cpu/disk usage caused by each peer.
   101  	resourceTracker tracker.ResourceTracker
   102  
   103  	// Holds messages that [engine] hasn't processed yet.
   104  	// [unprocessedMsgsCond.L] must be held while accessing [syncMessageQueue].
   105  	syncMessageQueue MessageQueue
   106  	// Holds messages that [engine] hasn't processed yet.
   107  	// [unprocessedAsyncMsgsCond.L] must be held while accessing [asyncMessageQueue].
   108  	asyncMessageQueue MessageQueue
   109  	// Worker pool for handling asynchronous consensus messages
   110  	asyncMessagePool errgroup.Group
   111  	timeouts         chan struct{}
   112  
   113  	closeOnce            sync.Once
   114  	startClosingTime     time.Time
   115  	totalClosingTime     time.Duration
   116  	closingChan          chan struct{}
   117  	numDispatchersClosed atomic.Uint32
   118  	// Closed when this handler and [engine] are done shutting down
   119  	closed chan struct{}
   120  
   121  	subnetConnector validators.SubnetConnector
   122  
   123  	subnet subnets.Subnet
   124  
   125  	// Tracks the peers that are currently connected to this subnet
   126  	peerTracker commontracker.Peers
   127  	p2pTracker  *p2p.PeerTracker
   128  }
   129  
   130  // Initialize this consensus handler
   131  // [engine] must be initialized before initializing this handler
   132  func New(
   133  	ctx *snow.ConsensusContext,
   134  	validators validators.Manager,
   135  	msgFromVMChan <-chan common.Message,
   136  	gossipFrequency time.Duration,
   137  	threadPoolSize int,
   138  	resourceTracker tracker.ResourceTracker,
   139  	subnetConnector validators.SubnetConnector,
   140  	subnet subnets.Subnet,
   141  	peerTracker commontracker.Peers,
   142  	p2pTracker *p2p.PeerTracker,
   143  	reg prometheus.Registerer,
   144  ) (Handler, error) {
   145  	h := &handler{
   146  		ctx:             ctx,
   147  		validators:      validators,
   148  		msgFromVMChan:   msgFromVMChan,
   149  		preemptTimeouts: subnet.OnBootstrapCompleted(),
   150  		gossipFrequency: gossipFrequency,
   151  		timeouts:        make(chan struct{}, 1),
   152  		closingChan:     make(chan struct{}),
   153  		closed:          make(chan struct{}),
   154  		resourceTracker: resourceTracker,
   155  		subnetConnector: subnetConnector,
   156  		subnet:          subnet,
   157  		peerTracker:     peerTracker,
   158  		p2pTracker:      p2pTracker,
   159  	}
   160  	h.asyncMessagePool.SetLimit(threadPoolSize)
   161  
   162  	var err error
   163  
   164  	h.metrics, err = newMetrics(reg)
   165  	if err != nil {
   166  		return nil, fmt.Errorf("initializing handler metrics errored with: %w", err)
   167  	}
   168  	cpuTracker := resourceTracker.CPUTracker()
   169  	h.syncMessageQueue, err = NewMessageQueue(
   170  		h.ctx.Log,
   171  		h.ctx.SubnetID,
   172  		h.validators,
   173  		cpuTracker,
   174  		"sync",
   175  		reg,
   176  	)
   177  	if err != nil {
   178  		return nil, fmt.Errorf("initializing sync message queue errored with: %w", err)
   179  	}
   180  	h.asyncMessageQueue, err = NewMessageQueue(
   181  		h.ctx.Log,
   182  		h.ctx.SubnetID,
   183  		h.validators,
   184  		cpuTracker,
   185  		"async",
   186  		reg,
   187  	)
   188  	if err != nil {
   189  		return nil, fmt.Errorf("initializing async message queue errored with: %w", err)
   190  	}
   191  	return h, nil
   192  }
   193  
   194  func (h *handler) Context() *snow.ConsensusContext {
   195  	return h.ctx
   196  }
   197  
   198  func (h *handler) ShouldHandle(nodeID ids.NodeID) bool {
   199  	_, ok := h.validators.GetValidator(h.ctx.SubnetID, nodeID)
   200  	return h.subnet.IsAllowed(nodeID, ok)
   201  }
   202  
   203  func (h *handler) SetEngineManager(engineManager *EngineManager) {
   204  	h.engineManager = engineManager
   205  }
   206  
   207  func (h *handler) GetEngineManager() *EngineManager {
   208  	return h.engineManager
   209  }
   210  
   211  func (h *handler) SetOnStopped(onStopped func()) {
   212  	h.onStopped = onStopped
   213  }
   214  
   215  func (h *handler) selectStartingGear(ctx context.Context) (common.Engine, error) {
   216  	state := h.ctx.State.Get()
   217  	engines := h.engineManager.Get(state.Type)
   218  	if engines == nil {
   219  		return nil, errNoStartingGear
   220  	}
   221  	if engines.StateSyncer == nil {
   222  		return engines.Bootstrapper, nil
   223  	}
   224  
   225  	stateSyncEnabled, err := engines.StateSyncer.IsEnabled(ctx)
   226  	if err != nil {
   227  		return nil, err
   228  	}
   229  
   230  	if !stateSyncEnabled {
   231  		return engines.Bootstrapper, nil
   232  	}
   233  
   234  	// drop bootstrap state from previous runs before starting state sync
   235  	return engines.StateSyncer, engines.Bootstrapper.Clear(ctx)
   236  }
   237  
   238  func (h *handler) Start(ctx context.Context, recoverPanic bool) {
   239  	gear, err := h.selectStartingGear(ctx)
   240  	if err != nil {
   241  		h.ctx.Log.Error("chain failed to select starting gear",
   242  			zap.Error(err),
   243  		)
   244  		h.shutdown(ctx, h.clock.Time())
   245  		return
   246  	}
   247  
   248  	h.ctx.Lock.Lock()
   249  	err = gear.Start(ctx, 0)
   250  	h.ctx.Lock.Unlock()
   251  	if err != nil {
   252  		h.ctx.Log.Error("chain failed to start",
   253  			zap.Error(err),
   254  		)
   255  		h.shutdown(ctx, h.clock.Time())
   256  		return
   257  	}
   258  
   259  	detachedCtx := context.WithoutCancel(ctx)
   260  	dispatchSync := func() {
   261  		h.dispatchSync(detachedCtx)
   262  	}
   263  	dispatchAsync := func() {
   264  		h.dispatchAsync(detachedCtx)
   265  	}
   266  	dispatchChans := func() {
   267  		h.dispatchChans(detachedCtx)
   268  	}
   269  	if recoverPanic {
   270  		go h.ctx.Log.RecoverAndExit(dispatchSync, func() {
   271  			h.ctx.Log.Error("chain was shutdown due to a panic in the sync dispatcher")
   272  		})
   273  		go h.ctx.Log.RecoverAndExit(dispatchAsync, func() {
   274  			h.ctx.Log.Error("chain was shutdown due to a panic in the async dispatcher")
   275  		})
   276  		go h.ctx.Log.RecoverAndExit(dispatchChans, func() {
   277  			h.ctx.Log.Error("chain was shutdown due to a panic in the chan dispatcher")
   278  		})
   279  	} else {
   280  		go h.ctx.Log.RecoverAndPanic(dispatchSync)
   281  		go h.ctx.Log.RecoverAndPanic(dispatchAsync)
   282  		go h.ctx.Log.RecoverAndPanic(dispatchChans)
   283  	}
   284  }
   285  
   286  // Push the message onto the handler's queue
   287  func (h *handler) Push(ctx context.Context, msg Message) {
   288  	switch msg.Op() {
   289  	case message.AppRequestOp, message.AppErrorOp, message.AppResponseOp, message.AppGossipOp,
   290  		message.CrossChainAppRequestOp, message.CrossChainAppErrorOp, message.CrossChainAppResponseOp:
   291  		h.asyncMessageQueue.Push(ctx, msg)
   292  	default:
   293  		h.syncMessageQueue.Push(ctx, msg)
   294  	}
   295  }
   296  
   297  func (h *handler) Len() int {
   298  	return h.syncMessageQueue.Len() + h.asyncMessageQueue.Len()
   299  }
   300  
   301  func (h *handler) RegisterTimeout(d time.Duration) {
   302  	go func() {
   303  		timer := time.NewTimer(d)
   304  		defer timer.Stop()
   305  
   306  		select {
   307  		case <-timer.C:
   308  		case <-h.preemptTimeouts:
   309  		}
   310  
   311  		// If there is already a timeout ready to fire - just drop the
   312  		// additional timeout. This ensures that all goroutines that are spawned
   313  		// here are able to close if the chain is shutdown.
   314  		select {
   315  		case h.timeouts <- struct{}{}:
   316  		default:
   317  		}
   318  	}()
   319  }
   320  
   321  // Note: It is possible for Stop to be called before/concurrently with Start.
   322  //
   323  // Invariant: Stop must never block.
   324  func (h *handler) Stop(ctx context.Context) {
   325  	h.closeOnce.Do(func() {
   326  		h.startClosingTime = h.clock.Time()
   327  
   328  		// Must hold the locks here to ensure there's no race condition in where
   329  		// we check the value of [h.closing] after the call to [Signal].
   330  		h.syncMessageQueue.Shutdown()
   331  		h.asyncMessageQueue.Shutdown()
   332  		close(h.closingChan)
   333  
   334  		// TODO: switch this to use a [context.Context] with a cancel function.
   335  		//
   336  		// Don't process any more bootstrap messages. If a dispatcher is
   337  		// processing a bootstrap message, stop. We do this because if we
   338  		// didn't, and the engine was in the middle of executing state
   339  		// transitions during bootstrapping, we wouldn't be able to grab
   340  		// [h.ctx.Lock] until the engine finished executing state transitions,
   341  		// which may take a long time. As a result, the router would time out on
   342  		// shutting down this chain.
   343  		state := h.ctx.State.Get()
   344  		bootstrapper, ok := h.engineManager.Get(state.Type).Get(snow.Bootstrapping)
   345  		if !ok {
   346  			h.ctx.Log.Error("bootstrapping engine doesn't exist",
   347  				zap.Stringer("type", state.Type),
   348  			)
   349  			return
   350  		}
   351  		bootstrapper.Halt(ctx)
   352  	})
   353  }
   354  
   355  func (h *handler) StopWithError(ctx context.Context, err error) {
   356  	h.ctx.Log.Fatal("shutting down chain",
   357  		zap.String("reason", "received an unexpected error"),
   358  		zap.Error(err),
   359  	)
   360  	h.Stop(ctx)
   361  }
   362  
   363  func (h *handler) AwaitStopped(ctx context.Context) (time.Duration, error) {
   364  	select {
   365  	case <-h.closed:
   366  		return h.totalClosingTime, nil
   367  	default:
   368  	}
   369  
   370  	select {
   371  	case <-ctx.Done():
   372  		return 0, ctx.Err()
   373  	case <-h.closed:
   374  		return h.totalClosingTime, nil
   375  	}
   376  }
   377  
   378  func (h *handler) dispatchSync(ctx context.Context) {
   379  	defer h.closeDispatcher(ctx)
   380  
   381  	// Handle sync messages from the router
   382  	for {
   383  		// Get the next message we should process. If the handler is shutting
   384  		// down, we may fail to pop a message.
   385  		ctx, msg, ok := h.popUnexpiredMsg(h.syncMessageQueue)
   386  		if !ok {
   387  			return
   388  		}
   389  
   390  		// If there is an error handling the message, shut down the chain
   391  		if err := h.handleSyncMsg(ctx, msg); err != nil {
   392  			h.StopWithError(ctx, fmt.Errorf(
   393  				"%w while processing sync message: %s from %s",
   394  				err,
   395  				msg.Op(),
   396  				msg.NodeID(),
   397  			))
   398  			return
   399  		}
   400  	}
   401  }
   402  
   403  func (h *handler) dispatchAsync(ctx context.Context) {
   404  	defer func() {
   405  		// We never return an error in any of our functions, so it is safe to
   406  		// drop any error here.
   407  		_ = h.asyncMessagePool.Wait()
   408  		h.closeDispatcher(ctx)
   409  	}()
   410  
   411  	// Handle async messages from the router
   412  	for {
   413  		// Get the next message we should process. If the handler is shutting
   414  		// down, we may fail to pop a message.
   415  		ctx, msg, ok := h.popUnexpiredMsg(h.asyncMessageQueue)
   416  		if !ok {
   417  			return
   418  		}
   419  
   420  		h.handleAsyncMsg(ctx, msg)
   421  	}
   422  }
   423  
   424  func (h *handler) dispatchChans(ctx context.Context) {
   425  	gossiper := time.NewTicker(h.gossipFrequency)
   426  	defer func() {
   427  		gossiper.Stop()
   428  		h.closeDispatcher(ctx)
   429  	}()
   430  
   431  	// Handle messages generated by the handler and the VM
   432  	for {
   433  		var msg message.InboundMessage
   434  		select {
   435  		case <-h.closingChan:
   436  			return
   437  
   438  		case vmMSG := <-h.msgFromVMChan:
   439  			msg = message.InternalVMMessage(h.ctx.NodeID, uint32(vmMSG))
   440  
   441  		case <-gossiper.C:
   442  			msg = message.InternalGossipRequest(h.ctx.NodeID)
   443  
   444  		case <-h.timeouts:
   445  			msg = message.InternalTimeout(h.ctx.NodeID)
   446  		}
   447  
   448  		if err := h.handleChanMsg(msg); err != nil {
   449  			h.StopWithError(ctx, fmt.Errorf(
   450  				"%w while processing chan message: %s",
   451  				err,
   452  				msg.Op(),
   453  			))
   454  			return
   455  		}
   456  	}
   457  }
   458  
   459  // Any returned error is treated as fatal
   460  func (h *handler) handleSyncMsg(ctx context.Context, msg Message) error {
   461  	var (
   462  		nodeID    = msg.NodeID()
   463  		op        = msg.Op().String()
   464  		body      = msg.Message()
   465  		startTime = h.clock.Time()
   466  		// Check if the chain is in normal operation at the start of message
   467  		// execution (may change during execution)
   468  		isNormalOp = h.ctx.State.Get().State == snow.NormalOp
   469  	)
   470  	if h.ctx.Log.Enabled(logging.Verbo) {
   471  		h.ctx.Log.Verbo("forwarding sync message to consensus",
   472  			zap.Stringer("nodeID", nodeID),
   473  			zap.String("messageOp", op),
   474  			zap.Stringer("message", body),
   475  		)
   476  	} else {
   477  		h.ctx.Log.Debug("forwarding sync message to consensus",
   478  			zap.Stringer("nodeID", nodeID),
   479  			zap.String("messageOp", op),
   480  		)
   481  	}
   482  	h.resourceTracker.StartProcessing(nodeID, startTime)
   483  	h.ctx.Lock.Lock()
   484  	lockAcquiredTime := h.clock.Time()
   485  	defer func() {
   486  		h.ctx.Lock.Unlock()
   487  
   488  		var (
   489  			endTime      = h.clock.Time()
   490  			lockingTime  = lockAcquiredTime.Sub(startTime)
   491  			handlingTime = endTime.Sub(lockAcquiredTime)
   492  		)
   493  		h.resourceTracker.StopProcessing(nodeID, endTime)
   494  		h.metrics.lockingTime.Add(float64(lockingTime))
   495  		labels := prometheus.Labels{
   496  			opLabel: op,
   497  		}
   498  		h.metrics.messages.With(labels).Inc()
   499  		h.metrics.messageHandlingTime.With(labels).Add(float64(handlingTime))
   500  
   501  		msg.OnFinishedHandling()
   502  		h.ctx.Log.Debug("finished handling sync message",
   503  			zap.String("messageOp", op),
   504  		)
   505  		if lockingTime+handlingTime > syncProcessingTimeWarnLimit && isNormalOp {
   506  			h.ctx.Log.Warn("handling sync message took longer than expected",
   507  				zap.Duration("lockingTime", lockingTime),
   508  				zap.Duration("handlingTime", handlingTime),
   509  				zap.Stringer("nodeID", nodeID),
   510  				zap.String("messageOp", op),
   511  				zap.Stringer("message", body),
   512  			)
   513  		}
   514  	}()
   515  
   516  	// We will attempt to pass the message to the requested type for the state
   517  	// we are currently in.
   518  	currentState := h.ctx.State.Get()
   519  	if msg.EngineType == p2ppb.EngineType_ENGINE_TYPE_SNOWMAN &&
   520  		currentState.Type == p2ppb.EngineType_ENGINE_TYPE_AVALANCHE {
   521  		// The peer is requesting an engine type that hasn't been initialized
   522  		// yet. This means we know that this isn't a response, so we can safely
   523  		// drop the message.
   524  		h.ctx.Log.Debug("dropping sync message",
   525  			zap.String("reason", "uninitialized engine type"),
   526  			zap.String("messageOp", op),
   527  			zap.Stringer("currentEngineType", currentState.Type),
   528  			zap.Stringer("requestedEngineType", msg.EngineType),
   529  		)
   530  		return nil
   531  	}
   532  
   533  	var engineType p2ppb.EngineType
   534  	switch msg.EngineType {
   535  	case p2ppb.EngineType_ENGINE_TYPE_AVALANCHE, p2ppb.EngineType_ENGINE_TYPE_SNOWMAN:
   536  		// The peer is requesting an engine type that has been initialized, so
   537  		// we should attempt to honor the request.
   538  		engineType = msg.EngineType
   539  	default:
   540  		// Note: [msg.EngineType] may have been provided by the peer as an
   541  		// invalid option. I.E. not one of AVALANCHE, SNOWMAN, or UNSPECIFIED.
   542  		// In this case, we treat the value the same way as UNSPECIFIED.
   543  		//
   544  		// If the peer didn't request a specific engine type, we default to the
   545  		// current engine.
   546  		engineType = currentState.Type
   547  	}
   548  
   549  	engine, ok := h.engineManager.Get(engineType).Get(currentState.State)
   550  	if !ok {
   551  		// This should only happen if the peer is not following the protocol.
   552  		// This can happen if the chain only has a Snowman engine and the peer
   553  		// requested an Avalanche engine handle the message.
   554  		h.ctx.Log.Debug("dropping sync message",
   555  			zap.String("reason", "uninitialized engine state"),
   556  			zap.String("messageOp", op),
   557  			zap.Stringer("currentEngineType", currentState.Type),
   558  			zap.Stringer("requestedEngineType", msg.EngineType),
   559  			zap.Stringer("engineState", currentState.State),
   560  		)
   561  		return nil
   562  	}
   563  
   564  	// Invariant: Response messages can never be dropped here. This is because
   565  	//            the timeout has already been cleared. This means the engine
   566  	//            should be invoked with a failure message if parsing of the
   567  	//            response fails.
   568  	switch msg := body.(type) {
   569  	// State messages should always be sent to the snowman engine
   570  	case *p2ppb.GetStateSummaryFrontier:
   571  		return engine.GetStateSummaryFrontier(ctx, nodeID, msg.RequestId)
   572  
   573  	case *p2ppb.StateSummaryFrontier:
   574  		return engine.StateSummaryFrontier(ctx, nodeID, msg.RequestId, msg.Summary)
   575  
   576  	case *message.GetStateSummaryFrontierFailed:
   577  		return engine.GetStateSummaryFrontierFailed(ctx, nodeID, msg.RequestID)
   578  
   579  	case *p2ppb.GetAcceptedStateSummary:
   580  		return engine.GetAcceptedStateSummary(
   581  			ctx,
   582  			nodeID,
   583  			msg.RequestId,
   584  			set.Of(msg.Heights...),
   585  		)
   586  
   587  	case *p2ppb.AcceptedStateSummary:
   588  		summaryIDs, err := getIDs(msg.SummaryIds)
   589  		if err != nil {
   590  			h.ctx.Log.Debug("message with invalid field",
   591  				zap.Stringer("nodeID", nodeID),
   592  				zap.Stringer("messageOp", message.AcceptedStateSummaryOp),
   593  				zap.Uint32("requestID", msg.RequestId),
   594  				zap.String("field", "SummaryIDs"),
   595  				zap.Error(err),
   596  			)
   597  			return engine.GetAcceptedStateSummaryFailed(ctx, nodeID, msg.RequestId)
   598  		}
   599  
   600  		return engine.AcceptedStateSummary(ctx, nodeID, msg.RequestId, summaryIDs)
   601  
   602  	case *message.GetAcceptedStateSummaryFailed:
   603  		return engine.GetAcceptedStateSummaryFailed(ctx, nodeID, msg.RequestID)
   604  
   605  	// Bootstrapping messages may be forwarded to either avalanche or snowman
   606  	// engines, depending on the EngineType field
   607  	case *p2ppb.GetAcceptedFrontier:
   608  		return engine.GetAcceptedFrontier(ctx, nodeID, msg.RequestId)
   609  
   610  	case *p2ppb.AcceptedFrontier:
   611  		containerID, err := ids.ToID(msg.ContainerId)
   612  		if err != nil {
   613  			h.ctx.Log.Debug("message with invalid field",
   614  				zap.Stringer("nodeID", nodeID),
   615  				zap.Stringer("messageOp", message.AcceptedFrontierOp),
   616  				zap.Uint32("requestID", msg.RequestId),
   617  				zap.String("field", "ContainerID"),
   618  				zap.Error(err),
   619  			)
   620  			return engine.GetAcceptedFrontierFailed(ctx, nodeID, msg.RequestId)
   621  		}
   622  
   623  		return engine.AcceptedFrontier(ctx, nodeID, msg.RequestId, containerID)
   624  
   625  	case *message.GetAcceptedFrontierFailed:
   626  		return engine.GetAcceptedFrontierFailed(ctx, nodeID, msg.RequestID)
   627  
   628  	case *p2ppb.GetAccepted:
   629  		containerIDs, err := getIDs(msg.ContainerIds)
   630  		if err != nil {
   631  			h.ctx.Log.Debug("message with invalid field",
   632  				zap.Stringer("nodeID", nodeID),
   633  				zap.Stringer("messageOp", message.GetAcceptedOp),
   634  				zap.Uint32("requestID", msg.RequestId),
   635  				zap.String("field", "ContainerIDs"),
   636  				zap.Error(err),
   637  			)
   638  			return nil
   639  		}
   640  
   641  		return engine.GetAccepted(ctx, nodeID, msg.RequestId, containerIDs)
   642  
   643  	case *p2ppb.Accepted:
   644  		containerIDs, err := getIDs(msg.ContainerIds)
   645  		if err != nil {
   646  			h.ctx.Log.Debug("message with invalid field",
   647  				zap.Stringer("nodeID", nodeID),
   648  				zap.Stringer("messageOp", message.AcceptedOp),
   649  				zap.Uint32("requestID", msg.RequestId),
   650  				zap.String("field", "ContainerIDs"),
   651  				zap.Error(err),
   652  			)
   653  			return engine.GetAcceptedFailed(ctx, nodeID, msg.RequestId)
   654  		}
   655  
   656  		return engine.Accepted(ctx, nodeID, msg.RequestId, containerIDs)
   657  
   658  	case *message.GetAcceptedFailed:
   659  		return engine.GetAcceptedFailed(ctx, nodeID, msg.RequestID)
   660  
   661  	case *p2ppb.GetAncestors:
   662  		containerID, err := ids.ToID(msg.ContainerId)
   663  		if err != nil {
   664  			h.ctx.Log.Debug("dropping message with invalid field",
   665  				zap.Stringer("nodeID", nodeID),
   666  				zap.Stringer("messageOp", message.GetAncestorsOp),
   667  				zap.Uint32("requestID", msg.RequestId),
   668  				zap.String("field", "ContainerID"),
   669  				zap.Error(err),
   670  			)
   671  			return nil
   672  		}
   673  
   674  		return engine.GetAncestors(ctx, nodeID, msg.RequestId, containerID)
   675  
   676  	case *message.GetAncestorsFailed:
   677  		return engine.GetAncestorsFailed(ctx, nodeID, msg.RequestID)
   678  
   679  	case *p2ppb.Ancestors:
   680  		return engine.Ancestors(ctx, nodeID, msg.RequestId, msg.Containers)
   681  
   682  	case *p2ppb.Get:
   683  		containerID, err := ids.ToID(msg.ContainerId)
   684  		if err != nil {
   685  			h.ctx.Log.Debug("dropping message with invalid field",
   686  				zap.Stringer("nodeID", nodeID),
   687  				zap.Stringer("messageOp", message.GetOp),
   688  				zap.Uint32("requestID", msg.RequestId),
   689  				zap.String("field", "ContainerID"),
   690  				zap.Error(err),
   691  			)
   692  			return nil
   693  		}
   694  
   695  		return engine.Get(ctx, nodeID, msg.RequestId, containerID)
   696  
   697  	case *message.GetFailed:
   698  		return engine.GetFailed(ctx, nodeID, msg.RequestID)
   699  
   700  	case *p2ppb.Put:
   701  		return engine.Put(ctx, nodeID, msg.RequestId, msg.Container)
   702  
   703  	case *p2ppb.PushQuery:
   704  		return engine.PushQuery(ctx, nodeID, msg.RequestId, msg.Container, msg.RequestedHeight)
   705  
   706  	case *p2ppb.PullQuery:
   707  		containerID, err := ids.ToID(msg.ContainerId)
   708  		if err != nil {
   709  			h.ctx.Log.Debug("dropping message with invalid field",
   710  				zap.Stringer("nodeID", nodeID),
   711  				zap.Stringer("messageOp", message.PullQueryOp),
   712  				zap.Uint32("requestID", msg.RequestId),
   713  				zap.String("field", "ContainerID"),
   714  				zap.Error(err),
   715  			)
   716  			return nil
   717  		}
   718  
   719  		return engine.PullQuery(ctx, nodeID, msg.RequestId, containerID, msg.RequestedHeight)
   720  
   721  	case *p2ppb.Chits:
   722  		preferredID, err := ids.ToID(msg.PreferredId)
   723  		if err != nil {
   724  			h.ctx.Log.Debug("message with invalid field",
   725  				zap.Stringer("nodeID", nodeID),
   726  				zap.Stringer("messageOp", message.ChitsOp),
   727  				zap.Uint32("requestID", msg.RequestId),
   728  				zap.String("field", "PreferredID"),
   729  				zap.Error(err),
   730  			)
   731  			return engine.QueryFailed(ctx, nodeID, msg.RequestId)
   732  		}
   733  
   734  		preferredIDAtHeight, err := ids.ToID(msg.PreferredIdAtHeight)
   735  		if err != nil {
   736  			h.ctx.Log.Debug("message with invalid field",
   737  				zap.Stringer("nodeID", nodeID),
   738  				zap.Stringer("messageOp", message.ChitsOp),
   739  				zap.Uint32("requestID", msg.RequestId),
   740  				zap.String("field", "PreferredIDAtHeight"),
   741  				zap.Error(err),
   742  			)
   743  			return engine.QueryFailed(ctx, nodeID, msg.RequestId)
   744  		}
   745  
   746  		acceptedID, err := ids.ToID(msg.AcceptedId)
   747  		if err != nil {
   748  			h.ctx.Log.Debug("message with invalid field",
   749  				zap.Stringer("nodeID", nodeID),
   750  				zap.Stringer("messageOp", message.ChitsOp),
   751  				zap.Uint32("requestID", msg.RequestId),
   752  				zap.String("field", "AcceptedID"),
   753  				zap.Error(err),
   754  			)
   755  			return engine.QueryFailed(ctx, nodeID, msg.RequestId)
   756  		}
   757  
   758  		return engine.Chits(ctx, nodeID, msg.RequestId, preferredID, preferredIDAtHeight, acceptedID)
   759  
   760  	case *message.QueryFailed:
   761  		return engine.QueryFailed(ctx, nodeID, msg.RequestID)
   762  
   763  	// Connection messages can be sent to the currently executing engine
   764  	case *message.Connected:
   765  		err := h.peerTracker.Connected(ctx, nodeID, msg.NodeVersion)
   766  		if err != nil {
   767  			return err
   768  		}
   769  		h.p2pTracker.Connected(nodeID, msg.NodeVersion)
   770  		return engine.Connected(ctx, nodeID, msg.NodeVersion)
   771  
   772  	case *message.ConnectedSubnet:
   773  		return h.subnetConnector.ConnectedSubnet(ctx, nodeID, msg.SubnetID)
   774  
   775  	case *message.Disconnected:
   776  		err := h.peerTracker.Disconnected(ctx, nodeID)
   777  		if err != nil {
   778  			return err
   779  		}
   780  		h.p2pTracker.Disconnected(nodeID)
   781  		return engine.Disconnected(ctx, nodeID)
   782  
   783  	default:
   784  		return fmt.Errorf(
   785  			"attempt to submit unhandled sync msg %s from %s",
   786  			op, nodeID,
   787  		)
   788  	}
   789  }
   790  
   791  func (h *handler) handleAsyncMsg(ctx context.Context, msg Message) {
   792  	h.asyncMessagePool.Go(func() error {
   793  		if err := h.executeAsyncMsg(ctx, msg); err != nil {
   794  			h.StopWithError(ctx, fmt.Errorf(
   795  				"%w while processing async message: %s from %s",
   796  				err,
   797  				msg.Op(),
   798  				msg.NodeID(),
   799  			))
   800  		}
   801  		return nil
   802  	})
   803  }
   804  
   805  // Any returned error is treated as fatal
   806  func (h *handler) executeAsyncMsg(ctx context.Context, msg Message) error {
   807  	var (
   808  		nodeID    = msg.NodeID()
   809  		op        = msg.Op().String()
   810  		body      = msg.Message()
   811  		startTime = h.clock.Time()
   812  	)
   813  	if h.ctx.Log.Enabled(logging.Verbo) {
   814  		h.ctx.Log.Verbo("forwarding async message to consensus",
   815  			zap.Stringer("nodeID", nodeID),
   816  			zap.String("messageOp", op),
   817  			zap.Stringer("message", body),
   818  		)
   819  	} else {
   820  		h.ctx.Log.Debug("forwarding async message to consensus",
   821  			zap.Stringer("nodeID", nodeID),
   822  			zap.String("messageOp", op),
   823  		)
   824  	}
   825  	h.resourceTracker.StartProcessing(nodeID, startTime)
   826  	defer func() {
   827  		var (
   828  			endTime      = h.clock.Time()
   829  			handlingTime = endTime.Sub(startTime)
   830  		)
   831  		h.resourceTracker.StopProcessing(nodeID, endTime)
   832  		labels := prometheus.Labels{
   833  			opLabel: op,
   834  		}
   835  		h.metrics.messages.With(labels).Inc()
   836  		h.metrics.messageHandlingTime.With(labels).Add(float64(handlingTime))
   837  
   838  		msg.OnFinishedHandling()
   839  		h.ctx.Log.Debug("finished handling async message",
   840  			zap.String("messageOp", op),
   841  		)
   842  	}()
   843  
   844  	state := h.ctx.State.Get()
   845  	engine, ok := h.engineManager.Get(state.Type).Get(state.State)
   846  	if !ok {
   847  		return fmt.Errorf(
   848  			"%w %s running %s",
   849  			errMissingEngine,
   850  			state.State,
   851  			state.Type,
   852  		)
   853  	}
   854  
   855  	switch m := body.(type) {
   856  	case *p2ppb.AppRequest:
   857  		return engine.AppRequest(
   858  			ctx,
   859  			nodeID,
   860  			m.RequestId,
   861  			msg.Expiration(),
   862  			m.AppBytes,
   863  		)
   864  
   865  	case *p2ppb.AppResponse:
   866  		return engine.AppResponse(ctx, nodeID, m.RequestId, m.AppBytes)
   867  
   868  	case *p2ppb.AppError:
   869  		err := &common.AppError{
   870  			Code:    m.ErrorCode,
   871  			Message: m.ErrorMessage,
   872  		}
   873  
   874  		return engine.AppRequestFailed(
   875  			ctx,
   876  			nodeID,
   877  			m.RequestId,
   878  			err,
   879  		)
   880  
   881  	case *p2ppb.AppGossip:
   882  		return engine.AppGossip(ctx, nodeID, m.AppBytes)
   883  
   884  	case *message.CrossChainAppRequest:
   885  		return engine.CrossChainAppRequest(
   886  			ctx,
   887  			m.SourceChainID,
   888  			m.RequestID,
   889  			msg.Expiration(),
   890  			m.Message,
   891  		)
   892  
   893  	case *message.CrossChainAppResponse:
   894  		return engine.CrossChainAppResponse(
   895  			ctx,
   896  			m.SourceChainID,
   897  			m.RequestID,
   898  			m.Message,
   899  		)
   900  
   901  	case *message.CrossChainAppRequestFailed:
   902  		err := &common.AppError{
   903  			Code:    m.ErrorCode,
   904  			Message: m.ErrorMessage,
   905  		}
   906  
   907  		return engine.CrossChainAppRequestFailed(
   908  			ctx,
   909  			m.SourceChainID,
   910  			m.RequestID,
   911  			err,
   912  		)
   913  
   914  	default:
   915  		return fmt.Errorf(
   916  			"attempt to submit unhandled async msg %s from %s",
   917  			op, nodeID,
   918  		)
   919  	}
   920  }
   921  
   922  // Any returned error is treated as fatal
   923  func (h *handler) handleChanMsg(msg message.InboundMessage) error {
   924  	var (
   925  		op        = msg.Op().String()
   926  		body      = msg.Message()
   927  		startTime = h.clock.Time()
   928  		// Check if the chain is in normal operation at the start of message
   929  		// execution (may change during execution)
   930  		isNormalOp = h.ctx.State.Get().State == snow.NormalOp
   931  	)
   932  	if h.ctx.Log.Enabled(logging.Verbo) {
   933  		h.ctx.Log.Verbo("forwarding chan message to consensus",
   934  			zap.String("messageOp", op),
   935  			zap.Stringer("message", body),
   936  		)
   937  	} else {
   938  		h.ctx.Log.Debug("forwarding chan message to consensus",
   939  			zap.String("messageOp", op),
   940  		)
   941  	}
   942  	h.ctx.Lock.Lock()
   943  	lockAcquiredTime := h.clock.Time()
   944  	defer func() {
   945  		h.ctx.Lock.Unlock()
   946  
   947  		var (
   948  			endTime      = h.clock.Time()
   949  			lockingTime  = lockAcquiredTime.Sub(startTime)
   950  			handlingTime = endTime.Sub(lockAcquiredTime)
   951  		)
   952  		h.metrics.lockingTime.Add(float64(lockingTime))
   953  		labels := prometheus.Labels{
   954  			opLabel: op,
   955  		}
   956  		h.metrics.messages.With(labels).Inc()
   957  		h.metrics.messageHandlingTime.With(labels).Add(float64(handlingTime))
   958  
   959  		msg.OnFinishedHandling()
   960  		h.ctx.Log.Debug("finished handling chan message",
   961  			zap.String("messageOp", op),
   962  		)
   963  		if lockingTime+handlingTime > syncProcessingTimeWarnLimit && isNormalOp {
   964  			h.ctx.Log.Warn("handling chan message took longer than expected",
   965  				zap.Duration("lockingTime", lockingTime),
   966  				zap.Duration("handlingTime", handlingTime),
   967  				zap.String("messageOp", op),
   968  				zap.Stringer("message", body),
   969  			)
   970  		}
   971  	}()
   972  
   973  	state := h.ctx.State.Get()
   974  	engine, ok := h.engineManager.Get(state.Type).Get(state.State)
   975  	if !ok {
   976  		return fmt.Errorf(
   977  			"%w %s running %s",
   978  			errMissingEngine,
   979  			state.State,
   980  			state.Type,
   981  		)
   982  	}
   983  
   984  	switch msg := body.(type) {
   985  	case *message.VMMessage:
   986  		return engine.Notify(context.TODO(), common.Message(msg.Notification))
   987  
   988  	case *message.GossipRequest:
   989  		return engine.Gossip(context.TODO())
   990  
   991  	case *message.Timeout:
   992  		return engine.Timeout(context.TODO())
   993  
   994  	default:
   995  		return fmt.Errorf(
   996  			"attempt to submit unhandled chan msg %s",
   997  			op,
   998  		)
   999  	}
  1000  }
  1001  
  1002  func (h *handler) popUnexpiredMsg(queue MessageQueue) (context.Context, Message, bool) {
  1003  	for {
  1004  		// Get the next message we should process. If the handler is shutting
  1005  		// down, we may fail to pop a message.
  1006  		ctx, msg, ok := queue.Pop()
  1007  		if !ok {
  1008  			return nil, Message{}, false
  1009  		}
  1010  
  1011  		// If this message's deadline has passed, don't process it.
  1012  		if expiration := msg.Expiration(); h.clock.Time().After(expiration) {
  1013  			op := msg.Op().String()
  1014  			h.ctx.Log.Debug("dropping message",
  1015  				zap.String("reason", "timeout"),
  1016  				zap.Stringer("nodeID", msg.NodeID()),
  1017  				zap.String("messageOp", op),
  1018  			)
  1019  			span := trace.SpanFromContext(ctx)
  1020  			span.AddEvent("dropping message", trace.WithAttributes(
  1021  				attribute.String("reason", "timeout"),
  1022  			))
  1023  			h.metrics.expired.With(prometheus.Labels{
  1024  				opLabel: op,
  1025  			}).Inc()
  1026  			msg.OnFinishedHandling()
  1027  			continue
  1028  		}
  1029  
  1030  		return ctx, msg, true
  1031  	}
  1032  }
  1033  
  1034  // Invariant: if closeDispatcher is called, Stop has already been called.
  1035  func (h *handler) closeDispatcher(ctx context.Context) {
  1036  	if h.numDispatchersClosed.Add(1) < numDispatchersToClose {
  1037  		return
  1038  	}
  1039  
  1040  	h.shutdown(ctx, h.startClosingTime)
  1041  }
  1042  
  1043  // Note: shutdown is only called after all message dispatchers have exited or if
  1044  // no message dispatchers ever started.
  1045  func (h *handler) shutdown(ctx context.Context, startClosingTime time.Time) {
  1046  	defer func() {
  1047  		if h.onStopped != nil {
  1048  			go h.onStopped()
  1049  		}
  1050  
  1051  		h.totalClosingTime = h.clock.Time().Sub(startClosingTime)
  1052  		close(h.closed)
  1053  	}()
  1054  
  1055  	state := h.ctx.State.Get()
  1056  	engine, ok := h.engineManager.Get(state.Type).Get(state.State)
  1057  	if !ok {
  1058  		h.ctx.Log.Error("failed fetching current engine during shutdown",
  1059  			zap.Stringer("type", state.Type),
  1060  			zap.Stringer("state", state.State),
  1061  		)
  1062  		return
  1063  	}
  1064  
  1065  	if err := engine.Shutdown(ctx); err != nil {
  1066  		h.ctx.Log.Error("failed while shutting down the chain",
  1067  			zap.Error(err),
  1068  		)
  1069  	}
  1070  }