github.com/MetalBlockchain/metalgo@v1.11.9/snow/networking/router/chain_router.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package router
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"strings"
    11  	"sync"
    12  	"time"
    13  
    14  	"github.com/prometheus/client_golang/prometheus"
    15  	"go.uber.org/zap"
    16  
    17  	"github.com/MetalBlockchain/metalgo/ids"
    18  	"github.com/MetalBlockchain/metalgo/message"
    19  	"github.com/MetalBlockchain/metalgo/proto/pb/p2p"
    20  	"github.com/MetalBlockchain/metalgo/snow/networking/benchlist"
    21  	"github.com/MetalBlockchain/metalgo/snow/networking/handler"
    22  	"github.com/MetalBlockchain/metalgo/snow/networking/timeout"
    23  	"github.com/MetalBlockchain/metalgo/utils/constants"
    24  	"github.com/MetalBlockchain/metalgo/utils/linked"
    25  	"github.com/MetalBlockchain/metalgo/utils/logging"
    26  	"github.com/MetalBlockchain/metalgo/utils/set"
    27  	"github.com/MetalBlockchain/metalgo/utils/timer/mockable"
    28  	"github.com/MetalBlockchain/metalgo/version"
    29  )
    30  
    31  var (
    32  	errUnknownChain  = errors.New("received message for unknown chain")
    33  	errUnallowedNode = errors.New("received message from non-allowed node")
    34  	errClosing       = errors.New("router is closing")
    35  
    36  	_ Router              = (*ChainRouter)(nil)
    37  	_ benchlist.Benchable = (*ChainRouter)(nil)
    38  )
    39  
    40  type requestEntry struct {
    41  	// When this request was registered
    42  	time time.Time
    43  	// The type of request that was made
    44  	op message.Op
    45  	// The engine type of the request that was made
    46  	engineType p2p.EngineType
    47  }
    48  
    49  type peer struct {
    50  	version *version.Application
    51  	// The subnets that this peer is currently tracking
    52  	trackedSubnets set.Set[ids.ID]
    53  	// The subnets that this peer actually has a connection to.
    54  	// This is a subset of trackedSubnets.
    55  	connectedSubnets set.Set[ids.ID]
    56  }
    57  
    58  // ChainRouter routes incoming messages from the validator network
    59  // to the consensus engines that the messages are intended for.
    60  // Note that consensus engines are uniquely identified by the ID of the chain
    61  // that they are working on.
    62  // Invariant: P-chain must be registered before processing any messages
    63  type ChainRouter struct {
    64  	clock         mockable.Clock
    65  	log           logging.Logger
    66  	lock          sync.Mutex
    67  	closing       bool
    68  	chainHandlers map[ids.ID]handler.Handler
    69  
    70  	// It is only safe to call [RegisterResponse] with the router lock held. Any
    71  	// other calls to the timeout manager with the router lock held could cause
    72  	// a deadlock because the timeout manager will call Benched and Unbenched.
    73  	timeoutManager timeout.Manager
    74  
    75  	closeTimeout time.Duration
    76  	myNodeID     ids.NodeID
    77  	peers        map[ids.NodeID]*peer
    78  	// node ID --> chains that node is benched on
    79  	// invariant: if a node is benched on any chain, it is treated as disconnected on all chains
    80  	benched                map[ids.NodeID]set.Set[ids.ID]
    81  	criticalChains         set.Set[ids.ID]
    82  	sybilProtectionEnabled bool
    83  	onFatal                func(exitCode int)
    84  	metrics                *routerMetrics
    85  	// Parameters for doing health checks
    86  	healthConfig HealthConfig
    87  	// aggregator of requests based on their time
    88  	timedRequests *linked.Hashmap[ids.RequestID, requestEntry]
    89  }
    90  
    91  // Initialize the router.
    92  //
    93  // When this router receives an incoming message, it cancels the timeout in
    94  // [timeouts] associated with the request that caused the incoming message, if
    95  // applicable.
    96  func (cr *ChainRouter) Initialize(
    97  	nodeID ids.NodeID,
    98  	log logging.Logger,
    99  	timeoutManager timeout.Manager,
   100  	closeTimeout time.Duration,
   101  	criticalChains set.Set[ids.ID],
   102  	sybilProtectionEnabled bool,
   103  	trackedSubnets set.Set[ids.ID],
   104  	onFatal func(exitCode int),
   105  	healthConfig HealthConfig,
   106  	reg prometheus.Registerer,
   107  ) error {
   108  	cr.log = log
   109  	cr.chainHandlers = make(map[ids.ID]handler.Handler)
   110  	cr.timeoutManager = timeoutManager
   111  	cr.closeTimeout = closeTimeout
   112  	cr.benched = make(map[ids.NodeID]set.Set[ids.ID])
   113  	cr.criticalChains = criticalChains
   114  	cr.sybilProtectionEnabled = sybilProtectionEnabled
   115  	cr.onFatal = onFatal
   116  	cr.timedRequests = linked.NewHashmap[ids.RequestID, requestEntry]()
   117  	cr.peers = make(map[ids.NodeID]*peer)
   118  	cr.healthConfig = healthConfig
   119  
   120  	// Mark myself as connected
   121  	cr.myNodeID = nodeID
   122  	myself := &peer{
   123  		version: version.CurrentApp,
   124  	}
   125  	myself.trackedSubnets.Union(trackedSubnets)
   126  	myself.trackedSubnets.Add(constants.PrimaryNetworkID)
   127  	cr.peers[nodeID] = myself
   128  
   129  	// Register metrics
   130  	rMetrics, err := newRouterMetrics(reg)
   131  	if err != nil {
   132  		return err
   133  	}
   134  	cr.metrics = rMetrics
   135  	return nil
   136  }
   137  
   138  // RegisterRequest marks that we should expect to receive a reply for a request
   139  // issued by [requestingChainID] from the given node's [respondingChainID] and
   140  // the reply should have the given requestID.
   141  //
   142  // The type of message we expect is [op].
   143  //
   144  // Every registered request must be cleared either by receiving a valid reply
   145  // and passing it to the appropriate chain or by a timeout.
   146  // This method registers a timeout that calls such methods if we don't get a
   147  // reply in time.
   148  func (cr *ChainRouter) RegisterRequest(
   149  	ctx context.Context,
   150  	nodeID ids.NodeID,
   151  	requestingChainID ids.ID,
   152  	respondingChainID ids.ID,
   153  	requestID uint32,
   154  	op message.Op,
   155  	timeoutMsg message.InboundMessage,
   156  	engineType p2p.EngineType,
   157  ) {
   158  	cr.lock.Lock()
   159  	if cr.closing {
   160  		cr.log.Debug("dropping request",
   161  			zap.Stringer("nodeID", nodeID),
   162  			zap.Stringer("requestingChainID", requestingChainID),
   163  			zap.Stringer("respondingChainID", respondingChainID),
   164  			zap.Uint32("requestID", requestID),
   165  			zap.Stringer("messageOp", op),
   166  			zap.Error(errClosing),
   167  		)
   168  		cr.lock.Unlock()
   169  		return
   170  	}
   171  	// When we receive a response message type (Chits, Put, Accepted, etc.)
   172  	// we validate that we actually sent the corresponding request.
   173  	// Give this request a unique ID so we can do that validation.
   174  	//
   175  	// For cross-chain messages, the responding chain is the source of the
   176  	// response which is sent to the requester which is the destination,
   177  	// which is why we flip the two in request id generation.
   178  	uniqueRequestID := ids.RequestID{
   179  		NodeID:             nodeID,
   180  		SourceChainID:      respondingChainID,
   181  		DestinationChainID: requestingChainID,
   182  		RequestID:          requestID,
   183  		Op:                 byte(op),
   184  	}
   185  	// Add to the set of unfulfilled requests
   186  	cr.timedRequests.Put(uniqueRequestID, requestEntry{
   187  		time:       cr.clock.Time(),
   188  		op:         op,
   189  		engineType: engineType,
   190  	})
   191  	cr.metrics.outstandingRequests.Set(float64(cr.timedRequests.Len()))
   192  	cr.lock.Unlock()
   193  
   194  	// Determine whether we should include the latency of this request in our
   195  	// measurements.
   196  	// - Don't measure messages from ourself since these don't go over the
   197  	//   network.
   198  	// - Don't measure Puts because an adversary can cause us to issue a Get
   199  	//   request to them and not respond, causing a timeout, skewing latency
   200  	//   measurements.
   201  	shouldMeasureLatency := nodeID != cr.myNodeID && op != message.PutOp
   202  
   203  	// Register a timeout to fire if we don't get a reply in time.
   204  	cr.timeoutManager.RegisterRequest(
   205  		nodeID,
   206  		respondingChainID,
   207  		shouldMeasureLatency,
   208  		uniqueRequestID,
   209  		func() {
   210  			cr.HandleInbound(ctx, timeoutMsg)
   211  		},
   212  	)
   213  }
   214  
   215  func (cr *ChainRouter) HandleInbound(ctx context.Context, msg message.InboundMessage) {
   216  	nodeID := msg.NodeID()
   217  	op := msg.Op()
   218  
   219  	m := msg.Message()
   220  	destinationChainID, err := message.GetChainID(m)
   221  	if err != nil {
   222  		cr.log.Debug("dropping message with invalid field",
   223  			zap.Stringer("nodeID", nodeID),
   224  			zap.Stringer("messageOp", op),
   225  			zap.String("field", "ChainID"),
   226  			zap.Error(err),
   227  		)
   228  
   229  		msg.OnFinishedHandling()
   230  		return
   231  	}
   232  
   233  	sourceChainID, err := message.GetSourceChainID(m)
   234  	if err != nil {
   235  		cr.log.Debug("dropping message with invalid field",
   236  			zap.Stringer("nodeID", nodeID),
   237  			zap.Stringer("messageOp", op),
   238  			zap.String("field", "SourceChainID"),
   239  			zap.Error(err),
   240  		)
   241  
   242  		msg.OnFinishedHandling()
   243  		return
   244  	}
   245  
   246  	requestID, ok := message.GetRequestID(m)
   247  	if !ok {
   248  		cr.log.Debug("dropping message with invalid field",
   249  			zap.Stringer("nodeID", nodeID),
   250  			zap.Stringer("messageOp", op),
   251  			zap.String("field", "RequestID"),
   252  		)
   253  
   254  		msg.OnFinishedHandling()
   255  		return
   256  	}
   257  
   258  	cr.lock.Lock()
   259  	defer cr.lock.Unlock()
   260  
   261  	if cr.closing {
   262  		cr.log.Debug("dropping message",
   263  			zap.Stringer("messageOp", op),
   264  			zap.Stringer("nodeID", nodeID),
   265  			zap.Stringer("chainID", destinationChainID),
   266  			zap.Error(errClosing),
   267  		)
   268  		msg.OnFinishedHandling()
   269  		return
   270  	}
   271  
   272  	// Get the chain, if it exists
   273  	chain, exists := cr.chainHandlers[destinationChainID]
   274  	if !exists {
   275  		cr.log.Debug("dropping message",
   276  			zap.Stringer("messageOp", op),
   277  			zap.Stringer("nodeID", nodeID),
   278  			zap.Stringer("chainID", destinationChainID),
   279  			zap.Error(errUnknownChain),
   280  		)
   281  		msg.OnFinishedHandling()
   282  		return
   283  	}
   284  
   285  	if !chain.ShouldHandle(nodeID) {
   286  		cr.log.Debug("dropping message",
   287  			zap.Stringer("messageOp", op),
   288  			zap.Stringer("nodeID", nodeID),
   289  			zap.Stringer("chainID", destinationChainID),
   290  			zap.Error(errUnallowedNode),
   291  		)
   292  		msg.OnFinishedHandling()
   293  		return
   294  	}
   295  
   296  	chainCtx := chain.Context()
   297  	if message.UnrequestedOps.Contains(op) {
   298  		if chainCtx.Executing.Get() {
   299  			cr.log.Debug("dropping message and skipping queue",
   300  				zap.String("reason", "the chain is currently executing"),
   301  				zap.Stringer("messageOp", op),
   302  			)
   303  			cr.metrics.droppedRequests.Inc()
   304  			msg.OnFinishedHandling()
   305  			return
   306  		}
   307  
   308  		// Note: engineType is not guaranteed to be one of the explicitly named
   309  		// enum values. If it was not specified it defaults to UNSPECIFIED.
   310  		engineType, _ := message.GetEngineType(m)
   311  		chain.Push(
   312  			ctx,
   313  			handler.Message{
   314  				InboundMessage: msg,
   315  				EngineType:     engineType,
   316  			},
   317  		)
   318  		return
   319  	}
   320  
   321  	if expectedResponse, isFailed := message.FailedToResponseOps[op]; isFailed {
   322  		// Create the request ID of the request we sent that this message is in
   323  		// response to.
   324  		uniqueRequestID, req := cr.clearRequest(expectedResponse, nodeID, sourceChainID, destinationChainID, requestID)
   325  		if req == nil {
   326  			// This was a duplicated response.
   327  			msg.OnFinishedHandling()
   328  			return
   329  		}
   330  
   331  		// Tell the timeout manager we are no longer expecting a response
   332  		cr.timeoutManager.RemoveRequest(uniqueRequestID)
   333  
   334  		// Pass the failure to the chain
   335  		chain.Push(
   336  			ctx,
   337  			handler.Message{
   338  				InboundMessage: msg,
   339  				EngineType:     req.engineType,
   340  			},
   341  		)
   342  		return
   343  	}
   344  
   345  	if chainCtx.Executing.Get() {
   346  		cr.log.Debug("dropping message and skipping queue",
   347  			zap.String("reason", "the chain is currently executing"),
   348  			zap.Stringer("messageOp", op),
   349  		)
   350  		cr.metrics.droppedRequests.Inc()
   351  		msg.OnFinishedHandling()
   352  		return
   353  	}
   354  
   355  	uniqueRequestID, req := cr.clearRequest(op, nodeID, sourceChainID, destinationChainID, requestID)
   356  	if req == nil {
   357  		// We didn't request this message.
   358  		msg.OnFinishedHandling()
   359  		return
   360  	}
   361  
   362  	// Calculate how long it took [nodeID] to reply
   363  	latency := cr.clock.Time().Sub(req.time)
   364  
   365  	// Tell the timeout manager we got a response
   366  	cr.timeoutManager.RegisterResponse(nodeID, destinationChainID, uniqueRequestID, req.op, latency)
   367  
   368  	// Pass the response to the chain
   369  	chain.Push(
   370  		ctx,
   371  		handler.Message{
   372  			InboundMessage: msg,
   373  			EngineType:     req.engineType,
   374  		},
   375  	)
   376  }
   377  
   378  // Shutdown shuts down this router
   379  func (cr *ChainRouter) Shutdown(ctx context.Context) {
   380  	cr.log.Info("shutting down chain router")
   381  	cr.lock.Lock()
   382  	prevChains := cr.chainHandlers
   383  	cr.chainHandlers = map[ids.ID]handler.Handler{}
   384  	cr.closing = true
   385  	cr.lock.Unlock()
   386  
   387  	for _, chain := range prevChains {
   388  		chain.Stop(ctx)
   389  	}
   390  
   391  	ctx, cancel := context.WithTimeout(ctx, cr.closeTimeout)
   392  	defer cancel()
   393  
   394  	for _, chain := range prevChains {
   395  		shutdownDuration, err := chain.AwaitStopped(ctx)
   396  
   397  		chainLog := chain.Context().Log
   398  		if err != nil {
   399  			chainLog.Warn("timed out while shutting down",
   400  				zap.Error(err),
   401  			)
   402  		} else {
   403  			chainLog.Info("chain shutdown",
   404  				zap.Duration("shutdownDuration", shutdownDuration),
   405  			)
   406  		}
   407  	}
   408  }
   409  
   410  // AddChain registers the specified chain so that incoming
   411  // messages can be routed to it
   412  func (cr *ChainRouter) AddChain(ctx context.Context, chain handler.Handler) {
   413  	cr.lock.Lock()
   414  	defer cr.lock.Unlock()
   415  
   416  	chainID := chain.Context().ChainID
   417  	if cr.closing {
   418  		cr.log.Debug("dropping add chain request",
   419  			zap.Stringer("chainID", chainID),
   420  			zap.Error(errClosing),
   421  		)
   422  		return
   423  	}
   424  	cr.log.Debug("registering chain with chain router",
   425  		zap.Stringer("chainID", chainID),
   426  	)
   427  	chain.SetOnStopped(func() {
   428  		cr.removeChain(ctx, chainID)
   429  	})
   430  	cr.chainHandlers[chainID] = chain
   431  
   432  	// Notify connected validators
   433  	subnetID := chain.Context().SubnetID
   434  	for validatorID, peer := range cr.peers {
   435  		// If this validator is benched on any chain, treat them as disconnected
   436  		// on all chains
   437  		_, benched := cr.benched[validatorID]
   438  		if benched {
   439  			continue
   440  		}
   441  
   442  		// If this peer isn't running this chain, then we shouldn't mark them as
   443  		// connected
   444  		if !peer.trackedSubnets.Contains(subnetID) && cr.sybilProtectionEnabled {
   445  			continue
   446  		}
   447  
   448  		msg := message.InternalConnected(validatorID, peer.version)
   449  		chain.Push(ctx,
   450  			handler.Message{
   451  				InboundMessage: msg,
   452  				EngineType:     p2p.EngineType_ENGINE_TYPE_UNSPECIFIED,
   453  			},
   454  		)
   455  	}
   456  
   457  	// When we register the P-chain, we mark ourselves as connected on all of
   458  	// the subnets that we have tracked.
   459  	if chainID != constants.PlatformChainID {
   460  		return
   461  	}
   462  
   463  	// If we have currently benched ourselves, we will mark ourselves as
   464  	// connected when we unbench. So skip connecting now.
   465  	// This is not "theoretically" possible, but keeping this here prevents us
   466  	// from keeping an invariant that we never bench ourselves.
   467  	if _, benched := cr.benched[cr.myNodeID]; benched {
   468  		return
   469  	}
   470  
   471  	myself := cr.peers[cr.myNodeID]
   472  	for subnetID := range myself.trackedSubnets {
   473  		cr.connectedSubnet(myself, cr.myNodeID, subnetID)
   474  	}
   475  }
   476  
   477  // Connected routes an incoming notification that a validator was just connected
   478  func (cr *ChainRouter) Connected(nodeID ids.NodeID, nodeVersion *version.Application, subnetID ids.ID) {
   479  	cr.lock.Lock()
   480  	defer cr.lock.Unlock()
   481  
   482  	if cr.closing {
   483  		cr.log.Debug("dropping connected message",
   484  			zap.Stringer("nodeID", nodeID),
   485  			zap.Error(errClosing),
   486  		)
   487  		return
   488  	}
   489  
   490  	connectedPeer, exists := cr.peers[nodeID]
   491  	if !exists {
   492  		connectedPeer = &peer{
   493  			version: nodeVersion,
   494  		}
   495  		cr.peers[nodeID] = connectedPeer
   496  	}
   497  	connectedPeer.trackedSubnets.Add(subnetID)
   498  
   499  	// If this validator is benched on any chain, treat them as disconnected on all chains
   500  	if _, benched := cr.benched[nodeID]; benched {
   501  		return
   502  	}
   503  
   504  	msg := message.InternalConnected(nodeID, nodeVersion)
   505  
   506  	// TODO: fire up an event when validator state changes i.e when they leave
   507  	// set, disconnect. we cannot put a subnet-only validator check here since
   508  	// Disconnected would not be handled properly.
   509  	//
   510  	// When sybil protection is disabled, we only want this clause to happen
   511  	// once. Therefore, we only update the chains during the connection of the
   512  	// primary network, which is guaranteed to happen for every peer.
   513  	if cr.sybilProtectionEnabled || subnetID == constants.PrimaryNetworkID {
   514  		for _, chain := range cr.chainHandlers {
   515  			// If sybil protection is disabled, send a Connected message to
   516  			// every chain when connecting to the primary network.
   517  			if subnetID == chain.Context().SubnetID || !cr.sybilProtectionEnabled {
   518  				chain.Push(
   519  					context.TODO(),
   520  					handler.Message{
   521  						InboundMessage: msg,
   522  						EngineType:     p2p.EngineType_ENGINE_TYPE_UNSPECIFIED,
   523  					},
   524  				)
   525  			}
   526  		}
   527  	}
   528  
   529  	cr.connectedSubnet(connectedPeer, nodeID, subnetID)
   530  }
   531  
   532  // Disconnected routes an incoming notification that a validator was connected
   533  func (cr *ChainRouter) Disconnected(nodeID ids.NodeID) {
   534  	cr.lock.Lock()
   535  	defer cr.lock.Unlock()
   536  
   537  	if cr.closing {
   538  		cr.log.Debug("dropping disconnected message",
   539  			zap.Stringer("nodeID", nodeID),
   540  			zap.Error(errClosing),
   541  		)
   542  		return
   543  	}
   544  
   545  	peer := cr.peers[nodeID]
   546  	delete(cr.peers, nodeID)
   547  	if _, benched := cr.benched[nodeID]; benched {
   548  		return
   549  	}
   550  
   551  	msg := message.InternalDisconnected(nodeID)
   552  
   553  	// TODO: fire up an event when validator state changes i.e when they leave
   554  	// set, disconnect. we cannot put a subnet-only validator check here since
   555  	// if a validator connects then it leaves validator-set, it would not be
   556  	// disconnected properly.
   557  	for _, chain := range cr.chainHandlers {
   558  		if peer.trackedSubnets.Contains(chain.Context().SubnetID) || !cr.sybilProtectionEnabled {
   559  			chain.Push(
   560  				context.TODO(),
   561  				handler.Message{
   562  					InboundMessage: msg,
   563  					EngineType:     p2p.EngineType_ENGINE_TYPE_UNSPECIFIED,
   564  				})
   565  		}
   566  	}
   567  }
   568  
   569  // Benched routes an incoming notification that a validator was benched
   570  func (cr *ChainRouter) Benched(chainID ids.ID, nodeID ids.NodeID) {
   571  	cr.lock.Lock()
   572  	defer cr.lock.Unlock()
   573  
   574  	if cr.closing {
   575  		cr.log.Debug("dropping benched message",
   576  			zap.Stringer("nodeID", nodeID),
   577  			zap.Stringer("chainID", chainID),
   578  			zap.Error(errClosing),
   579  		)
   580  		return
   581  	}
   582  
   583  	benchedChains, exists := cr.benched[nodeID]
   584  	benchedChains.Add(chainID)
   585  	cr.benched[nodeID] = benchedChains
   586  	peer, hasPeer := cr.peers[nodeID]
   587  	if exists || !hasPeer {
   588  		// If the set already existed, then the node was previously benched.
   589  		return
   590  	}
   591  
   592  	// This will disconnect the node from all subnets when issued to P-chain.
   593  	// Even if there is no chain in the subnet.
   594  	msg := message.InternalDisconnected(nodeID)
   595  
   596  	for _, chain := range cr.chainHandlers {
   597  		if peer.trackedSubnets.Contains(chain.Context().SubnetID) || !cr.sybilProtectionEnabled {
   598  			chain.Push(
   599  				context.TODO(),
   600  				handler.Message{
   601  					InboundMessage: msg,
   602  					EngineType:     p2p.EngineType_ENGINE_TYPE_UNSPECIFIED,
   603  				})
   604  		}
   605  	}
   606  
   607  	peer.connectedSubnets.Clear()
   608  }
   609  
   610  // Unbenched routes an incoming notification that a validator was just unbenched
   611  func (cr *ChainRouter) Unbenched(chainID ids.ID, nodeID ids.NodeID) {
   612  	cr.lock.Lock()
   613  	defer cr.lock.Unlock()
   614  
   615  	if cr.closing {
   616  		cr.log.Debug("dropping unbenched message",
   617  			zap.Stringer("nodeID", nodeID),
   618  			zap.Stringer("chainID", chainID),
   619  			zap.Error(errClosing),
   620  		)
   621  		return
   622  	}
   623  
   624  	benchedChains := cr.benched[nodeID]
   625  	benchedChains.Remove(chainID)
   626  	if benchedChains.Len() != 0 {
   627  		cr.benched[nodeID] = benchedChains
   628  		return // This node is still benched
   629  	}
   630  
   631  	delete(cr.benched, nodeID)
   632  
   633  	peer, found := cr.peers[nodeID]
   634  	if !found {
   635  		return
   636  	}
   637  
   638  	msg := message.InternalConnected(nodeID, peer.version)
   639  
   640  	for _, chain := range cr.chainHandlers {
   641  		if peer.trackedSubnets.Contains(chain.Context().SubnetID) || !cr.sybilProtectionEnabled {
   642  			chain.Push(
   643  				context.TODO(),
   644  				handler.Message{
   645  					InboundMessage: msg,
   646  					EngineType:     p2p.EngineType_ENGINE_TYPE_UNSPECIFIED,
   647  				})
   648  		}
   649  	}
   650  
   651  	// This will unbench the node from all its subnets.
   652  	// We handle this case separately because the node may have been benched on
   653  	// a subnet that has no chains.
   654  	for subnetID := range peer.trackedSubnets {
   655  		cr.connectedSubnet(peer, nodeID, subnetID)
   656  	}
   657  }
   658  
   659  // HealthCheck returns results of router health checks. Returns:
   660  // 1) Information about health check results
   661  // 2) An error if the health check reports unhealthy
   662  func (cr *ChainRouter) HealthCheck(context.Context) (interface{}, error) {
   663  	cr.lock.Lock()
   664  	defer cr.lock.Unlock()
   665  
   666  	numOutstandingReqs := cr.timedRequests.Len()
   667  	isOutstandingReqs := numOutstandingReqs <= cr.healthConfig.MaxOutstandingRequests
   668  	healthy := isOutstandingReqs
   669  	details := map[string]interface{}{
   670  		"outstandingRequests": numOutstandingReqs,
   671  	}
   672  
   673  	// check for long running requests
   674  	now := cr.clock.Time()
   675  	processingRequest := now
   676  	if _, longestRunning, exists := cr.timedRequests.Oldest(); exists {
   677  		processingRequest = longestRunning.time
   678  	}
   679  	timeReqRunning := now.Sub(processingRequest)
   680  	isOutstanding := timeReqRunning <= cr.healthConfig.MaxOutstandingDuration
   681  	healthy = healthy && isOutstanding
   682  	details["longestRunningRequest"] = timeReqRunning.String()
   683  	cr.metrics.longestRunningRequest.Set(float64(timeReqRunning))
   684  
   685  	if !healthy {
   686  		var errorReasons []string
   687  		if !isOutstandingReqs {
   688  			errorReasons = append(errorReasons, fmt.Sprintf("number of outstanding requests %d > %d", numOutstandingReqs, cr.healthConfig.MaxOutstandingRequests))
   689  		}
   690  		if !isOutstanding {
   691  			errorReasons = append(errorReasons, fmt.Sprintf("time for outstanding requests %s > %s", timeReqRunning, cr.healthConfig.MaxOutstandingDuration))
   692  		}
   693  		// The router is not healthy
   694  		return details, fmt.Errorf("the router is not healthy reason: %s", strings.Join(errorReasons, ", "))
   695  	}
   696  	return details, nil
   697  }
   698  
   699  // RemoveChain removes the specified chain so that incoming
   700  // messages can't be routed to it
   701  func (cr *ChainRouter) removeChain(ctx context.Context, chainID ids.ID) {
   702  	cr.lock.Lock()
   703  	chain, exists := cr.chainHandlers[chainID]
   704  	if !exists {
   705  		cr.log.Debug("can't remove unknown chain",
   706  			zap.Stringer("chainID", chainID),
   707  		)
   708  		cr.lock.Unlock()
   709  		return
   710  	}
   711  	delete(cr.chainHandlers, chainID)
   712  	cr.lock.Unlock()
   713  
   714  	chain.Stop(ctx)
   715  
   716  	ctx, cancel := context.WithTimeout(ctx, cr.closeTimeout)
   717  	shutdownDuration, err := chain.AwaitStopped(ctx)
   718  	cancel()
   719  
   720  	chainLog := chain.Context().Log
   721  	if err != nil {
   722  		chainLog.Warn("timed out while shutting down",
   723  			zap.Error(err),
   724  		)
   725  	} else {
   726  		chainLog.Info("chain shutdown",
   727  			zap.Duration("shutdownDuration", shutdownDuration),
   728  		)
   729  	}
   730  
   731  	if cr.onFatal != nil && cr.criticalChains.Contains(chainID) {
   732  		go cr.onFatal(1)
   733  	}
   734  }
   735  
   736  func (cr *ChainRouter) clearRequest(
   737  	op message.Op,
   738  	nodeID ids.NodeID,
   739  	sourceChainID ids.ID,
   740  	destinationChainID ids.ID,
   741  	requestID uint32,
   742  ) (ids.RequestID, *requestEntry) {
   743  	// Create the request ID of the request we sent that this message is (allegedly) in response to.
   744  	uniqueRequestID := ids.RequestID{
   745  		NodeID:             nodeID,
   746  		SourceChainID:      sourceChainID,
   747  		DestinationChainID: destinationChainID,
   748  		RequestID:          requestID,
   749  		Op:                 byte(op),
   750  	}
   751  	// Mark that an outstanding request has been fulfilled
   752  	request, exists := cr.timedRequests.Get(uniqueRequestID)
   753  	if !exists {
   754  		return uniqueRequestID, nil
   755  	}
   756  
   757  	cr.timedRequests.Delete(uniqueRequestID)
   758  	cr.metrics.outstandingRequests.Set(float64(cr.timedRequests.Len()))
   759  	return uniqueRequestID, &request
   760  }
   761  
   762  // connectedSubnet pushes an InternalSubnetConnected message with [nodeID] and
   763  // [subnetID] to the P-chain. This should be called when a node is either first
   764  // connecting to [subnetID] or when a node that was already connected is
   765  // unbenched on [subnetID]. This is a noop if [subnetID] is the Primary Network
   766  // or if the peer is already marked as connected to the subnet.
   767  // Invariant: should be called after *message.Connected is pushed to the P-chain
   768  // Invariant: should be called after the P-chain was provided in [AddChain]
   769  func (cr *ChainRouter) connectedSubnet(peer *peer, nodeID ids.NodeID, subnetID ids.ID) {
   770  	// if connected to primary network, we can skip this
   771  	// because Connected has its own internal message
   772  	if subnetID == constants.PrimaryNetworkID {
   773  		return
   774  	}
   775  
   776  	// peer already connected to this subnet
   777  	if peer.connectedSubnets.Contains(subnetID) {
   778  		return
   779  	}
   780  
   781  	msg := message.InternalConnectedSubnet(nodeID, subnetID)
   782  	// We only push this message to the P-chain because it is the only chain
   783  	// that cares about the connectivity of all subnets. Others chains learn
   784  	// about the connectivity of their own subnet when they receive a
   785  	// *message.Connected.
   786  	platformChain, ok := cr.chainHandlers[constants.PlatformChainID]
   787  	if !ok {
   788  		cr.log.Error("trying to issue InternalConnectedSubnet message, but platform chain is not registered",
   789  			zap.Stringer("nodeID", nodeID),
   790  			zap.Stringer("subnetID", subnetID),
   791  		)
   792  		return
   793  	}
   794  	platformChain.Push(
   795  		context.TODO(),
   796  		handler.Message{
   797  			InboundMessage: msg,
   798  			EngineType:     p2p.EngineType_ENGINE_TYPE_UNSPECIFIED,
   799  		},
   800  	)
   801  
   802  	peer.connectedSubnets.Add(subnetID)
   803  }