github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/network/p2p/inspector/validation/control_message_validation_inspector.go

github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/network/p2p/inspector/validation/control_message_validation_inspector.go (about)

     1  package validation
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/go-playground/validator/v10"
     8  	"github.com/hashicorp/go-multierror"
     9  	pubsub "github.com/libp2p/go-libp2p-pubsub"
    10  	pubsub_pb "github.com/libp2p/go-libp2p-pubsub/pb"
    11  	"github.com/libp2p/go-libp2p/core/peer"
    12  	"github.com/rs/zerolog"
    13  
    14  	"github.com/onflow/flow-go/engine/common/worker"
    15  	"github.com/onflow/flow-go/model/flow"
    16  	"github.com/onflow/flow-go/module"
    17  	"github.com/onflow/flow-go/module/component"
    18  	"github.com/onflow/flow-go/module/irrecoverable"
    19  	"github.com/onflow/flow-go/module/mempool/queue"
    20  	"github.com/onflow/flow-go/module/metrics"
    21  	"github.com/onflow/flow-go/network"
    22  	"github.com/onflow/flow-go/network/channels"
    23  	"github.com/onflow/flow-go/network/p2p"
    24  	p2pconfig "github.com/onflow/flow-go/network/p2p/config"
    25  	"github.com/onflow/flow-go/network/p2p/inspector/internal/cache"
    26  	p2plogging "github.com/onflow/flow-go/network/p2p/logging"
    27  	p2pmsg "github.com/onflow/flow-go/network/p2p/message"
    28  	"github.com/onflow/flow-go/state/protocol"
    29  	"github.com/onflow/flow-go/state/protocol/events"
    30  	"github.com/onflow/flow-go/utils/logging"
    31  	flowrand "github.com/onflow/flow-go/utils/rand"
    32  )
    33  
    34  const (
    35  	RPCInspectionDisabledWarning     = "rpc inspection disabled for all control message types, skipping inspection"
    36  	GraftInspectionDisabledWarning   = "rpc graft inspection disabled skipping"
    37  	PruneInspectionDisabledWarning   = "rpc prune inspection disabled skipping"
    38  	IWantInspectionDisabledWarning   = "rpc iwant inspection disabled skipping"
    39  	IHaveInspectionDisabledWarning   = "rpc ihave inspection disabled skipping"
    40  	PublishInspectionDisabledWarning = "rpc publish message inspection disabled skipping"
    41  
    42  	RPCTruncationDisabledWarning            = "rpc truncation disabled for all control message types, skipping truncation"
    43  	GraftTruncationDisabledWarning          = "rpc graft truncation disabled skipping"
    44  	PruneTruncationDisabledWarning          = "rpc prune truncation disabled skipping"
    45  	IHaveTruncationDisabledWarning          = "rpc ihave truncation disabled skipping"
    46  	IHaveMessageIDTruncationDisabledWarning = "ihave message ids truncation disabled skipping"
    47  	IWantTruncationDisabledWarning          = "rpc iwant truncation disabled skipping"
    48  	IWantMessageIDTruncationDisabledWarning = "iwant message ids truncation disabled skipping"
    49  
    50  	// rpcInspectorComponentName the rpc inspector component name.
    51  	rpcInspectorComponentName = "gossipsub_rpc_validation_inspector"
    52  )
    53  
    54  // ControlMsgValidationInspector RPC message inspector that inspects control messages and performs some validation on them,
    55  // when some validation rule is broken feedback is given via the Peer scoring notifier.
    56  type ControlMsgValidationInspector struct {
    57  	component.Component
    58  	events.Noop
    59  	ctx     irrecoverable.SignalerContext
    60  	logger  zerolog.Logger
    61  	sporkID flow.Identifier
    62  	metrics module.GossipSubRpcValidationInspectorMetrics
    63  	// config control message validation configurations.
    64  	config *p2pconfig.RpcValidationInspector
    65  	// workerPool queue that stores *InspectRPCRequest that will be processed by component workers.
    66  	workerPool *worker.Pool[*InspectRPCRequest]
    67  	// tracker is a map that associates the hash of a peer's ID with the
    68  	// number of cluster-prefix topic control messages received from that peer. It helps in tracking
    69  	// and managing the rate of incoming control messages from each peer, ensuring that the system
    70  	// stays performant and resilient against potential spam or abuse.
    71  	// The counter is incremented in the following scenarios:
    72  	// 1. The cluster prefix topic is received while the inspector waits for the cluster IDs provider to be set (this can happen during the startup or epoch transitions).
    73  	// 2. The node sends a cluster prefix topic where the cluster prefix does not match any of the active cluster IDs.
    74  	// In such cases, the inspector will allow a configured number of these messages from the corresponding peer.
    75  	tracker    *cache.ClusterPrefixedMessagesReceivedTracker
    76  	idProvider module.IdentityProvider
    77  	rpcTracker p2p.RpcControlTracking
    78  	// networkingType indicates public or private network, rpc publish messages are inspected for unstaked senders when running the private network.
    79  	networkingType network.NetworkingType
    80  	// topicOracle callback used to retrieve the current subscribed topics of the libp2p node.
    81  	topicOracle func() p2p.TopicProvider
    82  	// notificationConsumer the consumer that will be notified when a misbehavior is detected upon inspection of an RPC.
    83  	// For each RPC, at most one notification is sent to the consumer.
    84  	// Each notification acts as a penalty to the peer's score.
    85  	notificationConsumer p2p.GossipSubInvCtrlMsgNotifConsumer
    86  }
    87  
    88  type InspectorParams struct {
    89  	// Logger the logger used by the inspector.
    90  	Logger zerolog.Logger `validate:"required"`
    91  	// SporkID the current spork ID.
    92  	SporkID flow.Identifier `validate:"required"`
    93  	// Config inspector configuration.
    94  	Config *p2pconfig.RpcValidationInspector `validate:"required"`
    95  	// HeroCacheMetricsFactory the metrics factory.
    96  	HeroCacheMetricsFactory metrics.HeroCacheMetricsFactory `validate:"required"`
    97  	// IdProvider identity provider is used to get the flow identifier for a peer.
    98  	IdProvider module.IdentityProvider `validate:"required"`
    99  	// InspectorMetrics metrics for the validation inspector.
   100  	InspectorMetrics module.GossipSubRpcValidationInspectorMetrics `validate:"required"`
   101  	// RpcTracker tracker used to track iHave RPC's sent and last size.
   102  	RpcTracker p2p.RpcControlTracking `validate:"required"`
   103  	// NetworkingType the networking type of the node.
   104  	NetworkingType network.NetworkingType `validate:"required"`
   105  	// TopicOracle callback used to retrieve the current subscribed topics of the libp2p node.
   106  	// It is set as a callback to avoid circular dependencies between the topic oracle and the inspector.
   107  	TopicOracle func() p2p.TopicProvider `validate:"required"`
   108  
   109  	// InvalidControlMessageNotificationConsumer the consumer that will be notified when a misbehavior is detected upon inspection of an RPC.
   110  	// For each RPC, at most one notification is sent to the consumer.
   111  	// Each notification acts as a penalty to the peer's score.
   112  	InvalidControlMessageNotificationConsumer p2p.GossipSubInvCtrlMsgNotifConsumer `validate:"required"`
   113  }
   114  
   115  var _ component.Component = (*ControlMsgValidationInspector)(nil)
   116  var _ p2p.GossipSubRPCInspector = (*ControlMsgValidationInspector)(nil)
   117  var _ protocol.Consumer = (*ControlMsgValidationInspector)(nil)
   118  
   119  // NewControlMsgValidationInspector returns new ControlMsgValidationInspector
   120  // Args:
   121  //   - *InspectorParams: params used to create the inspector.
   122  //
   123  // Returns:
   124  //   - *ControlMsgValidationInspector: a new control message validation inspector.
   125  //   - error: an error if there is any error while creating the inspector. All errors are irrecoverable and unexpected.
   126  func NewControlMsgValidationInspector(params *InspectorParams) (*ControlMsgValidationInspector, error) {
   127  	err := validator.New().Struct(params)
   128  	if err != nil {
   129  		return nil, fmt.Errorf("inspector params validation failed: %w", err)
   130  	}
   131  	lg := params.Logger.With().Str("component", "gossip_sub_rpc_validation_inspector").Logger()
   132  
   133  	inspectMsgQueueCacheCollector := metrics.GossipSubRPCInspectorQueueMetricFactory(params.HeroCacheMetricsFactory, params.NetworkingType)
   134  	clusterPrefixedCacheCollector := metrics.GossipSubRPCInspectorClusterPrefixedCacheMetricFactory(params.HeroCacheMetricsFactory, params.NetworkingType)
   135  
   136  	clusterPrefixedTracker, err := cache.NewClusterPrefixedMessagesReceivedTracker(params.Logger,
   137  		params.Config.ClusterPrefixedMessage.ControlMsgsReceivedCacheSize,
   138  		clusterPrefixedCacheCollector,
   139  		params.Config.ClusterPrefixedMessage.ControlMsgsReceivedCacheDecay)
   140  	if err != nil {
   141  		return nil, fmt.Errorf("failed to create cluster prefix topics received tracker")
   142  	}
   143  
   144  	if params.Config.PublishMessages.MaxSampleSize < params.Config.PublishMessages.ErrorThreshold {
   145  		return nil, fmt.Errorf("rpc message max sample size must be greater than or equal to rpc message error threshold, got %d and %d respectively",
   146  			params.Config.PublishMessages.MaxSampleSize,
   147  			params.Config.PublishMessages.ErrorThreshold)
   148  	}
   149  
   150  	c := &ControlMsgValidationInspector{
   151  		logger:               lg,
   152  		sporkID:              params.SporkID,
   153  		config:               params.Config,
   154  		tracker:              clusterPrefixedTracker,
   155  		rpcTracker:           params.RpcTracker,
   156  		idProvider:           params.IdProvider,
   157  		metrics:              params.InspectorMetrics,
   158  		networkingType:       params.NetworkingType,
   159  		topicOracle:          params.TopicOracle,
   160  		notificationConsumer: params.InvalidControlMessageNotificationConsumer,
   161  	}
   162  
   163  	store := queue.NewHeroStore(params.Config.InspectionQueue.Size, params.Logger, inspectMsgQueueCacheCollector)
   164  
   165  	pool := worker.NewWorkerPoolBuilder[*InspectRPCRequest](lg, store, c.processInspectRPCReq).Build()
   166  
   167  	c.workerPool = pool
   168  
   169  	builder := component.NewComponentManagerBuilder()
   170  	for i := 0; i < c.config.InspectionQueue.NumberOfWorkers; i++ {
   171  		builder.AddWorker(pool.WorkerLogic())
   172  	}
   173  	c.Component = builder.Build()
   174  	return c, nil
   175  }
   176  
   177  func (c *ControlMsgValidationInspector) Start(parent irrecoverable.SignalerContext) {
   178  	if c.topicOracle == nil {
   179  		parent.Throw(fmt.Errorf("control message validation inspector topic oracle not set"))
   180  	}
   181  	c.Component.Start(parent)
   182  }
   183  
   184  // Name returns the name of the rpc inspector.
   185  func (c *ControlMsgValidationInspector) Name() string {
   186  	return rpcInspectorComponentName
   187  }
   188  
   189  // ActiveClustersChanged consumes cluster ID update protocol events.
   190  func (c *ControlMsgValidationInspector) ActiveClustersChanged(clusterIDList flow.ChainIDList) {
   191  	c.tracker.StoreActiveClusterIds(clusterIDList)
   192  }
   193  
   194  // Inspect is called by gossipsub upon reception of a rpc from a remote  node.
   195  // It creates a new InspectRPCRequest for the RPC to be inspected async by the worker pool.
   196  // Args:
   197  //   - from: the sender.
   198  //   - rpc: the control message RPC.
   199  //
   200  // Returns:
   201  //   - error: if a new inspect rpc request cannot be created, all errors returned are considered irrecoverable.
   202  func (c *ControlMsgValidationInspector) Inspect(from peer.ID, rpc *pubsub.RPC) error {
   203  	if c.config.InspectionProcess.Inspect.Disabled {
   204  		c.logger.
   205  			Trace().
   206  			Str("peer_id", p2plogging.PeerId(from)).
   207  			Bool(logging.KeyNetworkingSecurity, true).
   208  			Msg(RPCInspectionDisabledWarning)
   209  		return nil
   210  	}
   211  
   212  	// check peer identity when running private network
   213  	// sanity check: rpc inspection should be disabled on public networks
   214  	if c.networkingType == network.PrivateNetwork && c.config.InspectionProcess.Inspect.RejectUnstakedPeers {
   215  		_, err := c.checkSenderIdentity(from)
   216  		if err != nil {
   217  			c.notificationConsumer.OnInvalidControlMessageNotification(p2p.NewInvalidControlMessageNotification(from, p2pmsg.CtrlMsgRPC, err, 1, p2p.CtrlMsgNonClusterTopicType))
   218  			c.logger.
   219  				Error().
   220  				Err(err).
   221  				Str("peer_id", p2plogging.PeerId(from)).
   222  				Bool(logging.KeyNetworkingSecurity, true).
   223  				Msg("rpc received from unstaked peer")
   224  			c.metrics.OnInvalidControlMessageNotificationSent()
   225  			c.metrics.OnRpcRejectedFromUnknownSender()
   226  			return err
   227  		}
   228  	}
   229  
   230  	// first truncate the rpc to the configured max sample size; if needed
   231  	c.truncateRPC(from, rpc)
   232  
   233  	// second, queue further async inspection
   234  	req, err := NewInspectRPCRequest(from, rpc)
   235  	if err != nil {
   236  		c.logger.Error().
   237  			Err(err).
   238  			Bool(logging.KeyNetworkingSecurity, true).
   239  			Str("peer_id", p2plogging.PeerId(from)).
   240  			Msg("failed to get inspect RPC request")
   241  		return fmt.Errorf("failed to get inspect RPC request: %w", err)
   242  	}
   243  	c.workerPool.Submit(req)
   244  	return nil
   245  }
   246  
   247  // updateMetrics updates the metrics for the received RPC.
   248  // Args:
   249  //   - from: the sender.
   250  //
   251  // - rpc: the control message RPC.
   252  func (c *ControlMsgValidationInspector) updateMetrics(from peer.ID, rpc *pubsub.RPC) {
   253  	includedMessages := len(rpc.GetPublish())
   254  	iHaveCount, iWantCount, graftCount, pruneCount := 0, 0, 0, 0
   255  	ctl := rpc.GetControl()
   256  	if ctl != nil {
   257  		iHaveCount = len(ctl.GetIhave())
   258  		iWantCount = len(ctl.GetIwant())
   259  		graftCount = len(ctl.GetGraft())
   260  		pruneCount = len(ctl.GetPrune())
   261  	}
   262  	c.metrics.OnIncomingRpcReceived(iHaveCount, iWantCount, graftCount, pruneCount, includedMessages)
   263  	if c.logger.GetLevel() > zerolog.TraceLevel {
   264  		return // skip logging if trace level is not enabled
   265  	}
   266  	c.logger.Trace().
   267  		Str("peer_id", p2plogging.PeerId(from)).
   268  		Int("iHaveCount", iHaveCount).
   269  		Int("iWantCount", iWantCount).
   270  		Int("graftCount", graftCount).
   271  		Int("pruneCount", pruneCount).
   272  		Int("included_message_count", includedMessages).
   273  		Msg("received rpc with control messages")
   274  }
   275  
   276  // processInspectRPCReq func used by component workers to perform further inspection of RPC control messages that will validate ensure all control message
   277  // types are valid in the RPC.
   278  // Args:
   279  //   - req: the inspect rpc request.
   280  //
   281  // Returns:
   282  //   - error: no error is expected to be returned from this func as they are logged and distributed in invalid control message notifications.
   283  func (c *ControlMsgValidationInspector) processInspectRPCReq(req *InspectRPCRequest) error {
   284  	c.updateMetrics(req.Peer, req.rpc)
   285  	c.metrics.AsyncProcessingStarted()
   286  	start := time.Now()
   287  	defer func() {
   288  		c.metrics.AsyncProcessingFinished(time.Since(start))
   289  	}()
   290  
   291  	activeClusterIDS := c.tracker.GetActiveClusterIds()
   292  	for _, ctrlMsgType := range p2pmsg.ControlMessageTypes() {
   293  		switch ctrlMsgType {
   294  		case p2pmsg.CtrlMsgGraft:
   295  			err, topicType := c.inspectGraftMessages(req.Peer, req.rpc.GetControl().GetGraft(), activeClusterIDS)
   296  			if err != nil {
   297  				c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgGraft, err, 1, topicType)
   298  				return nil
   299  			}
   300  		case p2pmsg.CtrlMsgPrune:
   301  			err, topicType := c.inspectPruneMessages(req.Peer, req.rpc.GetControl().GetPrune(), activeClusterIDS)
   302  			if err != nil {
   303  				c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgPrune, err, 1, topicType)
   304  				return nil
   305  			}
   306  		case p2pmsg.CtrlMsgIWant:
   307  			err := c.inspectIWantMessages(req.Peer, req.rpc.GetControl().GetIwant())
   308  			if err != nil {
   309  				c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgIWant, err, 1, p2p.CtrlMsgNonClusterTopicType)
   310  				return nil
   311  			}
   312  		case p2pmsg.CtrlMsgIHave:
   313  			err, topicType := c.inspectIHaveMessages(req.Peer, req.rpc.GetControl().GetIhave(), activeClusterIDS)
   314  			if err != nil {
   315  				c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgIHave, err, 1, topicType)
   316  				return nil
   317  			}
   318  		}
   319  	}
   320  
   321  	// inspect rpc publish messages after all control message validation has passed
   322  	err, errCount := c.inspectRpcPublishMessages(req.Peer, req.rpc.GetPublish(), activeClusterIDS)
   323  	if err != nil {
   324  		c.logAndDistributeAsyncInspectErrs(req, p2pmsg.RpcPublishMessage, err, errCount, p2p.CtrlMsgNonClusterTopicType)
   325  		return nil
   326  	}
   327  
   328  	return nil
   329  }
   330  
   331  // checkSenderIdentity checks the identity of the peer with pid and ensures they are not unstaked, or ejected.
   332  // This check is only required on private networks.
   333  // Args:
   334  //   - pid : the peer ID.
   335  //
   336  // Returns:
   337  //   - error: sender is unknown or the identity is ejected.
   338  //
   339  // All errors returned from this function can be considered benign.
   340  func (c *ControlMsgValidationInspector) checkSenderIdentity(pid peer.ID) (*flow.Identity, error) {
   341  	id, ok := c.idProvider.ByPeerID(pid)
   342  	if !ok {
   343  		return nil, NewUnstakedPeerErr(pid)
   344  	}
   345  
   346  	if id.IsEjected() {
   347  		return nil, NewEjectedPeerErr(pid)
   348  	}
   349  
   350  	return id, nil
   351  }
   352  
   353  // inspectGraftMessages performs topic validation on all grafts in the control message using the provided validateTopic func while tracking duplicates.
   354  // Args:
   355  // - from: peer ID of the sender.
   356  // - grafts: the list of grafts to inspect.
   357  // - activeClusterIDS: the list of active cluster ids.
   358  // Returns:
   359  // - DuplicateTopicErr: if there are any duplicate topics in the list of grafts
   360  // - error: if any error occurs while sampling or validating topics, all returned errors are benign and should not cause the node to crash.
   361  // - bool: true if an error is returned and the topic that failed validation was a cluster prefixed topic, false otherwise.
   362  func (c *ControlMsgValidationInspector) inspectGraftMessages(from peer.ID, grafts []*pubsub_pb.ControlGraft, activeClusterIDS flow.ChainIDList) (error, p2p.CtrlMsgTopicType) {
   363  	if !c.config.InspectionProcess.Inspect.EnableGraft {
   364  		c.logger.
   365  			Trace().
   366  			Str("peer_id", p2plogging.PeerId(from)).
   367  			Bool(logging.KeyNetworkingSecurity, true).
   368  			Msg(GraftInspectionDisabledWarning)
   369  		return nil, p2p.CtrlMsgNonClusterTopicType
   370  	}
   371  
   372  	duplicateTopicTracker := make(duplicateStrTracker)
   373  	totalDuplicateTopicIds := 0
   374  	totalInvalidTopicIdErrs := 0
   375  	defer func() {
   376  		// regardless of inspection result, update metrics
   377  		c.metrics.OnGraftMessageInspected(totalDuplicateTopicIds, totalInvalidTopicIdErrs)
   378  	}()
   379  
   380  	for _, graft := range grafts {
   381  		topic := channels.Topic(graft.GetTopicID())
   382  		if duplicateTopicTracker.track(topic.String()) > 1 {
   383  			// ideally, a GRAFT message should not have any duplicate topics, hence a topic ID is counted as a duplicate only if it is repeated more than once.
   384  			totalDuplicateTopicIds++
   385  			// check if the total number of duplicates exceeds the configured threshold.
   386  			if totalDuplicateTopicIds > c.config.GraftPrune.DuplicateTopicIdThreshold {
   387  				c.metrics.OnGraftDuplicateTopicIdsExceedThreshold()
   388  				return NewDuplicateTopicIDThresholdExceeded(totalDuplicateTopicIds, len(grafts), c.config.GraftPrune.DuplicateTopicIdThreshold), p2p.CtrlMsgNonClusterTopicType
   389  			}
   390  		}
   391  		err, ctrlMsgType := c.validateTopic(from, topic, activeClusterIDS)
   392  		if err != nil {
   393  			totalInvalidTopicIdErrs++
   394  			c.metrics.OnInvalidTopicIdDetectedForControlMessage(p2pmsg.CtrlMsgGraft)
   395  			if totalInvalidTopicIdErrs > c.config.GraftPrune.InvalidTopicIdThreshold {
   396  				return NewInvalidTopicIDThresholdExceeded(totalInvalidTopicIdErrs, c.config.GraftPrune.InvalidTopicIdThreshold), ctrlMsgType
   397  			}
   398  		}
   399  	}
   400  	return nil, p2p.CtrlMsgNonClusterTopicType
   401  }
   402  
   403  // inspectPruneMessages performs topic validation on all prunes in the control message using the provided validateTopic func while tracking duplicates.
   404  // Args:
   405  // - from: peer ID of the sender.
   406  // - prunes: the list of iHaves to inspect.
   407  // - activeClusterIDS: the list of active cluster ids.
   408  // Returns:
   409  //   - DuplicateTopicErr: if there are any duplicate topics found in the list of iHaves
   410  //     or any duplicate message ids found inside a single iHave.
   411  //   - error: if any error occurs while sampling or validating topics, all returned errors are benign and should not cause the node to crash.
   412  //   - bool: true if an error is returned and the topic that failed validation was a cluster prefixed topic, false otherwise.
   413  func (c *ControlMsgValidationInspector) inspectPruneMessages(from peer.ID, prunes []*pubsub_pb.ControlPrune, activeClusterIDS flow.ChainIDList) (error, p2p.CtrlMsgTopicType) {
   414  	if !c.config.InspectionProcess.Inspect.EnablePrune {
   415  		c.logger.
   416  			Trace().
   417  			Str("peer_id", p2plogging.PeerId(from)).
   418  			Bool(logging.KeyNetworkingSecurity, true).
   419  			Msg(PruneInspectionDisabledWarning)
   420  		return nil, p2p.CtrlMsgNonClusterTopicType
   421  	}
   422  	tracker := make(duplicateStrTracker)
   423  	totalDuplicateTopicIds := 0
   424  	totalInvalidTopicIdErrs := 0
   425  	defer func() {
   426  		// regardless of inspection result, update metrics
   427  		c.metrics.OnPruneMessageInspected(totalDuplicateTopicIds, totalInvalidTopicIdErrs)
   428  	}()
   429  	for _, prune := range prunes {
   430  		topic := channels.Topic(prune.GetTopicID())
   431  		if tracker.track(topic.String()) > 1 {
   432  			// ideally, a PRUNE message should not have any duplicate topics, hence a topic ID is counted as a duplicate only if it is repeated more than once.
   433  			totalDuplicateTopicIds++
   434  			// check if the total number of duplicates exceeds the configured threshold.
   435  			if totalDuplicateTopicIds > c.config.GraftPrune.DuplicateTopicIdThreshold {
   436  				c.metrics.OnPruneDuplicateTopicIdsExceedThreshold()
   437  				return NewDuplicateTopicIDThresholdExceeded(totalDuplicateTopicIds, len(prunes), c.config.GraftPrune.DuplicateTopicIdThreshold), p2p.CtrlMsgNonClusterTopicType
   438  			}
   439  		}
   440  		err, ctrlMsgType := c.validateTopic(from, topic, activeClusterIDS)
   441  		if err != nil {
   442  			totalInvalidTopicIdErrs++
   443  			c.metrics.OnInvalidTopicIdDetectedForControlMessage(p2pmsg.CtrlMsgPrune)
   444  			if totalInvalidTopicIdErrs > c.config.GraftPrune.InvalidTopicIdThreshold {
   445  				return NewInvalidTopicIDThresholdExceeded(totalInvalidTopicIdErrs, c.config.GraftPrune.InvalidTopicIdThreshold), ctrlMsgType
   446  			}
   447  		}
   448  	}
   449  	return nil, p2p.CtrlMsgNonClusterTopicType
   450  }
   451  
   452  // inspectIHaveMessages performs topic validation on all ihaves in the control message using the provided validateTopic func while tracking duplicates.
   453  // Args:
   454  // - from: peer ID of the sender.
   455  // - iHaves: the list of iHaves to inspect.
   456  // - activeClusterIDS: the list of active cluster ids.
   457  // Returns:
   458  //   - DuplicateTopicErr: if there are any duplicate topics found in the list of iHaves
   459  //     or any duplicate message ids found inside a single iHave.
   460  //   - error: if any error occurs while sampling or validating topics, all returned errors are benign and should not cause the node to crash.
   461  //   - bool: true if an error is returned and the topic that failed validation was a cluster prefixed topic, false otherwise.
   462  func (c *ControlMsgValidationInspector) inspectIHaveMessages(from peer.ID, ihaves []*pubsub_pb.ControlIHave, activeClusterIDS flow.ChainIDList) (error, p2p.CtrlMsgTopicType) {
   463  	if !c.config.InspectionProcess.Inspect.EnableIHave {
   464  		c.logger.
   465  			Trace().
   466  			Str("peer_id", p2plogging.PeerId(from)).
   467  			Bool(logging.KeyNetworkingSecurity, true).
   468  			Msg(IHaveInspectionDisabledWarning)
   469  		return nil, p2p.CtrlMsgNonClusterTopicType
   470  	}
   471  
   472  	if len(ihaves) == 0 {
   473  		return nil, p2p.CtrlMsgNonClusterTopicType
   474  	}
   475  	lg := c.logger.With().
   476  		Str("peer_id", p2plogging.PeerId(from)).
   477  		Int("sample_size", len(ihaves)).
   478  		Int("max_sample_size", c.config.IHave.MessageCountThreshold).
   479  		Logger()
   480  	duplicateTopicTracker := make(duplicateStrTracker)
   481  	duplicateMessageIDTracker := make(duplicateStrTracker)
   482  	totalMessageIds := 0
   483  	totalDuplicateTopicIds := 0
   484  	totalDuplicateMessageIds := 0
   485  	totalInvalidTopicIdErrs := 0
   486  	defer func() {
   487  		// regardless of inspection result, update metrics
   488  		c.metrics.OnIHaveMessagesInspected(totalDuplicateTopicIds, totalDuplicateMessageIds, totalInvalidTopicIdErrs)
   489  	}()
   490  	for _, ihave := range ihaves {
   491  		messageIds := ihave.GetMessageIDs()
   492  		topic := ihave.GetTopicID()
   493  		totalMessageIds += len(messageIds)
   494  
   495  		// first check if the topic is valid, fail fast if it is not
   496  		err, ctrlMsgType := c.validateTopic(from, channels.Topic(topic), activeClusterIDS)
   497  		if err != nil {
   498  			totalInvalidTopicIdErrs++
   499  			c.metrics.OnInvalidTopicIdDetectedForControlMessage(p2pmsg.CtrlMsgIHave)
   500  			if totalInvalidTopicIdErrs > c.config.IHave.InvalidTopicIdThreshold {
   501  				return NewInvalidTopicIDThresholdExceeded(totalInvalidTopicIdErrs, c.config.IHave.InvalidTopicIdThreshold), ctrlMsgType
   502  			}
   503  		}
   504  
   505  		// then track the topic ensuring it is not beyond a duplicate threshold.
   506  		if duplicateTopicTracker.track(topic) > 1 {
   507  			totalDuplicateTopicIds++
   508  			// the topic is duplicated, check if the total number of duplicates exceeds the configured threshold
   509  			if totalDuplicateTopicIds > c.config.IHave.DuplicateTopicIdThreshold {
   510  				c.metrics.OnIHaveDuplicateTopicIdsExceedThreshold()
   511  				return NewDuplicateTopicIDThresholdExceeded(totalDuplicateTopicIds, len(ihaves), c.config.IHave.DuplicateTopicIdThreshold), p2p.CtrlMsgNonClusterTopicType
   512  			}
   513  		}
   514  
   515  		for _, messageID := range messageIds {
   516  			if duplicateMessageIDTracker.track(messageID) > 1 {
   517  				totalDuplicateMessageIds++
   518  				// the message is duplicated, check if the total number of duplicates exceeds the configured threshold
   519  				if totalDuplicateMessageIds > c.config.IHave.DuplicateMessageIdThreshold {
   520  					c.metrics.OnIHaveDuplicateMessageIdsExceedThreshold()
   521  					return NewDuplicateMessageIDErr(messageID, totalDuplicateMessageIds, p2pmsg.CtrlMsgIHave), p2p.CtrlMsgNonClusterTopicType
   522  				}
   523  			}
   524  		}
   525  	}
   526  	lg.Debug().
   527  		Int("total_message_ids", totalMessageIds).
   528  		Int("total_duplicate_topic_ids", totalDuplicateTopicIds).
   529  		Int("total_duplicate_message_ids", totalDuplicateMessageIds).
   530  		Msg("ihave control message validation complete")
   531  	return nil, p2p.CtrlMsgNonClusterTopicType
   532  }
   533  
   534  // inspectIWantMessages inspects RPC iWant control messages. This func will sample the iWants and perform validation on each iWant in the sample.
   535  // Ensuring that the following are true:
   536  // - Each iWant corresponds to an iHave that was sent.
   537  // - Each topic in the iWant sample is a valid topic.
   538  // If the number of iWants that do not have a corresponding iHave exceed the configured threshold an error is returned.
   539  // Args:
   540  // - from: peer ID of the sender.
   541  // - iWant: the list of iWant control messages.
   542  // Returns:
   543  // - DuplicateTopicErr: if there are any duplicate message ids found in any of the iWants.
   544  // - IWantCacheMissThresholdErr: if the rate of cache misses exceeds the configured allowed threshold.
   545  func (c *ControlMsgValidationInspector) inspectIWantMessages(from peer.ID, iWants []*pubsub_pb.ControlIWant) error {
   546  	if !c.config.InspectionProcess.Inspect.EnableIWant {
   547  		c.logger.
   548  			Trace().
   549  			Str("peer_id", p2plogging.PeerId(from)).
   550  			Bool(logging.KeyNetworkingSecurity, true).
   551  			Msg(IWantInspectionDisabledWarning)
   552  		return nil
   553  	}
   554  
   555  	if len(iWants) == 0 {
   556  		return nil
   557  	}
   558  	lastHighest := c.rpcTracker.LastHighestIHaveRPCSize()
   559  	lg := c.logger.With().
   560  		Str("peer_id", p2plogging.PeerId(from)).
   561  		Uint("max_sample_size", c.config.IWant.MessageCountThreshold).
   562  		Int64("last_highest_ihave_rpc_size", lastHighest).
   563  		Logger()
   564  	duplicateMsgIdTracker := make(duplicateStrTracker)
   565  	cacheMisses := 0
   566  	duplicateMessageIds := 0
   567  	defer func() {
   568  		// regardless of inspection result, update metrics
   569  		c.metrics.OnIWantMessagesInspected(duplicateMessageIds, cacheMisses)
   570  	}()
   571  
   572  	lg = lg.With().
   573  		Int("iwant_msg_count", len(iWants)).
   574  		Int("cache_misses_threshold", c.config.IWant.CacheMissThreshold).
   575  		Int("duplicates_threshold", c.config.IWant.DuplicateMsgIdThreshold).Logger()
   576  
   577  	lg.Trace().Msg("validating sample of message ids from iwant control message")
   578  
   579  	totalMessageIds := 0
   580  	for _, iWant := range iWants {
   581  		messageIds := iWant.GetMessageIDs()
   582  		messageIDCount := uint(len(messageIds))
   583  		for _, messageID := range messageIds {
   584  			// check duplicate allowed threshold
   585  			if duplicateMsgIdTracker.track(messageID) > 1 {
   586  				// ideally, an iWant message should not have any duplicate message IDs, hence a message id is considered duplicate when it is repeated more than once.
   587  				duplicateMessageIds++
   588  				if duplicateMessageIds > c.config.IWant.DuplicateMsgIdThreshold {
   589  					c.metrics.OnIWantDuplicateMessageIdsExceedThreshold()
   590  					return NewIWantDuplicateMsgIDThresholdErr(duplicateMessageIds, messageIDCount, c.config.IWant.DuplicateMsgIdThreshold)
   591  				}
   592  			}
   593  			// check cache miss threshold
   594  			if !c.rpcTracker.WasIHaveRPCSent(messageID) {
   595  				cacheMisses++
   596  				if cacheMisses > c.config.IWant.CacheMissThreshold {
   597  					c.metrics.OnIWantCacheMissMessageIdsExceedThreshold()
   598  					return NewIWantCacheMissThresholdErr(cacheMisses, messageIDCount, c.config.IWant.CacheMissThreshold)
   599  				}
   600  			}
   601  			duplicateMsgIdTracker.track(messageID)
   602  			totalMessageIds++
   603  		}
   604  	}
   605  
   606  	lg.Debug().
   607  		Int("total_message_ids", totalMessageIds).
   608  		Int("cache_misses", cacheMisses).
   609  		Int("total_duplicate_message_ids", duplicateMessageIds).
   610  		Msg("iwant control message validation complete")
   611  
   612  	return nil
   613  }
   614  
   615  // inspectRpcPublishMessages inspects a sample of the RPC gossip messages and performs topic validation that ensures the following:
   616  // - Topics are known flow topics.
   617  // - Topics are valid flow topics.
   618  // - Topics are in the nodes subscribe topics list.
   619  // If more than half the topics in the sample contain invalid topics an error will be returned.
   620  // Args:
   621  // - from: peer ID of the sender.
   622  // - messages: rpc publish messages.
   623  // - activeClusterIDS: the list of active cluster ids.
   624  // Returns:
   625  // - InvalidRpcPublishMessagesErr: if the amount of invalid messages exceeds the configured RPCMessageErrorThreshold.
   626  // - int: the number of invalid pubsub messages
   627  func (c *ControlMsgValidationInspector) inspectRpcPublishMessages(from peer.ID, messages []*pubsub_pb.Message, activeClusterIDS flow.ChainIDList) (error, uint64) {
   628  	if !c.config.InspectionProcess.Inspect.EnablePublish {
   629  		c.logger.
   630  			Trace().
   631  			Str("peer_id", p2plogging.PeerId(from)).
   632  			Bool(logging.KeyNetworkingSecurity, true).
   633  			Msg(PublishInspectionDisabledWarning)
   634  		return nil, 0
   635  	}
   636  	totalMessages := len(messages)
   637  	if totalMessages == 0 {
   638  		return nil, 0
   639  	}
   640  
   641  	sampleSize := c.config.PublishMessages.MaxSampleSize
   642  	if sampleSize > totalMessages {
   643  		sampleSize = totalMessages
   644  	}
   645  	c.performSample(p2pmsg.RpcPublishMessage, uint(totalMessages), uint(sampleSize), func(i, j uint) {
   646  		messages[i], messages[j] = messages[j], messages[i]
   647  	})
   648  
   649  	subscribedTopics := c.topicOracle().GetTopics()
   650  	hasSubscription := func(topic string) bool {
   651  		for _, subscribedTopic := range subscribedTopics {
   652  			if topic == subscribedTopic {
   653  				return true
   654  			}
   655  		}
   656  		return false
   657  	}
   658  	var errs *multierror.Error
   659  	invalidTopicIdsCount := 0
   660  	invalidSubscriptionsCount := 0
   661  	invalidSendersCount := 0
   662  	defer func() {
   663  		// regardless of inspection result, update metrics
   664  		errCnt := 0
   665  		if errs != nil {
   666  			errCnt = errs.Len()
   667  		}
   668  		c.metrics.OnPublishMessageInspected(errCnt, invalidTopicIdsCount, invalidSubscriptionsCount, invalidSendersCount)
   669  	}()
   670  
   671  	idCheckCache := make(map[peer.ID]error)
   672  	for _, message := range messages[:sampleSize] {
   673  		topic := channels.Topic(message.GetTopic())
   674  		// The boolean value returned when validating a topic, indicating whether the topic is cluster-prefixed or not, is intentionally ignored.
   675  		// This is because we have already set a threshold for errors allowed on publish messages. Reducing the penalty further based on
   676  		// cluster prefix status is unnecessary when the error threshold is exceeded.
   677  		err, _ := c.validateTopic(from, topic, activeClusterIDS)
   678  		if err != nil {
   679  			// we can skip checking for subscription of topic that failed validation and continue
   680  			invalidTopicIdsCount++
   681  			errs = multierror.Append(errs, err)
   682  			continue
   683  		}
   684  
   685  		if !hasSubscription(topic.String()) {
   686  			invalidSubscriptionsCount++
   687  			errs = multierror.Append(errs, fmt.Errorf("subscription for topic %s not found", topic))
   688  			continue
   689  		}
   690  
   691  		if c.networkingType == network.PrivateNetwork {
   692  			pid, err := peer.IDFromBytes(message.GetFrom())
   693  			if err != nil {
   694  				invalidSendersCount++
   695  				errs = multierror.Append(errs, fmt.Errorf("failed to get peer ID from bytes: %w", err))
   696  				continue
   697  			}
   698  
   699  			if idCheckErr, ok := idCheckCache[pid]; ok {
   700  				if idCheckErr != nil {
   701  					errs = multierror.Append(errs, idCheckErr)
   702  					continue
   703  				}
   704  			}
   705  
   706  			_, idErr := c.checkSenderIdentity(pid)
   707  			if idErr != nil {
   708  				invalidSendersCount++
   709  				errs = multierror.Append(errs, idErr)
   710  				idCheckCache[pid] = idErr
   711  				continue
   712  			}
   713  
   714  			idCheckCache[pid] = nil
   715  		}
   716  	}
   717  	// return an error when we exceed the error threshold
   718  	if errs != nil && errs.Len() > c.config.PublishMessages.ErrorThreshold {
   719  		c.metrics.OnPublishMessagesInspectionErrorExceedsThreshold()
   720  		return NewInvalidRpcPublishMessagesErr(errs.ErrorOrNil(), errs.Len()), uint64(errs.Len())
   721  	}
   722  
   723  	return nil, 0
   724  }
   725  
   726  // truncateRPC truncates the RPC by truncating each control message type using the configured max sample size values.
   727  // Args:
   728  // - from: peer ID of the sender.
   729  // - rpc: the pubsub RPC.
   730  func (c *ControlMsgValidationInspector) truncateRPC(from peer.ID, rpc *pubsub.RPC) {
   731  	if c.config.InspectionProcess.Truncate.Disabled {
   732  		c.logger.
   733  			Trace().
   734  			Str("peer_id", p2plogging.PeerId(from)).
   735  			Bool(logging.KeyNetworkingSecurity, true).
   736  			Msg(RPCTruncationDisabledWarning)
   737  		return
   738  	}
   739  
   740  	for _, ctlMsgType := range p2pmsg.ControlMessageTypes() {
   741  		switch ctlMsgType {
   742  		case p2pmsg.CtrlMsgGraft:
   743  			c.truncateGraftMessages(from, rpc)
   744  		case p2pmsg.CtrlMsgPrune:
   745  			c.truncatePruneMessages(from, rpc)
   746  		case p2pmsg.CtrlMsgIHave:
   747  			c.truncateIHaveMessages(from, rpc)
   748  			c.truncateIHaveMessageIds(from, rpc)
   749  		case p2pmsg.CtrlMsgIWant:
   750  			c.truncateIWantMessages(from, rpc)
   751  			c.truncateIWantMessageIds(from, rpc)
   752  		default:
   753  			// sanity check this should never happen
   754  			c.logAndThrowError(fmt.Errorf("unknown control message type encountered during RPC truncation"))
   755  		}
   756  	}
   757  }
   758  
   759  // truncateGraftMessages truncates the Graft control messages in the RPC. If the total number of Grafts in the RPC exceeds the configured
   760  // GraftPruneMessageMaxSampleSize the list of Grafts will be truncated.
   761  // Args:
   762  //   - rpc: the rpc message to truncate.
   763  func (c *ControlMsgValidationInspector) truncateGraftMessages(from peer.ID, rpc *pubsub.RPC) {
   764  	if !c.config.InspectionProcess.Truncate.EnableGraft {
   765  		c.logger.
   766  			Trace().
   767  			Str("peer_id", p2plogging.PeerId(from)).
   768  			Bool(logging.KeyNetworkingSecurity, true).
   769  			Msg(GraftTruncationDisabledWarning)
   770  		return
   771  	}
   772  
   773  	grafts := rpc.GetControl().GetGraft()
   774  	originalGraftSize := len(grafts)
   775  	if originalGraftSize <= c.config.GraftPrune.MessageCountThreshold {
   776  		return // nothing to truncate
   777  	}
   778  
   779  	// truncate grafts and update metrics
   780  	sampleSize := c.config.GraftPrune.MessageCountThreshold
   781  	c.performSample(p2pmsg.CtrlMsgGraft, uint(originalGraftSize), uint(sampleSize), func(i, j uint) {
   782  		grafts[i], grafts[j] = grafts[j], grafts[i]
   783  	})
   784  	rpc.Control.Graft = grafts[:sampleSize]
   785  	c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgGraft, originalGraftSize-len(rpc.Control.Graft))
   786  }
   787  
   788  // truncatePruneMessages truncates the Prune control messages in the RPC. If the total number of Prunes in the RPC exceeds the configured
   789  // GraftPruneMessageMaxSampleSize the list of Prunes will be truncated.
   790  // Args:
   791  //   - rpc: the rpc message to truncate.
   792  func (c *ControlMsgValidationInspector) truncatePruneMessages(from peer.ID, rpc *pubsub.RPC) {
   793  	if !c.config.InspectionProcess.Truncate.EnablePrune {
   794  		c.logger.
   795  			Trace().
   796  			Str("peer_id", p2plogging.PeerId(from)).
   797  			Bool(logging.KeyNetworkingSecurity, true).
   798  			Msg(PruneTruncationDisabledWarning)
   799  		return
   800  	}
   801  
   802  	prunes := rpc.GetControl().GetPrune()
   803  	originalPruneSize := len(prunes)
   804  	if originalPruneSize <= c.config.GraftPrune.MessageCountThreshold {
   805  		return // nothing to truncate
   806  	}
   807  
   808  	sampleSize := c.config.GraftPrune.MessageCountThreshold
   809  	c.performSample(p2pmsg.CtrlMsgPrune, uint(originalPruneSize), uint(sampleSize), func(i, j uint) {
   810  		prunes[i], prunes[j] = prunes[j], prunes[i]
   811  	})
   812  	rpc.Control.Prune = prunes[:sampleSize]
   813  	c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgPrune, originalPruneSize-len(rpc.Control.Prune))
   814  }
   815  
   816  // truncateIHaveMessages truncates the iHaves control messages in the RPC. If the total number of iHaves in the RPC exceeds the configured
   817  // MessageCountThreshold the list of iHaves will be truncated.
   818  // Args:
   819  //   - rpc: the rpc message to truncate.
   820  func (c *ControlMsgValidationInspector) truncateIHaveMessages(from peer.ID, rpc *pubsub.RPC) {
   821  	if !c.config.InspectionProcess.Truncate.EnableIHave {
   822  		c.logger.
   823  			Trace().
   824  			Str("peer_id", p2plogging.PeerId(from)).
   825  			Bool(logging.KeyNetworkingSecurity, true).
   826  			Msg(IHaveTruncationDisabledWarning)
   827  		return
   828  	}
   829  
   830  	ihaves := rpc.GetControl().GetIhave()
   831  	originalIHaveCount := len(ihaves)
   832  	if originalIHaveCount == 0 {
   833  		return
   834  	}
   835  
   836  	if originalIHaveCount > c.config.IHave.MessageCountThreshold {
   837  		// truncate ihaves and update metrics
   838  		sampleSize := c.config.IHave.MessageCountThreshold
   839  		if sampleSize > originalIHaveCount {
   840  			sampleSize = originalIHaveCount
   841  		}
   842  		c.performSample(p2pmsg.CtrlMsgIHave, uint(originalIHaveCount), uint(sampleSize), func(i, j uint) {
   843  			ihaves[i], ihaves[j] = ihaves[j], ihaves[i]
   844  		})
   845  		rpc.Control.Ihave = ihaves[:sampleSize]
   846  		c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgIHave, originalIHaveCount-len(rpc.Control.Ihave))
   847  	}
   848  }
   849  
   850  // truncateIHaveMessageIds truncates the message ids for each iHave control message in the RPC. If the total number of message ids in a single iHave exceeds the configured
   851  // MessageIdCountThreshold the list of message ids will be truncated. Before message ids are truncated the iHave control messages should have been truncated themselves.
   852  // Args:
   853  //   - rpc: the rpc message to truncate.
   854  func (c *ControlMsgValidationInspector) truncateIHaveMessageIds(from peer.ID, rpc *pubsub.RPC) {
   855  	if !c.config.InspectionProcess.Truncate.EnableIHaveMessageIds {
   856  		c.logger.
   857  			Trace().
   858  			Str("peer_id", p2plogging.PeerId(from)).
   859  			Bool(logging.KeyNetworkingSecurity, true).
   860  			Msg(IHaveMessageIDTruncationDisabledWarning)
   861  		return
   862  	}
   863  
   864  	for _, ihave := range rpc.GetControl().GetIhave() {
   865  		messageIDs := ihave.GetMessageIDs()
   866  		originalMessageIdCount := len(messageIDs)
   867  		if originalMessageIdCount == 0 {
   868  			continue // nothing to truncate; skip
   869  		}
   870  
   871  		if originalMessageIdCount > c.config.IHave.MessageIdCountThreshold {
   872  			sampleSize := c.config.IHave.MessageIdCountThreshold
   873  			if sampleSize > originalMessageIdCount {
   874  				sampleSize = originalMessageIdCount
   875  			}
   876  			c.performSample(p2pmsg.CtrlMsgIHave, uint(originalMessageIdCount), uint(sampleSize), func(i, j uint) {
   877  				messageIDs[i], messageIDs[j] = messageIDs[j], messageIDs[i]
   878  			})
   879  			ihave.MessageIDs = messageIDs[:sampleSize]
   880  			c.metrics.OnIHaveControlMessageIdsTruncated(originalMessageIdCount - len(ihave.MessageIDs))
   881  		}
   882  		c.metrics.OnIHaveMessageIDsReceived(ihave.GetTopicID(), len(ihave.MessageIDs))
   883  	}
   884  }
   885  
   886  // truncateIWantMessages truncates the iWant control messages in the RPC. If the total number of iWants in the RPC exceeds the configured
   887  // MessageCountThreshold the list of iWants will be truncated.
   888  // Args:
   889  //   - rpc: the rpc message to truncate.
   890  func (c *ControlMsgValidationInspector) truncateIWantMessages(from peer.ID, rpc *pubsub.RPC) {
   891  	if !c.config.InspectionProcess.Truncate.EnableIWant {
   892  		c.logger.
   893  			Trace().
   894  			Str("peer_id", p2plogging.PeerId(from)).
   895  			Bool(logging.KeyNetworkingSecurity, true).
   896  			Msg(IWantTruncationDisabledWarning)
   897  		return
   898  	}
   899  
   900  	iWants := rpc.GetControl().GetIwant()
   901  	originalIWantCount := uint(len(iWants))
   902  	if originalIWantCount == 0 {
   903  		return
   904  	}
   905  
   906  	if originalIWantCount > c.config.IWant.MessageCountThreshold {
   907  		// truncate iWants and update metrics
   908  		sampleSize := c.config.IWant.MessageCountThreshold
   909  		if sampleSize > originalIWantCount {
   910  			sampleSize = originalIWantCount
   911  		}
   912  		c.performSample(p2pmsg.CtrlMsgIWant, originalIWantCount, sampleSize, func(i, j uint) {
   913  			iWants[i], iWants[j] = iWants[j], iWants[i]
   914  		})
   915  		rpc.Control.Iwant = iWants[:sampleSize]
   916  		c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgIWant, int(originalIWantCount)-len(rpc.Control.Iwant))
   917  	}
   918  }
   919  
   920  // truncateIWantMessageIds truncates the message ids for each iWant control message in the RPC. If the total number of message ids in a single iWant exceeds the configured
   921  // MessageIdCountThreshold the list of message ids will be truncated. Before message ids are truncated the iWant control messages should have been truncated themselves.
   922  // Args:
   923  //   - rpc: the rpc message to truncate.
   924  func (c *ControlMsgValidationInspector) truncateIWantMessageIds(from peer.ID, rpc *pubsub.RPC) {
   925  	if !c.config.InspectionProcess.Truncate.EnableIWantMessageIds {
   926  		c.logger.
   927  			Trace().
   928  			Str("peer_id", p2plogging.PeerId(from)).
   929  			Bool(logging.KeyNetworkingSecurity, true).
   930  			Msg(IWantMessageIDTruncationDisabledWarning)
   931  		return
   932  	}
   933  
   934  	lastHighest := c.rpcTracker.LastHighestIHaveRPCSize()
   935  	lg := c.logger.With().
   936  		Str("peer_id", p2plogging.PeerId(from)).
   937  		Uint("max_sample_size", c.config.IWant.MessageCountThreshold).
   938  		Int64("last_highest_ihave_rpc_size", lastHighest).
   939  		Logger()
   940  
   941  	sampleSize := int(10 * lastHighest)
   942  	if sampleSize == 0 || sampleSize > c.config.IWant.MessageIdCountThreshold {
   943  		// invalid or 0 sample size is suspicious
   944  		lg.Warn().Str(logging.KeySuspicious, "true").Msg("zero or invalid sample size, using default max sample size")
   945  		sampleSize = c.config.IWant.MessageIdCountThreshold
   946  	}
   947  	for _, iWant := range rpc.GetControl().GetIwant() {
   948  		messageIDs := iWant.GetMessageIDs()
   949  		totalMessageIdCount := len(messageIDs)
   950  		if totalMessageIdCount == 0 {
   951  			continue // nothing to truncate; skip
   952  		}
   953  
   954  		if totalMessageIdCount > sampleSize {
   955  			c.performSample(p2pmsg.CtrlMsgIWant, uint(totalMessageIdCount), uint(sampleSize), func(i, j uint) {
   956  				messageIDs[i], messageIDs[j] = messageIDs[j], messageIDs[i]
   957  			})
   958  			iWant.MessageIDs = messageIDs[:sampleSize]
   959  			c.metrics.OnIWantControlMessageIdsTruncated(totalMessageIdCount - len(iWant.MessageIDs))
   960  		}
   961  		c.metrics.OnIWantMessageIDsReceived(len(iWant.MessageIDs))
   962  	}
   963  }
   964  
   965  // performSample performs sampling on the specified control message that will randomize
   966  // the items in the control message slice up to index sampleSize-1. Any error encountered during sampling is considered
   967  // irrecoverable and will cause the node to crash.
   968  func (c *ControlMsgValidationInspector) performSample(ctrlMsg p2pmsg.ControlMessageType, totalSize, sampleSize uint, swap func(i, j uint)) {
   969  	err := flowrand.Samples(totalSize, sampleSize, swap)
   970  	if err != nil {
   971  		c.logAndThrowError(fmt.Errorf("failed to get random sample of %s control messages: %w", ctrlMsg, err))
   972  	}
   973  }
   974  
   975  // validateTopic ensures the topic is a valid flow topic/channel.
   976  // Expected error returns during normal operations:
   977  //   - channels.InvalidTopicErr: if topic is invalid.
   978  //   - ErrActiveClusterIdsNotSet: if the cluster ID provider is not set.
   979  //   - channels.UnknownClusterIDErr: if the topic contains a cluster ID prefix that is not in the active cluster IDs list.
   980  //
   981  // This func returns an exception in case of unexpected bug or state corruption if cluster prefixed topic validation
   982  // fails due to unexpected error returned when getting the active cluster IDS.
   983  func (c *ControlMsgValidationInspector) validateTopic(from peer.ID, topic channels.Topic, activeClusterIds flow.ChainIDList) (error, p2p.CtrlMsgTopicType) {
   984  	channel, ok := channels.ChannelFromTopic(topic)
   985  	if !ok {
   986  		return channels.NewInvalidTopicErr(topic, fmt.Errorf("failed to get channel from topic")), p2p.CtrlMsgNonClusterTopicType
   987  	}
   988  	// handle cluster prefixed topics
   989  	if channels.IsClusterChannel(channel) {
   990  		return c.validateClusterPrefixedTopic(from, topic, activeClusterIds), p2p.CtrlMsgTopicTypeClusterPrefixed
   991  	}
   992  
   993  	// non cluster prefixed topic validation
   994  	err := channels.IsValidNonClusterFlowTopic(topic, c.sporkID)
   995  	if err != nil {
   996  		return err, p2p.CtrlMsgNonClusterTopicType
   997  	}
   998  	return nil, p2p.CtrlMsgNonClusterTopicType
   999  }
  1000  
  1001  // validateClusterPrefixedTopic validates cluster prefixed topics.
  1002  // Expected error returns during normal operations:
  1003  //   - ErrActiveClusterIdsNotSet: if the cluster ID provider is not set.
  1004  //   - channels.InvalidTopicErr: if topic is invalid.
  1005  //   - channels.UnknownClusterIDErr: if the topic contains a cluster ID prefix that is not in the active cluster IDs list.
  1006  //
  1007  // In the case where an ErrActiveClusterIdsNotSet or UnknownClusterIDErr is encountered and the cluster prefixed topic received
  1008  // tracker for the peer is less than or equal to the configured HardThreshold an error will only be logged and not returned.
  1009  // At the point where the hard threshold is crossed the error will be returned and the sender will start to be penalized.
  1010  // Any errors encountered while incrementing or loading the cluster prefixed control message gauge for a peer will result in an irrecoverable error being thrown, these
  1011  // errors are unexpected and irrecoverable indicating a bug.
  1012  func (c *ControlMsgValidationInspector) validateClusterPrefixedTopic(from peer.ID, topic channels.Topic, activeClusterIds flow.ChainIDList) error {
  1013  	lg := c.logger.With().
  1014  		Str("from", p2plogging.PeerId(from)).
  1015  		Logger()
  1016  
  1017  	if len(activeClusterIds) == 0 {
  1018  		// cluster IDs have not been updated yet
  1019  		_, incErr := c.tracker.Inc(from)
  1020  		if incErr != nil {
  1021  			// irrecoverable error encountered
  1022  			c.logAndThrowError(fmt.Errorf("error encountered while incrementing the cluster prefixed control message gauge %s: %w", from, incErr))
  1023  		}
  1024  
  1025  		// if the amount of messages received is below our hard threshold log the error and return nil.
  1026  		if ok := c.checkClusterPrefixHardThreshold(from); ok {
  1027  			lg.Warn().
  1028  				Str("topic", topic.String()).
  1029  				Msg("failed to validate cluster prefixed control message with cluster pre-fixed topic active cluster ids not set")
  1030  			return nil
  1031  		}
  1032  
  1033  		return NewActiveClusterIdsNotSetErr(topic)
  1034  	}
  1035  
  1036  	err := channels.IsValidFlowClusterTopic(topic, activeClusterIds)
  1037  	if err != nil {
  1038  		if channels.IsUnknownClusterIDErr(err) {
  1039  			// unknown cluster ID error could indicate that a node has fallen
  1040  			// behind and needs to catchup increment to topics received cache.
  1041  			_, incErr := c.tracker.Inc(from)
  1042  			if incErr != nil {
  1043  				c.logAndThrowError(fmt.Errorf("error encountered while incrementing the cluster prefixed control message gauge %s: %w", from, err))
  1044  			}
  1045  			// if the amount of messages received is below our hard threshold log the error and return nil.
  1046  			if c.checkClusterPrefixHardThreshold(from) {
  1047  				lg.Warn().
  1048  					Err(err).
  1049  					Str("topic", topic.String()).
  1050  					Msg("processing unknown cluster prefixed topic received below cluster prefixed discard threshold peer may be behind in the protocol")
  1051  				return nil
  1052  			}
  1053  		}
  1054  		return err
  1055  	}
  1056  
  1057  	return nil
  1058  }
  1059  
  1060  // checkClusterPrefixHardThreshold returns true if the cluster prefix received tracker count is less than
  1061  // the configured HardThreshold, false otherwise.
  1062  // If any error is encountered while loading from the tracker this func will throw an error on the signaler context, these errors
  1063  // are unexpected and irrecoverable indicating a bug.
  1064  func (c *ControlMsgValidationInspector) checkClusterPrefixHardThreshold(pid peer.ID) bool {
  1065  	gauge, err := c.tracker.Load(pid)
  1066  	if err != nil {
  1067  		// irrecoverable error encountered
  1068  		c.logAndThrowError(fmt.Errorf("cluster prefixed control message gauge during hard threshold check failed for peer %s: %w", pid, err))
  1069  	}
  1070  	return gauge <= c.config.ClusterPrefixedMessage.HardThreshold
  1071  }
  1072  
  1073  // logAndDistributeErr logs the provided error and attempts to disseminate an invalid control message validation notification for the error.
  1074  // Args:
  1075  //   - req: inspect rpc request that failed validation.
  1076  //   - ctlMsgType: the control message type of the rpc message that caused the error.
  1077  //   - err: the error that occurred.
  1078  //   - count: the number of occurrences of the error.
  1079  //   - isClusterPrefixed: indicates if the errors occurred on a cluster prefixed topic.
  1080  func (c *ControlMsgValidationInspector) logAndDistributeAsyncInspectErrs(req *InspectRPCRequest, ctlMsgType p2pmsg.ControlMessageType, err error, count uint64, topicType p2p.CtrlMsgTopicType) {
  1081  	lg := c.logger.With().
  1082  		Err(err).
  1083  		Str("control_message_type", ctlMsgType.String()).
  1084  		Bool(logging.KeySuspicious, true).
  1085  		Bool(logging.KeyNetworkingSecurity, true).
  1086  		Str("topic_type", topicType.String()).
  1087  		Uint64("error_count", count).
  1088  		Str("peer_id", p2plogging.PeerId(req.Peer)).
  1089  		Logger()
  1090  
  1091  	switch {
  1092  	case IsErrActiveClusterIDsNotSet(err):
  1093  		c.metrics.OnActiveClusterIDsNotSetErr()
  1094  		lg.Warn().Msg("active cluster ids not set")
  1095  	case IsErrUnstakedPeer(err):
  1096  		c.metrics.OnUnstakedPeerInspectionFailed()
  1097  		lg.Warn().Msg("control message received from unstaked peer")
  1098  	default:
  1099  		c.notificationConsumer.OnInvalidControlMessageNotification(p2p.NewInvalidControlMessageNotification(req.Peer, ctlMsgType, err, count, topicType))
  1100  		lg.Error().Msg("rpc control message async inspection failed, notification sent")
  1101  		c.metrics.OnInvalidControlMessageNotificationSent()
  1102  	}
  1103  }
  1104  
  1105  // logAndThrowError logs and throws irrecoverable errors on the context.
  1106  // Args:
  1107  //
  1108  //	err: the error encountered.
  1109  func (c *ControlMsgValidationInspector) logAndThrowError(err error) {
  1110  	c.logger.Error().
  1111  		Err(err).
  1112  		Bool(logging.KeySuspicious, true).
  1113  		Bool(logging.KeyNetworkingSecurity, true).
  1114  		Msg("unexpected irrecoverable error encountered")
  1115  	c.ctx.Throw(err)
  1116  }