github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/network/p2p/inspector/validation/control_message_validation_inspector.go (about) 1 package validation 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/go-playground/validator/v10" 8 "github.com/hashicorp/go-multierror" 9 pubsub "github.com/libp2p/go-libp2p-pubsub" 10 pubsub_pb "github.com/libp2p/go-libp2p-pubsub/pb" 11 "github.com/libp2p/go-libp2p/core/peer" 12 "github.com/rs/zerolog" 13 14 "github.com/onflow/flow-go/engine/common/worker" 15 "github.com/onflow/flow-go/model/flow" 16 "github.com/onflow/flow-go/module" 17 "github.com/onflow/flow-go/module/component" 18 "github.com/onflow/flow-go/module/irrecoverable" 19 "github.com/onflow/flow-go/module/mempool/queue" 20 "github.com/onflow/flow-go/module/metrics" 21 "github.com/onflow/flow-go/network" 22 "github.com/onflow/flow-go/network/channels" 23 "github.com/onflow/flow-go/network/p2p" 24 p2pconfig "github.com/onflow/flow-go/network/p2p/config" 25 "github.com/onflow/flow-go/network/p2p/inspector/internal/cache" 26 p2plogging "github.com/onflow/flow-go/network/p2p/logging" 27 p2pmsg "github.com/onflow/flow-go/network/p2p/message" 28 "github.com/onflow/flow-go/state/protocol" 29 "github.com/onflow/flow-go/state/protocol/events" 30 "github.com/onflow/flow-go/utils/logging" 31 flowrand "github.com/onflow/flow-go/utils/rand" 32 ) 33 34 const ( 35 RPCInspectionDisabledWarning = "rpc inspection disabled for all control message types, skipping inspection" 36 GraftInspectionDisabledWarning = "rpc graft inspection disabled skipping" 37 PruneInspectionDisabledWarning = "rpc prune inspection disabled skipping" 38 IWantInspectionDisabledWarning = "rpc iwant inspection disabled skipping" 39 IHaveInspectionDisabledWarning = "rpc ihave inspection disabled skipping" 40 PublishInspectionDisabledWarning = "rpc publish message inspection disabled skipping" 41 42 RPCTruncationDisabledWarning = "rpc truncation disabled for all control message types, skipping truncation" 43 GraftTruncationDisabledWarning = "rpc graft truncation disabled skipping" 44 PruneTruncationDisabledWarning = "rpc prune truncation disabled skipping" 45 IHaveTruncationDisabledWarning = "rpc ihave truncation disabled skipping" 46 IHaveMessageIDTruncationDisabledWarning = "ihave message ids truncation disabled skipping" 47 IWantTruncationDisabledWarning = "rpc iwant truncation disabled skipping" 48 IWantMessageIDTruncationDisabledWarning = "iwant message ids truncation disabled skipping" 49 50 // rpcInspectorComponentName the rpc inspector component name. 51 rpcInspectorComponentName = "gossipsub_rpc_validation_inspector" 52 ) 53 54 // ControlMsgValidationInspector RPC message inspector that inspects control messages and performs some validation on them, 55 // when some validation rule is broken feedback is given via the Peer scoring notifier. 56 type ControlMsgValidationInspector struct { 57 component.Component 58 events.Noop 59 ctx irrecoverable.SignalerContext 60 logger zerolog.Logger 61 sporkID flow.Identifier 62 metrics module.GossipSubRpcValidationInspectorMetrics 63 // config control message validation configurations. 64 config *p2pconfig.RpcValidationInspector 65 // workerPool queue that stores *InspectRPCRequest that will be processed by component workers. 66 workerPool *worker.Pool[*InspectRPCRequest] 67 // tracker is a map that associates the hash of a peer's ID with the 68 // number of cluster-prefix topic control messages received from that peer. It helps in tracking 69 // and managing the rate of incoming control messages from each peer, ensuring that the system 70 // stays performant and resilient against potential spam or abuse. 71 // The counter is incremented in the following scenarios: 72 // 1. The cluster prefix topic is received while the inspector waits for the cluster IDs provider to be set (this can happen during the startup or epoch transitions). 73 // 2. The node sends a cluster prefix topic where the cluster prefix does not match any of the active cluster IDs. 74 // In such cases, the inspector will allow a configured number of these messages from the corresponding peer. 75 tracker *cache.ClusterPrefixedMessagesReceivedTracker 76 idProvider module.IdentityProvider 77 rpcTracker p2p.RpcControlTracking 78 // networkingType indicates public or private network, rpc publish messages are inspected for unstaked senders when running the private network. 79 networkingType network.NetworkingType 80 // topicOracle callback used to retrieve the current subscribed topics of the libp2p node. 81 topicOracle func() p2p.TopicProvider 82 // notificationConsumer the consumer that will be notified when a misbehavior is detected upon inspection of an RPC. 83 // For each RPC, at most one notification is sent to the consumer. 84 // Each notification acts as a penalty to the peer's score. 85 notificationConsumer p2p.GossipSubInvCtrlMsgNotifConsumer 86 } 87 88 type InspectorParams struct { 89 // Logger the logger used by the inspector. 90 Logger zerolog.Logger `validate:"required"` 91 // SporkID the current spork ID. 92 SporkID flow.Identifier `validate:"required"` 93 // Config inspector configuration. 94 Config *p2pconfig.RpcValidationInspector `validate:"required"` 95 // HeroCacheMetricsFactory the metrics factory. 96 HeroCacheMetricsFactory metrics.HeroCacheMetricsFactory `validate:"required"` 97 // IdProvider identity provider is used to get the flow identifier for a peer. 98 IdProvider module.IdentityProvider `validate:"required"` 99 // InspectorMetrics metrics for the validation inspector. 100 InspectorMetrics module.GossipSubRpcValidationInspectorMetrics `validate:"required"` 101 // RpcTracker tracker used to track iHave RPC's sent and last size. 102 RpcTracker p2p.RpcControlTracking `validate:"required"` 103 // NetworkingType the networking type of the node. 104 NetworkingType network.NetworkingType `validate:"required"` 105 // TopicOracle callback used to retrieve the current subscribed topics of the libp2p node. 106 // It is set as a callback to avoid circular dependencies between the topic oracle and the inspector. 107 TopicOracle func() p2p.TopicProvider `validate:"required"` 108 109 // InvalidControlMessageNotificationConsumer the consumer that will be notified when a misbehavior is detected upon inspection of an RPC. 110 // For each RPC, at most one notification is sent to the consumer. 111 // Each notification acts as a penalty to the peer's score. 112 InvalidControlMessageNotificationConsumer p2p.GossipSubInvCtrlMsgNotifConsumer `validate:"required"` 113 } 114 115 var _ component.Component = (*ControlMsgValidationInspector)(nil) 116 var _ p2p.GossipSubRPCInspector = (*ControlMsgValidationInspector)(nil) 117 var _ protocol.Consumer = (*ControlMsgValidationInspector)(nil) 118 119 // NewControlMsgValidationInspector returns new ControlMsgValidationInspector 120 // Args: 121 // - *InspectorParams: params used to create the inspector. 122 // 123 // Returns: 124 // - *ControlMsgValidationInspector: a new control message validation inspector. 125 // - error: an error if there is any error while creating the inspector. All errors are irrecoverable and unexpected. 126 func NewControlMsgValidationInspector(params *InspectorParams) (*ControlMsgValidationInspector, error) { 127 err := validator.New().Struct(params) 128 if err != nil { 129 return nil, fmt.Errorf("inspector params validation failed: %w", err) 130 } 131 lg := params.Logger.With().Str("component", "gossip_sub_rpc_validation_inspector").Logger() 132 133 inspectMsgQueueCacheCollector := metrics.GossipSubRPCInspectorQueueMetricFactory(params.HeroCacheMetricsFactory, params.NetworkingType) 134 clusterPrefixedCacheCollector := metrics.GossipSubRPCInspectorClusterPrefixedCacheMetricFactory(params.HeroCacheMetricsFactory, params.NetworkingType) 135 136 clusterPrefixedTracker, err := cache.NewClusterPrefixedMessagesReceivedTracker(params.Logger, 137 params.Config.ClusterPrefixedMessage.ControlMsgsReceivedCacheSize, 138 clusterPrefixedCacheCollector, 139 params.Config.ClusterPrefixedMessage.ControlMsgsReceivedCacheDecay) 140 if err != nil { 141 return nil, fmt.Errorf("failed to create cluster prefix topics received tracker") 142 } 143 144 if params.Config.PublishMessages.MaxSampleSize < params.Config.PublishMessages.ErrorThreshold { 145 return nil, fmt.Errorf("rpc message max sample size must be greater than or equal to rpc message error threshold, got %d and %d respectively", 146 params.Config.PublishMessages.MaxSampleSize, 147 params.Config.PublishMessages.ErrorThreshold) 148 } 149 150 c := &ControlMsgValidationInspector{ 151 logger: lg, 152 sporkID: params.SporkID, 153 config: params.Config, 154 tracker: clusterPrefixedTracker, 155 rpcTracker: params.RpcTracker, 156 idProvider: params.IdProvider, 157 metrics: params.InspectorMetrics, 158 networkingType: params.NetworkingType, 159 topicOracle: params.TopicOracle, 160 notificationConsumer: params.InvalidControlMessageNotificationConsumer, 161 } 162 163 store := queue.NewHeroStore(params.Config.InspectionQueue.Size, params.Logger, inspectMsgQueueCacheCollector) 164 165 pool := worker.NewWorkerPoolBuilder[*InspectRPCRequest](lg, store, c.processInspectRPCReq).Build() 166 167 c.workerPool = pool 168 169 builder := component.NewComponentManagerBuilder() 170 for i := 0; i < c.config.InspectionQueue.NumberOfWorkers; i++ { 171 builder.AddWorker(pool.WorkerLogic()) 172 } 173 c.Component = builder.Build() 174 return c, nil 175 } 176 177 func (c *ControlMsgValidationInspector) Start(parent irrecoverable.SignalerContext) { 178 if c.topicOracle == nil { 179 parent.Throw(fmt.Errorf("control message validation inspector topic oracle not set")) 180 } 181 c.Component.Start(parent) 182 } 183 184 // Name returns the name of the rpc inspector. 185 func (c *ControlMsgValidationInspector) Name() string { 186 return rpcInspectorComponentName 187 } 188 189 // ActiveClustersChanged consumes cluster ID update protocol events. 190 func (c *ControlMsgValidationInspector) ActiveClustersChanged(clusterIDList flow.ChainIDList) { 191 c.tracker.StoreActiveClusterIds(clusterIDList) 192 } 193 194 // Inspect is called by gossipsub upon reception of a rpc from a remote node. 195 // It creates a new InspectRPCRequest for the RPC to be inspected async by the worker pool. 196 // Args: 197 // - from: the sender. 198 // - rpc: the control message RPC. 199 // 200 // Returns: 201 // - error: if a new inspect rpc request cannot be created, all errors returned are considered irrecoverable. 202 func (c *ControlMsgValidationInspector) Inspect(from peer.ID, rpc *pubsub.RPC) error { 203 if c.config.InspectionProcess.Inspect.Disabled { 204 c.logger. 205 Trace(). 206 Str("peer_id", p2plogging.PeerId(from)). 207 Bool(logging.KeyNetworkingSecurity, true). 208 Msg(RPCInspectionDisabledWarning) 209 return nil 210 } 211 212 // check peer identity when running private network 213 // sanity check: rpc inspection should be disabled on public networks 214 if c.networkingType == network.PrivateNetwork && c.config.InspectionProcess.Inspect.RejectUnstakedPeers { 215 _, err := c.checkSenderIdentity(from) 216 if err != nil { 217 c.notificationConsumer.OnInvalidControlMessageNotification(p2p.NewInvalidControlMessageNotification(from, p2pmsg.CtrlMsgRPC, err, 1, p2p.CtrlMsgNonClusterTopicType)) 218 c.logger. 219 Error(). 220 Err(err). 221 Str("peer_id", p2plogging.PeerId(from)). 222 Bool(logging.KeyNetworkingSecurity, true). 223 Msg("rpc received from unstaked peer") 224 c.metrics.OnInvalidControlMessageNotificationSent() 225 c.metrics.OnRpcRejectedFromUnknownSender() 226 return err 227 } 228 } 229 230 // first truncate the rpc to the configured max sample size; if needed 231 c.truncateRPC(from, rpc) 232 233 // second, queue further async inspection 234 req, err := NewInspectRPCRequest(from, rpc) 235 if err != nil { 236 c.logger.Error(). 237 Err(err). 238 Bool(logging.KeyNetworkingSecurity, true). 239 Str("peer_id", p2plogging.PeerId(from)). 240 Msg("failed to get inspect RPC request") 241 return fmt.Errorf("failed to get inspect RPC request: %w", err) 242 } 243 c.workerPool.Submit(req) 244 return nil 245 } 246 247 // updateMetrics updates the metrics for the received RPC. 248 // Args: 249 // - from: the sender. 250 // 251 // - rpc: the control message RPC. 252 func (c *ControlMsgValidationInspector) updateMetrics(from peer.ID, rpc *pubsub.RPC) { 253 includedMessages := len(rpc.GetPublish()) 254 iHaveCount, iWantCount, graftCount, pruneCount := 0, 0, 0, 0 255 ctl := rpc.GetControl() 256 if ctl != nil { 257 iHaveCount = len(ctl.GetIhave()) 258 iWantCount = len(ctl.GetIwant()) 259 graftCount = len(ctl.GetGraft()) 260 pruneCount = len(ctl.GetPrune()) 261 } 262 c.metrics.OnIncomingRpcReceived(iHaveCount, iWantCount, graftCount, pruneCount, includedMessages) 263 if c.logger.GetLevel() > zerolog.TraceLevel { 264 return // skip logging if trace level is not enabled 265 } 266 c.logger.Trace(). 267 Str("peer_id", p2plogging.PeerId(from)). 268 Int("iHaveCount", iHaveCount). 269 Int("iWantCount", iWantCount). 270 Int("graftCount", graftCount). 271 Int("pruneCount", pruneCount). 272 Int("included_message_count", includedMessages). 273 Msg("received rpc with control messages") 274 } 275 276 // processInspectRPCReq func used by component workers to perform further inspection of RPC control messages that will validate ensure all control message 277 // types are valid in the RPC. 278 // Args: 279 // - req: the inspect rpc request. 280 // 281 // Returns: 282 // - error: no error is expected to be returned from this func as they are logged and distributed in invalid control message notifications. 283 func (c *ControlMsgValidationInspector) processInspectRPCReq(req *InspectRPCRequest) error { 284 c.updateMetrics(req.Peer, req.rpc) 285 c.metrics.AsyncProcessingStarted() 286 start := time.Now() 287 defer func() { 288 c.metrics.AsyncProcessingFinished(time.Since(start)) 289 }() 290 291 activeClusterIDS := c.tracker.GetActiveClusterIds() 292 for _, ctrlMsgType := range p2pmsg.ControlMessageTypes() { 293 switch ctrlMsgType { 294 case p2pmsg.CtrlMsgGraft: 295 err, topicType := c.inspectGraftMessages(req.Peer, req.rpc.GetControl().GetGraft(), activeClusterIDS) 296 if err != nil { 297 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgGraft, err, 1, topicType) 298 return nil 299 } 300 case p2pmsg.CtrlMsgPrune: 301 err, topicType := c.inspectPruneMessages(req.Peer, req.rpc.GetControl().GetPrune(), activeClusterIDS) 302 if err != nil { 303 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgPrune, err, 1, topicType) 304 return nil 305 } 306 case p2pmsg.CtrlMsgIWant: 307 err := c.inspectIWantMessages(req.Peer, req.rpc.GetControl().GetIwant()) 308 if err != nil { 309 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgIWant, err, 1, p2p.CtrlMsgNonClusterTopicType) 310 return nil 311 } 312 case p2pmsg.CtrlMsgIHave: 313 err, topicType := c.inspectIHaveMessages(req.Peer, req.rpc.GetControl().GetIhave(), activeClusterIDS) 314 if err != nil { 315 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgIHave, err, 1, topicType) 316 return nil 317 } 318 } 319 } 320 321 // inspect rpc publish messages after all control message validation has passed 322 err, errCount := c.inspectRpcPublishMessages(req.Peer, req.rpc.GetPublish(), activeClusterIDS) 323 if err != nil { 324 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.RpcPublishMessage, err, errCount, p2p.CtrlMsgNonClusterTopicType) 325 return nil 326 } 327 328 return nil 329 } 330 331 // checkSenderIdentity checks the identity of the peer with pid and ensures they are not unstaked, or ejected. 332 // This check is only required on private networks. 333 // Args: 334 // - pid : the peer ID. 335 // 336 // Returns: 337 // - error: sender is unknown or the identity is ejected. 338 // 339 // All errors returned from this function can be considered benign. 340 func (c *ControlMsgValidationInspector) checkSenderIdentity(pid peer.ID) (*flow.Identity, error) { 341 id, ok := c.idProvider.ByPeerID(pid) 342 if !ok { 343 return nil, NewUnstakedPeerErr(pid) 344 } 345 346 if id.IsEjected() { 347 return nil, NewEjectedPeerErr(pid) 348 } 349 350 return id, nil 351 } 352 353 // inspectGraftMessages performs topic validation on all grafts in the control message using the provided validateTopic func while tracking duplicates. 354 // Args: 355 // - from: peer ID of the sender. 356 // - grafts: the list of grafts to inspect. 357 // - activeClusterIDS: the list of active cluster ids. 358 // Returns: 359 // - DuplicateTopicErr: if there are any duplicate topics in the list of grafts 360 // - error: if any error occurs while sampling or validating topics, all returned errors are benign and should not cause the node to crash. 361 // - bool: true if an error is returned and the topic that failed validation was a cluster prefixed topic, false otherwise. 362 func (c *ControlMsgValidationInspector) inspectGraftMessages(from peer.ID, grafts []*pubsub_pb.ControlGraft, activeClusterIDS flow.ChainIDList) (error, p2p.CtrlMsgTopicType) { 363 if !c.config.InspectionProcess.Inspect.EnableGraft { 364 c.logger. 365 Trace(). 366 Str("peer_id", p2plogging.PeerId(from)). 367 Bool(logging.KeyNetworkingSecurity, true). 368 Msg(GraftInspectionDisabledWarning) 369 return nil, p2p.CtrlMsgNonClusterTopicType 370 } 371 372 duplicateTopicTracker := make(duplicateStrTracker) 373 totalDuplicateTopicIds := 0 374 totalInvalidTopicIdErrs := 0 375 defer func() { 376 // regardless of inspection result, update metrics 377 c.metrics.OnGraftMessageInspected(totalDuplicateTopicIds, totalInvalidTopicIdErrs) 378 }() 379 380 for _, graft := range grafts { 381 topic := channels.Topic(graft.GetTopicID()) 382 if duplicateTopicTracker.track(topic.String()) > 1 { 383 // ideally, a GRAFT message should not have any duplicate topics, hence a topic ID is counted as a duplicate only if it is repeated more than once. 384 totalDuplicateTopicIds++ 385 // check if the total number of duplicates exceeds the configured threshold. 386 if totalDuplicateTopicIds > c.config.GraftPrune.DuplicateTopicIdThreshold { 387 c.metrics.OnGraftDuplicateTopicIdsExceedThreshold() 388 return NewDuplicateTopicIDThresholdExceeded(totalDuplicateTopicIds, len(grafts), c.config.GraftPrune.DuplicateTopicIdThreshold), p2p.CtrlMsgNonClusterTopicType 389 } 390 } 391 err, ctrlMsgType := c.validateTopic(from, topic, activeClusterIDS) 392 if err != nil { 393 totalInvalidTopicIdErrs++ 394 c.metrics.OnInvalidTopicIdDetectedForControlMessage(p2pmsg.CtrlMsgGraft) 395 if totalInvalidTopicIdErrs > c.config.GraftPrune.InvalidTopicIdThreshold { 396 return NewInvalidTopicIDThresholdExceeded(totalInvalidTopicIdErrs, c.config.GraftPrune.InvalidTopicIdThreshold), ctrlMsgType 397 } 398 } 399 } 400 return nil, p2p.CtrlMsgNonClusterTopicType 401 } 402 403 // inspectPruneMessages performs topic validation on all prunes in the control message using the provided validateTopic func while tracking duplicates. 404 // Args: 405 // - from: peer ID of the sender. 406 // - prunes: the list of iHaves to inspect. 407 // - activeClusterIDS: the list of active cluster ids. 408 // Returns: 409 // - DuplicateTopicErr: if there are any duplicate topics found in the list of iHaves 410 // or any duplicate message ids found inside a single iHave. 411 // - error: if any error occurs while sampling or validating topics, all returned errors are benign and should not cause the node to crash. 412 // - bool: true if an error is returned and the topic that failed validation was a cluster prefixed topic, false otherwise. 413 func (c *ControlMsgValidationInspector) inspectPruneMessages(from peer.ID, prunes []*pubsub_pb.ControlPrune, activeClusterIDS flow.ChainIDList) (error, p2p.CtrlMsgTopicType) { 414 if !c.config.InspectionProcess.Inspect.EnablePrune { 415 c.logger. 416 Trace(). 417 Str("peer_id", p2plogging.PeerId(from)). 418 Bool(logging.KeyNetworkingSecurity, true). 419 Msg(PruneInspectionDisabledWarning) 420 return nil, p2p.CtrlMsgNonClusterTopicType 421 } 422 tracker := make(duplicateStrTracker) 423 totalDuplicateTopicIds := 0 424 totalInvalidTopicIdErrs := 0 425 defer func() { 426 // regardless of inspection result, update metrics 427 c.metrics.OnPruneMessageInspected(totalDuplicateTopicIds, totalInvalidTopicIdErrs) 428 }() 429 for _, prune := range prunes { 430 topic := channels.Topic(prune.GetTopicID()) 431 if tracker.track(topic.String()) > 1 { 432 // ideally, a PRUNE message should not have any duplicate topics, hence a topic ID is counted as a duplicate only if it is repeated more than once. 433 totalDuplicateTopicIds++ 434 // check if the total number of duplicates exceeds the configured threshold. 435 if totalDuplicateTopicIds > c.config.GraftPrune.DuplicateTopicIdThreshold { 436 c.metrics.OnPruneDuplicateTopicIdsExceedThreshold() 437 return NewDuplicateTopicIDThresholdExceeded(totalDuplicateTopicIds, len(prunes), c.config.GraftPrune.DuplicateTopicIdThreshold), p2p.CtrlMsgNonClusterTopicType 438 } 439 } 440 err, ctrlMsgType := c.validateTopic(from, topic, activeClusterIDS) 441 if err != nil { 442 totalInvalidTopicIdErrs++ 443 c.metrics.OnInvalidTopicIdDetectedForControlMessage(p2pmsg.CtrlMsgPrune) 444 if totalInvalidTopicIdErrs > c.config.GraftPrune.InvalidTopicIdThreshold { 445 return NewInvalidTopicIDThresholdExceeded(totalInvalidTopicIdErrs, c.config.GraftPrune.InvalidTopicIdThreshold), ctrlMsgType 446 } 447 } 448 } 449 return nil, p2p.CtrlMsgNonClusterTopicType 450 } 451 452 // inspectIHaveMessages performs topic validation on all ihaves in the control message using the provided validateTopic func while tracking duplicates. 453 // Args: 454 // - from: peer ID of the sender. 455 // - iHaves: the list of iHaves to inspect. 456 // - activeClusterIDS: the list of active cluster ids. 457 // Returns: 458 // - DuplicateTopicErr: if there are any duplicate topics found in the list of iHaves 459 // or any duplicate message ids found inside a single iHave. 460 // - error: if any error occurs while sampling or validating topics, all returned errors are benign and should not cause the node to crash. 461 // - bool: true if an error is returned and the topic that failed validation was a cluster prefixed topic, false otherwise. 462 func (c *ControlMsgValidationInspector) inspectIHaveMessages(from peer.ID, ihaves []*pubsub_pb.ControlIHave, activeClusterIDS flow.ChainIDList) (error, p2p.CtrlMsgTopicType) { 463 if !c.config.InspectionProcess.Inspect.EnableIHave { 464 c.logger. 465 Trace(). 466 Str("peer_id", p2plogging.PeerId(from)). 467 Bool(logging.KeyNetworkingSecurity, true). 468 Msg(IHaveInspectionDisabledWarning) 469 return nil, p2p.CtrlMsgNonClusterTopicType 470 } 471 472 if len(ihaves) == 0 { 473 return nil, p2p.CtrlMsgNonClusterTopicType 474 } 475 lg := c.logger.With(). 476 Str("peer_id", p2plogging.PeerId(from)). 477 Int("sample_size", len(ihaves)). 478 Int("max_sample_size", c.config.IHave.MessageCountThreshold). 479 Logger() 480 duplicateTopicTracker := make(duplicateStrTracker) 481 duplicateMessageIDTracker := make(duplicateStrTracker) 482 totalMessageIds := 0 483 totalDuplicateTopicIds := 0 484 totalDuplicateMessageIds := 0 485 totalInvalidTopicIdErrs := 0 486 defer func() { 487 // regardless of inspection result, update metrics 488 c.metrics.OnIHaveMessagesInspected(totalDuplicateTopicIds, totalDuplicateMessageIds, totalInvalidTopicIdErrs) 489 }() 490 for _, ihave := range ihaves { 491 messageIds := ihave.GetMessageIDs() 492 topic := ihave.GetTopicID() 493 totalMessageIds += len(messageIds) 494 495 // first check if the topic is valid, fail fast if it is not 496 err, ctrlMsgType := c.validateTopic(from, channels.Topic(topic), activeClusterIDS) 497 if err != nil { 498 totalInvalidTopicIdErrs++ 499 c.metrics.OnInvalidTopicIdDetectedForControlMessage(p2pmsg.CtrlMsgIHave) 500 if totalInvalidTopicIdErrs > c.config.IHave.InvalidTopicIdThreshold { 501 return NewInvalidTopicIDThresholdExceeded(totalInvalidTopicIdErrs, c.config.IHave.InvalidTopicIdThreshold), ctrlMsgType 502 } 503 } 504 505 // then track the topic ensuring it is not beyond a duplicate threshold. 506 if duplicateTopicTracker.track(topic) > 1 { 507 totalDuplicateTopicIds++ 508 // the topic is duplicated, check if the total number of duplicates exceeds the configured threshold 509 if totalDuplicateTopicIds > c.config.IHave.DuplicateTopicIdThreshold { 510 c.metrics.OnIHaveDuplicateTopicIdsExceedThreshold() 511 return NewDuplicateTopicIDThresholdExceeded(totalDuplicateTopicIds, len(ihaves), c.config.IHave.DuplicateTopicIdThreshold), p2p.CtrlMsgNonClusterTopicType 512 } 513 } 514 515 for _, messageID := range messageIds { 516 if duplicateMessageIDTracker.track(messageID) > 1 { 517 totalDuplicateMessageIds++ 518 // the message is duplicated, check if the total number of duplicates exceeds the configured threshold 519 if totalDuplicateMessageIds > c.config.IHave.DuplicateMessageIdThreshold { 520 c.metrics.OnIHaveDuplicateMessageIdsExceedThreshold() 521 return NewDuplicateMessageIDErr(messageID, totalDuplicateMessageIds, p2pmsg.CtrlMsgIHave), p2p.CtrlMsgNonClusterTopicType 522 } 523 } 524 } 525 } 526 lg.Debug(). 527 Int("total_message_ids", totalMessageIds). 528 Int("total_duplicate_topic_ids", totalDuplicateTopicIds). 529 Int("total_duplicate_message_ids", totalDuplicateMessageIds). 530 Msg("ihave control message validation complete") 531 return nil, p2p.CtrlMsgNonClusterTopicType 532 } 533 534 // inspectIWantMessages inspects RPC iWant control messages. This func will sample the iWants and perform validation on each iWant in the sample. 535 // Ensuring that the following are true: 536 // - Each iWant corresponds to an iHave that was sent. 537 // - Each topic in the iWant sample is a valid topic. 538 // If the number of iWants that do not have a corresponding iHave exceed the configured threshold an error is returned. 539 // Args: 540 // - from: peer ID of the sender. 541 // - iWant: the list of iWant control messages. 542 // Returns: 543 // - DuplicateTopicErr: if there are any duplicate message ids found in any of the iWants. 544 // - IWantCacheMissThresholdErr: if the rate of cache misses exceeds the configured allowed threshold. 545 func (c *ControlMsgValidationInspector) inspectIWantMessages(from peer.ID, iWants []*pubsub_pb.ControlIWant) error { 546 if !c.config.InspectionProcess.Inspect.EnableIWant { 547 c.logger. 548 Trace(). 549 Str("peer_id", p2plogging.PeerId(from)). 550 Bool(logging.KeyNetworkingSecurity, true). 551 Msg(IWantInspectionDisabledWarning) 552 return nil 553 } 554 555 if len(iWants) == 0 { 556 return nil 557 } 558 lastHighest := c.rpcTracker.LastHighestIHaveRPCSize() 559 lg := c.logger.With(). 560 Str("peer_id", p2plogging.PeerId(from)). 561 Uint("max_sample_size", c.config.IWant.MessageCountThreshold). 562 Int64("last_highest_ihave_rpc_size", lastHighest). 563 Logger() 564 duplicateMsgIdTracker := make(duplicateStrTracker) 565 cacheMisses := 0 566 duplicateMessageIds := 0 567 defer func() { 568 // regardless of inspection result, update metrics 569 c.metrics.OnIWantMessagesInspected(duplicateMessageIds, cacheMisses) 570 }() 571 572 lg = lg.With(). 573 Int("iwant_msg_count", len(iWants)). 574 Int("cache_misses_threshold", c.config.IWant.CacheMissThreshold). 575 Int("duplicates_threshold", c.config.IWant.DuplicateMsgIdThreshold).Logger() 576 577 lg.Trace().Msg("validating sample of message ids from iwant control message") 578 579 totalMessageIds := 0 580 for _, iWant := range iWants { 581 messageIds := iWant.GetMessageIDs() 582 messageIDCount := uint(len(messageIds)) 583 for _, messageID := range messageIds { 584 // check duplicate allowed threshold 585 if duplicateMsgIdTracker.track(messageID) > 1 { 586 // ideally, an iWant message should not have any duplicate message IDs, hence a message id is considered duplicate when it is repeated more than once. 587 duplicateMessageIds++ 588 if duplicateMessageIds > c.config.IWant.DuplicateMsgIdThreshold { 589 c.metrics.OnIWantDuplicateMessageIdsExceedThreshold() 590 return NewIWantDuplicateMsgIDThresholdErr(duplicateMessageIds, messageIDCount, c.config.IWant.DuplicateMsgIdThreshold) 591 } 592 } 593 // check cache miss threshold 594 if !c.rpcTracker.WasIHaveRPCSent(messageID) { 595 cacheMisses++ 596 if cacheMisses > c.config.IWant.CacheMissThreshold { 597 c.metrics.OnIWantCacheMissMessageIdsExceedThreshold() 598 return NewIWantCacheMissThresholdErr(cacheMisses, messageIDCount, c.config.IWant.CacheMissThreshold) 599 } 600 } 601 duplicateMsgIdTracker.track(messageID) 602 totalMessageIds++ 603 } 604 } 605 606 lg.Debug(). 607 Int("total_message_ids", totalMessageIds). 608 Int("cache_misses", cacheMisses). 609 Int("total_duplicate_message_ids", duplicateMessageIds). 610 Msg("iwant control message validation complete") 611 612 return nil 613 } 614 615 // inspectRpcPublishMessages inspects a sample of the RPC gossip messages and performs topic validation that ensures the following: 616 // - Topics are known flow topics. 617 // - Topics are valid flow topics. 618 // - Topics are in the nodes subscribe topics list. 619 // If more than half the topics in the sample contain invalid topics an error will be returned. 620 // Args: 621 // - from: peer ID of the sender. 622 // - messages: rpc publish messages. 623 // - activeClusterIDS: the list of active cluster ids. 624 // Returns: 625 // - InvalidRpcPublishMessagesErr: if the amount of invalid messages exceeds the configured RPCMessageErrorThreshold. 626 // - int: the number of invalid pubsub messages 627 func (c *ControlMsgValidationInspector) inspectRpcPublishMessages(from peer.ID, messages []*pubsub_pb.Message, activeClusterIDS flow.ChainIDList) (error, uint64) { 628 if !c.config.InspectionProcess.Inspect.EnablePublish { 629 c.logger. 630 Trace(). 631 Str("peer_id", p2plogging.PeerId(from)). 632 Bool(logging.KeyNetworkingSecurity, true). 633 Msg(PublishInspectionDisabledWarning) 634 return nil, 0 635 } 636 totalMessages := len(messages) 637 if totalMessages == 0 { 638 return nil, 0 639 } 640 641 sampleSize := c.config.PublishMessages.MaxSampleSize 642 if sampleSize > totalMessages { 643 sampleSize = totalMessages 644 } 645 c.performSample(p2pmsg.RpcPublishMessage, uint(totalMessages), uint(sampleSize), func(i, j uint) { 646 messages[i], messages[j] = messages[j], messages[i] 647 }) 648 649 subscribedTopics := c.topicOracle().GetTopics() 650 hasSubscription := func(topic string) bool { 651 for _, subscribedTopic := range subscribedTopics { 652 if topic == subscribedTopic { 653 return true 654 } 655 } 656 return false 657 } 658 var errs *multierror.Error 659 invalidTopicIdsCount := 0 660 invalidSubscriptionsCount := 0 661 invalidSendersCount := 0 662 defer func() { 663 // regardless of inspection result, update metrics 664 errCnt := 0 665 if errs != nil { 666 errCnt = errs.Len() 667 } 668 c.metrics.OnPublishMessageInspected(errCnt, invalidTopicIdsCount, invalidSubscriptionsCount, invalidSendersCount) 669 }() 670 671 idCheckCache := make(map[peer.ID]error) 672 for _, message := range messages[:sampleSize] { 673 topic := channels.Topic(message.GetTopic()) 674 // The boolean value returned when validating a topic, indicating whether the topic is cluster-prefixed or not, is intentionally ignored. 675 // This is because we have already set a threshold for errors allowed on publish messages. Reducing the penalty further based on 676 // cluster prefix status is unnecessary when the error threshold is exceeded. 677 err, _ := c.validateTopic(from, topic, activeClusterIDS) 678 if err != nil { 679 // we can skip checking for subscription of topic that failed validation and continue 680 invalidTopicIdsCount++ 681 errs = multierror.Append(errs, err) 682 continue 683 } 684 685 if !hasSubscription(topic.String()) { 686 invalidSubscriptionsCount++ 687 errs = multierror.Append(errs, fmt.Errorf("subscription for topic %s not found", topic)) 688 continue 689 } 690 691 if c.networkingType == network.PrivateNetwork { 692 pid, err := peer.IDFromBytes(message.GetFrom()) 693 if err != nil { 694 invalidSendersCount++ 695 errs = multierror.Append(errs, fmt.Errorf("failed to get peer ID from bytes: %w", err)) 696 continue 697 } 698 699 if idCheckErr, ok := idCheckCache[pid]; ok { 700 if idCheckErr != nil { 701 errs = multierror.Append(errs, idCheckErr) 702 continue 703 } 704 } 705 706 _, idErr := c.checkSenderIdentity(pid) 707 if idErr != nil { 708 invalidSendersCount++ 709 errs = multierror.Append(errs, idErr) 710 idCheckCache[pid] = idErr 711 continue 712 } 713 714 idCheckCache[pid] = nil 715 } 716 } 717 // return an error when we exceed the error threshold 718 if errs != nil && errs.Len() > c.config.PublishMessages.ErrorThreshold { 719 c.metrics.OnPublishMessagesInspectionErrorExceedsThreshold() 720 return NewInvalidRpcPublishMessagesErr(errs.ErrorOrNil(), errs.Len()), uint64(errs.Len()) 721 } 722 723 return nil, 0 724 } 725 726 // truncateRPC truncates the RPC by truncating each control message type using the configured max sample size values. 727 // Args: 728 // - from: peer ID of the sender. 729 // - rpc: the pubsub RPC. 730 func (c *ControlMsgValidationInspector) truncateRPC(from peer.ID, rpc *pubsub.RPC) { 731 if c.config.InspectionProcess.Truncate.Disabled { 732 c.logger. 733 Trace(). 734 Str("peer_id", p2plogging.PeerId(from)). 735 Bool(logging.KeyNetworkingSecurity, true). 736 Msg(RPCTruncationDisabledWarning) 737 return 738 } 739 740 for _, ctlMsgType := range p2pmsg.ControlMessageTypes() { 741 switch ctlMsgType { 742 case p2pmsg.CtrlMsgGraft: 743 c.truncateGraftMessages(from, rpc) 744 case p2pmsg.CtrlMsgPrune: 745 c.truncatePruneMessages(from, rpc) 746 case p2pmsg.CtrlMsgIHave: 747 c.truncateIHaveMessages(from, rpc) 748 c.truncateIHaveMessageIds(from, rpc) 749 case p2pmsg.CtrlMsgIWant: 750 c.truncateIWantMessages(from, rpc) 751 c.truncateIWantMessageIds(from, rpc) 752 default: 753 // sanity check this should never happen 754 c.logAndThrowError(fmt.Errorf("unknown control message type encountered during RPC truncation")) 755 } 756 } 757 } 758 759 // truncateGraftMessages truncates the Graft control messages in the RPC. If the total number of Grafts in the RPC exceeds the configured 760 // GraftPruneMessageMaxSampleSize the list of Grafts will be truncated. 761 // Args: 762 // - rpc: the rpc message to truncate. 763 func (c *ControlMsgValidationInspector) truncateGraftMessages(from peer.ID, rpc *pubsub.RPC) { 764 if !c.config.InspectionProcess.Truncate.EnableGraft { 765 c.logger. 766 Trace(). 767 Str("peer_id", p2plogging.PeerId(from)). 768 Bool(logging.KeyNetworkingSecurity, true). 769 Msg(GraftTruncationDisabledWarning) 770 return 771 } 772 773 grafts := rpc.GetControl().GetGraft() 774 originalGraftSize := len(grafts) 775 if originalGraftSize <= c.config.GraftPrune.MessageCountThreshold { 776 return // nothing to truncate 777 } 778 779 // truncate grafts and update metrics 780 sampleSize := c.config.GraftPrune.MessageCountThreshold 781 c.performSample(p2pmsg.CtrlMsgGraft, uint(originalGraftSize), uint(sampleSize), func(i, j uint) { 782 grafts[i], grafts[j] = grafts[j], grafts[i] 783 }) 784 rpc.Control.Graft = grafts[:sampleSize] 785 c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgGraft, originalGraftSize-len(rpc.Control.Graft)) 786 } 787 788 // truncatePruneMessages truncates the Prune control messages in the RPC. If the total number of Prunes in the RPC exceeds the configured 789 // GraftPruneMessageMaxSampleSize the list of Prunes will be truncated. 790 // Args: 791 // - rpc: the rpc message to truncate. 792 func (c *ControlMsgValidationInspector) truncatePruneMessages(from peer.ID, rpc *pubsub.RPC) { 793 if !c.config.InspectionProcess.Truncate.EnablePrune { 794 c.logger. 795 Trace(). 796 Str("peer_id", p2plogging.PeerId(from)). 797 Bool(logging.KeyNetworkingSecurity, true). 798 Msg(PruneTruncationDisabledWarning) 799 return 800 } 801 802 prunes := rpc.GetControl().GetPrune() 803 originalPruneSize := len(prunes) 804 if originalPruneSize <= c.config.GraftPrune.MessageCountThreshold { 805 return // nothing to truncate 806 } 807 808 sampleSize := c.config.GraftPrune.MessageCountThreshold 809 c.performSample(p2pmsg.CtrlMsgPrune, uint(originalPruneSize), uint(sampleSize), func(i, j uint) { 810 prunes[i], prunes[j] = prunes[j], prunes[i] 811 }) 812 rpc.Control.Prune = prunes[:sampleSize] 813 c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgPrune, originalPruneSize-len(rpc.Control.Prune)) 814 } 815 816 // truncateIHaveMessages truncates the iHaves control messages in the RPC. If the total number of iHaves in the RPC exceeds the configured 817 // MessageCountThreshold the list of iHaves will be truncated. 818 // Args: 819 // - rpc: the rpc message to truncate. 820 func (c *ControlMsgValidationInspector) truncateIHaveMessages(from peer.ID, rpc *pubsub.RPC) { 821 if !c.config.InspectionProcess.Truncate.EnableIHave { 822 c.logger. 823 Trace(). 824 Str("peer_id", p2plogging.PeerId(from)). 825 Bool(logging.KeyNetworkingSecurity, true). 826 Msg(IHaveTruncationDisabledWarning) 827 return 828 } 829 830 ihaves := rpc.GetControl().GetIhave() 831 originalIHaveCount := len(ihaves) 832 if originalIHaveCount == 0 { 833 return 834 } 835 836 if originalIHaveCount > c.config.IHave.MessageCountThreshold { 837 // truncate ihaves and update metrics 838 sampleSize := c.config.IHave.MessageCountThreshold 839 if sampleSize > originalIHaveCount { 840 sampleSize = originalIHaveCount 841 } 842 c.performSample(p2pmsg.CtrlMsgIHave, uint(originalIHaveCount), uint(sampleSize), func(i, j uint) { 843 ihaves[i], ihaves[j] = ihaves[j], ihaves[i] 844 }) 845 rpc.Control.Ihave = ihaves[:sampleSize] 846 c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgIHave, originalIHaveCount-len(rpc.Control.Ihave)) 847 } 848 } 849 850 // truncateIHaveMessageIds truncates the message ids for each iHave control message in the RPC. If the total number of message ids in a single iHave exceeds the configured 851 // MessageIdCountThreshold the list of message ids will be truncated. Before message ids are truncated the iHave control messages should have been truncated themselves. 852 // Args: 853 // - rpc: the rpc message to truncate. 854 func (c *ControlMsgValidationInspector) truncateIHaveMessageIds(from peer.ID, rpc *pubsub.RPC) { 855 if !c.config.InspectionProcess.Truncate.EnableIHaveMessageIds { 856 c.logger. 857 Trace(). 858 Str("peer_id", p2plogging.PeerId(from)). 859 Bool(logging.KeyNetworkingSecurity, true). 860 Msg(IHaveMessageIDTruncationDisabledWarning) 861 return 862 } 863 864 for _, ihave := range rpc.GetControl().GetIhave() { 865 messageIDs := ihave.GetMessageIDs() 866 originalMessageIdCount := len(messageIDs) 867 if originalMessageIdCount == 0 { 868 continue // nothing to truncate; skip 869 } 870 871 if originalMessageIdCount > c.config.IHave.MessageIdCountThreshold { 872 sampleSize := c.config.IHave.MessageIdCountThreshold 873 if sampleSize > originalMessageIdCount { 874 sampleSize = originalMessageIdCount 875 } 876 c.performSample(p2pmsg.CtrlMsgIHave, uint(originalMessageIdCount), uint(sampleSize), func(i, j uint) { 877 messageIDs[i], messageIDs[j] = messageIDs[j], messageIDs[i] 878 }) 879 ihave.MessageIDs = messageIDs[:sampleSize] 880 c.metrics.OnIHaveControlMessageIdsTruncated(originalMessageIdCount - len(ihave.MessageIDs)) 881 } 882 c.metrics.OnIHaveMessageIDsReceived(ihave.GetTopicID(), len(ihave.MessageIDs)) 883 } 884 } 885 886 // truncateIWantMessages truncates the iWant control messages in the RPC. If the total number of iWants in the RPC exceeds the configured 887 // MessageCountThreshold the list of iWants will be truncated. 888 // Args: 889 // - rpc: the rpc message to truncate. 890 func (c *ControlMsgValidationInspector) truncateIWantMessages(from peer.ID, rpc *pubsub.RPC) { 891 if !c.config.InspectionProcess.Truncate.EnableIWant { 892 c.logger. 893 Trace(). 894 Str("peer_id", p2plogging.PeerId(from)). 895 Bool(logging.KeyNetworkingSecurity, true). 896 Msg(IWantTruncationDisabledWarning) 897 return 898 } 899 900 iWants := rpc.GetControl().GetIwant() 901 originalIWantCount := uint(len(iWants)) 902 if originalIWantCount == 0 { 903 return 904 } 905 906 if originalIWantCount > c.config.IWant.MessageCountThreshold { 907 // truncate iWants and update metrics 908 sampleSize := c.config.IWant.MessageCountThreshold 909 if sampleSize > originalIWantCount { 910 sampleSize = originalIWantCount 911 } 912 c.performSample(p2pmsg.CtrlMsgIWant, originalIWantCount, sampleSize, func(i, j uint) { 913 iWants[i], iWants[j] = iWants[j], iWants[i] 914 }) 915 rpc.Control.Iwant = iWants[:sampleSize] 916 c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgIWant, int(originalIWantCount)-len(rpc.Control.Iwant)) 917 } 918 } 919 920 // truncateIWantMessageIds truncates the message ids for each iWant control message in the RPC. If the total number of message ids in a single iWant exceeds the configured 921 // MessageIdCountThreshold the list of message ids will be truncated. Before message ids are truncated the iWant control messages should have been truncated themselves. 922 // Args: 923 // - rpc: the rpc message to truncate. 924 func (c *ControlMsgValidationInspector) truncateIWantMessageIds(from peer.ID, rpc *pubsub.RPC) { 925 if !c.config.InspectionProcess.Truncate.EnableIWantMessageIds { 926 c.logger. 927 Trace(). 928 Str("peer_id", p2plogging.PeerId(from)). 929 Bool(logging.KeyNetworkingSecurity, true). 930 Msg(IWantMessageIDTruncationDisabledWarning) 931 return 932 } 933 934 lastHighest := c.rpcTracker.LastHighestIHaveRPCSize() 935 lg := c.logger.With(). 936 Str("peer_id", p2plogging.PeerId(from)). 937 Uint("max_sample_size", c.config.IWant.MessageCountThreshold). 938 Int64("last_highest_ihave_rpc_size", lastHighest). 939 Logger() 940 941 sampleSize := int(10 * lastHighest) 942 if sampleSize == 0 || sampleSize > c.config.IWant.MessageIdCountThreshold { 943 // invalid or 0 sample size is suspicious 944 lg.Warn().Str(logging.KeySuspicious, "true").Msg("zero or invalid sample size, using default max sample size") 945 sampleSize = c.config.IWant.MessageIdCountThreshold 946 } 947 for _, iWant := range rpc.GetControl().GetIwant() { 948 messageIDs := iWant.GetMessageIDs() 949 totalMessageIdCount := len(messageIDs) 950 if totalMessageIdCount == 0 { 951 continue // nothing to truncate; skip 952 } 953 954 if totalMessageIdCount > sampleSize { 955 c.performSample(p2pmsg.CtrlMsgIWant, uint(totalMessageIdCount), uint(sampleSize), func(i, j uint) { 956 messageIDs[i], messageIDs[j] = messageIDs[j], messageIDs[i] 957 }) 958 iWant.MessageIDs = messageIDs[:sampleSize] 959 c.metrics.OnIWantControlMessageIdsTruncated(totalMessageIdCount - len(iWant.MessageIDs)) 960 } 961 c.metrics.OnIWantMessageIDsReceived(len(iWant.MessageIDs)) 962 } 963 } 964 965 // performSample performs sampling on the specified control message that will randomize 966 // the items in the control message slice up to index sampleSize-1. Any error encountered during sampling is considered 967 // irrecoverable and will cause the node to crash. 968 func (c *ControlMsgValidationInspector) performSample(ctrlMsg p2pmsg.ControlMessageType, totalSize, sampleSize uint, swap func(i, j uint)) { 969 err := flowrand.Samples(totalSize, sampleSize, swap) 970 if err != nil { 971 c.logAndThrowError(fmt.Errorf("failed to get random sample of %s control messages: %w", ctrlMsg, err)) 972 } 973 } 974 975 // validateTopic ensures the topic is a valid flow topic/channel. 976 // Expected error returns during normal operations: 977 // - channels.InvalidTopicErr: if topic is invalid. 978 // - ErrActiveClusterIdsNotSet: if the cluster ID provider is not set. 979 // - channels.UnknownClusterIDErr: if the topic contains a cluster ID prefix that is not in the active cluster IDs list. 980 // 981 // This func returns an exception in case of unexpected bug or state corruption if cluster prefixed topic validation 982 // fails due to unexpected error returned when getting the active cluster IDS. 983 func (c *ControlMsgValidationInspector) validateTopic(from peer.ID, topic channels.Topic, activeClusterIds flow.ChainIDList) (error, p2p.CtrlMsgTopicType) { 984 channel, ok := channels.ChannelFromTopic(topic) 985 if !ok { 986 return channels.NewInvalidTopicErr(topic, fmt.Errorf("failed to get channel from topic")), p2p.CtrlMsgNonClusterTopicType 987 } 988 // handle cluster prefixed topics 989 if channels.IsClusterChannel(channel) { 990 return c.validateClusterPrefixedTopic(from, topic, activeClusterIds), p2p.CtrlMsgTopicTypeClusterPrefixed 991 } 992 993 // non cluster prefixed topic validation 994 err := channels.IsValidNonClusterFlowTopic(topic, c.sporkID) 995 if err != nil { 996 return err, p2p.CtrlMsgNonClusterTopicType 997 } 998 return nil, p2p.CtrlMsgNonClusterTopicType 999 } 1000 1001 // validateClusterPrefixedTopic validates cluster prefixed topics. 1002 // Expected error returns during normal operations: 1003 // - ErrActiveClusterIdsNotSet: if the cluster ID provider is not set. 1004 // - channels.InvalidTopicErr: if topic is invalid. 1005 // - channels.UnknownClusterIDErr: if the topic contains a cluster ID prefix that is not in the active cluster IDs list. 1006 // 1007 // In the case where an ErrActiveClusterIdsNotSet or UnknownClusterIDErr is encountered and the cluster prefixed topic received 1008 // tracker for the peer is less than or equal to the configured HardThreshold an error will only be logged and not returned. 1009 // At the point where the hard threshold is crossed the error will be returned and the sender will start to be penalized. 1010 // Any errors encountered while incrementing or loading the cluster prefixed control message gauge for a peer will result in an irrecoverable error being thrown, these 1011 // errors are unexpected and irrecoverable indicating a bug. 1012 func (c *ControlMsgValidationInspector) validateClusterPrefixedTopic(from peer.ID, topic channels.Topic, activeClusterIds flow.ChainIDList) error { 1013 lg := c.logger.With(). 1014 Str("from", p2plogging.PeerId(from)). 1015 Logger() 1016 1017 if len(activeClusterIds) == 0 { 1018 // cluster IDs have not been updated yet 1019 _, incErr := c.tracker.Inc(from) 1020 if incErr != nil { 1021 // irrecoverable error encountered 1022 c.logAndThrowError(fmt.Errorf("error encountered while incrementing the cluster prefixed control message gauge %s: %w", from, incErr)) 1023 } 1024 1025 // if the amount of messages received is below our hard threshold log the error and return nil. 1026 if ok := c.checkClusterPrefixHardThreshold(from); ok { 1027 lg.Warn(). 1028 Str("topic", topic.String()). 1029 Msg("failed to validate cluster prefixed control message with cluster pre-fixed topic active cluster ids not set") 1030 return nil 1031 } 1032 1033 return NewActiveClusterIdsNotSetErr(topic) 1034 } 1035 1036 err := channels.IsValidFlowClusterTopic(topic, activeClusterIds) 1037 if err != nil { 1038 if channels.IsUnknownClusterIDErr(err) { 1039 // unknown cluster ID error could indicate that a node has fallen 1040 // behind and needs to catchup increment to topics received cache. 1041 _, incErr := c.tracker.Inc(from) 1042 if incErr != nil { 1043 c.logAndThrowError(fmt.Errorf("error encountered while incrementing the cluster prefixed control message gauge %s: %w", from, err)) 1044 } 1045 // if the amount of messages received is below our hard threshold log the error and return nil. 1046 if c.checkClusterPrefixHardThreshold(from) { 1047 lg.Warn(). 1048 Err(err). 1049 Str("topic", topic.String()). 1050 Msg("processing unknown cluster prefixed topic received below cluster prefixed discard threshold peer may be behind in the protocol") 1051 return nil 1052 } 1053 } 1054 return err 1055 } 1056 1057 return nil 1058 } 1059 1060 // checkClusterPrefixHardThreshold returns true if the cluster prefix received tracker count is less than 1061 // the configured HardThreshold, false otherwise. 1062 // If any error is encountered while loading from the tracker this func will throw an error on the signaler context, these errors 1063 // are unexpected and irrecoverable indicating a bug. 1064 func (c *ControlMsgValidationInspector) checkClusterPrefixHardThreshold(pid peer.ID) bool { 1065 gauge, err := c.tracker.Load(pid) 1066 if err != nil { 1067 // irrecoverable error encountered 1068 c.logAndThrowError(fmt.Errorf("cluster prefixed control message gauge during hard threshold check failed for peer %s: %w", pid, err)) 1069 } 1070 return gauge <= c.config.ClusterPrefixedMessage.HardThreshold 1071 } 1072 1073 // logAndDistributeErr logs the provided error and attempts to disseminate an invalid control message validation notification for the error. 1074 // Args: 1075 // - req: inspect rpc request that failed validation. 1076 // - ctlMsgType: the control message type of the rpc message that caused the error. 1077 // - err: the error that occurred. 1078 // - count: the number of occurrences of the error. 1079 // - isClusterPrefixed: indicates if the errors occurred on a cluster prefixed topic. 1080 func (c *ControlMsgValidationInspector) logAndDistributeAsyncInspectErrs(req *InspectRPCRequest, ctlMsgType p2pmsg.ControlMessageType, err error, count uint64, topicType p2p.CtrlMsgTopicType) { 1081 lg := c.logger.With(). 1082 Err(err). 1083 Str("control_message_type", ctlMsgType.String()). 1084 Bool(logging.KeySuspicious, true). 1085 Bool(logging.KeyNetworkingSecurity, true). 1086 Str("topic_type", topicType.String()). 1087 Uint64("error_count", count). 1088 Str("peer_id", p2plogging.PeerId(req.Peer)). 1089 Logger() 1090 1091 switch { 1092 case IsErrActiveClusterIDsNotSet(err): 1093 c.metrics.OnActiveClusterIDsNotSetErr() 1094 lg.Warn().Msg("active cluster ids not set") 1095 case IsErrUnstakedPeer(err): 1096 c.metrics.OnUnstakedPeerInspectionFailed() 1097 lg.Warn().Msg("control message received from unstaked peer") 1098 default: 1099 c.notificationConsumer.OnInvalidControlMessageNotification(p2p.NewInvalidControlMessageNotification(req.Peer, ctlMsgType, err, count, topicType)) 1100 lg.Error().Msg("rpc control message async inspection failed, notification sent") 1101 c.metrics.OnInvalidControlMessageNotificationSent() 1102 } 1103 } 1104 1105 // logAndThrowError logs and throws irrecoverable errors on the context. 1106 // Args: 1107 // 1108 // err: the error encountered. 1109 func (c *ControlMsgValidationInspector) logAndThrowError(err error) { 1110 c.logger.Error(). 1111 Err(err). 1112 Bool(logging.KeySuspicious, true). 1113 Bool(logging.KeyNetworkingSecurity, true). 1114 Msg("unexpected irrecoverable error encountered") 1115 c.ctx.Throw(err) 1116 }