github.com/onflow/flow-go@v0.33.17/network/p2p/inspector/validation/control_message_validation_inspector.go (about) 1 package validation 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/go-playground/validator/v10" 8 "github.com/hashicorp/go-multierror" 9 pubsub "github.com/libp2p/go-libp2p-pubsub" 10 pubsub_pb "github.com/libp2p/go-libp2p-pubsub/pb" 11 "github.com/libp2p/go-libp2p/core/peer" 12 "github.com/rs/zerolog" 13 14 "github.com/onflow/flow-go/engine/common/worker" 15 "github.com/onflow/flow-go/model/flow" 16 "github.com/onflow/flow-go/module" 17 "github.com/onflow/flow-go/module/component" 18 "github.com/onflow/flow-go/module/irrecoverable" 19 "github.com/onflow/flow-go/module/mempool/queue" 20 "github.com/onflow/flow-go/module/metrics" 21 "github.com/onflow/flow-go/network" 22 "github.com/onflow/flow-go/network/channels" 23 "github.com/onflow/flow-go/network/p2p" 24 p2pconfig "github.com/onflow/flow-go/network/p2p/config" 25 "github.com/onflow/flow-go/network/p2p/inspector/internal/cache" 26 p2plogging "github.com/onflow/flow-go/network/p2p/logging" 27 p2pmsg "github.com/onflow/flow-go/network/p2p/message" 28 "github.com/onflow/flow-go/state/protocol" 29 "github.com/onflow/flow-go/state/protocol/events" 30 "github.com/onflow/flow-go/utils/logging" 31 flowrand "github.com/onflow/flow-go/utils/rand" 32 ) 33 34 // ControlMsgValidationInspector RPC message inspector that inspects control messages and performs some validation on them, 35 // when some validation rule is broken feedback is given via the Peer scoring notifier. 36 type ControlMsgValidationInspector struct { 37 component.Component 38 events.Noop 39 ctx irrecoverable.SignalerContext 40 logger zerolog.Logger 41 sporkID flow.Identifier 42 metrics module.GossipSubRpcValidationInspectorMetrics 43 // config control message validation configurations. 44 config *p2pconfig.RpcValidationInspector 45 // distributor used to disseminate invalid RPC message notifications. 46 distributor p2p.GossipSubInspectorNotifDistributor 47 // workerPool queue that stores *InspectRPCRequest that will be processed by component workers. 48 workerPool *worker.Pool[*InspectRPCRequest] 49 // tracker is a map that associates the hash of a peer's ID with the 50 // number of cluster-prefix topic control messages received from that peer. It helps in tracking 51 // and managing the rate of incoming control messages from each peer, ensuring that the system 52 // stays performant and resilient against potential spam or abuse. 53 // The counter is incremented in the following scenarios: 54 // 1. The cluster prefix topic is received while the inspector waits for the cluster IDs provider to be set (this can happen during the startup or epoch transitions). 55 // 2. The node sends a cluster prefix topic where the cluster prefix does not match any of the active cluster IDs. 56 // In such cases, the inspector will allow a configured number of these messages from the corresponding peer. 57 tracker *cache.ClusterPrefixedMessagesReceivedTracker 58 idProvider module.IdentityProvider 59 rpcTracker p2p.RpcControlTracking 60 // networkingType indicates public or private network, rpc publish messages are inspected for unstaked senders when running the private network. 61 networkingType network.NetworkingType 62 // topicOracle callback used to retrieve the current subscribed topics of the libp2p node. 63 topicOracle func() p2p.TopicProvider 64 } 65 66 type InspectorParams struct { 67 // Logger the logger used by the inspector. 68 Logger zerolog.Logger `validate:"required"` 69 // SporkID the current spork ID. 70 SporkID flow.Identifier `validate:"required"` 71 // Config inspector configuration. 72 Config *p2pconfig.RpcValidationInspector `validate:"required"` 73 // Distributor gossipsub inspector notification distributor. 74 Distributor p2p.GossipSubInspectorNotifDistributor `validate:"required"` 75 // HeroCacheMetricsFactory the metrics factory. 76 HeroCacheMetricsFactory metrics.HeroCacheMetricsFactory `validate:"required"` 77 // IdProvider identity provider is used to get the flow identifier for a peer. 78 IdProvider module.IdentityProvider `validate:"required"` 79 // InspectorMetrics metrics for the validation inspector. 80 InspectorMetrics module.GossipSubRpcValidationInspectorMetrics `validate:"required"` 81 // RpcTracker tracker used to track iHave RPC's sent and last size. 82 RpcTracker p2p.RpcControlTracking `validate:"required"` 83 // NetworkingType the networking type of the node. 84 NetworkingType network.NetworkingType `validate:"required"` 85 // TopicOracle callback used to retrieve the current subscribed topics of the libp2p node. 86 // It is set as a callback to avoid circular dependencies between the topic oracle and the inspector. 87 TopicOracle func() p2p.TopicProvider `validate:"required"` 88 } 89 90 var _ component.Component = (*ControlMsgValidationInspector)(nil) 91 var _ p2p.GossipSubMsgValidationRpcInspector = (*ControlMsgValidationInspector)(nil) 92 var _ protocol.Consumer = (*ControlMsgValidationInspector)(nil) 93 94 // NewControlMsgValidationInspector returns new ControlMsgValidationInspector 95 // Args: 96 // - *InspectorParams: params used to create the inspector. 97 // 98 // Returns: 99 // - *ControlMsgValidationInspector: a new control message validation inspector. 100 // - error: an error if there is any error while creating the inspector. All errors are irrecoverable and unexpected. 101 func NewControlMsgValidationInspector(params *InspectorParams) (*ControlMsgValidationInspector, error) { 102 err := validator.New().Struct(params) 103 if err != nil { 104 return nil, fmt.Errorf("inspector params validation failed: %w", err) 105 } 106 lg := params.Logger.With().Str("component", "gossip_sub_rpc_validation_inspector").Logger() 107 108 inspectMsgQueueCacheCollector := metrics.GossipSubRPCInspectorQueueMetricFactory(params.HeroCacheMetricsFactory, params.NetworkingType) 109 clusterPrefixedCacheCollector := metrics.GossipSubRPCInspectorClusterPrefixedCacheMetricFactory(params.HeroCacheMetricsFactory, params.NetworkingType) 110 111 clusterPrefixedTracker, err := cache.NewClusterPrefixedMessagesReceivedTracker(params.Logger, 112 params.Config.ClusterPrefixedMessage.ControlMsgsReceivedCacheSize, 113 clusterPrefixedCacheCollector, 114 params.Config.ClusterPrefixedMessage.ControlMsgsReceivedCacheDecay) 115 if err != nil { 116 return nil, fmt.Errorf("failed to create cluster prefix topics received tracker") 117 } 118 119 if params.Config.PublishMessages.MaxSampleSize < params.Config.PublishMessages.ErrorThreshold { 120 return nil, fmt.Errorf("rpc message max sample size must be greater than or equal to rpc message error threshold, got %d and %d respectively", 121 params.Config.PublishMessages.MaxSampleSize, 122 params.Config.PublishMessages.ErrorThreshold) 123 } 124 125 c := &ControlMsgValidationInspector{ 126 logger: lg, 127 sporkID: params.SporkID, 128 config: params.Config, 129 distributor: params.Distributor, 130 tracker: clusterPrefixedTracker, 131 rpcTracker: params.RpcTracker, 132 idProvider: params.IdProvider, 133 metrics: params.InspectorMetrics, 134 networkingType: params.NetworkingType, 135 topicOracle: params.TopicOracle, 136 } 137 138 store := queue.NewHeroStore(params.Config.InspectionQueue.Size, params.Logger, inspectMsgQueueCacheCollector) 139 140 pool := worker.NewWorkerPoolBuilder[*InspectRPCRequest](lg, store, c.processInspectRPCReq).Build() 141 142 c.workerPool = pool 143 144 builder := component.NewComponentManagerBuilder() 145 builder.AddWorker(func(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 146 c.logger.Debug().Msg("starting rpc inspector distributor") 147 c.ctx = ctx 148 c.distributor.Start(ctx) 149 select { 150 case <-ctx.Done(): 151 c.logger.Debug().Msg("rpc inspector distributor startup aborted; context cancelled") 152 case <-c.distributor.Ready(): 153 c.logger.Debug().Msg("rpc inspector distributor started") 154 ready() 155 } 156 <-ctx.Done() 157 c.logger.Debug().Msg("rpc inspector distributor stopped") 158 <-c.distributor.Done() 159 c.logger.Debug().Msg("rpc inspector distributor shutdown complete") 160 }) 161 for i := 0; i < c.config.InspectionQueue.NumberOfWorkers; i++ { 162 builder.AddWorker(pool.WorkerLogic()) 163 } 164 c.Component = builder.Build() 165 return c, nil 166 } 167 168 func (c *ControlMsgValidationInspector) Start(parent irrecoverable.SignalerContext) { 169 if c.topicOracle == nil { 170 parent.Throw(fmt.Errorf("control message validation inspector topic oracle not set")) 171 } 172 c.Component.Start(parent) 173 } 174 175 // Name returns the name of the rpc inspector. 176 func (c *ControlMsgValidationInspector) Name() string { 177 return rpcInspectorComponentName 178 } 179 180 // ActiveClustersChanged consumes cluster ID update protocol events. 181 func (c *ControlMsgValidationInspector) ActiveClustersChanged(clusterIDList flow.ChainIDList) { 182 c.tracker.StoreActiveClusterIds(clusterIDList) 183 } 184 185 // Inspect is called by gossipsub upon reception of a rpc from a remote node. 186 // It creates a new InspectRPCRequest for the RPC to be inspected async by the worker pool. 187 // Args: 188 // - from: the sender. 189 // - rpc: the control message RPC. 190 // 191 // Returns: 192 // - error: if a new inspect rpc request cannot be created, all errors returned are considered irrecoverable. 193 func (c *ControlMsgValidationInspector) Inspect(from peer.ID, rpc *pubsub.RPC) error { 194 // first truncate the rpc to the configured max sample size; if needed 195 c.truncateRPC(from, rpc) 196 197 // second, queue further async inspection 198 req, err := NewInspectRPCRequest(from, rpc) 199 if err != nil { 200 c.logger.Error(). 201 Err(err). 202 Bool(logging.KeyNetworkingSecurity, true). 203 Str("peer_id", p2plogging.PeerId(from)). 204 Msg("failed to get inspect RPC request") 205 return fmt.Errorf("failed to get inspect RPC request: %w", err) 206 } 207 c.workerPool.Submit(req) 208 209 return nil 210 } 211 212 // updateMetrics updates the metrics for the received RPC. 213 // Args: 214 // - from: the sender. 215 // 216 // - rpc: the control message RPC. 217 func (c *ControlMsgValidationInspector) updateMetrics(from peer.ID, rpc *pubsub.RPC) { 218 includedMessages := len(rpc.GetPublish()) 219 iHaveCount, iWantCount, graftCount, pruneCount := 0, 0, 0, 0 220 ctl := rpc.GetControl() 221 if ctl != nil { 222 iHaveCount = len(ctl.GetIhave()) 223 iWantCount = len(ctl.GetIwant()) 224 graftCount = len(ctl.GetGraft()) 225 pruneCount = len(ctl.GetPrune()) 226 } 227 c.metrics.OnIncomingRpcReceived(iHaveCount, iWantCount, graftCount, pruneCount, includedMessages) 228 if c.logger.GetLevel() > zerolog.TraceLevel { 229 return // skip logging if trace level is not enabled 230 } 231 c.logger.Trace(). 232 Str("peer_id", p2plogging.PeerId(from)). 233 Int("iHaveCount", iHaveCount). 234 Int("iWantCount", iWantCount). 235 Int("graftCount", graftCount). 236 Int("pruneCount", pruneCount). 237 Int("included_message_count", includedMessages). 238 Msg("received rpc with control messages") 239 } 240 241 // processInspectRPCReq func used by component workers to perform further inspection of RPC control messages that will validate ensure all control message 242 // types are valid in the RPC. 243 // Args: 244 // - req: the inspect rpc request. 245 // 246 // Returns: 247 // - error: no error is expected to be returned from this func as they are logged and distributed in invalid control message notifications. 248 func (c *ControlMsgValidationInspector) processInspectRPCReq(req *InspectRPCRequest) error { 249 c.updateMetrics(req.Peer, req.rpc) 250 c.metrics.AsyncProcessingStarted() 251 start := time.Now() 252 defer func() { 253 c.metrics.AsyncProcessingFinished(time.Since(start)) 254 }() 255 256 activeClusterIDS := c.tracker.GetActiveClusterIds() 257 for _, ctrlMsgType := range p2pmsg.ControlMessageTypes() { 258 switch ctrlMsgType { 259 case p2pmsg.CtrlMsgGraft: 260 err, topicType := c.inspectGraftMessages(req.Peer, req.rpc.GetControl().GetGraft(), activeClusterIDS) 261 if err != nil { 262 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgGraft, err, 1, topicType) 263 return nil 264 } 265 case p2pmsg.CtrlMsgPrune: 266 err, topicType := c.inspectPruneMessages(req.Peer, req.rpc.GetControl().GetPrune(), activeClusterIDS) 267 if err != nil { 268 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgPrune, err, 1, topicType) 269 return nil 270 } 271 case p2pmsg.CtrlMsgIWant: 272 err := c.inspectIWantMessages(req.Peer, req.rpc.GetControl().GetIwant()) 273 if err != nil { 274 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgIWant, err, 1, p2p.CtrlMsgNonClusterTopicType) 275 return nil 276 } 277 case p2pmsg.CtrlMsgIHave: 278 err, topicType := c.inspectIHaveMessages(req.Peer, req.rpc.GetControl().GetIhave(), activeClusterIDS) 279 if err != nil { 280 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.CtrlMsgIHave, err, 1, topicType) 281 return nil 282 } 283 } 284 } 285 286 // inspect rpc publish messages after all control message validation has passed 287 err, errCount := c.inspectRpcPublishMessages(req.Peer, req.rpc.GetPublish(), activeClusterIDS) 288 if err != nil { 289 c.logAndDistributeAsyncInspectErrs(req, p2pmsg.RpcPublishMessage, err, errCount, p2p.CtrlMsgNonClusterTopicType) 290 return nil 291 } 292 293 return nil 294 } 295 296 // checkPubsubMessageSender checks the sender of the sender of pubsub message to ensure they are not unstaked, or ejected. 297 // This check is only required on private networks. 298 // Args: 299 // - message: the pubsub message. 300 // 301 // Returns: 302 // - error: if the peer ID cannot be created from bytes, sender is unknown or the identity is ejected. 303 // 304 // All errors returned from this function can be considered benign. 305 func (c *ControlMsgValidationInspector) checkPubsubMessageSender(message *pubsub_pb.Message) error { 306 pid, err := peer.IDFromBytes(message.GetFrom()) 307 if err != nil { 308 return fmt.Errorf("failed to get peer ID from bytes: %w", err) 309 } 310 if id, ok := c.idProvider.ByPeerID(pid); !ok { 311 return fmt.Errorf("received rpc publish message from unstaked peer: %s", pid) 312 } else if id.Ejected { 313 return fmt.Errorf("received rpc publish message from ejected peer: %s", pid) 314 } 315 316 return nil 317 } 318 319 // inspectGraftMessages performs topic validation on all grafts in the control message using the provided validateTopic func while tracking duplicates. 320 // Args: 321 // - from: peer ID of the sender. 322 // - grafts: the list of grafts to inspect. 323 // - activeClusterIDS: the list of active cluster ids. 324 // Returns: 325 // - DuplicateTopicErr: if there are any duplicate topics in the list of grafts 326 // - error: if any error occurs while sampling or validating topics, all returned errors are benign and should not cause the node to crash. 327 // - bool: true if an error is returned and the topic that failed validation was a cluster prefixed topic, false otherwise. 328 func (c *ControlMsgValidationInspector) inspectGraftMessages(from peer.ID, grafts []*pubsub_pb.ControlGraft, activeClusterIDS flow.ChainIDList) (error, p2p.CtrlMsgTopicType) { 329 duplicateTopicTracker := make(duplicateStrTracker) 330 totalDuplicateTopicIds := 0 331 defer func() { 332 // regardless of inspection result, update metrics 333 c.metrics.OnGraftMessageInspected(totalDuplicateTopicIds) 334 }() 335 for _, graft := range grafts { 336 topic := channels.Topic(graft.GetTopicID()) 337 if duplicateTopicTracker.track(topic.String()) > 1 { 338 // ideally, a GRAFT message should not have any duplicate topics, hence a topic ID is counted as a duplicate only if it is repeated more than once. 339 totalDuplicateTopicIds++ 340 // check if the total number of duplicates exceeds the configured threshold. 341 if totalDuplicateTopicIds > c.config.GraftPrune.DuplicateTopicIdThreshold { 342 c.metrics.OnGraftDuplicateTopicIdsExceedThreshold() 343 return NewDuplicateTopicErr(topic.String(), totalDuplicateTopicIds, p2pmsg.CtrlMsgGraft), p2p.CtrlMsgNonClusterTopicType 344 } 345 } 346 err, ctrlMsgType := c.validateTopic(from, topic, activeClusterIDS) 347 if err != nil { 348 // TODO: consider adding a threshold for this error similar to the duplicate topic id threshold. 349 c.metrics.OnInvalidTopicIdDetectedForControlMessage(p2pmsg.CtrlMsgGraft) 350 return err, ctrlMsgType 351 } 352 } 353 return nil, p2p.CtrlMsgNonClusterTopicType 354 } 355 356 // inspectPruneMessages performs topic validation on all prunes in the control message using the provided validateTopic func while tracking duplicates. 357 // Args: 358 // - from: peer ID of the sender. 359 // - prunes: the list of iHaves to inspect. 360 // - activeClusterIDS: the list of active cluster ids. 361 // Returns: 362 // - DuplicateTopicErr: if there are any duplicate topics found in the list of iHaves 363 // or any duplicate message ids found inside a single iHave. 364 // - error: if any error occurs while sampling or validating topics, all returned errors are benign and should not cause the node to crash. 365 // - bool: true if an error is returned and the topic that failed validation was a cluster prefixed topic, false otherwise. 366 func (c *ControlMsgValidationInspector) inspectPruneMessages(from peer.ID, prunes []*pubsub_pb.ControlPrune, activeClusterIDS flow.ChainIDList) (error, p2p.CtrlMsgTopicType) { 367 tracker := make(duplicateStrTracker) 368 totalDuplicateTopicIds := 0 369 defer func() { 370 // regardless of inspection result, update metrics 371 c.metrics.OnPruneMessageInspected(totalDuplicateTopicIds) 372 }() 373 for _, prune := range prunes { 374 topic := channels.Topic(prune.GetTopicID()) 375 if tracker.track(topic.String()) > 1 { 376 // ideally, a PRUNE message should not have any duplicate topics, hence a topic ID is counted as a duplicate only if it is repeated more than once. 377 totalDuplicateTopicIds++ 378 // check if the total number of duplicates exceeds the configured threshold. 379 if totalDuplicateTopicIds > c.config.GraftPrune.DuplicateTopicIdThreshold { 380 c.metrics.OnPruneDuplicateTopicIdsExceedThreshold() 381 return NewDuplicateTopicErr(topic.String(), totalDuplicateTopicIds, p2pmsg.CtrlMsgPrune), p2p.CtrlMsgNonClusterTopicType 382 } 383 } 384 err, ctrlMsgType := c.validateTopic(from, topic, activeClusterIDS) 385 if err != nil { 386 // TODO: consider adding a threshold for this error similar to the duplicate topic id threshold. 387 c.metrics.OnInvalidTopicIdDetectedForControlMessage(p2pmsg.CtrlMsgPrune) 388 return err, ctrlMsgType 389 } 390 } 391 return nil, p2p.CtrlMsgNonClusterTopicType 392 } 393 394 // inspectIHaveMessages performs topic validation on all ihaves in the control message using the provided validateTopic func while tracking duplicates. 395 // Args: 396 // - from: peer ID of the sender. 397 // - iHaves: the list of iHaves to inspect. 398 // - activeClusterIDS: the list of active cluster ids. 399 // Returns: 400 // - DuplicateTopicErr: if there are any duplicate topics found in the list of iHaves 401 // or any duplicate message ids found inside a single iHave. 402 // - error: if any error occurs while sampling or validating topics, all returned errors are benign and should not cause the node to crash. 403 // - bool: true if an error is returned and the topic that failed validation was a cluster prefixed topic, false otherwise. 404 func (c *ControlMsgValidationInspector) inspectIHaveMessages(from peer.ID, ihaves []*pubsub_pb.ControlIHave, activeClusterIDS flow.ChainIDList) (error, p2p.CtrlMsgTopicType) { 405 if len(ihaves) == 0 { 406 return nil, p2p.CtrlMsgNonClusterTopicType 407 } 408 lg := c.logger.With(). 409 Str("peer_id", p2plogging.PeerId(from)). 410 Int("sample_size", len(ihaves)). 411 Int("max_sample_size", c.config.IHave.MessageCountThreshold). 412 Logger() 413 duplicateTopicTracker := make(duplicateStrTracker) 414 duplicateMessageIDTracker := make(duplicateStrTracker) 415 totalMessageIds := 0 416 totalDuplicateTopicIds := 0 417 totalDuplicateMessageIds := 0 418 defer func() { 419 // regardless of inspection result, update metrics 420 c.metrics.OnIHaveMessagesInspected(totalDuplicateTopicIds, totalDuplicateMessageIds) 421 }() 422 for _, ihave := range ihaves { 423 messageIds := ihave.GetMessageIDs() 424 topic := ihave.GetTopicID() 425 totalMessageIds += len(messageIds) 426 427 // first check if the topic is valid, fail fast if it is not 428 err, ctrlMsgType := c.validateTopic(from, channels.Topic(topic), activeClusterIDS) 429 if err != nil { 430 // TODO: consider adding a threshold for this error similar to the duplicate topic id threshold. 431 c.metrics.OnInvalidTopicIdDetectedForControlMessage(p2pmsg.CtrlMsgIHave) 432 return err, ctrlMsgType 433 } 434 435 // then track the topic ensuring it is not beyond a duplicate threshold. 436 if duplicateTopicTracker.track(topic) > 1 { 437 totalDuplicateTopicIds++ 438 // the topic is duplicated, check if the total number of duplicates exceeds the configured threshold 439 if totalDuplicateTopicIds > c.config.IHave.DuplicateTopicIdThreshold { 440 c.metrics.OnIHaveDuplicateTopicIdsExceedThreshold() 441 return NewDuplicateTopicErr(topic, totalDuplicateTopicIds, p2pmsg.CtrlMsgIHave), p2p.CtrlMsgNonClusterTopicType 442 } 443 } 444 445 for _, messageID := range messageIds { 446 if duplicateMessageIDTracker.track(messageID) > 1 { 447 totalDuplicateMessageIds++ 448 // the message is duplicated, check if the total number of duplicates exceeds the configured threshold 449 if totalDuplicateMessageIds > c.config.IHave.DuplicateMessageIdThreshold { 450 c.metrics.OnIHaveDuplicateMessageIdsExceedThreshold() 451 return NewDuplicateMessageIDErr(messageID, totalDuplicateMessageIds, p2pmsg.CtrlMsgIHave), p2p.CtrlMsgNonClusterTopicType 452 } 453 } 454 } 455 } 456 lg.Debug(). 457 Int("total_message_ids", totalMessageIds). 458 Int("total_duplicate_topic_ids", totalDuplicateTopicIds). 459 Int("total_duplicate_message_ids", totalDuplicateMessageIds). 460 Msg("ihave control message validation complete") 461 return nil, p2p.CtrlMsgNonClusterTopicType 462 } 463 464 // inspectIWantMessages inspects RPC iWant control messages. This func will sample the iWants and perform validation on each iWant in the sample. 465 // Ensuring that the following are true: 466 // - Each iWant corresponds to an iHave that was sent. 467 // - Each topic in the iWant sample is a valid topic. 468 // If the number of iWants that do not have a corresponding iHave exceed the configured threshold an error is returned. 469 // Args: 470 // - from: peer ID of the sender. 471 // - iWant: the list of iWant control messages. 472 // Returns: 473 // - DuplicateTopicErr: if there are any duplicate message ids found in any of the iWants. 474 // - IWantCacheMissThresholdErr: if the rate of cache misses exceeds the configured allowed threshold. 475 func (c *ControlMsgValidationInspector) inspectIWantMessages(from peer.ID, iWants []*pubsub_pb.ControlIWant) error { 476 if len(iWants) == 0 { 477 return nil 478 } 479 lastHighest := c.rpcTracker.LastHighestIHaveRPCSize() 480 lg := c.logger.With(). 481 Str("peer_id", p2plogging.PeerId(from)). 482 Uint("max_sample_size", c.config.IWant.MessageCountThreshold). 483 Int64("last_highest_ihave_rpc_size", lastHighest). 484 Logger() 485 duplicateMsgIdTracker := make(duplicateStrTracker) 486 cacheMisses := 0 487 duplicateMessageIds := 0 488 defer func() { 489 // regardless of inspection result, update metrics 490 c.metrics.OnIWantMessagesInspected(duplicateMessageIds, cacheMisses) 491 }() 492 493 lg = lg.With(). 494 Int("iwant_msg_count", len(iWants)). 495 Int("cache_misses_threshold", c.config.IWant.CacheMissThreshold). 496 Int("duplicates_threshold", c.config.IWant.DuplicateMsgIdThreshold).Logger() 497 498 lg.Trace().Msg("validating sample of message ids from iwant control message") 499 500 totalMessageIds := 0 501 for _, iWant := range iWants { 502 messageIds := iWant.GetMessageIDs() 503 messageIDCount := uint(len(messageIds)) 504 for _, messageID := range messageIds { 505 // check duplicate allowed threshold 506 if duplicateMsgIdTracker.track(messageID) > 1 { 507 // ideally, an iWant message should not have any duplicate message IDs, hence a message id is considered duplicate when it is repeated more than once. 508 duplicateMessageIds++ 509 if duplicateMessageIds > c.config.IWant.DuplicateMsgIdThreshold { 510 c.metrics.OnIWantDuplicateMessageIdsExceedThreshold() 511 return NewIWantDuplicateMsgIDThresholdErr(duplicateMessageIds, messageIDCount, c.config.IWant.DuplicateMsgIdThreshold) 512 } 513 } 514 // check cache miss threshold 515 if !c.rpcTracker.WasIHaveRPCSent(messageID) { 516 cacheMisses++ 517 if cacheMisses > c.config.IWant.CacheMissThreshold { 518 c.metrics.OnIWantCacheMissMessageIdsExceedThreshold() 519 return NewIWantCacheMissThresholdErr(cacheMisses, messageIDCount, c.config.IWant.CacheMissThreshold) 520 } 521 } 522 duplicateMsgIdTracker.track(messageID) 523 totalMessageIds++ 524 } 525 } 526 527 lg.Debug(). 528 Int("total_message_ids", totalMessageIds). 529 Int("cache_misses", cacheMisses). 530 Int("total_duplicate_message_ids", duplicateMessageIds). 531 Msg("iwant control message validation complete") 532 533 return nil 534 } 535 536 // inspectRpcPublishMessages inspects a sample of the RPC gossip messages and performs topic validation that ensures the following: 537 // - Topics are known flow topics. 538 // - Topics are valid flow topics. 539 // - Topics are in the nodes subscribe topics list. 540 // If more than half the topics in the sample contain invalid topics an error will be returned. 541 // Args: 542 // - from: peer ID of the sender. 543 // - messages: rpc publish messages. 544 // - activeClusterIDS: the list of active cluster ids. 545 // Returns: 546 // - InvalidRpcPublishMessagesErr: if the amount of invalid messages exceeds the configured RPCMessageErrorThreshold. 547 // - int: the number of invalid pubsub messages 548 func (c *ControlMsgValidationInspector) inspectRpcPublishMessages(from peer.ID, messages []*pubsub_pb.Message, activeClusterIDS flow.ChainIDList) (error, uint64) { 549 totalMessages := len(messages) 550 if totalMessages == 0 { 551 return nil, 0 552 } 553 sampleSize := c.config.PublishMessages.MaxSampleSize 554 if sampleSize > totalMessages { 555 sampleSize = totalMessages 556 } 557 c.performSample(p2pmsg.RpcPublishMessage, uint(totalMessages), uint(sampleSize), func(i, j uint) { 558 messages[i], messages[j] = messages[j], messages[i] 559 }) 560 561 subscribedTopics := c.topicOracle().GetTopics() 562 hasSubscription := func(topic string) bool { 563 for _, subscribedTopic := range subscribedTopics { 564 if topic == subscribedTopic { 565 return true 566 } 567 } 568 return false 569 } 570 var errs *multierror.Error 571 invalidTopicIdsCount := 0 572 invalidSubscriptionsCount := 0 573 invalidSendersCount := 0 574 defer func() { 575 // regardless of inspection result, update metrics 576 errCnt := 0 577 if errs != nil { 578 errCnt = errs.Len() 579 } 580 c.metrics.OnPublishMessageInspected(errCnt, invalidTopicIdsCount, invalidSubscriptionsCount, invalidSendersCount) 581 }() 582 for _, message := range messages[:sampleSize] { 583 if c.networkingType == network.PrivateNetwork { 584 err := c.checkPubsubMessageSender(message) 585 if err != nil { 586 invalidSendersCount++ 587 errs = multierror.Append(errs, err) 588 continue 589 } 590 } 591 topic := channels.Topic(message.GetTopic()) 592 // The boolean value returned when validating a topic, indicating whether the topic is cluster-prefixed or not, is intentionally ignored. 593 // This is because we have already set a threshold for errors allowed on publish messages. Reducing the penalty further based on 594 // cluster prefix status is unnecessary when the error threshold is exceeded. 595 err, _ := c.validateTopic(from, topic, activeClusterIDS) 596 if err != nil { 597 // we can skip checking for subscription of topic that failed validation and continue 598 invalidTopicIdsCount++ 599 errs = multierror.Append(errs, err) 600 continue 601 } 602 603 if !hasSubscription(topic.String()) { 604 invalidSubscriptionsCount++ 605 errs = multierror.Append(errs, fmt.Errorf("subscription for topic %s not found", topic)) 606 } 607 } 608 609 // return an error when we exceed the error threshold 610 if errs != nil && errs.Len() > c.config.PublishMessages.ErrorThreshold { 611 c.metrics.OnPublishMessagesInspectionErrorExceedsThreshold() 612 return NewInvalidRpcPublishMessagesErr(errs.ErrorOrNil(), errs.Len()), uint64(errs.Len()) 613 } 614 615 return nil, 0 616 } 617 618 // truncateRPC truncates the RPC by truncating each control message type using the configured max sample size values. 619 // Args: 620 // - from: peer ID of the sender. 621 // - rpc: the pubsub RPC. 622 func (c *ControlMsgValidationInspector) truncateRPC(from peer.ID, rpc *pubsub.RPC) { 623 for _, ctlMsgType := range p2pmsg.ControlMessageTypes() { 624 switch ctlMsgType { 625 case p2pmsg.CtrlMsgGraft: 626 c.truncateGraftMessages(rpc) 627 case p2pmsg.CtrlMsgPrune: 628 c.truncatePruneMessages(rpc) 629 case p2pmsg.CtrlMsgIHave: 630 c.truncateIHaveMessages(rpc) 631 case p2pmsg.CtrlMsgIWant: 632 c.truncateIWantMessages(from, rpc) 633 default: 634 // sanity check this should never happen 635 c.logAndThrowError(fmt.Errorf("unknown control message type encountered during RPC truncation")) 636 } 637 } 638 } 639 640 // truncateGraftMessages truncates the Graft control messages in the RPC. If the total number of Grafts in the RPC exceeds the configured 641 // GraftPruneMessageMaxSampleSize the list of Grafts will be truncated. 642 // Args: 643 // - rpc: the rpc message to truncate. 644 func (c *ControlMsgValidationInspector) truncateGraftMessages(rpc *pubsub.RPC) { 645 grafts := rpc.GetControl().GetGraft() 646 originalGraftSize := len(grafts) 647 if originalGraftSize <= c.config.GraftPrune.MessageCountThreshold { 648 return // nothing to truncate 649 } 650 651 // truncate grafts and update metrics 652 sampleSize := c.config.GraftPrune.MessageCountThreshold 653 c.performSample(p2pmsg.CtrlMsgGraft, uint(originalGraftSize), uint(sampleSize), func(i, j uint) { 654 grafts[i], grafts[j] = grafts[j], grafts[i] 655 }) 656 rpc.Control.Graft = grafts[:sampleSize] 657 c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgGraft, originalGraftSize-len(rpc.Control.Graft)) 658 } 659 660 // truncatePruneMessages truncates the Prune control messages in the RPC. If the total number of Prunes in the RPC exceeds the configured 661 // GraftPruneMessageMaxSampleSize the list of Prunes will be truncated. 662 // Args: 663 // - rpc: the rpc message to truncate. 664 func (c *ControlMsgValidationInspector) truncatePruneMessages(rpc *pubsub.RPC) { 665 prunes := rpc.GetControl().GetPrune() 666 originalPruneSize := len(prunes) 667 if originalPruneSize <= c.config.GraftPrune.MessageCountThreshold { 668 return // nothing to truncate 669 } 670 671 sampleSize := c.config.GraftPrune.MessageCountThreshold 672 c.performSample(p2pmsg.CtrlMsgPrune, uint(originalPruneSize), uint(sampleSize), func(i, j uint) { 673 prunes[i], prunes[j] = prunes[j], prunes[i] 674 }) 675 rpc.Control.Prune = prunes[:sampleSize] 676 c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgPrune, originalPruneSize-len(rpc.Control.Prune)) 677 } 678 679 // truncateIHaveMessages truncates the iHaves control messages in the RPC. If the total number of iHaves in the RPC exceeds the configured 680 // MessageCountThreshold the list of iHaves will be truncated. 681 // Args: 682 // - rpc: the rpc message to truncate. 683 func (c *ControlMsgValidationInspector) truncateIHaveMessages(rpc *pubsub.RPC) { 684 ihaves := rpc.GetControl().GetIhave() 685 originalIHaveCount := len(ihaves) 686 if originalIHaveCount == 0 { 687 return 688 } 689 690 if originalIHaveCount > c.config.IHave.MessageCountThreshold { 691 // truncate ihaves and update metrics 692 sampleSize := c.config.IHave.MessageCountThreshold 693 if sampleSize > originalIHaveCount { 694 sampleSize = originalIHaveCount 695 } 696 c.performSample(p2pmsg.CtrlMsgIHave, uint(originalIHaveCount), uint(sampleSize), func(i, j uint) { 697 ihaves[i], ihaves[j] = ihaves[j], ihaves[i] 698 }) 699 rpc.Control.Ihave = ihaves[:sampleSize] 700 c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgIHave, originalIHaveCount-len(rpc.Control.Ihave)) 701 } 702 c.truncateIHaveMessageIds(rpc) 703 } 704 705 // truncateIHaveMessageIds truncates the message ids for each iHave control message in the RPC. If the total number of message ids in a single iHave exceeds the configured 706 // MessageIdCountThreshold the list of message ids will be truncated. Before message ids are truncated the iHave control messages should have been truncated themselves. 707 // Args: 708 // - rpc: the rpc message to truncate. 709 func (c *ControlMsgValidationInspector) truncateIHaveMessageIds(rpc *pubsub.RPC) { 710 for _, ihave := range rpc.GetControl().GetIhave() { 711 messageIDs := ihave.GetMessageIDs() 712 originalMessageIdCount := len(messageIDs) 713 if originalMessageIdCount == 0 { 714 continue // nothing to truncate; skip 715 } 716 717 if originalMessageIdCount > c.config.IHave.MessageIdCountThreshold { 718 sampleSize := c.config.IHave.MessageIdCountThreshold 719 if sampleSize > originalMessageIdCount { 720 sampleSize = originalMessageIdCount 721 } 722 c.performSample(p2pmsg.CtrlMsgIHave, uint(originalMessageIdCount), uint(sampleSize), func(i, j uint) { 723 messageIDs[i], messageIDs[j] = messageIDs[j], messageIDs[i] 724 }) 725 ihave.MessageIDs = messageIDs[:sampleSize] 726 c.metrics.OnIHaveControlMessageIdsTruncated(originalMessageIdCount - len(ihave.MessageIDs)) 727 } 728 c.metrics.OnIHaveMessageIDsReceived(ihave.GetTopicID(), len(ihave.MessageIDs)) 729 } 730 } 731 732 // truncateIWantMessages truncates the iWant control messages in the RPC. If the total number of iWants in the RPC exceeds the configured 733 // MessageCountThreshold the list of iWants will be truncated. 734 // Args: 735 // - rpc: the rpc message to truncate. 736 func (c *ControlMsgValidationInspector) truncateIWantMessages(from peer.ID, rpc *pubsub.RPC) { 737 iWants := rpc.GetControl().GetIwant() 738 originalIWantCount := uint(len(iWants)) 739 if originalIWantCount == 0 { 740 return 741 } 742 743 if originalIWantCount > c.config.IWant.MessageCountThreshold { 744 // truncate iWants and update metrics 745 sampleSize := c.config.IWant.MessageCountThreshold 746 if sampleSize > originalIWantCount { 747 sampleSize = originalIWantCount 748 } 749 c.performSample(p2pmsg.CtrlMsgIWant, originalIWantCount, sampleSize, func(i, j uint) { 750 iWants[i], iWants[j] = iWants[j], iWants[i] 751 }) 752 rpc.Control.Iwant = iWants[:sampleSize] 753 c.metrics.OnControlMessagesTruncated(p2pmsg.CtrlMsgIWant, int(originalIWantCount)-len(rpc.Control.Iwant)) 754 } 755 c.truncateIWantMessageIds(from, rpc) 756 } 757 758 // truncateIWantMessageIds truncates the message ids for each iWant control message in the RPC. If the total number of message ids in a single iWant exceeds the configured 759 // MessageIdCountThreshold the list of message ids will be truncated. Before message ids are truncated the iWant control messages should have been truncated themselves. 760 // Args: 761 // - rpc: the rpc message to truncate. 762 func (c *ControlMsgValidationInspector) truncateIWantMessageIds(from peer.ID, rpc *pubsub.RPC) { 763 lastHighest := c.rpcTracker.LastHighestIHaveRPCSize() 764 lg := c.logger.With(). 765 Str("peer_id", p2plogging.PeerId(from)). 766 Uint("max_sample_size", c.config.IWant.MessageCountThreshold). 767 Int64("last_highest_ihave_rpc_size", lastHighest). 768 Logger() 769 770 sampleSize := int(10 * lastHighest) 771 if sampleSize == 0 || sampleSize > c.config.IWant.MessageIdCountThreshold { 772 // invalid or 0 sample size is suspicious 773 lg.Warn().Str(logging.KeySuspicious, "true").Msg("zero or invalid sample size, using default max sample size") 774 sampleSize = c.config.IWant.MessageIdCountThreshold 775 } 776 for _, iWant := range rpc.GetControl().GetIwant() { 777 messageIDs := iWant.GetMessageIDs() 778 totalMessageIdCount := len(messageIDs) 779 if totalMessageIdCount == 0 { 780 continue // nothing to truncate; skip 781 } 782 783 if totalMessageIdCount > sampleSize { 784 c.performSample(p2pmsg.CtrlMsgIWant, uint(totalMessageIdCount), uint(sampleSize), func(i, j uint) { 785 messageIDs[i], messageIDs[j] = messageIDs[j], messageIDs[i] 786 }) 787 iWant.MessageIDs = messageIDs[:sampleSize] 788 c.metrics.OnIWantControlMessageIdsTruncated(totalMessageIdCount - len(iWant.MessageIDs)) 789 } 790 c.metrics.OnIWantMessageIDsReceived(len(iWant.MessageIDs)) 791 } 792 } 793 794 // performSample performs sampling on the specified control message that will randomize 795 // the items in the control message slice up to index sampleSize-1. Any error encountered during sampling is considered 796 // irrecoverable and will cause the node to crash. 797 func (c *ControlMsgValidationInspector) performSample(ctrlMsg p2pmsg.ControlMessageType, totalSize, sampleSize uint, swap func(i, j uint)) { 798 err := flowrand.Samples(totalSize, sampleSize, swap) 799 if err != nil { 800 c.logAndThrowError(fmt.Errorf("failed to get random sample of %s control messages: %w", ctrlMsg, err)) 801 } 802 } 803 804 // validateTopic ensures the topic is a valid flow topic/channel. 805 // Expected error returns during normal operations: 806 // - channels.InvalidTopicErr: if topic is invalid. 807 // - ErrActiveClusterIdsNotSet: if the cluster ID provider is not set. 808 // - channels.UnknownClusterIDErr: if the topic contains a cluster ID prefix that is not in the active cluster IDs list. 809 // 810 // This func returns an exception in case of unexpected bug or state corruption if cluster prefixed topic validation 811 // fails due to unexpected error returned when getting the active cluster IDS. 812 func (c *ControlMsgValidationInspector) validateTopic(from peer.ID, topic channels.Topic, activeClusterIds flow.ChainIDList) (error, p2p.CtrlMsgTopicType) { 813 channel, ok := channels.ChannelFromTopic(topic) 814 if !ok { 815 return channels.NewInvalidTopicErr(topic, fmt.Errorf("failed to get channel from topic")), p2p.CtrlMsgNonClusterTopicType 816 } 817 // handle cluster prefixed topics 818 if channels.IsClusterChannel(channel) { 819 return c.validateClusterPrefixedTopic(from, topic, activeClusterIds), p2p.CtrlMsgTopicTypeClusterPrefixed 820 } 821 822 // non cluster prefixed topic validation 823 err := channels.IsValidNonClusterFlowTopic(topic, c.sporkID) 824 if err != nil { 825 return err, p2p.CtrlMsgNonClusterTopicType 826 } 827 return nil, p2p.CtrlMsgNonClusterTopicType 828 } 829 830 // validateClusterPrefixedTopic validates cluster prefixed topics. 831 // Expected error returns during normal operations: 832 // - ErrActiveClusterIdsNotSet: if the cluster ID provider is not set. 833 // - channels.InvalidTopicErr: if topic is invalid. 834 // - channels.UnknownClusterIDErr: if the topic contains a cluster ID prefix that is not in the active cluster IDs list. 835 // 836 // In the case where an ErrActiveClusterIdsNotSet or UnknownClusterIDErr is encountered and the cluster prefixed topic received 837 // tracker for the peer is less than or equal to the configured HardThreshold an error will only be logged and not returned. 838 // At the point where the hard threshold is crossed the error will be returned and the sender will start to be penalized. 839 // Any errors encountered while incrementing or loading the cluster prefixed control message gauge for a peer will result in an irrecoverable error being thrown, these 840 // errors are unexpected and irrecoverable indicating a bug. 841 func (c *ControlMsgValidationInspector) validateClusterPrefixedTopic(from peer.ID, topic channels.Topic, activeClusterIds flow.ChainIDList) error { 842 lg := c.logger.With(). 843 Str("from", p2plogging.PeerId(from)). 844 Logger() 845 846 // only staked nodes are expected to participate on cluster prefixed topics 847 nodeID, err := c.getFlowIdentifier(from) 848 if err != nil { 849 return err 850 } 851 if len(activeClusterIds) == 0 { 852 // cluster IDs have not been updated yet 853 _, incErr := c.tracker.Inc(nodeID) 854 if incErr != nil { 855 // irrecoverable error encountered 856 c.logAndThrowError(fmt.Errorf("error encountered while incrementing the cluster prefixed control message gauge %s: %w", nodeID, err)) 857 } 858 859 // if the amount of messages received is below our hard threshold log the error and return nil. 860 if ok := c.checkClusterPrefixHardThreshold(nodeID); ok { 861 lg.Warn(). 862 Err(err). 863 Str("topic", topic.String()). 864 Msg("failed to validate cluster prefixed control message with cluster pre-fixed topic active cluster ids not set") 865 return nil 866 } 867 868 return NewActiveClusterIdsNotSetErr(topic) 869 } 870 871 err = channels.IsValidFlowClusterTopic(topic, activeClusterIds) 872 if err != nil { 873 if channels.IsUnknownClusterIDErr(err) { 874 // unknown cluster ID error could indicate that a node has fallen 875 // behind and needs to catchup increment to topics received cache. 876 _, incErr := c.tracker.Inc(nodeID) 877 if incErr != nil { 878 c.logAndThrowError(fmt.Errorf("error encountered while incrementing the cluster prefixed control message gauge %s: %w", nodeID, err)) 879 } 880 // if the amount of messages received is below our hard threshold log the error and return nil. 881 if c.checkClusterPrefixHardThreshold(nodeID) { 882 lg.Warn(). 883 Err(err). 884 Str("topic", topic.String()). 885 Msg("processing unknown cluster prefixed topic received below cluster prefixed discard threshold peer may be behind in the protocol") 886 return nil 887 } 888 } 889 return err 890 } 891 892 return nil 893 } 894 895 // getFlowIdentifier returns the flow identity identifier for a peer. 896 // Args: 897 // - peerID: the peer id of the sender. 898 // 899 // The returned error indicates that the peer is un-staked. 900 func (c *ControlMsgValidationInspector) getFlowIdentifier(peerID peer.ID) (flow.Identifier, error) { 901 id, ok := c.idProvider.ByPeerID(peerID) 902 if !ok { 903 return flow.ZeroID, NewUnstakedPeerErr(fmt.Errorf("failed to get flow identity for peer: %s", peerID)) 904 } 905 return id.ID(), nil 906 } 907 908 // checkClusterPrefixHardThreshold returns true if the cluster prefix received tracker count is less than 909 // the configured HardThreshold, false otherwise. 910 // If any error is encountered while loading from the tracker this func will throw an error on the signaler context, these errors 911 // are unexpected and irrecoverable indicating a bug. 912 func (c *ControlMsgValidationInspector) checkClusterPrefixHardThreshold(nodeID flow.Identifier) bool { 913 gauge, err := c.tracker.Load(nodeID) 914 if err != nil { 915 // irrecoverable error encountered 916 c.logAndThrowError(fmt.Errorf("cluster prefixed control message gauge during hard threshold check failed for node %s: %w", nodeID, err)) 917 } 918 return gauge <= c.config.ClusterPrefixedMessage.HardThreshold 919 } 920 921 // logAndDistributeErr logs the provided error and attempts to disseminate an invalid control message validation notification for the error. 922 // Args: 923 // - req: inspect rpc request that failed validation. 924 // - ctlMsgType: the control message type of the rpc message that caused the error. 925 // - err: the error that occurred. 926 // - count: the number of occurrences of the error. 927 // - isClusterPrefixed: indicates if the errors occurred on a cluster prefixed topic. 928 func (c *ControlMsgValidationInspector) logAndDistributeAsyncInspectErrs(req *InspectRPCRequest, ctlMsgType p2pmsg.ControlMessageType, err error, count uint64, topicType p2p.CtrlMsgTopicType) { 929 lg := c.logger.With(). 930 Err(err). 931 Str("control_message_type", ctlMsgType.String()). 932 Bool(logging.KeySuspicious, true). 933 Bool(logging.KeyNetworkingSecurity, true). 934 Str("topic_type", topicType.String()). 935 Uint64("error_count", count). 936 Str("peer_id", p2plogging.PeerId(req.Peer)). 937 Logger() 938 939 switch { 940 case IsErrActiveClusterIDsNotSet(err): 941 c.metrics.OnActiveClusterIDsNotSetErr() 942 lg.Warn().Msg("active cluster ids not set") 943 case IsErrUnstakedPeer(err): 944 c.metrics.OnUnstakedPeerInspectionFailed() 945 lg.Warn().Msg("control message received from unstaked peer") 946 default: 947 distErr := c.distributor.Distribute(p2p.NewInvalidControlMessageNotification(req.Peer, ctlMsgType, err, count, topicType)) 948 if distErr != nil { 949 lg.Error(). 950 Err(distErr). 951 Msg("failed to distribute invalid control message notification") 952 return 953 } 954 lg.Error().Msg("rpc control message async inspection failed") 955 c.metrics.OnInvalidControlMessageNotificationSent() 956 } 957 } 958 959 // logAndThrowError logs and throws irrecoverable errors on the context. 960 // Args: 961 // 962 // err: the error encountered. 963 func (c *ControlMsgValidationInspector) logAndThrowError(err error) { 964 c.logger.Error(). 965 Err(err). 966 Bool(logging.KeySuspicious, true). 967 Bool(logging.KeyNetworkingSecurity, true). 968 Msg("unexpected irrecoverable error encountered") 969 c.ctx.Throw(err) 970 }