github.com/onflow/flow-go@v0.33.17/network/p2p/tracer/gossipSubMeshTracer.go (about) 1 package tracer 2 3 import ( 4 "fmt" 5 "strconv" 6 "sync" 7 "time" 8 9 pubsub "github.com/libp2p/go-libp2p-pubsub" 10 "github.com/libp2p/go-libp2p/core/peer" 11 "github.com/libp2p/go-libp2p/core/protocol" 12 "github.com/rs/zerolog" 13 14 "github.com/onflow/flow-go/module" 15 "github.com/onflow/flow-go/module/component" 16 "github.com/onflow/flow-go/module/irrecoverable" 17 "github.com/onflow/flow-go/module/metrics" 18 "github.com/onflow/flow-go/network" 19 "github.com/onflow/flow-go/network/channels" 20 "github.com/onflow/flow-go/network/p2p" 21 p2plogging "github.com/onflow/flow-go/network/p2p/logging" 22 "github.com/onflow/flow-go/network/p2p/tracer/internal" 23 "github.com/onflow/flow-go/utils/logging" 24 ) 25 26 const ( 27 // MeshLogIntervalMsg is the message logged by the tracer every logInterval. 28 MeshLogIntervalMsg = "topic mesh peers of local node since last heartbeat" 29 30 // MeshLogIntervalWarnMsg is the message logged by the tracer every logInterval if there are unknown peers in the mesh. 31 MeshLogIntervalWarnMsg = "unknown peers in topic mesh peers of local node since last heartbeat" 32 33 // defaultLastHighestIHaveRPCSizeResetInterval is the interval that we reset the tracker of max ihave size sent back 34 // to a default. We use ihave message max size to determine the health of requested iwants from remote peers. However, 35 // we don't desire an ihave size anomaly to persist forever, hence, we reset it back to a default every minute. 36 // The choice of the interval to be a minute is in harmony with the GossipSub decay interval. 37 defaultLastHighestIHaveRPCSizeResetInterval = time.Minute 38 ) 39 40 // The GossipSubMeshTracer component in the GossipSub pubsub.RawTracer that is designed to track the local 41 // mesh peers for each topic. By logging the mesh peers and updating the local mesh size metric, the GossipSubMeshTracer 42 // provides insights into the behavior of the topology. 43 // 44 // This component also provides real-time and historical visibility into the topology. 45 // The GossipSubMeshTracer logs the mesh peers of the local node for each topic 46 // at a regular interval, enabling users to monitor the state of the mesh network and take appropriate action. 47 // Additionally, it allows users to configure the logging interval. 48 type GossipSubMeshTracer struct { 49 component.Component 50 51 topicMeshMu sync.RWMutex // to protect topicMeshMap 52 topicMeshMap map[string]map[peer.ID]struct{} // map of local mesh peers by topic. 53 logger zerolog.Logger 54 idProvider module.IdentityProvider 55 loggerInterval time.Duration 56 metrics module.LocalGossipSubRouterMetrics 57 rpcSentTracker *internal.RPCSentTracker 58 } 59 60 var _ p2p.PubSubTracer = (*GossipSubMeshTracer)(nil) 61 62 type GossipSubMeshTracerConfig struct { 63 network.NetworkingType 64 metrics.HeroCacheMetricsFactory 65 Logger zerolog.Logger 66 Metrics module.LocalGossipSubRouterMetrics 67 IDProvider module.IdentityProvider 68 LoggerInterval time.Duration 69 RpcSentTrackerCacheSize uint32 70 RpcSentTrackerWorkerQueueCacheSize uint32 71 RpcSentTrackerNumOfWorkers int 72 } 73 74 // NewGossipSubMeshTracer creates a new *GossipSubMeshTracer. 75 // Args: 76 // - *GossipSubMeshTracerConfig: the mesh tracer config. 77 // Returns: 78 // - *GossipSubMeshTracer: new mesh tracer. 79 func NewGossipSubMeshTracer(config *GossipSubMeshTracerConfig) *GossipSubMeshTracer { 80 lg := config.Logger.With().Str("component", "gossipsub_topology_tracer").Logger() 81 rpcSentTracker := internal.NewRPCSentTracker(&internal.RPCSentTrackerConfig{ 82 Logger: lg, 83 RPCSentCacheSize: config.RpcSentTrackerCacheSize, 84 RPCSentCacheCollector: metrics.GossipSubRPCSentTrackerMetricFactory(config.HeroCacheMetricsFactory, config.NetworkingType), 85 WorkerQueueCacheCollector: metrics.GossipSubRPCSentTrackerQueueMetricFactory(config.HeroCacheMetricsFactory, config.NetworkingType), 86 WorkerQueueCacheSize: config.RpcSentTrackerWorkerQueueCacheSize, 87 NumOfWorkers: config.RpcSentTrackerNumOfWorkers, 88 LastHighestIhavesSentResetInterval: defaultLastHighestIHaveRPCSizeResetInterval, 89 }) 90 g := &GossipSubMeshTracer{ 91 topicMeshMap: make(map[string]map[peer.ID]struct{}), 92 idProvider: config.IDProvider, 93 metrics: config.Metrics, 94 logger: lg, 95 loggerInterval: config.LoggerInterval, 96 rpcSentTracker: rpcSentTracker, 97 } 98 99 g.Component = component.NewComponentManagerBuilder(). 100 AddWorker(func(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 101 ready() 102 g.logLoop(ctx) 103 }). 104 AddWorker(func(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 105 ready() 106 lg.Debug().Msg("starting rpc sent tracker") 107 g.rpcSentTracker.Start(ctx) 108 lg.Debug().Msg("rpc sent tracker started") 109 110 <-g.rpcSentTracker.Done() 111 lg.Debug().Msg("rpc sent tracker stopped") 112 }). 113 Build() 114 115 return g 116 } 117 118 // GetLocalMeshPeers returns the local mesh peers for the given topic. 119 // Args: 120 // - topic: the topic. 121 // Returns: 122 // - []peer.ID: the local mesh peers for the given topic. 123 func (t *GossipSubMeshTracer) GetLocalMeshPeers(topic channels.Topic) []peer.ID { 124 t.topicMeshMu.RLock() 125 defer t.topicMeshMu.RUnlock() 126 127 peers := make([]peer.ID, 0, len(t.topicMeshMap[topic.String()])) 128 for p := range t.topicMeshMap[topic.String()] { 129 peers = append(peers, p) 130 } 131 return peers 132 } 133 134 // Graft is called by GossipSub when a peer is added to a topic mesh. The tracer uses this to track the mesh peers. 135 func (t *GossipSubMeshTracer) Graft(p peer.ID, topic string) { 136 t.metrics.OnPeerGraftTopic(topic) 137 t.topicMeshMu.Lock() 138 defer t.topicMeshMu.Unlock() 139 140 lg := t.logger.With().Str("topic", topic).Str("peer_id", p2plogging.PeerId(p)).Logger() 141 142 if _, ok := t.topicMeshMap[topic]; !ok { 143 t.topicMeshMap[topic] = make(map[peer.ID]struct{}) 144 } 145 t.topicMeshMap[topic][p] = struct{}{} 146 meshSize := len(t.topicMeshMap[topic]) 147 148 t.metrics.OnLocalMeshSizeUpdated(topic, meshSize) 149 lg = lg.With().Int("mesh_size", meshSize).Logger() 150 151 id, exists := t.idProvider.ByPeerID(p) 152 if !exists { 153 lg.Warn(). 154 Bool(logging.KeySuspicious, true). 155 Msg("grafted peer not found in identity provider") 156 return 157 } 158 159 lg.Debug().Hex("flow_id", logging.ID(id.NodeID)).Str("role", id.Role.String()).Msg("grafted peer") 160 } 161 162 // Prune is called by GossipSub when a peer is removed from a topic mesh. The tracer uses this to track the mesh peers. 163 func (t *GossipSubMeshTracer) Prune(p peer.ID, topic string) { 164 t.metrics.OnPeerPruneTopic(topic) 165 t.topicMeshMu.Lock() 166 defer t.topicMeshMu.Unlock() 167 168 lg := t.logger.With().Str("topic", topic).Str("peer_id", p2plogging.PeerId(p)).Logger() 169 170 if _, ok := t.topicMeshMap[topic]; !ok { 171 return 172 } 173 delete(t.topicMeshMap[topic], p) 174 175 meshSize := len(t.topicMeshMap[topic]) 176 t.metrics.OnLocalMeshSizeUpdated(topic, meshSize) 177 lg = lg.With().Int("mesh_size", meshSize).Logger() 178 179 id, exists := t.idProvider.ByPeerID(p) 180 if !exists { 181 lg.Warn(). 182 Bool(logging.KeySuspicious, true). 183 Msg("pruned peer not found in identity provider") 184 185 return 186 } 187 188 lg.Debug().Hex("flow_id", logging.ID(id.NodeID)).Str("role", id.Role.String()).Msg("pruned peer") 189 } 190 191 // SendRPC is called by GossipSub when a RPC is sent. Currently, the GossipSubMeshTracer tracks iHave RPC messages that have been sent. 192 // This function can be updated to track other control messages in the future as required. 193 func (t *GossipSubMeshTracer) SendRPC(rpc *pubsub.RPC, p peer.ID) { 194 err := t.rpcSentTracker.Track(rpc) 195 if err != nil { 196 t.logger.Err(err).Bool(logging.KeyNetworkingSecurity, true).Msg("failed to track sent pubsbub rpc") 197 } 198 199 msgCount, ihaveCount, iwantCount, graftCount, pruneCount := 0, 0, 0, 0, 0 200 if rpc.Control != nil { 201 ihaveCount = len(rpc.Control.Ihave) 202 iwantCount = len(rpc.Control.Iwant) 203 graftCount = len(rpc.Control.Graft) 204 pruneCount = len(rpc.Control.Prune) 205 } 206 msgCount = len(rpc.Publish) 207 t.metrics.OnRpcReceived(msgCount, ihaveCount, iwantCount, graftCount, pruneCount) 208 if t.logger.GetLevel() == zerolog.TraceLevel { 209 t.logger.Trace(). 210 Str("remote_peer_id", p2plogging.PeerId(p)). 211 Int("subscription_option_count", len(rpc.Subscriptions)). 212 Int("publish_message_count", msgCount). 213 Int("ihave_size", ihaveCount). 214 Int("iwant_size", iwantCount). 215 Int("graft_size", graftCount). 216 Int("prune_size", pruneCount). 217 Msg("sent pubsub rpc") 218 } 219 220 t.metrics.OnRpcSent(msgCount, ihaveCount, iwantCount, graftCount, pruneCount) 221 } 222 223 // AddPeer is called by GossipSub as a callback when a peer is added to the local node on a protocol, i.e., the local node is connected to the peer on a protocol. 224 // The peer may or may not be subscribed to any topic. 225 func (t *GossipSubMeshTracer) AddPeer(p peer.ID, proto protocol.ID) { 226 if t.logger.GetLevel() == zerolog.TraceLevel { 227 t.logger.Trace(). 228 Str("local_peer_id", p2plogging.PeerId(p)). 229 Str("protocol", string(proto)). 230 Msg("peer added") 231 } 232 t.metrics.OnPeerAddedToProtocol(string(proto)) 233 } 234 235 // RemovePeer is called by GossipSub as a callback when a peer is removed from the local node, 236 // i.e., the local node is no longer connected to the peer. 237 func (t *GossipSubMeshTracer) RemovePeer(p peer.ID) { 238 t.metrics.OnPeerRemovedFromProtocol() 239 if t.logger.GetLevel() == zerolog.TraceLevel { 240 t.logger.Trace(). 241 Str("local_peer_id", p2plogging.PeerId(p)). 242 Msg("peer removed") 243 } 244 } 245 246 // Join is called by GossipSub as a callback when the local node joins a topic. 247 func (t *GossipSubMeshTracer) Join(topic string) { 248 t.metrics.OnLocalPeerJoinedTopic() 249 if t.logger.GetLevel() == zerolog.TraceLevel { 250 t.logger.Trace(). 251 Str("topic", topic). 252 Msg("local peer joined topic") 253 } 254 } 255 256 // Leave is called by GossipSub as a callback when the local node leaves a topic. 257 func (t *GossipSubMeshTracer) Leave(topic string) { 258 t.metrics.OnLocalPeerLeftTopic() 259 if t.logger.GetLevel() == zerolog.TraceLevel { 260 t.logger.Trace(). 261 Str("topic", topic). 262 Msg("local peer left topic") 263 } 264 } 265 266 // ValidateMessage is called by GossipSub as a callback when a message is received by the local node and entered the validation phase. 267 // As the result of the validation, the message may be rejected or passed to the application (i.e., Flow protocol). 268 func (t *GossipSubMeshTracer) ValidateMessage(msg *pubsub.Message) { 269 size := len(msg.Data) 270 t.metrics.OnMessageEnteredValidation(size) 271 272 if t.logger.GetLevel() > zerolog.TraceLevel { 273 return // return fast if we are not logging at trace level 274 } 275 276 lg := t.logger.With().Logger() 277 if msg.Topic != nil { 278 lg = lg.With().Str("topic", *msg.Topic).Logger() 279 } 280 from, err := peer.IDFromBytes(msg.From) 281 if err == nil { 282 lg = lg.With().Str("remote_peer_id", p2plogging.PeerId(from)).Logger() 283 } 284 285 lg.Trace(). 286 Str("received_from", p2plogging.PeerId(msg.ReceivedFrom)). 287 Int("message_size", size). 288 Msg("received pubsub message entered validation phase") 289 } 290 291 // DeliverMessage is called by GossipSub as a callback when the local node delivers a message to all subscribers of the topic. 292 func (t *GossipSubMeshTracer) DeliverMessage(msg *pubsub.Message) { 293 size := len(msg.Data) 294 t.metrics.OnMessageDeliveredToAllSubscribers(size) 295 296 if t.logger.GetLevel() > zerolog.TraceLevel { 297 return // return fast if we are not logging at trace level 298 } 299 300 lg := t.logger.With().Logger() 301 if msg.Topic != nil { 302 lg = lg.With().Str("topic", *msg.Topic).Logger() 303 } 304 from, err := peer.IDFromBytes(msg.From) 305 if err == nil { 306 lg = lg.With().Str("remote_peer_id", p2plogging.PeerId(from)).Logger() 307 } 308 309 lg.Trace(). 310 Str("received_from", p2plogging.PeerId(msg.ReceivedFrom)). 311 Int("message_size", len(msg.Data)). 312 Msg("delivered pubsub message to all subscribers") 313 } 314 315 // RejectMessage is called by GossipSub as a callback when a message is rejected by the local node. 316 // The message may be rejected for a variety of reasons, but the most common reason is that the message is invalid with respect to signature. 317 // Any message that arrives at the local node should contain the peer id of the source (i.e., the peer that created the message), the 318 // networking public key of the source, and the signature of the message. The local node uses this information to verify the message. 319 // If any of the information is missing or invalid, the message is rejected. 320 func (t *GossipSubMeshTracer) RejectMessage(msg *pubsub.Message, reason string) { 321 size := len(msg.Data) 322 t.metrics.OnMessageRejected(size, reason) 323 324 if t.logger.GetLevel() > zerolog.TraceLevel { 325 return // return fast if we are not logging at trace level 326 } 327 328 lg := t.logger.With().Logger() 329 if msg.Topic != nil { 330 lg = lg.With().Str("topic", *msg.Topic).Logger() 331 } 332 from, err := peer.IDFromBytes(msg.From) 333 if err == nil { 334 lg = lg.With().Str("remote_peer_id", p2plogging.PeerId(from)).Logger() 335 } 336 337 lg.Trace(). 338 Str("received_from", p2plogging.PeerId(msg.ReceivedFrom)). 339 Int("message_size", size). 340 Msg("rejected pubsub message") 341 342 } 343 344 // DuplicateMessage is called by GossipSub as a callback when a duplicate message is received by the local node. 345 func (t *GossipSubMeshTracer) DuplicateMessage(msg *pubsub.Message) { 346 size := len(msg.Data) 347 t.metrics.OnMessageDuplicate(size) 348 349 if t.logger.GetLevel() > zerolog.TraceLevel { 350 return // return fast if we are not logging at trace level 351 } 352 353 lg := t.logger.With().Logger() 354 if msg.Topic != nil { 355 lg = lg.With().Str("topic", *msg.Topic).Logger() 356 } 357 from, err := peer.IDFromBytes(msg.From) 358 if err == nil { 359 lg = lg.With().Str("remote_peer_id", p2plogging.PeerId(from)).Logger() 360 } 361 362 lg.Trace(). 363 Str("received_from", p2plogging.PeerId(msg.ReceivedFrom)). 364 Int("message_size", size). 365 Msg("received duplicate pubsub message") 366 } 367 368 // ThrottlePeer is called by GossipSub when a peer is throttled by the local node, i.e., the local node is not accepting any 369 // pubsub message from the peer but may still accept control messages. 370 func (t *GossipSubMeshTracer) ThrottlePeer(p peer.ID) { 371 t.logger.Warn(). 372 Bool(logging.KeyNetworkingSecurity, true). 373 Str("remote_peer_id", p2plogging.PeerId(p)). 374 Msg("throttled peer; no longer accepting pubsub messages from peer, but may still accept control messages") 375 t.metrics.OnPeerThrottled() 376 } 377 378 // RecvRPC is called by GossipSub as a callback when an inbound RPC message is received by the local node, 379 // note that the RPC already passed the RPC inspection, hence its statistics may be different from the RPC inspector metrics, as 380 // the RPC inspector metrics are updated before the RPC inspection, and the RPC may gone through truncation or rejection. 381 // This callback tracks the RPC messages as they are completely received by the local GossipSub router. 382 func (t *GossipSubMeshTracer) RecvRPC(rpc *pubsub.RPC) { 383 msgCount, ihaveCount, iwantCount, graftCount, pruneCount := 0, 0, 0, 0, 0 384 if rpc.Control != nil { 385 ihaveCount = len(rpc.Control.Ihave) 386 iwantCount = len(rpc.Control.Iwant) 387 graftCount = len(rpc.Control.Graft) 388 pruneCount = len(rpc.Control.Prune) 389 } 390 msgCount = len(rpc.Publish) 391 t.metrics.OnRpcReceived(msgCount, ihaveCount, iwantCount, graftCount, pruneCount) 392 if t.logger.GetLevel() == zerolog.TraceLevel { 393 t.logger.Trace(). 394 Int("subscription_option_count", len(rpc.Subscriptions)). 395 Int("publish_message_count", msgCount). 396 Int("ihave_size", ihaveCount). 397 Int("iwant_size", iwantCount). 398 Int("graft_size", graftCount). 399 Int("prune_size", pruneCount). 400 Msg("received pubsub rpc") 401 } 402 } 403 404 // DropRPC is called by GossipSub as a callback when an outbound RPC message is dropped by the local node, typically because the local node 405 // outbound message queue is full; or the RPC is big and the local node cannot fragment it. 406 func (t *GossipSubMeshTracer) DropRPC(rpc *pubsub.RPC, p peer.ID) { 407 msgCount, ihaveCount, iwantCount, graftCount, pruneCount := 0, 0, 0, 0, 0 408 if rpc.Control != nil { 409 ihaveCount = len(rpc.Control.Ihave) 410 iwantCount = len(rpc.Control.Iwant) 411 graftCount = len(rpc.Control.Graft) 412 pruneCount = len(rpc.Control.Prune) 413 } 414 msgCount = len(rpc.Publish) 415 t.metrics.OnRpcReceived(msgCount, ihaveCount, iwantCount, graftCount, pruneCount) 416 if t.logger.GetLevel() == zerolog.TraceLevel { 417 t.logger.Warn(). 418 Bool(logging.KeyNetworkingSecurity, true). 419 Str("remote_peer_id", p2plogging.PeerId(p)). 420 Int("subscription_option_count", len(rpc.Subscriptions)). 421 Int("publish_message_count", msgCount). 422 Int("ihave_size", ihaveCount). 423 Int("iwant_size", iwantCount). 424 Int("graft_size", graftCount). 425 Int("prune_size", pruneCount). 426 Msg("outbound rpc dropped") 427 } 428 t.metrics.OnOutboundRpcDropped() 429 } 430 431 // UndeliverableMessage is called by GossipSub as a callback when a message is dropped by the local node, typically because the local node 432 // outbound message queue is full; or the message is big and the local node cannot fragment it. 433 func (t *GossipSubMeshTracer) UndeliverableMessage(msg *pubsub.Message) { 434 t.logger.Warn(). 435 Bool(logging.KeyNetworkingSecurity, true). 436 Str("topic", *msg.Topic). 437 Str("remote_peer_id", p2plogging.PeerId(msg.ReceivedFrom)). 438 Int("message_size", len(msg.Data)). 439 Msg("undeliverable pubsub message") 440 t.metrics.OnUndeliveredMessage() 441 } 442 443 // WasIHaveRPCSent returns true if an iHave control message for the messageID was sent, otherwise false. 444 func (t *GossipSubMeshTracer) WasIHaveRPCSent(messageID string) bool { 445 return t.rpcSentTracker.WasIHaveRPCSent(messageID) 446 } 447 448 // LastHighestIHaveRPCSize returns the last highest RPC iHave message sent. 449 func (t *GossipSubMeshTracer) LastHighestIHaveRPCSize() int64 { 450 return t.rpcSentTracker.LastHighestIHaveRPCSize() 451 } 452 453 // logLoop logs the mesh peers of the local node for each topic at a regular interval. 454 func (t *GossipSubMeshTracer) logLoop(ctx irrecoverable.SignalerContext) { 455 ticker := time.NewTicker(t.loggerInterval) 456 defer ticker.Stop() 457 458 for { 459 select { 460 case <-ctx.Done(): 461 return 462 default: 463 } 464 465 select { 466 case <-ctx.Done(): 467 return 468 case <-ticker.C: 469 t.logPeers() 470 } 471 } 472 } 473 474 // logPeers logs the mesh peers of the local node for each topic. 475 // Note that based on GossipSub parameters, we expect to have between 6 and 12 peers in the mesh for each topic. 476 // Hence, choosing a heartbeat interval in the order of minutes should be sufficient to log the mesh peers of the local node. 477 // Also, note that the mesh peers are also logged reactively when a peer is added or removed from the mesh. 478 func (t *GossipSubMeshTracer) logPeers() { 479 t.topicMeshMu.RLock() 480 defer t.topicMeshMu.RUnlock() 481 for topic := range t.topicMeshMap { 482 shouldWarn := false // whether we should warn about the mesh state 483 484 topicPeers := zerolog.Dict() 485 486 peerIndex := -1 // index to keep track of peer info in different logging dictionaries. 487 for p := range t.topicMeshMap[topic] { 488 peerIndex++ 489 id, exists := t.idProvider.ByPeerID(p) 490 491 if !exists { 492 shouldWarn = true 493 topicPeers = topicPeers.Str(strconv.Itoa(peerIndex), fmt.Sprintf("pid=%s, flow_id=unknown, role=unknown", p2plogging.PeerId(p))) 494 continue 495 } 496 497 topicPeers = topicPeers.Str(strconv.Itoa(peerIndex), fmt.Sprintf("pid=%s, flow_id=%x, role=%s", p2plogging.PeerId(p), id.NodeID, id.Role.String())) 498 } 499 500 lg := t.logger.With(). 501 Dur("heartbeat_interval", t.loggerInterval). 502 Str("topic", topic). 503 Dict("topic_mesh", topicPeers). 504 Logger() 505 506 if shouldWarn { 507 lg.Warn(). 508 Bool(logging.KeySuspicious, true). 509 Msg(MeshLogIntervalWarnMsg) 510 continue 511 } 512 lg.Debug().Msg(MeshLogIntervalMsg) 513 } 514 }