github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/metrics/gossipsub.go (about)

     1  package metrics
     2  
     3  import (
     4  	"github.com/prometheus/client_golang/prometheus"
     5  	"github.com/prometheus/client_golang/prometheus/promauto"
     6  
     7  	"github.com/onflow/flow-go/module"
     8  )
     9  
    10  // LocalGossipSubRouterMetrics encapsulates the metrics collectors for GossipSub router of the local node.
    11  // It gives a lens into the local node's view of the GossipSub protocol.
    12  type LocalGossipSubRouterMetrics struct {
    13  	// localMeshSize is the number of peers in the local mesh of the node on each topic.
    14  	localMeshSize prometheus.GaugeVec
    15  
    16  	// peerAddedOnProtocolCount is the number of peers added to the local gossipsub router on a gossipsub protocol.
    17  	peerAddedOnProtocolCount prometheus.CounterVec
    18  
    19  	// peerRemovedFromProtocolCount is the number of peers removed from the local gossipsub router (i.e., blacklisted or unavailable).
    20  	peerRemovedFromProtocolCount prometheus.Counter
    21  
    22  	// localPeerJoinedTopicCount is the number of times the local node joined (i.e., subscribed) to a topic.
    23  	localPeerJoinedTopicCount prometheus.Counter
    24  
    25  	// localPeerLeftTopicCount is the number of times the local node left (i.e., unsubscribed) from a topic.
    26  	localPeerLeftTopicCount prometheus.Counter
    27  
    28  	// peerGraftTopicCount is the number of peers grafted to a topic on the local mesh of the node, i.e., the local node
    29  	// is directly connected to the peer on the topic, and exchange messages directly.
    30  	peerGraftTopicCount prometheus.CounterVec
    31  
    32  	// peerPruneTopicCount is the number of peers pruned from a topic on the local mesh of the node, i.e., the local node
    33  	// is no longer directly connected to the peer on the topic, and exchange messages indirectly.
    34  	peerPruneTopicCount prometheus.CounterVec
    35  
    36  	// messageEnteredValidationCount is the number of incoming pubsub messages entered internal validation pipeline of gossipsub.
    37  	messageEnteredValidationCount prometheus.Counter
    38  
    39  	// messageDeliveredSize is the size of messages delivered to all subscribers of the topic.
    40  	messageDeliveredSize prometheus.Histogram
    41  
    42  	// messageRejectedSize is the size of inbound messages rejected by the validation pipeline; the rejection reason is also included.
    43  	messageRejectedSize prometheus.HistogramVec
    44  
    45  	// messageDuplicateSize is the size of messages that are duplicates of already received messages.
    46  	messageDuplicateSize prometheus.Histogram
    47  
    48  	// peerThrottledCount is the number of peers that are throttled by the local node, i.e., the local node is not accepting
    49  	// any pubsub message from the peer but may still accept control messages.
    50  	peerThrottledCount prometheus.Counter
    51  
    52  	// rpcRcvCount is the number of rpc messages received and processed by the router (i.e., passed rpc inspection).
    53  	rpcRcvCount prometheus.Counter
    54  
    55  	// iWantRcvCount is the number of iwant messages received by the router on rpcs.
    56  	iWantRcvCount prometheus.Counter
    57  
    58  	// iHaveRcvCount is the number of ihave messages received by the router on rpcs.
    59  	iHaveRcvCount prometheus.Counter
    60  
    61  	// graftRcvCount is the number of graft messages received by the router on rpcs.
    62  	graftRcvCount prometheus.Counter
    63  
    64  	// pruneRcvCount is the number of prune messages received by the router on rpcs.
    65  	pruneRcvCount prometheus.Counter
    66  
    67  	// pubsubMsgRcvCount is the number of pubsub messages received by the router.
    68  	pubsubMsgRcvCount prometheus.Counter
    69  
    70  	// rpcSentCount is the number of rpc messages sent by the router.
    71  	rpcSentCount prometheus.Counter
    72  
    73  	// iWantSentCount is the number of iwant messages sent by the router on rpcs.
    74  	iWantSentCount prometheus.Counter
    75  
    76  	// iHaveSentCount is the number of ihave messages sent by the router on rpcs.
    77  	iHaveSentCount prometheus.Counter
    78  
    79  	// graftSentCount is the number of graft messages sent by the router on rpcs.
    80  	graftSentCount prometheus.Counter
    81  
    82  	// pruneSentCount is the number of prune messages sent by the router on rpcs.
    83  	pruneSentCount prometheus.Counter
    84  
    85  	// pubsubMsgSentCount is the number of pubsub messages sent by the router.
    86  	pubsubMsgSentCount prometheus.Counter
    87  
    88  	// outboundRpcDroppedCount is the number of outbound rpc messages dropped, typically because the outbound message queue is full.
    89  	outboundRpcDroppedCount prometheus.Counter
    90  
    91  	// undeliveredOutboundMessageCount is the number of undelivered messages, i.e., messages that are not delivered to at least one subscriber.
    92  	undeliveredOutboundMessageCount prometheus.Counter
    93  }
    94  
    95  func NewGossipSubLocalMeshMetrics(prefix string) *LocalGossipSubRouterMetrics {
    96  	return &LocalGossipSubRouterMetrics{
    97  		localMeshSize: *promauto.NewGaugeVec(
    98  			prometheus.GaugeOpts{
    99  				Namespace: namespaceNetwork,
   100  				Subsystem: subsystemGossip,
   101  				Name:      prefix + "gossipsub_local_mesh_size",
   102  				Help:      "number of peers in the local mesh of the node",
   103  			},
   104  			[]string{LabelChannel},
   105  		),
   106  		peerAddedOnProtocolCount: *promauto.NewCounterVec(prometheus.CounterOpts{
   107  			Namespace: namespaceNetwork,
   108  			Subsystem: subsystemGossip,
   109  			Name:      prefix + "gossipsub_added_peer_on_protocol_total",
   110  			Help:      "number of peers added to the local gossipsub router on a gossipsub protocol",
   111  		}, []string{LabelProtocol}),
   112  		peerRemovedFromProtocolCount: prometheus.NewCounter(prometheus.CounterOpts{
   113  			Namespace: namespaceNetwork,
   114  			Subsystem: subsystemGossip,
   115  			Name:      prefix + "gossipsub_removed_peer_total",
   116  			Help:      "number of peers removed from the local gossipsub router on a gossipsub protocol due to unavailability or blacklisting",
   117  		}),
   118  		localPeerJoinedTopicCount: prometheus.NewCounter(prometheus.CounterOpts{
   119  			Namespace: namespaceNetwork,
   120  			Subsystem: subsystemGossip,
   121  			Name:      prefix + "gossipsub_joined_topic_total",
   122  			Help:      "number of times the local node joined (i.e., subscribed) to a topic",
   123  		}),
   124  		localPeerLeftTopicCount: prometheus.NewCounter(prometheus.CounterOpts{
   125  			Namespace: namespaceNetwork,
   126  			Subsystem: subsystemGossip,
   127  			Name:      prefix + "gossipsub_left_topic_total",
   128  			Help:      "number of times the local node left (i.e., unsubscribed) from a topic",
   129  		}),
   130  		peerGraftTopicCount: *promauto.NewCounterVec(prometheus.CounterOpts{
   131  			Namespace: namespaceNetwork,
   132  			Subsystem: subsystemGossip,
   133  			Name:      prefix + "gossipsub_graft_topic_total",
   134  			Help:      "number of peers grafted to a topic on the local mesh of the node",
   135  		}, []string{LabelChannel}),
   136  		peerPruneTopicCount: *promauto.NewCounterVec(prometheus.CounterOpts{
   137  			Namespace: namespaceNetwork,
   138  			Subsystem: subsystemGossip,
   139  			Name:      prefix + "gossipsub_prune_topic_total",
   140  			Help:      "number of peers pruned from a topic on the local mesh of the node",
   141  		}, []string{LabelChannel}),
   142  		messageEnteredValidationCount: prometheus.NewCounter(prometheus.CounterOpts{
   143  			Namespace: namespaceNetwork,
   144  			Subsystem: subsystemGossip,
   145  			Name:      prefix + "gossipsub_message_entered_validation_total",
   146  			Help:      "number of messages entered internal validation pipeline of gossipsub",
   147  		}),
   148  		messageDeliveredSize: prometheus.NewHistogram(prometheus.HistogramOpts{
   149  			Namespace: namespaceNetwork,
   150  			Subsystem: subsystemGossip,
   151  			Buckets:   []float64{KiB, 100 * KiB, 1 * MiB},
   152  			Name:      prefix + "gossipsub_message_delivered_size",
   153  			Help:      "size of messages delivered to all subscribers of the topic",
   154  		}),
   155  		messageRejectedSize: *promauto.NewHistogramVec(prometheus.HistogramOpts{
   156  			Namespace: namespaceNetwork,
   157  			Subsystem: subsystemGossip,
   158  			Name:      prefix + "gossipsub_message_rejected_size_bytes",
   159  			Help:      "size of messages rejected by the validation pipeline",
   160  		}, []string{LabelRejectionReason}),
   161  		messageDuplicateSize: prometheus.NewHistogram(prometheus.HistogramOpts{
   162  			Namespace: namespaceNetwork,
   163  			Subsystem: subsystemGossip,
   164  			Buckets:   []float64{KiB, 100 * KiB, 1 * MiB},
   165  			Name:      prefix + "gossipsub_duplicate_message_size_bytes",
   166  			Help:      "size of messages that are duplicates of already received messages",
   167  		}),
   168  		peerThrottledCount: prometheus.NewCounter(prometheus.CounterOpts{
   169  			Namespace: namespaceNetwork,
   170  			Subsystem: subsystemGossip,
   171  			Name:      prefix + "gossipsub_peer_throttled_total",
   172  			Help:      "number of peers that are throttled by the local node, i.e., the local node is not accepting any pubsub message from the peer but may still accept control messages",
   173  		}),
   174  		rpcRcvCount: prometheus.NewCounter(prometheus.CounterOpts{
   175  			Namespace: namespaceNetwork,
   176  			Subsystem: subsystemGossip,
   177  			Name:      prefix + "gossipsub_rpc_received_total",
   178  			Help:      "number of rpc messages received and processed by the router (i.e., passed rpc inspection)",
   179  		}),
   180  		rpcSentCount: prometheus.NewCounter(prometheus.CounterOpts{
   181  			Namespace: namespaceNetwork,
   182  			Subsystem: subsystemGossip,
   183  			Name:      prefix + "gossipsub_rpc_sent_total",
   184  			Help:      "number of rpc messages sent by the router",
   185  		}),
   186  		outboundRpcDroppedCount: prometheus.NewCounter(prometheus.CounterOpts{
   187  			Namespace: namespaceNetwork,
   188  			Subsystem: subsystemGossip,
   189  			Name:      prefix + "gossipsub_rpc_dropped_total",
   190  			Help:      "number of outbound rpc messages dropped, typically because the outbound message queue is full",
   191  		}),
   192  		undeliveredOutboundMessageCount: prometheus.NewCounter(prometheus.CounterOpts{
   193  			Namespace: namespaceNetwork,
   194  			Subsystem: subsystemGossip,
   195  			Name:      prefix + "gossipsub_undelivered_message_total",
   196  			Help:      "number of undelivered messages, i.e., messages that are not delivered to at least one subscriber",
   197  		}),
   198  		iHaveRcvCount: prometheus.NewCounter(prometheus.CounterOpts{
   199  			Namespace: namespaceNetwork,
   200  			Subsystem: subsystemGossip,
   201  			Name:      prefix + "gossipsub_ihave_received_total",
   202  			Help:      "number of ihave messages received by the router on rpcs",
   203  		}),
   204  		iWantRcvCount: prometheus.NewCounter(prometheus.CounterOpts{
   205  			Namespace: namespaceNetwork,
   206  			Subsystem: subsystemGossip,
   207  			Name:      prefix + "gossipsub_iwant_received_total",
   208  			Help:      "number of iwant messages received by the router on rpcs",
   209  		}),
   210  		graftRcvCount: prometheus.NewCounter(prometheus.CounterOpts{
   211  			Namespace: namespaceNetwork,
   212  			Subsystem: subsystemGossip,
   213  			Name:      prefix + "gossipsub_graft_received_total",
   214  			Help:      "number of graft messages received by the router on rpcs",
   215  		}),
   216  		pruneRcvCount: prometheus.NewCounter(prometheus.CounterOpts{
   217  			Namespace: namespaceNetwork,
   218  			Subsystem: subsystemGossip,
   219  			Name:      prefix + "gossipsub_prune_received_total",
   220  			Help:      "number of prune messages received by the router on rpcs",
   221  		}),
   222  		pubsubMsgRcvCount: prometheus.NewCounter(prometheus.CounterOpts{
   223  			Namespace: namespaceNetwork,
   224  			Subsystem: subsystemGossip,
   225  			Name:      prefix + "gossipsub_pubsub_message_received_total",
   226  			Help:      "number of pubsub messages received by the router",
   227  		}),
   228  		iHaveSentCount: prometheus.NewCounter(prometheus.CounterOpts{
   229  			Namespace: namespaceNetwork,
   230  			Subsystem: subsystemGossip,
   231  			Name:      prefix + "gossipsub_ihave_sent_total",
   232  			Help:      "number of ihave messages sent by the router on rpcs",
   233  		}),
   234  		iWantSentCount: prometheus.NewCounter(prometheus.CounterOpts{
   235  			Namespace: namespaceNetwork,
   236  			Subsystem: subsystemGossip,
   237  			Name:      prefix + "gossipsub_iwant_sent_total",
   238  			Help:      "number of iwant messages sent by the router on rpcs",
   239  		}),
   240  		graftSentCount: prometheus.NewCounter(prometheus.CounterOpts{
   241  			Namespace: namespaceNetwork,
   242  			Subsystem: subsystemGossip,
   243  			Name:      prefix + "gossipsub_graft_sent_total",
   244  			Help:      "number of graft messages sent by the router on rpcs",
   245  		}),
   246  		pruneSentCount: prometheus.NewCounter(prometheus.CounterOpts{
   247  			Namespace: namespaceNetwork,
   248  			Subsystem: subsystemGossip,
   249  			Name:      prefix + "gossipsub_prune_sent_total",
   250  			Help:      "number of prune messages sent by the router on rpcs",
   251  		}),
   252  		pubsubMsgSentCount: prometheus.NewCounter(prometheus.CounterOpts{
   253  			Namespace: namespaceNetwork,
   254  			Subsystem: subsystemGossip,
   255  			Name:      prefix + "gossipsub_pubsub_message_sent_total",
   256  			Help:      "number of pubsub messages sent by the router",
   257  		}),
   258  	}
   259  }
   260  
   261  var _ module.LocalGossipSubRouterMetrics = (*LocalGossipSubRouterMetrics)(nil)
   262  
   263  // OnLocalMeshSizeUpdated updates the local mesh size metric.
   264  func (g *LocalGossipSubRouterMetrics) OnLocalMeshSizeUpdated(topic string, size int) {
   265  	g.localMeshSize.WithLabelValues(topic).Set(float64(size))
   266  }
   267  
   268  // OnPeerAddedToProtocol is called when the local node receives a stream from a peer on a gossipsub-related protocol.
   269  // Args:
   270  //
   271  //	protocol: the protocol name that the peer is connected to.
   272  func (g *LocalGossipSubRouterMetrics) OnPeerAddedToProtocol(protocol string) {
   273  	g.peerAddedOnProtocolCount.WithLabelValues(protocol).Inc()
   274  }
   275  
   276  // OnPeerRemovedFromProtocol is called when the local considers a remote peer blacklisted or unavailable.
   277  func (g *LocalGossipSubRouterMetrics) OnPeerRemovedFromProtocol() {
   278  	g.peerRemovedFromProtocolCount.Inc()
   279  }
   280  
   281  // OnLocalPeerJoinedTopic is called when the local node subscribes to a gossipsub topic.
   282  // Args:
   283  //
   284  //	topic: the topic that the local peer subscribed to.
   285  func (g *LocalGossipSubRouterMetrics) OnLocalPeerJoinedTopic() {
   286  	g.localPeerJoinedTopicCount.Inc()
   287  }
   288  
   289  // OnLocalPeerLeftTopic is called when the local node unsubscribes from a gossipsub topic.
   290  // Args:
   291  //
   292  //	topic: the topic that the local peer has unsubscribed from.
   293  func (g *LocalGossipSubRouterMetrics) OnLocalPeerLeftTopic() {
   294  	g.localPeerLeftTopicCount.Inc()
   295  }
   296  
   297  // OnPeerGraftTopic is called when the local node receives a GRAFT message from a remote peer on a topic.
   298  // Note: the received GRAFT at this point is considered passed the RPC inspection, and is accepted by the local node.
   299  func (g *LocalGossipSubRouterMetrics) OnPeerGraftTopic(topic string) {
   300  	g.peerGraftTopicCount.WithLabelValues(topic).Inc()
   301  }
   302  
   303  // OnPeerPruneTopic is called when the local node receives a PRUNE message from a remote peer on a topic.
   304  // Note: the received PRUNE at this point is considered passed the RPC inspection, and is accepted by the local node.
   305  func (g *LocalGossipSubRouterMetrics) OnPeerPruneTopic(topic string) {
   306  	g.peerPruneTopicCount.WithLabelValues(topic).Inc()
   307  }
   308  
   309  // OnMessageEnteredValidation is called when a received pubsub message enters the validation pipeline. It is the
   310  // internal validation pipeline of GossipSub protocol. The message may be rejected or accepted by the validation
   311  // pipeline.
   312  func (g *LocalGossipSubRouterMetrics) OnMessageEnteredValidation(int) {
   313  	g.messageEnteredValidationCount.Inc()
   314  }
   315  
   316  // OnMessageRejected is called when a received pubsub message is rejected by the validation pipeline.
   317  // Args:
   318  //
   319  //	reason: the reason for rejection.
   320  //	size: the size of the rejected message.
   321  func (g *LocalGossipSubRouterMetrics) OnMessageRejected(size int, reason string) {
   322  	g.messageRejectedSize.WithLabelValues(reason).Observe(float64(size))
   323  }
   324  
   325  // OnMessageDuplicate is called when a received pubsub message is a duplicate of a previously received message, and
   326  // is dropped.
   327  // Args:
   328  //
   329  //	size: the size of the duplicate message.
   330  func (g *LocalGossipSubRouterMetrics) OnMessageDuplicate(size int) {
   331  	g.messageDuplicateSize.Observe(float64(size))
   332  }
   333  
   334  // OnPeerThrottled is called when a peer is throttled by the local node, i.e., the local node is not accepting any
   335  // pubsub message from the peer but may still accept control messages.
   336  func (g *LocalGossipSubRouterMetrics) OnPeerThrottled() {
   337  	g.peerThrottledCount.Inc()
   338  }
   339  
   340  // OnRpcReceived is called when an RPC message is received by the local node. The received RPC is considered
   341  // passed the RPC inspection, and is accepted by the local node.
   342  func (g *LocalGossipSubRouterMetrics) OnRpcReceived(msgCount int, iHaveCount int, iWantCount int, graftCount int, pruneCount int) {
   343  	g.rpcRcvCount.Inc()
   344  	g.pubsubMsgRcvCount.Add(float64(msgCount))
   345  	g.iHaveRcvCount.Add(float64(iHaveCount))
   346  	g.iWantRcvCount.Add(float64(iWantCount))
   347  	g.graftRcvCount.Add(float64(graftCount))
   348  	g.pruneRcvCount.Add(float64(pruneCount))
   349  }
   350  
   351  // OnRpcSent is called when an RPC message is sent by the local node.
   352  // Note: the sent RPC is considered passed the RPC inspection, and is accepted by the local node.
   353  func (g *LocalGossipSubRouterMetrics) OnRpcSent(msgCount int, iHaveCount int, iWantCount int, graftCount int, pruneCount int) {
   354  	g.rpcSentCount.Inc()
   355  	g.pubsubMsgSentCount.Add(float64(msgCount))
   356  	g.iHaveSentCount.Add(float64(iHaveCount))
   357  	g.iWantSentCount.Add(float64(iWantCount))
   358  	g.graftSentCount.Add(float64(graftCount))
   359  	g.pruneSentCount.Add(float64(pruneCount))
   360  }
   361  
   362  // OnOutboundRpcDropped is called when an outbound RPC message is dropped by the local node, typically because the local node
   363  // outbound message queue is full; or the RPC is big and the local node cannot fragment it.
   364  func (g *LocalGossipSubRouterMetrics) OnOutboundRpcDropped() {
   365  	g.outboundRpcDroppedCount.Inc()
   366  }
   367  
   368  // OnUndeliveredMessage is called when a message is not delivered at least one subscriber of the topic, for example when
   369  // the subscriber is too slow to process the message.
   370  func (g *LocalGossipSubRouterMetrics) OnUndeliveredMessage() {
   371  	g.undeliveredOutboundMessageCount.Inc()
   372  }
   373  
   374  // OnMessageDeliveredToAllSubscribers is called when a message is delivered to all subscribers of the topic.
   375  // Args:
   376  //
   377  //	size: the size of the delivered message.
   378  func (g *LocalGossipSubRouterMetrics) OnMessageDeliveredToAllSubscribers(size int) {
   379  	g.messageDeliveredSize.Observe(float64(size))
   380  }