github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/metrics/network.go (about)

     1  package metrics
     2  
     3  import (
     4  	"strconv"
     5  	"time"
     6  
     7  	"github.com/libp2p/go-libp2p/core/peer"
     8  	"github.com/prometheus/client_golang/prometheus"
     9  	"github.com/prometheus/client_golang/prometheus/promauto"
    10  	"github.com/rs/zerolog"
    11  
    12  	"github.com/onflow/flow-go/module"
    13  	logging2 "github.com/onflow/flow-go/network/p2p/logging"
    14  	"github.com/onflow/flow-go/utils/logging"
    15  )
    16  
    17  const (
    18  	_   = iota
    19  	KiB = 1 << (10 * iota)
    20  	MiB
    21  	GiB
    22  )
    23  
    24  type NetworkCollector struct {
    25  	*UnicastManagerMetrics
    26  	*LibP2PResourceManagerMetrics
    27  	*GossipSubScoreMetrics
    28  	*LocalGossipSubRouterMetrics
    29  	*GossipSubRpcValidationInspectorMetrics
    30  	*GossipSubScoringRegistryMetrics
    31  	*AlspMetrics
    32  	outboundMessageSize          *prometheus.HistogramVec
    33  	inboundMessageSize           *prometheus.HistogramVec
    34  	duplicateMessagesDropped     *prometheus.CounterVec
    35  	queueSize                    *prometheus.GaugeVec
    36  	queueDuration                *prometheus.HistogramVec
    37  	numMessagesProcessing        *prometheus.GaugeVec
    38  	numDirectMessagesSending     *prometheus.GaugeVec
    39  	inboundProcessTime           *prometheus.CounterVec
    40  	outboundConnectionCount      prometheus.Gauge
    41  	inboundConnectionCount       prometheus.Gauge
    42  	dnsLookupDuration            prometheus.Histogram
    43  	dnsCacheMissCount            prometheus.Counter
    44  	dnsCacheHitCount             prometheus.Counter
    45  	dnsCacheInvalidationCount    prometheus.Counter
    46  	dnsLookupRequestDroppedCount prometheus.Counter
    47  	routingTableSize             prometheus.Gauge
    48  
    49  	// security metrics
    50  	unAuthorizedMessagesCount       *prometheus.CounterVec
    51  	rateLimitedUnicastMessagesCount *prometheus.CounterVec
    52  	violationReportSkippedCount     prometheus.Counter
    53  
    54  	prefix string
    55  }
    56  
    57  var _ module.NetworkMetrics = (*NetworkCollector)(nil)
    58  
    59  type NetworkCollectorOpt func(*NetworkCollector)
    60  
    61  func WithNetworkPrefix(prefix string) NetworkCollectorOpt {
    62  	return func(nc *NetworkCollector) {
    63  		if prefix != "" {
    64  			nc.prefix = prefix + "_"
    65  		}
    66  	}
    67  }
    68  
    69  func NewNetworkCollector(logger zerolog.Logger, opts ...NetworkCollectorOpt) *NetworkCollector {
    70  	nc := &NetworkCollector{}
    71  
    72  	for _, opt := range opts {
    73  		opt(nc)
    74  	}
    75  
    76  	nc.UnicastManagerMetrics = NewUnicastManagerMetrics(nc.prefix)
    77  	nc.LibP2PResourceManagerMetrics = NewLibP2PResourceManagerMetrics(logger, nc.prefix)
    78  	nc.LocalGossipSubRouterMetrics = NewGossipSubLocalMeshMetrics(nc.prefix)
    79  	nc.GossipSubScoreMetrics = NewGossipSubScoreMetrics(nc.prefix)
    80  	nc.GossipSubRpcValidationInspectorMetrics = NewGossipSubRPCValidationInspectorMetrics(nc.prefix)
    81  	nc.GossipSubScoringRegistryMetrics = NewGossipSubScoringRegistryMetrics(nc.prefix)
    82  	nc.AlspMetrics = NewAlspMetrics()
    83  
    84  	nc.outboundMessageSize = promauto.NewHistogramVec(
    85  		prometheus.HistogramOpts{
    86  			Namespace: namespaceNetwork,
    87  			Subsystem: subsystemGossip,
    88  			Name:      nc.prefix + "outbound_message_size_bytes",
    89  			Help:      "size of the outbound network message",
    90  			Buckets:   []float64{KiB, 100 * KiB, 1 * MiB},
    91  		}, []string{LabelChannel, LabelProtocol, LabelMessage},
    92  	)
    93  
    94  	nc.inboundMessageSize = promauto.NewHistogramVec(
    95  		prometheus.HistogramOpts{
    96  			Namespace: namespaceNetwork,
    97  			Subsystem: subsystemGossip,
    98  			Name:      nc.prefix + "inbound_message_size_bytes",
    99  			Help:      "size of the inbound network message",
   100  			Buckets:   []float64{KiB, 100 * KiB, 1 * MiB},
   101  		}, []string{LabelChannel, LabelProtocol, LabelMessage},
   102  	)
   103  
   104  	nc.duplicateMessagesDropped = promauto.NewCounterVec(
   105  		prometheus.CounterOpts{
   106  			Namespace: namespaceNetwork,
   107  			Subsystem: subsystemGossip,
   108  			Name:      nc.prefix + "duplicate_messages_dropped",
   109  			Help:      "number of duplicate messages dropped",
   110  		}, []string{LabelChannel, LabelProtocol, LabelMessage},
   111  	)
   112  
   113  	nc.dnsLookupDuration = promauto.NewHistogram(
   114  		prometheus.HistogramOpts{
   115  			Namespace: namespaceNetwork,
   116  			Subsystem: subsystemGossip,
   117  			Name:      nc.prefix + "dns_lookup_duration_ms",
   118  			Buckets:   []float64{1, 10, 100, 500, 1000, 2000},
   119  			Help:      "the time spent on resolving a dns lookup (including cache hits)",
   120  		},
   121  	)
   122  
   123  	nc.dnsCacheMissCount = promauto.NewCounter(
   124  		prometheus.CounterOpts{
   125  			Namespace: namespaceNetwork,
   126  			Subsystem: subsystemGossip,
   127  			Name:      nc.prefix + "dns_cache_miss_total",
   128  			Help:      "the number of dns lookups that miss the cache and made through network",
   129  		},
   130  	)
   131  
   132  	nc.dnsCacheInvalidationCount = promauto.NewCounter(
   133  		prometheus.CounterOpts{
   134  			Namespace: namespaceNetwork,
   135  			Subsystem: subsystemGossip,
   136  			Name:      nc.prefix + "dns_cache_invalidation_total",
   137  			Help:      "the number of times dns cache is invalidated for an entry",
   138  		},
   139  	)
   140  
   141  	nc.dnsCacheHitCount = promauto.NewCounter(
   142  		prometheus.CounterOpts{
   143  			Namespace: namespaceNetwork,
   144  			Subsystem: subsystemGossip,
   145  			Name:      nc.prefix + "dns_cache_hit_total",
   146  			Help:      "the number of dns cache hits",
   147  		},
   148  	)
   149  
   150  	nc.dnsLookupRequestDroppedCount = promauto.NewCounter(
   151  		prometheus.CounterOpts{
   152  			Namespace: namespaceNetwork,
   153  			Subsystem: subsystemGossip,
   154  			Name:      nc.prefix + "dns_lookup_requests_dropped_total",
   155  			Help:      "the number of dns lookup requests dropped",
   156  		},
   157  	)
   158  
   159  	nc.queueSize = promauto.NewGaugeVec(
   160  		prometheus.GaugeOpts{
   161  			Namespace: namespaceNetwork,
   162  			Subsystem: subsystemQueue,
   163  			Name:      nc.prefix + "message_queue_size",
   164  			Help:      "the number of elements in the message receive queue",
   165  		}, []string{LabelPriority},
   166  	)
   167  
   168  	nc.queueDuration = promauto.NewHistogramVec(
   169  		prometheus.HistogramOpts{
   170  			Namespace: namespaceNetwork,
   171  			Subsystem: subsystemQueue,
   172  			Name:      nc.prefix + "message_queue_duration_seconds",
   173  			Help:      "duration [seconds; measured with float64 precision] of how long a message spent in the queue before delivered to an engine.",
   174  			Buckets:   []float64{0.01, 0.1, 0.5, 1, 2, 5}, // 10ms, 100ms, 500ms, 1s, 2s, 5s
   175  		}, []string{LabelPriority},
   176  	)
   177  
   178  	nc.numMessagesProcessing = promauto.NewGaugeVec(
   179  		prometheus.GaugeOpts{
   180  			Namespace: namespaceNetwork,
   181  			Subsystem: subsystemQueue,
   182  			Name:      nc.prefix + "current_messages_processing",
   183  			Help:      "the number of messages currently being processed",
   184  		}, []string{LabelChannel},
   185  	)
   186  
   187  	nc.numDirectMessagesSending = promauto.NewGaugeVec(
   188  		prometheus.GaugeOpts{
   189  			Namespace: namespaceNetwork,
   190  			Subsystem: subsystemGossip,
   191  			Name:      nc.prefix + "direct_messages_in_progress",
   192  			Help:      "the number of direct messages currently in the process of sending",
   193  		}, []string{LabelChannel},
   194  	)
   195  
   196  	nc.inboundProcessTime = promauto.NewCounterVec(
   197  		prometheus.CounterOpts{
   198  			Namespace: namespaceNetwork,
   199  			Subsystem: subsystemQueue,
   200  			Name:      nc.prefix + "engine_message_processing_time_seconds",
   201  			Help:      "duration [seconds; measured with float64 precision] of how long a queue worker blocked for an engine processing message",
   202  		}, []string{LabelChannel},
   203  	)
   204  
   205  	nc.outboundConnectionCount = promauto.NewGauge(
   206  		prometheus.GaugeOpts{
   207  			Namespace: namespaceNetwork,
   208  			Subsystem: subsystemQueue,
   209  			Name:      nc.prefix + "outbound_connection_count",
   210  			Help:      "the number of outbound connections of this node",
   211  		},
   212  	)
   213  
   214  	nc.inboundConnectionCount = promauto.NewGauge(
   215  		prometheus.GaugeOpts{
   216  			Namespace: namespaceNetwork,
   217  			Subsystem: subsystemQueue,
   218  			Name:      nc.prefix + "inbound_connection_count",
   219  			Help:      "the number of inbound connections of this node",
   220  		},
   221  	)
   222  
   223  	nc.routingTableSize = promauto.NewGauge(
   224  		prometheus.GaugeOpts{
   225  			Name:      nc.prefix + "routing_table_size",
   226  			Namespace: namespaceNetwork,
   227  			Subsystem: subsystemDHT,
   228  			Help:      "the size of the DHT routing table",
   229  		},
   230  	)
   231  
   232  	nc.unAuthorizedMessagesCount = promauto.NewCounterVec(
   233  		prometheus.CounterOpts{
   234  			Namespace: namespaceNetwork,
   235  			Subsystem: subsystemAuth,
   236  			Name:      nc.prefix + "unauthorized_messages_count",
   237  			Help:      "number of messages that failed authorization validation",
   238  		}, []string{LabelNodeRole, LabelMessage, LabelChannel, LabelViolationReason},
   239  	)
   240  
   241  	nc.rateLimitedUnicastMessagesCount = promauto.NewCounterVec(
   242  		prometheus.CounterOpts{
   243  			Namespace: namespaceNetwork,
   244  			Subsystem: subsystemRateLimiting,
   245  			Name:      nc.prefix + "rate_limited_unicast_messages_count",
   246  			Help:      "number of messages sent via unicast that have been rate limited",
   247  		}, []string{LabelNodeRole, LabelMessage, LabelChannel, LabelRateLimitReason},
   248  	)
   249  
   250  	nc.violationReportSkippedCount = promauto.NewCounter(
   251  		prometheus.CounterOpts{
   252  			Namespace: namespaceNetwork,
   253  			Subsystem: subsystemSecurity,
   254  			Name:      nc.prefix + "slashing_violation_reports_skipped_count",
   255  			Help:      "number of slashing violations consumer violations that were not reported for misbehavior because the identity of the sender not known",
   256  		},
   257  	)
   258  
   259  	return nc
   260  }
   261  
   262  // OutboundMessageSent collects metrics related to a message sent by the node.
   263  func (nc *NetworkCollector) OutboundMessageSent(sizeBytes int, topic, protocol, messageType string) {
   264  	nc.outboundMessageSize.WithLabelValues(topic, protocol, messageType).Observe(float64(sizeBytes))
   265  }
   266  
   267  // InboundMessageReceived collects metrics related to a message received by the node.
   268  func (nc *NetworkCollector) InboundMessageReceived(sizeBytes int, topic, protocol, messageType string) {
   269  	nc.inboundMessageSize.WithLabelValues(topic, protocol, messageType).Observe(float64(sizeBytes))
   270  }
   271  
   272  // DuplicateInboundMessagesDropped increments the metric tracking the number of duplicate messages dropped by the node.
   273  func (nc *NetworkCollector) DuplicateInboundMessagesDropped(topic, protocol, messageType string) {
   274  	nc.duplicateMessagesDropped.WithLabelValues(topic, protocol, messageType).Add(1)
   275  }
   276  
   277  func (nc *NetworkCollector) MessageAdded(priority int) {
   278  	nc.queueSize.WithLabelValues(strconv.Itoa(priority)).Inc()
   279  }
   280  
   281  func (nc *NetworkCollector) MessageRemoved(priority int) {
   282  	nc.queueSize.WithLabelValues(strconv.Itoa(priority)).Dec()
   283  }
   284  
   285  func (nc *NetworkCollector) QueueDuration(duration time.Duration, priority int) {
   286  	nc.queueDuration.WithLabelValues(strconv.Itoa(priority)).Observe(duration.Seconds())
   287  }
   288  
   289  // MessageProcessingStarted increments the metric tracking the number of messages being processed by the node.
   290  func (nc *NetworkCollector) MessageProcessingStarted(topic string) {
   291  	nc.numMessagesProcessing.WithLabelValues(topic).Inc()
   292  }
   293  
   294  // UnicastMessageSendingStarted increments the metric tracking the number of unicast messages sent by the node.
   295  func (nc *NetworkCollector) UnicastMessageSendingStarted(topic string) {
   296  	nc.numDirectMessagesSending.WithLabelValues(topic).Inc()
   297  }
   298  
   299  // UnicastMessageSendingCompleted decrements the metric tracking the number of unicast messages sent by the node.
   300  func (nc *NetworkCollector) UnicastMessageSendingCompleted(topic string) {
   301  	nc.numDirectMessagesSending.WithLabelValues(topic).Dec()
   302  }
   303  
   304  func (nc *NetworkCollector) RoutingTablePeerAdded() {
   305  	nc.routingTableSize.Inc()
   306  }
   307  
   308  func (nc *NetworkCollector) RoutingTablePeerRemoved() {
   309  	nc.routingTableSize.Dec()
   310  }
   311  
   312  // MessageProcessingFinished tracks the time spent by the node to process a message and decrements the metric tracking
   313  // the number of messages being processed by the node.
   314  func (nc *NetworkCollector) MessageProcessingFinished(topic string, duration time.Duration) {
   315  	nc.numMessagesProcessing.WithLabelValues(topic).Dec()
   316  	nc.inboundProcessTime.WithLabelValues(topic).Add(duration.Seconds())
   317  }
   318  
   319  // OutboundConnections updates the metric tracking the number of outbound connections of this node
   320  func (nc *NetworkCollector) OutboundConnections(connectionCount uint) {
   321  	nc.outboundConnectionCount.Set(float64(connectionCount))
   322  }
   323  
   324  // InboundConnections updates the metric tracking the number of inbound connections of this node
   325  func (nc *NetworkCollector) InboundConnections(connectionCount uint) {
   326  	nc.inboundConnectionCount.Set(float64(connectionCount))
   327  }
   328  
   329  // DNSLookupDuration tracks the time spent to resolve a DNS address.
   330  func (nc *NetworkCollector) DNSLookupDuration(duration time.Duration) {
   331  	nc.dnsLookupDuration.Observe(float64(duration.Milliseconds()))
   332  }
   333  
   334  // OnDNSCacheMiss tracks the total number of dns requests resolved through looking up the network.
   335  func (nc *NetworkCollector) OnDNSCacheMiss() {
   336  	nc.dnsCacheMissCount.Inc()
   337  }
   338  
   339  // OnDNSCacheInvalidated is called whenever dns cache is invalidated for an entry
   340  func (nc *NetworkCollector) OnDNSCacheInvalidated() {
   341  	nc.dnsCacheInvalidationCount.Inc()
   342  }
   343  
   344  // OnDNSCacheHit tracks the total number of dns requests resolved through the cache without
   345  // looking up the network.
   346  func (nc *NetworkCollector) OnDNSCacheHit() {
   347  	nc.dnsCacheHitCount.Inc()
   348  }
   349  
   350  // OnDNSLookupRequestDropped tracks the number of dns lookup requests that are dropped due to a full queue
   351  func (nc *NetworkCollector) OnDNSLookupRequestDropped() {
   352  	nc.dnsLookupRequestDroppedCount.Inc()
   353  }
   354  
   355  // OnUnauthorizedMessage tracks the number of unauthorized messages seen on the network.
   356  func (nc *NetworkCollector) OnUnauthorizedMessage(role, msgType, topic, offense string) {
   357  	nc.unAuthorizedMessagesCount.WithLabelValues(role, msgType, topic, offense).Inc()
   358  }
   359  
   360  // OnRateLimitedPeer tracks the number of rate limited messages seen on the network.
   361  func (nc *NetworkCollector) OnRateLimitedPeer(peerID peer.ID, role, msgType, topic, reason string) {
   362  	nc.logger.Warn().
   363  		Str("peer_id", logging2.PeerId(peerID)).
   364  		Str("role", role).
   365  		Str("message_type", msgType).
   366  		Str("topic", topic).
   367  		Str("reason", reason).
   368  		Bool(logging.KeySuspicious, true).
   369  		Msg("unicast peer rate limited")
   370  	nc.rateLimitedUnicastMessagesCount.WithLabelValues(role, msgType, topic, reason).Inc()
   371  }
   372  
   373  // OnViolationReportSkipped tracks the number of slashing violations consumer violations that were not
   374  // reported for misbehavior when the identity of the sender not known.
   375  func (nc *NetworkCollector) OnViolationReportSkipped() {
   376  	nc.violationReportSkippedCount.Inc()
   377  }