github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/module/metrics/network.go (about) 1 package metrics 2 3 import ( 4 "strconv" 5 "time" 6 7 "github.com/libp2p/go-libp2p/core/peer" 8 "github.com/prometheus/client_golang/prometheus" 9 "github.com/prometheus/client_golang/prometheus/promauto" 10 "github.com/rs/zerolog" 11 12 "github.com/onflow/flow-go/module" 13 logging2 "github.com/onflow/flow-go/network/p2p/logging" 14 "github.com/onflow/flow-go/utils/logging" 15 ) 16 17 const ( 18 _ = iota 19 KiB = 1 << (10 * iota) 20 MiB 21 GiB 22 ) 23 24 type NetworkCollector struct { 25 *UnicastManagerMetrics 26 *LibP2PResourceManagerMetrics 27 *GossipSubScoreMetrics 28 *LocalGossipSubRouterMetrics 29 *GossipSubRpcValidationInspectorMetrics 30 *GossipSubScoringRegistryMetrics 31 *AlspMetrics 32 outboundMessageSize *prometheus.HistogramVec 33 inboundMessageSize *prometheus.HistogramVec 34 duplicateMessagesDropped *prometheus.CounterVec 35 queueSize *prometheus.GaugeVec 36 queueDuration *prometheus.HistogramVec 37 numMessagesProcessing *prometheus.GaugeVec 38 numDirectMessagesSending *prometheus.GaugeVec 39 inboundProcessTime *prometheus.CounterVec 40 outboundConnectionCount prometheus.Gauge 41 inboundConnectionCount prometheus.Gauge 42 dnsLookupDuration prometheus.Histogram 43 dnsCacheMissCount prometheus.Counter 44 dnsCacheHitCount prometheus.Counter 45 dnsCacheInvalidationCount prometheus.Counter 46 dnsLookupRequestDroppedCount prometheus.Counter 47 routingTableSize prometheus.Gauge 48 49 // security metrics 50 unAuthorizedMessagesCount *prometheus.CounterVec 51 rateLimitedUnicastMessagesCount *prometheus.CounterVec 52 violationReportSkippedCount prometheus.Counter 53 54 prefix string 55 } 56 57 var _ module.NetworkMetrics = (*NetworkCollector)(nil) 58 59 type NetworkCollectorOpt func(*NetworkCollector) 60 61 func WithNetworkPrefix(prefix string) NetworkCollectorOpt { 62 return func(nc *NetworkCollector) { 63 if prefix != "" { 64 nc.prefix = prefix + "_" 65 } 66 } 67 } 68 69 func NewNetworkCollector(logger zerolog.Logger, opts ...NetworkCollectorOpt) *NetworkCollector { 70 nc := &NetworkCollector{} 71 72 for _, opt := range opts { 73 opt(nc) 74 } 75 76 nc.UnicastManagerMetrics = NewUnicastManagerMetrics(nc.prefix) 77 nc.LibP2PResourceManagerMetrics = NewLibP2PResourceManagerMetrics(logger, nc.prefix) 78 nc.LocalGossipSubRouterMetrics = NewGossipSubLocalMeshMetrics(nc.prefix) 79 nc.GossipSubScoreMetrics = NewGossipSubScoreMetrics(nc.prefix) 80 nc.GossipSubRpcValidationInspectorMetrics = NewGossipSubRPCValidationInspectorMetrics(nc.prefix) 81 nc.GossipSubScoringRegistryMetrics = NewGossipSubScoringRegistryMetrics(nc.prefix) 82 nc.AlspMetrics = NewAlspMetrics() 83 84 nc.outboundMessageSize = promauto.NewHistogramVec( 85 prometheus.HistogramOpts{ 86 Namespace: namespaceNetwork, 87 Subsystem: subsystemGossip, 88 Name: nc.prefix + "outbound_message_size_bytes", 89 Help: "size of the outbound network message", 90 Buckets: []float64{KiB, 100 * KiB, 1 * MiB}, 91 }, []string{LabelChannel, LabelProtocol, LabelMessage}, 92 ) 93 94 nc.inboundMessageSize = promauto.NewHistogramVec( 95 prometheus.HistogramOpts{ 96 Namespace: namespaceNetwork, 97 Subsystem: subsystemGossip, 98 Name: nc.prefix + "inbound_message_size_bytes", 99 Help: "size of the inbound network message", 100 Buckets: []float64{KiB, 100 * KiB, 1 * MiB}, 101 }, []string{LabelChannel, LabelProtocol, LabelMessage}, 102 ) 103 104 nc.duplicateMessagesDropped = promauto.NewCounterVec( 105 prometheus.CounterOpts{ 106 Namespace: namespaceNetwork, 107 Subsystem: subsystemGossip, 108 Name: nc.prefix + "duplicate_messages_dropped", 109 Help: "number of duplicate messages dropped", 110 }, []string{LabelChannel, LabelProtocol, LabelMessage}, 111 ) 112 113 nc.dnsLookupDuration = promauto.NewHistogram( 114 prometheus.HistogramOpts{ 115 Namespace: namespaceNetwork, 116 Subsystem: subsystemGossip, 117 Name: nc.prefix + "dns_lookup_duration_ms", 118 Buckets: []float64{1, 10, 100, 500, 1000, 2000}, 119 Help: "the time spent on resolving a dns lookup (including cache hits)", 120 }, 121 ) 122 123 nc.dnsCacheMissCount = promauto.NewCounter( 124 prometheus.CounterOpts{ 125 Namespace: namespaceNetwork, 126 Subsystem: subsystemGossip, 127 Name: nc.prefix + "dns_cache_miss_total", 128 Help: "the number of dns lookups that miss the cache and made through network", 129 }, 130 ) 131 132 nc.dnsCacheInvalidationCount = promauto.NewCounter( 133 prometheus.CounterOpts{ 134 Namespace: namespaceNetwork, 135 Subsystem: subsystemGossip, 136 Name: nc.prefix + "dns_cache_invalidation_total", 137 Help: "the number of times dns cache is invalidated for an entry", 138 }, 139 ) 140 141 nc.dnsCacheHitCount = promauto.NewCounter( 142 prometheus.CounterOpts{ 143 Namespace: namespaceNetwork, 144 Subsystem: subsystemGossip, 145 Name: nc.prefix + "dns_cache_hit_total", 146 Help: "the number of dns cache hits", 147 }, 148 ) 149 150 nc.dnsLookupRequestDroppedCount = promauto.NewCounter( 151 prometheus.CounterOpts{ 152 Namespace: namespaceNetwork, 153 Subsystem: subsystemGossip, 154 Name: nc.prefix + "dns_lookup_requests_dropped_total", 155 Help: "the number of dns lookup requests dropped", 156 }, 157 ) 158 159 nc.queueSize = promauto.NewGaugeVec( 160 prometheus.GaugeOpts{ 161 Namespace: namespaceNetwork, 162 Subsystem: subsystemQueue, 163 Name: nc.prefix + "message_queue_size", 164 Help: "the number of elements in the message receive queue", 165 }, []string{LabelPriority}, 166 ) 167 168 nc.queueDuration = promauto.NewHistogramVec( 169 prometheus.HistogramOpts{ 170 Namespace: namespaceNetwork, 171 Subsystem: subsystemQueue, 172 Name: nc.prefix + "message_queue_duration_seconds", 173 Help: "duration [seconds; measured with float64 precision] of how long a message spent in the queue before delivered to an engine.", 174 Buckets: []float64{0.01, 0.1, 0.5, 1, 2, 5}, // 10ms, 100ms, 500ms, 1s, 2s, 5s 175 }, []string{LabelPriority}, 176 ) 177 178 nc.numMessagesProcessing = promauto.NewGaugeVec( 179 prometheus.GaugeOpts{ 180 Namespace: namespaceNetwork, 181 Subsystem: subsystemQueue, 182 Name: nc.prefix + "current_messages_processing", 183 Help: "the number of messages currently being processed", 184 }, []string{LabelChannel}, 185 ) 186 187 nc.numDirectMessagesSending = promauto.NewGaugeVec( 188 prometheus.GaugeOpts{ 189 Namespace: namespaceNetwork, 190 Subsystem: subsystemGossip, 191 Name: nc.prefix + "direct_messages_in_progress", 192 Help: "the number of direct messages currently in the process of sending", 193 }, []string{LabelChannel}, 194 ) 195 196 nc.inboundProcessTime = promauto.NewCounterVec( 197 prometheus.CounterOpts{ 198 Namespace: namespaceNetwork, 199 Subsystem: subsystemQueue, 200 Name: nc.prefix + "engine_message_processing_time_seconds", 201 Help: "duration [seconds; measured with float64 precision] of how long a queue worker blocked for an engine processing message", 202 }, []string{LabelChannel}, 203 ) 204 205 nc.outboundConnectionCount = promauto.NewGauge( 206 prometheus.GaugeOpts{ 207 Namespace: namespaceNetwork, 208 Subsystem: subsystemQueue, 209 Name: nc.prefix + "outbound_connection_count", 210 Help: "the number of outbound connections of this node", 211 }, 212 ) 213 214 nc.inboundConnectionCount = promauto.NewGauge( 215 prometheus.GaugeOpts{ 216 Namespace: namespaceNetwork, 217 Subsystem: subsystemQueue, 218 Name: nc.prefix + "inbound_connection_count", 219 Help: "the number of inbound connections of this node", 220 }, 221 ) 222 223 nc.routingTableSize = promauto.NewGauge( 224 prometheus.GaugeOpts{ 225 Name: nc.prefix + "routing_table_size", 226 Namespace: namespaceNetwork, 227 Subsystem: subsystemDHT, 228 Help: "the size of the DHT routing table", 229 }, 230 ) 231 232 nc.unAuthorizedMessagesCount = promauto.NewCounterVec( 233 prometheus.CounterOpts{ 234 Namespace: namespaceNetwork, 235 Subsystem: subsystemAuth, 236 Name: nc.prefix + "unauthorized_messages_count", 237 Help: "number of messages that failed authorization validation", 238 }, []string{LabelNodeRole, LabelMessage, LabelChannel, LabelViolationReason}, 239 ) 240 241 nc.rateLimitedUnicastMessagesCount = promauto.NewCounterVec( 242 prometheus.CounterOpts{ 243 Namespace: namespaceNetwork, 244 Subsystem: subsystemRateLimiting, 245 Name: nc.prefix + "rate_limited_unicast_messages_count", 246 Help: "number of messages sent via unicast that have been rate limited", 247 }, []string{LabelNodeRole, LabelMessage, LabelChannel, LabelRateLimitReason}, 248 ) 249 250 nc.violationReportSkippedCount = promauto.NewCounter( 251 prometheus.CounterOpts{ 252 Namespace: namespaceNetwork, 253 Subsystem: subsystemSecurity, 254 Name: nc.prefix + "slashing_violation_reports_skipped_count", 255 Help: "number of slashing violations consumer violations that were not reported for misbehavior because the identity of the sender not known", 256 }, 257 ) 258 259 return nc 260 } 261 262 // OutboundMessageSent collects metrics related to a message sent by the node. 263 func (nc *NetworkCollector) OutboundMessageSent(sizeBytes int, topic, protocol, messageType string) { 264 nc.outboundMessageSize.WithLabelValues(topic, protocol, messageType).Observe(float64(sizeBytes)) 265 } 266 267 // InboundMessageReceived collects metrics related to a message received by the node. 268 func (nc *NetworkCollector) InboundMessageReceived(sizeBytes int, topic, protocol, messageType string) { 269 nc.inboundMessageSize.WithLabelValues(topic, protocol, messageType).Observe(float64(sizeBytes)) 270 } 271 272 // DuplicateInboundMessagesDropped increments the metric tracking the number of duplicate messages dropped by the node. 273 func (nc *NetworkCollector) DuplicateInboundMessagesDropped(topic, protocol, messageType string) { 274 nc.duplicateMessagesDropped.WithLabelValues(topic, protocol, messageType).Add(1) 275 } 276 277 func (nc *NetworkCollector) MessageAdded(priority int) { 278 nc.queueSize.WithLabelValues(strconv.Itoa(priority)).Inc() 279 } 280 281 func (nc *NetworkCollector) MessageRemoved(priority int) { 282 nc.queueSize.WithLabelValues(strconv.Itoa(priority)).Dec() 283 } 284 285 func (nc *NetworkCollector) QueueDuration(duration time.Duration, priority int) { 286 nc.queueDuration.WithLabelValues(strconv.Itoa(priority)).Observe(duration.Seconds()) 287 } 288 289 // MessageProcessingStarted increments the metric tracking the number of messages being processed by the node. 290 func (nc *NetworkCollector) MessageProcessingStarted(topic string) { 291 nc.numMessagesProcessing.WithLabelValues(topic).Inc() 292 } 293 294 // UnicastMessageSendingStarted increments the metric tracking the number of unicast messages sent by the node. 295 func (nc *NetworkCollector) UnicastMessageSendingStarted(topic string) { 296 nc.numDirectMessagesSending.WithLabelValues(topic).Inc() 297 } 298 299 // UnicastMessageSendingCompleted decrements the metric tracking the number of unicast messages sent by the node. 300 func (nc *NetworkCollector) UnicastMessageSendingCompleted(topic string) { 301 nc.numDirectMessagesSending.WithLabelValues(topic).Dec() 302 } 303 304 func (nc *NetworkCollector) RoutingTablePeerAdded() { 305 nc.routingTableSize.Inc() 306 } 307 308 func (nc *NetworkCollector) RoutingTablePeerRemoved() { 309 nc.routingTableSize.Dec() 310 } 311 312 // MessageProcessingFinished tracks the time spent by the node to process a message and decrements the metric tracking 313 // the number of messages being processed by the node. 314 func (nc *NetworkCollector) MessageProcessingFinished(topic string, duration time.Duration) { 315 nc.numMessagesProcessing.WithLabelValues(topic).Dec() 316 nc.inboundProcessTime.WithLabelValues(topic).Add(duration.Seconds()) 317 } 318 319 // OutboundConnections updates the metric tracking the number of outbound connections of this node 320 func (nc *NetworkCollector) OutboundConnections(connectionCount uint) { 321 nc.outboundConnectionCount.Set(float64(connectionCount)) 322 } 323 324 // InboundConnections updates the metric tracking the number of inbound connections of this node 325 func (nc *NetworkCollector) InboundConnections(connectionCount uint) { 326 nc.inboundConnectionCount.Set(float64(connectionCount)) 327 } 328 329 // DNSLookupDuration tracks the time spent to resolve a DNS address. 330 func (nc *NetworkCollector) DNSLookupDuration(duration time.Duration) { 331 nc.dnsLookupDuration.Observe(float64(duration.Milliseconds())) 332 } 333 334 // OnDNSCacheMiss tracks the total number of dns requests resolved through looking up the network. 335 func (nc *NetworkCollector) OnDNSCacheMiss() { 336 nc.dnsCacheMissCount.Inc() 337 } 338 339 // OnDNSCacheInvalidated is called whenever dns cache is invalidated for an entry 340 func (nc *NetworkCollector) OnDNSCacheInvalidated() { 341 nc.dnsCacheInvalidationCount.Inc() 342 } 343 344 // OnDNSCacheHit tracks the total number of dns requests resolved through the cache without 345 // looking up the network. 346 func (nc *NetworkCollector) OnDNSCacheHit() { 347 nc.dnsCacheHitCount.Inc() 348 } 349 350 // OnDNSLookupRequestDropped tracks the number of dns lookup requests that are dropped due to a full queue 351 func (nc *NetworkCollector) OnDNSLookupRequestDropped() { 352 nc.dnsLookupRequestDroppedCount.Inc() 353 } 354 355 // OnUnauthorizedMessage tracks the number of unauthorized messages seen on the network. 356 func (nc *NetworkCollector) OnUnauthorizedMessage(role, msgType, topic, offense string) { 357 nc.unAuthorizedMessagesCount.WithLabelValues(role, msgType, topic, offense).Inc() 358 } 359 360 // OnRateLimitedPeer tracks the number of rate limited messages seen on the network. 361 func (nc *NetworkCollector) OnRateLimitedPeer(peerID peer.ID, role, msgType, topic, reason string) { 362 nc.logger.Warn(). 363 Str("peer_id", logging2.PeerId(peerID)). 364 Str("role", role). 365 Str("message_type", msgType). 366 Str("topic", topic). 367 Str("reason", reason). 368 Bool(logging.KeySuspicious, true). 369 Msg("unicast peer rate limited") 370 nc.rateLimitedUnicastMessagesCount.WithLabelValues(role, msgType, topic, reason).Inc() 371 } 372 373 // OnViolationReportSkipped tracks the number of slashing violations consumer violations that were not 374 // reported for misbehavior when the identity of the sender not known. 375 func (nc *NetworkCollector) OnViolationReportSkipped() { 376 nc.violationReportSkippedCount.Inc() 377 }