github.com/koko1123/flow-go-1@v0.29.6/module/metrics/network.go (about)

     1  package metrics
     2  
     3  import (
     4  	"strconv"
     5  	"time"
     6  
     7  	"github.com/prometheus/client_golang/prometheus"
     8  	"github.com/prometheus/client_golang/prometheus/promauto"
     9  	"github.com/rs/zerolog"
    10  
    11  	"github.com/koko1123/flow-go-1/module"
    12  )
    13  
    14  const (
    15  	_   = iota
    16  	KiB = 1 << (10 * iota)
    17  	MiB
    18  	GiB
    19  )
    20  
    21  type NetworkCollector struct {
    22  	*LibP2PResourceManagerMetrics
    23  	*GossipSubMetrics
    24  	outboundMessageSize          *prometheus.HistogramVec
    25  	inboundMessageSize           *prometheus.HistogramVec
    26  	duplicateMessagesDropped     *prometheus.CounterVec
    27  	queueSize                    *prometheus.GaugeVec
    28  	queueDuration                *prometheus.HistogramVec
    29  	numMessagesProcessing        *prometheus.GaugeVec
    30  	numDirectMessagesSending     *prometheus.GaugeVec
    31  	inboundProcessTime           *prometheus.CounterVec
    32  	outboundConnectionCount      prometheus.Gauge
    33  	inboundConnectionCount       prometheus.Gauge
    34  	dnsLookupDuration            prometheus.Histogram
    35  	dnsCacheMissCount            prometheus.Counter
    36  	dnsCacheHitCount             prometheus.Counter
    37  	dnsCacheInvalidationCount    prometheus.Counter
    38  	dnsLookupRequestDroppedCount prometheus.Counter
    39  	routingTableSize             prometheus.Gauge
    40  
    41  	// authorization, rate limiting metrics
    42  	unAuthorizedMessagesCount       *prometheus.CounterVec
    43  	rateLimitedUnicastMessagesCount *prometheus.CounterVec
    44  
    45  	prefix string
    46  }
    47  
    48  var _ module.NetworkMetrics = (*NetworkCollector)(nil)
    49  
    50  type NetworkCollectorOpt func(*NetworkCollector)
    51  
    52  func WithNetworkPrefix(prefix string) NetworkCollectorOpt {
    53  	return func(nc *NetworkCollector) {
    54  		if prefix != "" {
    55  			nc.prefix = prefix + "_"
    56  		}
    57  	}
    58  }
    59  
    60  func NewNetworkCollector(logger zerolog.Logger, opts ...NetworkCollectorOpt) *NetworkCollector {
    61  	nc := &NetworkCollector{}
    62  
    63  	for _, opt := range opts {
    64  		opt(nc)
    65  	}
    66  
    67  	nc.LibP2PResourceManagerMetrics = NewLibP2PResourceManagerMetrics(logger, nc.prefix)
    68  	nc.GossipSubMetrics = NewGossipSubMetrics(nc.prefix)
    69  
    70  	nc.outboundMessageSize = promauto.NewHistogramVec(
    71  		prometheus.HistogramOpts{
    72  			Namespace: namespaceNetwork,
    73  			Subsystem: subsystemGossip,
    74  			Name:      nc.prefix + "outbound_message_size_bytes",
    75  			Help:      "size of the outbound network message",
    76  			Buckets:   []float64{KiB, 100 * KiB, 500 * KiB, 1 * MiB, 2 * MiB, 4 * MiB},
    77  		}, []string{LabelChannel, LabelProtocol, LabelMessage},
    78  	)
    79  
    80  	nc.inboundMessageSize = promauto.NewHistogramVec(
    81  		prometheus.HistogramOpts{
    82  			Namespace: namespaceNetwork,
    83  			Subsystem: subsystemGossip,
    84  			Name:      nc.prefix + "inbound_message_size_bytes",
    85  			Help:      "size of the inbound network message",
    86  			Buckets:   []float64{KiB, 100 * KiB, 500 * KiB, 1 * MiB, 2 * MiB, 4 * MiB},
    87  		}, []string{LabelChannel, LabelProtocol, LabelMessage},
    88  	)
    89  
    90  	nc.duplicateMessagesDropped = promauto.NewCounterVec(
    91  		prometheus.CounterOpts{
    92  			Namespace: namespaceNetwork,
    93  			Subsystem: subsystemGossip,
    94  			Name:      nc.prefix + "duplicate_messages_dropped",
    95  			Help:      "number of duplicate messages dropped",
    96  		}, []string{LabelChannel, LabelProtocol, LabelMessage},
    97  	)
    98  
    99  	nc.dnsLookupDuration = promauto.NewHistogram(
   100  		prometheus.HistogramOpts{
   101  			Namespace: namespaceNetwork,
   102  			Subsystem: subsystemGossip,
   103  			Name:      nc.prefix + "dns_lookup_duration_ms",
   104  			Buckets:   []float64{1, 10, 100, 500, 1000, 2000},
   105  			Help:      "the time spent on resolving a dns lookup (including cache hits)",
   106  		},
   107  	)
   108  
   109  	nc.dnsCacheMissCount = promauto.NewCounter(
   110  		prometheus.CounterOpts{
   111  			Namespace: namespaceNetwork,
   112  			Subsystem: subsystemGossip,
   113  			Name:      nc.prefix + "dns_cache_miss_total",
   114  			Help:      "the number of dns lookups that miss the cache and made through network",
   115  		},
   116  	)
   117  
   118  	nc.dnsCacheInvalidationCount = promauto.NewCounter(
   119  		prometheus.CounterOpts{
   120  			Namespace: namespaceNetwork,
   121  			Subsystem: subsystemGossip,
   122  			Name:      nc.prefix + "dns_cache_invalidation_total",
   123  			Help:      "the number of times dns cache is invalidated for an entry",
   124  		},
   125  	)
   126  
   127  	nc.dnsCacheHitCount = promauto.NewCounter(
   128  		prometheus.CounterOpts{
   129  			Namespace: namespaceNetwork,
   130  			Subsystem: subsystemGossip,
   131  			Name:      nc.prefix + "dns_cache_hit_total",
   132  			Help:      "the number of dns cache hits",
   133  		},
   134  	)
   135  
   136  	nc.dnsLookupRequestDroppedCount = promauto.NewCounter(
   137  		prometheus.CounterOpts{
   138  			Namespace: namespaceNetwork,
   139  			Subsystem: subsystemGossip,
   140  			Name:      nc.prefix + "dns_lookup_requests_dropped_total",
   141  			Help:      "the number of dns lookup requests dropped",
   142  		},
   143  	)
   144  
   145  	nc.queueSize = promauto.NewGaugeVec(
   146  		prometheus.GaugeOpts{
   147  			Namespace: namespaceNetwork,
   148  			Subsystem: subsystemQueue,
   149  			Name:      nc.prefix + "message_queue_size",
   150  			Help:      "the number of elements in the message receive queue",
   151  		}, []string{LabelPriority},
   152  	)
   153  
   154  	nc.queueDuration = promauto.NewHistogramVec(
   155  		prometheus.HistogramOpts{
   156  			Namespace: namespaceNetwork,
   157  			Subsystem: subsystemQueue,
   158  			Name:      nc.prefix + "message_queue_duration_seconds",
   159  			Help:      "duration [seconds; measured with float64 precision] of how long a message spent in the queue before delivered to an engine.",
   160  			Buckets:   []float64{0.01, 0.1, 0.5, 1, 2, 5}, // 10ms, 100ms, 500ms, 1s, 2s, 5s
   161  		}, []string{LabelPriority},
   162  	)
   163  
   164  	nc.numMessagesProcessing = promauto.NewGaugeVec(
   165  		prometheus.GaugeOpts{
   166  			Namespace: namespaceNetwork,
   167  			Subsystem: subsystemQueue,
   168  			Name:      nc.prefix + "current_messages_processing",
   169  			Help:      "the number of messages currently being processed",
   170  		}, []string{LabelChannel},
   171  	)
   172  
   173  	nc.numDirectMessagesSending = promauto.NewGaugeVec(
   174  		prometheus.GaugeOpts{
   175  			Namespace: namespaceNetwork,
   176  			Subsystem: subsystemGossip,
   177  			Name:      nc.prefix + "direct_messages_in_progress",
   178  			Help:      "the number of direct messages currently in the process of sending",
   179  		}, []string{LabelChannel},
   180  	)
   181  
   182  	nc.inboundProcessTime = promauto.NewCounterVec(
   183  		prometheus.CounterOpts{
   184  			Namespace: namespaceNetwork,
   185  			Subsystem: subsystemQueue,
   186  			Name:      nc.prefix + "engine_message_processing_time_seconds",
   187  			Help:      "duration [seconds; measured with float64 precision] of how long a queue worker blocked for an engine processing message",
   188  		}, []string{LabelChannel},
   189  	)
   190  
   191  	nc.outboundConnectionCount = promauto.NewGauge(
   192  		prometheus.GaugeOpts{
   193  			Namespace: namespaceNetwork,
   194  			Subsystem: subsystemQueue,
   195  			Name:      nc.prefix + "outbound_connection_count",
   196  			Help:      "the number of outbound connections of this node",
   197  		},
   198  	)
   199  
   200  	nc.inboundConnectionCount = promauto.NewGauge(
   201  		prometheus.GaugeOpts{
   202  			Namespace: namespaceNetwork,
   203  			Subsystem: subsystemQueue,
   204  			Name:      nc.prefix + "inbound_connection_count",
   205  			Help:      "the number of inbound connections of this node",
   206  		},
   207  	)
   208  
   209  	nc.routingTableSize = promauto.NewGauge(
   210  		prometheus.GaugeOpts{
   211  			Name:      nc.prefix + "routing_table_size",
   212  			Namespace: namespaceNetwork,
   213  			Subsystem: subsystemDHT,
   214  			Help:      "the size of the DHT routing table",
   215  		},
   216  	)
   217  
   218  	nc.unAuthorizedMessagesCount = promauto.NewCounterVec(
   219  		prometheus.CounterOpts{
   220  			Namespace: namespaceNetwork,
   221  			Subsystem: subsystemAuth,
   222  			Name:      nc.prefix + "unauthorized_messages_count",
   223  			Help:      "number of messages that failed authorization validation",
   224  		}, []string{LabelNodeRole, LabelMessage, LabelChannel, LabelViolationReason},
   225  	)
   226  
   227  	nc.rateLimitedUnicastMessagesCount = promauto.NewCounterVec(
   228  		prometheus.CounterOpts{
   229  			Namespace: namespaceNetwork,
   230  			Subsystem: subsystemRateLimiting,
   231  			Name:      nc.prefix + "rate_limited_unicast_messages_count",
   232  			Help:      "number of messages sent via unicast that have been rate limited",
   233  		}, []string{LabelNodeRole, LabelMessage, LabelChannel, LabelRateLimitReason},
   234  	)
   235  
   236  	return nc
   237  }
   238  
   239  // OutboundMessageSent collects metrics related to a message sent by the node.
   240  func (nc *NetworkCollector) OutboundMessageSent(sizeBytes int, topic, protocol, messageType string) {
   241  	nc.outboundMessageSize.WithLabelValues(topic, protocol, messageType).Observe(float64(sizeBytes))
   242  }
   243  
   244  // InboundMessageReceived collects metrics related to a message received by the node.
   245  func (nc *NetworkCollector) InboundMessageReceived(sizeBytes int, topic, protocol, messageType string) {
   246  	nc.inboundMessageSize.WithLabelValues(topic, protocol, messageType).Observe(float64(sizeBytes))
   247  }
   248  
   249  // DuplicateInboundMessagesDropped increments the metric tracking the number of duplicate messages dropped by the node.
   250  func (nc *NetworkCollector) DuplicateInboundMessagesDropped(topic, protocol, messageType string) {
   251  	nc.duplicateMessagesDropped.WithLabelValues(topic, protocol, messageType).Add(1)
   252  }
   253  
   254  func (nc *NetworkCollector) MessageAdded(priority int) {
   255  	nc.queueSize.WithLabelValues(strconv.Itoa(priority)).Inc()
   256  }
   257  
   258  func (nc *NetworkCollector) MessageRemoved(priority int) {
   259  	nc.queueSize.WithLabelValues(strconv.Itoa(priority)).Dec()
   260  }
   261  
   262  func (nc *NetworkCollector) QueueDuration(duration time.Duration, priority int) {
   263  	nc.queueDuration.WithLabelValues(strconv.Itoa(priority)).Observe(duration.Seconds())
   264  }
   265  
   266  // MessageProcessingStarted increments the metric tracking the number of messages being processed by the node.
   267  func (nc *NetworkCollector) MessageProcessingStarted(topic string) {
   268  	nc.numMessagesProcessing.WithLabelValues(topic).Inc()
   269  }
   270  
   271  // UnicastMessageSendingStarted increments the metric tracking the number of unicast messages sent by the node.
   272  func (nc *NetworkCollector) UnicastMessageSendingStarted(topic string) {
   273  	nc.numDirectMessagesSending.WithLabelValues(topic).Inc()
   274  }
   275  
   276  // UnicastMessageSendingCompleted decrements the metric tracking the number of unicast messages sent by the node.
   277  func (nc *NetworkCollector) UnicastMessageSendingCompleted(topic string) {
   278  	nc.numDirectMessagesSending.WithLabelValues(topic).Dec()
   279  }
   280  
   281  func (nc *NetworkCollector) RoutingTablePeerAdded() {
   282  	nc.routingTableSize.Inc()
   283  }
   284  
   285  func (nc *NetworkCollector) RoutingTablePeerRemoved() {
   286  	nc.routingTableSize.Dec()
   287  }
   288  
   289  // MessageProcessingFinished tracks the time spent by the node to process a message and decrements the metric tracking
   290  // the number of messages being processed by the node.
   291  func (nc *NetworkCollector) MessageProcessingFinished(topic string, duration time.Duration) {
   292  	nc.numMessagesProcessing.WithLabelValues(topic).Dec()
   293  	nc.inboundProcessTime.WithLabelValues(topic).Add(duration.Seconds())
   294  }
   295  
   296  // OutboundConnections updates the metric tracking the number of outbound connections of this node
   297  func (nc *NetworkCollector) OutboundConnections(connectionCount uint) {
   298  	nc.outboundConnectionCount.Set(float64(connectionCount))
   299  }
   300  
   301  // InboundConnections updates the metric tracking the number of inbound connections of this node
   302  func (nc *NetworkCollector) InboundConnections(connectionCount uint) {
   303  	nc.inboundConnectionCount.Set(float64(connectionCount))
   304  }
   305  
   306  // DNSLookupDuration tracks the time spent to resolve a DNS address.
   307  func (nc *NetworkCollector) DNSLookupDuration(duration time.Duration) {
   308  	nc.dnsLookupDuration.Observe(float64(duration.Milliseconds()))
   309  }
   310  
   311  // OnDNSCacheMiss tracks the total number of dns requests resolved through looking up the network.
   312  func (nc *NetworkCollector) OnDNSCacheMiss() {
   313  	nc.dnsCacheMissCount.Inc()
   314  }
   315  
   316  // OnDNSCacheInvalidated is called whenever dns cache is invalidated for an entry
   317  func (nc *NetworkCollector) OnDNSCacheInvalidated() {
   318  	nc.dnsCacheInvalidationCount.Inc()
   319  }
   320  
   321  // OnDNSCacheHit tracks the total number of dns requests resolved through the cache without
   322  // looking up the network.
   323  func (nc *NetworkCollector) OnDNSCacheHit() {
   324  	nc.dnsCacheHitCount.Inc()
   325  }
   326  
   327  // OnDNSLookupRequestDropped tracks the number of dns lookup requests that are dropped due to a full queue
   328  func (nc *NetworkCollector) OnDNSLookupRequestDropped() {
   329  	nc.dnsLookupRequestDroppedCount.Inc()
   330  }
   331  
   332  // OnUnauthorizedMessage tracks the number of unauthorized messages seen on the network.
   333  func (nc *NetworkCollector) OnUnauthorizedMessage(role, msgType, topic, offense string) {
   334  	nc.unAuthorizedMessagesCount.WithLabelValues(role, msgType, topic, offense).Inc()
   335  }
   336  
   337  // OnRateLimitedUnicastMessage tracks the number of rate limited messages seen on the network.
   338  func (nc *NetworkCollector) OnRateLimitedUnicastMessage(role, msgType, topic, reason string) {
   339  	nc.rateLimitedUnicastMessagesCount.WithLabelValues(role, msgType, topic, reason).Inc()
   340  }