github.com/MetalBlockchain/metalgo@v1.11.9/network/metrics.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package network
     5  
     6  import (
     7  	"errors"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/prometheus/client_golang/prometheus"
    12  
    13  	"github.com/MetalBlockchain/metalgo/ids"
    14  	"github.com/MetalBlockchain/metalgo/network/peer"
    15  	"github.com/MetalBlockchain/metalgo/utils/set"
    16  )
    17  
    18  type metrics struct {
    19  	// trackedSubnets does not include the primary network ID
    20  	trackedSubnets set.Set[ids.ID]
    21  
    22  	numTracked                      prometheus.Gauge
    23  	numPeers                        prometheus.Gauge
    24  	numSubnetPeers                  *prometheus.GaugeVec
    25  	timeSinceLastMsgSent            prometheus.Gauge
    26  	timeSinceLastMsgReceived        prometheus.Gauge
    27  	sendFailRate                    prometheus.Gauge
    28  	connected                       prometheus.Counter
    29  	disconnected                    prometheus.Counter
    30  	acceptFailed                    prometheus.Counter
    31  	inboundConnRateLimited          prometheus.Counter
    32  	inboundConnAllowed              prometheus.Counter
    33  	tlsConnRejected                 prometheus.Counter
    34  	numUselessPeerListBytes         prometheus.Counter
    35  	nodeUptimeWeightedAverage       prometheus.Gauge
    36  	nodeUptimeRewardingStake        prometheus.Gauge
    37  	nodeSubnetUptimeWeightedAverage *prometheus.GaugeVec
    38  	nodeSubnetUptimeRewardingStake  *prometheus.GaugeVec
    39  	peerConnectedLifetimeAverage    prometheus.Gauge
    40  
    41  	lock                       sync.RWMutex
    42  	peerConnectedStartTimes    map[ids.NodeID]float64
    43  	peerConnectedStartTimesSum float64
    44  }
    45  
    46  func newMetrics(
    47  	registerer prometheus.Registerer,
    48  	trackedSubnets set.Set[ids.ID],
    49  ) (*metrics, error) {
    50  	m := &metrics{
    51  		trackedSubnets: trackedSubnets,
    52  		numPeers: prometheus.NewGauge(prometheus.GaugeOpts{
    53  			Name: "peers",
    54  			Help: "Number of network peers",
    55  		}),
    56  		numTracked: prometheus.NewGauge(prometheus.GaugeOpts{
    57  			Name: "tracked",
    58  			Help: "Number of currently tracked IPs attempting to be connected to",
    59  		}),
    60  		numSubnetPeers: prometheus.NewGaugeVec(
    61  			prometheus.GaugeOpts{
    62  				Name: "peers_subnet",
    63  				Help: "Number of peers that are validating a particular subnet",
    64  			},
    65  			[]string{"subnetID"},
    66  		),
    67  		timeSinceLastMsgReceived: prometheus.NewGauge(prometheus.GaugeOpts{
    68  			Name: "time_since_last_msg_received",
    69  			Help: "Time (in ns) since the last msg was received",
    70  		}),
    71  		timeSinceLastMsgSent: prometheus.NewGauge(prometheus.GaugeOpts{
    72  			Name: "time_since_last_msg_sent",
    73  			Help: "Time (in ns) since the last msg was sent",
    74  		}),
    75  		sendFailRate: prometheus.NewGauge(prometheus.GaugeOpts{
    76  			Name: "send_fail_rate",
    77  			Help: "Portion of messages that recently failed to be sent over the network",
    78  		}),
    79  		connected: prometheus.NewCounter(prometheus.CounterOpts{
    80  			Name: "times_connected",
    81  			Help: "Times this node successfully completed a handshake with a peer",
    82  		}),
    83  		disconnected: prometheus.NewCounter(prometheus.CounterOpts{
    84  			Name: "times_disconnected",
    85  			Help: "Times this node disconnected from a peer it had completed a handshake with",
    86  		}),
    87  		acceptFailed: prometheus.NewCounter(prometheus.CounterOpts{
    88  			Name: "accept_failed",
    89  			Help: "Times this node's listener failed to accept an inbound connection",
    90  		}),
    91  		inboundConnAllowed: prometheus.NewCounter(prometheus.CounterOpts{
    92  			Name: "inbound_conn_throttler_allowed",
    93  			Help: "Times this node allowed (attempted to upgrade) an inbound connection",
    94  		}),
    95  		tlsConnRejected: prometheus.NewCounter(prometheus.CounterOpts{
    96  			Name: "tls_conn_rejected",
    97  			Help: "Times this node rejected a connection due to an unsupported TLS certificate",
    98  		}),
    99  		numUselessPeerListBytes: prometheus.NewCounter(prometheus.CounterOpts{
   100  			Name: "num_useless_peerlist_bytes",
   101  			Help: "Amount of useless bytes (i.e. information about nodes we already knew/don't want to connect to) received in PeerList messages",
   102  		}),
   103  		inboundConnRateLimited: prometheus.NewCounter(prometheus.CounterOpts{
   104  			Name: "inbound_conn_throttler_rate_limited",
   105  			Help: "Times this node rejected an inbound connection due to rate-limiting",
   106  		}),
   107  		nodeUptimeWeightedAverage: prometheus.NewGauge(prometheus.GaugeOpts{
   108  			Name: "node_uptime_weighted_average",
   109  			Help: "This node's uptime average weighted by observing peer stakes",
   110  		}),
   111  		nodeUptimeRewardingStake: prometheus.NewGauge(prometheus.GaugeOpts{
   112  			Name: "node_uptime_rewarding_stake",
   113  			Help: "The percentage of total stake which thinks this node is eligible for rewards",
   114  		}),
   115  		nodeSubnetUptimeWeightedAverage: prometheus.NewGaugeVec(
   116  			prometheus.GaugeOpts{
   117  				Name: "node_subnet_uptime_weighted_average",
   118  				Help: "This node's subnet uptime averages weighted by observing subnet peer stakes",
   119  			},
   120  			[]string{"subnetID"},
   121  		),
   122  		nodeSubnetUptimeRewardingStake: prometheus.NewGaugeVec(
   123  			prometheus.GaugeOpts{
   124  				Name: "node_subnet_uptime_rewarding_stake",
   125  				Help: "The percentage of subnet's total stake which thinks this node is eligible for subnet's rewards",
   126  			},
   127  			[]string{"subnetID"},
   128  		),
   129  		peerConnectedLifetimeAverage: prometheus.NewGauge(
   130  			prometheus.GaugeOpts{
   131  				Name: "peer_connected_duration_average",
   132  				Help: "The average duration of all peer connections in nanoseconds",
   133  			},
   134  		),
   135  		peerConnectedStartTimes: make(map[ids.NodeID]float64),
   136  	}
   137  
   138  	err := errors.Join(
   139  		registerer.Register(m.numTracked),
   140  		registerer.Register(m.numPeers),
   141  		registerer.Register(m.numSubnetPeers),
   142  		registerer.Register(m.timeSinceLastMsgReceived),
   143  		registerer.Register(m.timeSinceLastMsgSent),
   144  		registerer.Register(m.sendFailRate),
   145  		registerer.Register(m.connected),
   146  		registerer.Register(m.disconnected),
   147  		registerer.Register(m.acceptFailed),
   148  		registerer.Register(m.inboundConnAllowed),
   149  		registerer.Register(m.tlsConnRejected),
   150  		registerer.Register(m.numUselessPeerListBytes),
   151  		registerer.Register(m.inboundConnRateLimited),
   152  		registerer.Register(m.nodeUptimeWeightedAverage),
   153  		registerer.Register(m.nodeUptimeRewardingStake),
   154  		registerer.Register(m.nodeSubnetUptimeWeightedAverage),
   155  		registerer.Register(m.nodeSubnetUptimeRewardingStake),
   156  		registerer.Register(m.peerConnectedLifetimeAverage),
   157  	)
   158  
   159  	// init subnet tracker metrics with tracked subnets
   160  	for subnetID := range trackedSubnets {
   161  		// initialize to 0
   162  		subnetIDStr := subnetID.String()
   163  		m.numSubnetPeers.WithLabelValues(subnetIDStr).Set(0)
   164  		m.nodeSubnetUptimeWeightedAverage.WithLabelValues(subnetIDStr).Set(0)
   165  		m.nodeSubnetUptimeRewardingStake.WithLabelValues(subnetIDStr).Set(0)
   166  	}
   167  
   168  	return m, err
   169  }
   170  
   171  func (m *metrics) markConnected(peer peer.Peer) {
   172  	m.numPeers.Inc()
   173  	m.connected.Inc()
   174  
   175  	trackedSubnets := peer.TrackedSubnets()
   176  	for subnetID := range m.trackedSubnets {
   177  		if trackedSubnets.Contains(subnetID) {
   178  			m.numSubnetPeers.WithLabelValues(subnetID.String()).Inc()
   179  		}
   180  	}
   181  
   182  	m.lock.Lock()
   183  	defer m.lock.Unlock()
   184  
   185  	now := float64(time.Now().UnixNano())
   186  	m.peerConnectedStartTimes[peer.ID()] = now
   187  	m.peerConnectedStartTimesSum += now
   188  }
   189  
   190  func (m *metrics) markDisconnected(peer peer.Peer) {
   191  	m.numPeers.Dec()
   192  	m.disconnected.Inc()
   193  
   194  	trackedSubnets := peer.TrackedSubnets()
   195  	for subnetID := range m.trackedSubnets {
   196  		if trackedSubnets.Contains(subnetID) {
   197  			m.numSubnetPeers.WithLabelValues(subnetID.String()).Dec()
   198  		}
   199  	}
   200  
   201  	m.lock.Lock()
   202  	defer m.lock.Unlock()
   203  
   204  	peerID := peer.ID()
   205  	start := m.peerConnectedStartTimes[peerID]
   206  	m.peerConnectedStartTimesSum -= start
   207  
   208  	delete(m.peerConnectedStartTimes, peerID)
   209  }
   210  
   211  func (m *metrics) updatePeerConnectionLifetimeMetrics() {
   212  	m.lock.RLock()
   213  	defer m.lock.RUnlock()
   214  
   215  	avg := float64(0)
   216  	if n := len(m.peerConnectedStartTimes); n > 0 {
   217  		avgStartTime := m.peerConnectedStartTimesSum / float64(n)
   218  		avg = float64(time.Now().UnixNano()) - avgStartTime
   219  	}
   220  
   221  	m.peerConnectedLifetimeAverage.Set(avg)
   222  }