github.com/ava-labs/avalanchego@v1.11.11/network/metrics.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package network
     5  
     6  import (
     7  	"errors"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/prometheus/client_golang/prometheus"
    12  
    13  	"github.com/ava-labs/avalanchego/ids"
    14  	"github.com/ava-labs/avalanchego/network/peer"
    15  	"github.com/ava-labs/avalanchego/utils/set"
    16  )
    17  
    18  type metrics struct {
    19  	// trackedSubnets does not include the primary network ID
    20  	trackedSubnets set.Set[ids.ID]
    21  
    22  	numTracked                      prometheus.Gauge
    23  	numPeers                        prometheus.Gauge
    24  	numSubnetPeers                  *prometheus.GaugeVec
    25  	timeSinceLastMsgSent            prometheus.Gauge
    26  	timeSinceLastMsgReceived        prometheus.Gauge
    27  	sendFailRate                    prometheus.Gauge
    28  	connected                       prometheus.Counter
    29  	disconnected                    prometheus.Counter
    30  	acceptFailed                    prometheus.Counter
    31  	inboundConnRateLimited          prometheus.Counter
    32  	inboundConnAllowed              prometheus.Counter
    33  	tlsConnRejected                 prometheus.Counter
    34  	numUselessPeerListBytes         prometheus.Counter
    35  	nodeUptimeWeightedAverage       prometheus.Gauge
    36  	nodeUptimeRewardingStake        prometheus.Gauge
    37  	nodeSubnetUptimeWeightedAverage *prometheus.GaugeVec // Deprecated
    38  	nodeSubnetUptimeRewardingStake  *prometheus.GaugeVec // Deprecated
    39  	peerConnectedLifetimeAverage    prometheus.Gauge
    40  	lock                            sync.RWMutex
    41  	peerConnectedStartTimes         map[ids.NodeID]float64
    42  	peerConnectedStartTimesSum      float64
    43  }
    44  
    45  func newMetrics(
    46  	registerer prometheus.Registerer,
    47  	trackedSubnets set.Set[ids.ID],
    48  ) (*metrics, error) {
    49  	m := &metrics{
    50  		trackedSubnets: trackedSubnets,
    51  		numPeers: prometheus.NewGauge(prometheus.GaugeOpts{
    52  			Name: "peers",
    53  			Help: "Number of network peers",
    54  		}),
    55  		numTracked: prometheus.NewGauge(prometheus.GaugeOpts{
    56  			Name: "tracked",
    57  			Help: "Number of currently tracked IPs attempting to be connected to",
    58  		}),
    59  		numSubnetPeers: prometheus.NewGaugeVec(
    60  			prometheus.GaugeOpts{
    61  				Name: "peers_subnet",
    62  				Help: "Number of peers that are validating a particular subnet",
    63  			},
    64  			[]string{"subnetID"},
    65  		),
    66  		timeSinceLastMsgReceived: prometheus.NewGauge(prometheus.GaugeOpts{
    67  			Name: "time_since_last_msg_received",
    68  			Help: "Time (in ns) since the last msg was received",
    69  		}),
    70  		timeSinceLastMsgSent: prometheus.NewGauge(prometheus.GaugeOpts{
    71  			Name: "time_since_last_msg_sent",
    72  			Help: "Time (in ns) since the last msg was sent",
    73  		}),
    74  		sendFailRate: prometheus.NewGauge(prometheus.GaugeOpts{
    75  			Name: "send_fail_rate",
    76  			Help: "Portion of messages that recently failed to be sent over the network",
    77  		}),
    78  		connected: prometheus.NewCounter(prometheus.CounterOpts{
    79  			Name: "times_connected",
    80  			Help: "Times this node successfully completed a handshake with a peer",
    81  		}),
    82  		disconnected: prometheus.NewCounter(prometheus.CounterOpts{
    83  			Name: "times_disconnected",
    84  			Help: "Times this node disconnected from a peer it had completed a handshake with",
    85  		}),
    86  		acceptFailed: prometheus.NewCounter(prometheus.CounterOpts{
    87  			Name: "accept_failed",
    88  			Help: "Times this node's listener failed to accept an inbound connection",
    89  		}),
    90  		inboundConnAllowed: prometheus.NewCounter(prometheus.CounterOpts{
    91  			Name: "inbound_conn_throttler_allowed",
    92  			Help: "Times this node allowed (attempted to upgrade) an inbound connection",
    93  		}),
    94  		tlsConnRejected: prometheus.NewCounter(prometheus.CounterOpts{
    95  			Name: "tls_conn_rejected",
    96  			Help: "Times this node rejected a connection due to an unsupported TLS certificate",
    97  		}),
    98  		numUselessPeerListBytes: prometheus.NewCounter(prometheus.CounterOpts{
    99  			Name: "num_useless_peerlist_bytes",
   100  			Help: "Amount of useless bytes (i.e. information about nodes we already knew/don't want to connect to) received in PeerList messages",
   101  		}),
   102  		inboundConnRateLimited: prometheus.NewCounter(prometheus.CounterOpts{
   103  			Name: "inbound_conn_throttler_rate_limited",
   104  			Help: "Times this node rejected an inbound connection due to rate-limiting",
   105  		}),
   106  		nodeUptimeWeightedAverage: prometheus.NewGauge(prometheus.GaugeOpts{
   107  			Name: "node_uptime_weighted_average",
   108  			Help: "This node's uptime average weighted by observing peer stakes",
   109  		}),
   110  		nodeUptimeRewardingStake: prometheus.NewGauge(prometheus.GaugeOpts{
   111  			Name: "node_uptime_rewarding_stake",
   112  			Help: "The percentage of total stake which thinks this node is eligible for rewards",
   113  		}),
   114  		nodeSubnetUptimeWeightedAverage: prometheus.NewGaugeVec(
   115  			prometheus.GaugeOpts{
   116  				Name: "node_subnet_uptime_weighted_average",
   117  				Help: "This node's subnet uptime averages weighted by observing subnet peer stakes",
   118  			},
   119  			[]string{"subnetID"},
   120  		),
   121  		nodeSubnetUptimeRewardingStake: prometheus.NewGaugeVec(
   122  			prometheus.GaugeOpts{
   123  				Name: "node_subnet_uptime_rewarding_stake",
   124  				Help: "The percentage of subnet's total stake which thinks this node is eligible for subnet's rewards",
   125  			},
   126  			[]string{"subnetID"},
   127  		),
   128  		peerConnectedLifetimeAverage: prometheus.NewGauge(
   129  			prometheus.GaugeOpts{
   130  				Name: "peer_connected_duration_average",
   131  				Help: "The average duration of all peer connections in nanoseconds",
   132  			},
   133  		),
   134  		peerConnectedStartTimes: make(map[ids.NodeID]float64),
   135  	}
   136  
   137  	err := errors.Join(
   138  		registerer.Register(m.numTracked),
   139  		registerer.Register(m.numPeers),
   140  		registerer.Register(m.numSubnetPeers),
   141  		registerer.Register(m.timeSinceLastMsgReceived),
   142  		registerer.Register(m.timeSinceLastMsgSent),
   143  		registerer.Register(m.sendFailRate),
   144  		registerer.Register(m.connected),
   145  		registerer.Register(m.disconnected),
   146  		registerer.Register(m.acceptFailed),
   147  		registerer.Register(m.inboundConnAllowed),
   148  		registerer.Register(m.tlsConnRejected),
   149  		registerer.Register(m.numUselessPeerListBytes),
   150  		registerer.Register(m.inboundConnRateLimited),
   151  		registerer.Register(m.nodeUptimeWeightedAverage),
   152  		registerer.Register(m.nodeUptimeRewardingStake),
   153  		registerer.Register(m.nodeSubnetUptimeWeightedAverage),
   154  		registerer.Register(m.nodeSubnetUptimeRewardingStake),
   155  		registerer.Register(m.peerConnectedLifetimeAverage),
   156  	)
   157  
   158  	// init subnet tracker metrics with tracked subnets
   159  	for subnetID := range trackedSubnets {
   160  		// initialize to 0
   161  		subnetIDStr := subnetID.String()
   162  		m.numSubnetPeers.WithLabelValues(subnetIDStr).Set(0)
   163  		m.nodeSubnetUptimeWeightedAverage.WithLabelValues(subnetIDStr).Set(0)
   164  		m.nodeSubnetUptimeRewardingStake.WithLabelValues(subnetIDStr).Set(0)
   165  	}
   166  
   167  	return m, err
   168  }
   169  
   170  func (m *metrics) markConnected(peer peer.Peer) {
   171  	m.numPeers.Inc()
   172  	m.connected.Inc()
   173  
   174  	trackedSubnets := peer.TrackedSubnets()
   175  	for subnetID := range m.trackedSubnets {
   176  		if trackedSubnets.Contains(subnetID) {
   177  			m.numSubnetPeers.WithLabelValues(subnetID.String()).Inc()
   178  		}
   179  	}
   180  
   181  	m.lock.Lock()
   182  	defer m.lock.Unlock()
   183  
   184  	now := float64(time.Now().UnixNano())
   185  	m.peerConnectedStartTimes[peer.ID()] = now
   186  	m.peerConnectedStartTimesSum += now
   187  }
   188  
   189  func (m *metrics) markDisconnected(peer peer.Peer) {
   190  	m.numPeers.Dec()
   191  	m.disconnected.Inc()
   192  
   193  	trackedSubnets := peer.TrackedSubnets()
   194  	for subnetID := range m.trackedSubnets {
   195  		if trackedSubnets.Contains(subnetID) {
   196  			m.numSubnetPeers.WithLabelValues(subnetID.String()).Dec()
   197  		}
   198  	}
   199  
   200  	m.lock.Lock()
   201  	defer m.lock.Unlock()
   202  
   203  	peerID := peer.ID()
   204  	start := m.peerConnectedStartTimes[peerID]
   205  	m.peerConnectedStartTimesSum -= start
   206  
   207  	delete(m.peerConnectedStartTimes, peerID)
   208  }
   209  
   210  func (m *metrics) updatePeerConnectionLifetimeMetrics() {
   211  	m.lock.RLock()
   212  	defer m.lock.RUnlock()
   213  
   214  	avg := float64(0)
   215  	if n := len(m.peerConnectedStartTimes); n > 0 {
   216  		avgStartTime := m.peerConnectedStartTimesSum / float64(n)
   217  		avg = float64(time.Now().UnixNano()) - avgStartTime
   218  	}
   219  
   220  	m.peerConnectedLifetimeAverage.Set(avg)
   221  }