github.com/ava-labs/avalanchego@v1.11.11/network/metrics.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package network 5 6 import ( 7 "errors" 8 "sync" 9 "time" 10 11 "github.com/prometheus/client_golang/prometheus" 12 13 "github.com/ava-labs/avalanchego/ids" 14 "github.com/ava-labs/avalanchego/network/peer" 15 "github.com/ava-labs/avalanchego/utils/set" 16 ) 17 18 type metrics struct { 19 // trackedSubnets does not include the primary network ID 20 trackedSubnets set.Set[ids.ID] 21 22 numTracked prometheus.Gauge 23 numPeers prometheus.Gauge 24 numSubnetPeers *prometheus.GaugeVec 25 timeSinceLastMsgSent prometheus.Gauge 26 timeSinceLastMsgReceived prometheus.Gauge 27 sendFailRate prometheus.Gauge 28 connected prometheus.Counter 29 disconnected prometheus.Counter 30 acceptFailed prometheus.Counter 31 inboundConnRateLimited prometheus.Counter 32 inboundConnAllowed prometheus.Counter 33 tlsConnRejected prometheus.Counter 34 numUselessPeerListBytes prometheus.Counter 35 nodeUptimeWeightedAverage prometheus.Gauge 36 nodeUptimeRewardingStake prometheus.Gauge 37 nodeSubnetUptimeWeightedAverage *prometheus.GaugeVec // Deprecated 38 nodeSubnetUptimeRewardingStake *prometheus.GaugeVec // Deprecated 39 peerConnectedLifetimeAverage prometheus.Gauge 40 lock sync.RWMutex 41 peerConnectedStartTimes map[ids.NodeID]float64 42 peerConnectedStartTimesSum float64 43 } 44 45 func newMetrics( 46 registerer prometheus.Registerer, 47 trackedSubnets set.Set[ids.ID], 48 ) (*metrics, error) { 49 m := &metrics{ 50 trackedSubnets: trackedSubnets, 51 numPeers: prometheus.NewGauge(prometheus.GaugeOpts{ 52 Name: "peers", 53 Help: "Number of network peers", 54 }), 55 numTracked: prometheus.NewGauge(prometheus.GaugeOpts{ 56 Name: "tracked", 57 Help: "Number of currently tracked IPs attempting to be connected to", 58 }), 59 numSubnetPeers: prometheus.NewGaugeVec( 60 prometheus.GaugeOpts{ 61 Name: "peers_subnet", 62 Help: "Number of peers that are validating a particular subnet", 63 }, 64 []string{"subnetID"}, 65 ), 66 timeSinceLastMsgReceived: prometheus.NewGauge(prometheus.GaugeOpts{ 67 Name: "time_since_last_msg_received", 68 Help: "Time (in ns) since the last msg was received", 69 }), 70 timeSinceLastMsgSent: prometheus.NewGauge(prometheus.GaugeOpts{ 71 Name: "time_since_last_msg_sent", 72 Help: "Time (in ns) since the last msg was sent", 73 }), 74 sendFailRate: prometheus.NewGauge(prometheus.GaugeOpts{ 75 Name: "send_fail_rate", 76 Help: "Portion of messages that recently failed to be sent over the network", 77 }), 78 connected: prometheus.NewCounter(prometheus.CounterOpts{ 79 Name: "times_connected", 80 Help: "Times this node successfully completed a handshake with a peer", 81 }), 82 disconnected: prometheus.NewCounter(prometheus.CounterOpts{ 83 Name: "times_disconnected", 84 Help: "Times this node disconnected from a peer it had completed a handshake with", 85 }), 86 acceptFailed: prometheus.NewCounter(prometheus.CounterOpts{ 87 Name: "accept_failed", 88 Help: "Times this node's listener failed to accept an inbound connection", 89 }), 90 inboundConnAllowed: prometheus.NewCounter(prometheus.CounterOpts{ 91 Name: "inbound_conn_throttler_allowed", 92 Help: "Times this node allowed (attempted to upgrade) an inbound connection", 93 }), 94 tlsConnRejected: prometheus.NewCounter(prometheus.CounterOpts{ 95 Name: "tls_conn_rejected", 96 Help: "Times this node rejected a connection due to an unsupported TLS certificate", 97 }), 98 numUselessPeerListBytes: prometheus.NewCounter(prometheus.CounterOpts{ 99 Name: "num_useless_peerlist_bytes", 100 Help: "Amount of useless bytes (i.e. information about nodes we already knew/don't want to connect to) received in PeerList messages", 101 }), 102 inboundConnRateLimited: prometheus.NewCounter(prometheus.CounterOpts{ 103 Name: "inbound_conn_throttler_rate_limited", 104 Help: "Times this node rejected an inbound connection due to rate-limiting", 105 }), 106 nodeUptimeWeightedAverage: prometheus.NewGauge(prometheus.GaugeOpts{ 107 Name: "node_uptime_weighted_average", 108 Help: "This node's uptime average weighted by observing peer stakes", 109 }), 110 nodeUptimeRewardingStake: prometheus.NewGauge(prometheus.GaugeOpts{ 111 Name: "node_uptime_rewarding_stake", 112 Help: "The percentage of total stake which thinks this node is eligible for rewards", 113 }), 114 nodeSubnetUptimeWeightedAverage: prometheus.NewGaugeVec( 115 prometheus.GaugeOpts{ 116 Name: "node_subnet_uptime_weighted_average", 117 Help: "This node's subnet uptime averages weighted by observing subnet peer stakes", 118 }, 119 []string{"subnetID"}, 120 ), 121 nodeSubnetUptimeRewardingStake: prometheus.NewGaugeVec( 122 prometheus.GaugeOpts{ 123 Name: "node_subnet_uptime_rewarding_stake", 124 Help: "The percentage of subnet's total stake which thinks this node is eligible for subnet's rewards", 125 }, 126 []string{"subnetID"}, 127 ), 128 peerConnectedLifetimeAverage: prometheus.NewGauge( 129 prometheus.GaugeOpts{ 130 Name: "peer_connected_duration_average", 131 Help: "The average duration of all peer connections in nanoseconds", 132 }, 133 ), 134 peerConnectedStartTimes: make(map[ids.NodeID]float64), 135 } 136 137 err := errors.Join( 138 registerer.Register(m.numTracked), 139 registerer.Register(m.numPeers), 140 registerer.Register(m.numSubnetPeers), 141 registerer.Register(m.timeSinceLastMsgReceived), 142 registerer.Register(m.timeSinceLastMsgSent), 143 registerer.Register(m.sendFailRate), 144 registerer.Register(m.connected), 145 registerer.Register(m.disconnected), 146 registerer.Register(m.acceptFailed), 147 registerer.Register(m.inboundConnAllowed), 148 registerer.Register(m.tlsConnRejected), 149 registerer.Register(m.numUselessPeerListBytes), 150 registerer.Register(m.inboundConnRateLimited), 151 registerer.Register(m.nodeUptimeWeightedAverage), 152 registerer.Register(m.nodeUptimeRewardingStake), 153 registerer.Register(m.nodeSubnetUptimeWeightedAverage), 154 registerer.Register(m.nodeSubnetUptimeRewardingStake), 155 registerer.Register(m.peerConnectedLifetimeAverage), 156 ) 157 158 // init subnet tracker metrics with tracked subnets 159 for subnetID := range trackedSubnets { 160 // initialize to 0 161 subnetIDStr := subnetID.String() 162 m.numSubnetPeers.WithLabelValues(subnetIDStr).Set(0) 163 m.nodeSubnetUptimeWeightedAverage.WithLabelValues(subnetIDStr).Set(0) 164 m.nodeSubnetUptimeRewardingStake.WithLabelValues(subnetIDStr).Set(0) 165 } 166 167 return m, err 168 } 169 170 func (m *metrics) markConnected(peer peer.Peer) { 171 m.numPeers.Inc() 172 m.connected.Inc() 173 174 trackedSubnets := peer.TrackedSubnets() 175 for subnetID := range m.trackedSubnets { 176 if trackedSubnets.Contains(subnetID) { 177 m.numSubnetPeers.WithLabelValues(subnetID.String()).Inc() 178 } 179 } 180 181 m.lock.Lock() 182 defer m.lock.Unlock() 183 184 now := float64(time.Now().UnixNano()) 185 m.peerConnectedStartTimes[peer.ID()] = now 186 m.peerConnectedStartTimesSum += now 187 } 188 189 func (m *metrics) markDisconnected(peer peer.Peer) { 190 m.numPeers.Dec() 191 m.disconnected.Inc() 192 193 trackedSubnets := peer.TrackedSubnets() 194 for subnetID := range m.trackedSubnets { 195 if trackedSubnets.Contains(subnetID) { 196 m.numSubnetPeers.WithLabelValues(subnetID.String()).Dec() 197 } 198 } 199 200 m.lock.Lock() 201 defer m.lock.Unlock() 202 203 peerID := peer.ID() 204 start := m.peerConnectedStartTimes[peerID] 205 m.peerConnectedStartTimesSum -= start 206 207 delete(m.peerConnectedStartTimes, peerID) 208 } 209 210 func (m *metrics) updatePeerConnectionLifetimeMetrics() { 211 m.lock.RLock() 212 defer m.lock.RUnlock() 213 214 avg := float64(0) 215 if n := len(m.peerConnectedStartTimes); n > 0 { 216 avgStartTime := m.peerConnectedStartTimesSum / float64(n) 217 avg = float64(time.Now().UnixNano()) - avgStartTime 218 } 219 220 m.peerConnectedLifetimeAverage.Set(avg) 221 }