github.com/MetalBlockchain/metalgo@v1.11.9/network/metrics.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package network 5 6 import ( 7 "errors" 8 "sync" 9 "time" 10 11 "github.com/prometheus/client_golang/prometheus" 12 13 "github.com/MetalBlockchain/metalgo/ids" 14 "github.com/MetalBlockchain/metalgo/network/peer" 15 "github.com/MetalBlockchain/metalgo/utils/set" 16 ) 17 18 type metrics struct { 19 // trackedSubnets does not include the primary network ID 20 trackedSubnets set.Set[ids.ID] 21 22 numTracked prometheus.Gauge 23 numPeers prometheus.Gauge 24 numSubnetPeers *prometheus.GaugeVec 25 timeSinceLastMsgSent prometheus.Gauge 26 timeSinceLastMsgReceived prometheus.Gauge 27 sendFailRate prometheus.Gauge 28 connected prometheus.Counter 29 disconnected prometheus.Counter 30 acceptFailed prometheus.Counter 31 inboundConnRateLimited prometheus.Counter 32 inboundConnAllowed prometheus.Counter 33 tlsConnRejected prometheus.Counter 34 numUselessPeerListBytes prometheus.Counter 35 nodeUptimeWeightedAverage prometheus.Gauge 36 nodeUptimeRewardingStake prometheus.Gauge 37 nodeSubnetUptimeWeightedAverage *prometheus.GaugeVec 38 nodeSubnetUptimeRewardingStake *prometheus.GaugeVec 39 peerConnectedLifetimeAverage prometheus.Gauge 40 41 lock sync.RWMutex 42 peerConnectedStartTimes map[ids.NodeID]float64 43 peerConnectedStartTimesSum float64 44 } 45 46 func newMetrics( 47 registerer prometheus.Registerer, 48 trackedSubnets set.Set[ids.ID], 49 ) (*metrics, error) { 50 m := &metrics{ 51 trackedSubnets: trackedSubnets, 52 numPeers: prometheus.NewGauge(prometheus.GaugeOpts{ 53 Name: "peers", 54 Help: "Number of network peers", 55 }), 56 numTracked: prometheus.NewGauge(prometheus.GaugeOpts{ 57 Name: "tracked", 58 Help: "Number of currently tracked IPs attempting to be connected to", 59 }), 60 numSubnetPeers: prometheus.NewGaugeVec( 61 prometheus.GaugeOpts{ 62 Name: "peers_subnet", 63 Help: "Number of peers that are validating a particular subnet", 64 }, 65 []string{"subnetID"}, 66 ), 67 timeSinceLastMsgReceived: prometheus.NewGauge(prometheus.GaugeOpts{ 68 Name: "time_since_last_msg_received", 69 Help: "Time (in ns) since the last msg was received", 70 }), 71 timeSinceLastMsgSent: prometheus.NewGauge(prometheus.GaugeOpts{ 72 Name: "time_since_last_msg_sent", 73 Help: "Time (in ns) since the last msg was sent", 74 }), 75 sendFailRate: prometheus.NewGauge(prometheus.GaugeOpts{ 76 Name: "send_fail_rate", 77 Help: "Portion of messages that recently failed to be sent over the network", 78 }), 79 connected: prometheus.NewCounter(prometheus.CounterOpts{ 80 Name: "times_connected", 81 Help: "Times this node successfully completed a handshake with a peer", 82 }), 83 disconnected: prometheus.NewCounter(prometheus.CounterOpts{ 84 Name: "times_disconnected", 85 Help: "Times this node disconnected from a peer it had completed a handshake with", 86 }), 87 acceptFailed: prometheus.NewCounter(prometheus.CounterOpts{ 88 Name: "accept_failed", 89 Help: "Times this node's listener failed to accept an inbound connection", 90 }), 91 inboundConnAllowed: prometheus.NewCounter(prometheus.CounterOpts{ 92 Name: "inbound_conn_throttler_allowed", 93 Help: "Times this node allowed (attempted to upgrade) an inbound connection", 94 }), 95 tlsConnRejected: prometheus.NewCounter(prometheus.CounterOpts{ 96 Name: "tls_conn_rejected", 97 Help: "Times this node rejected a connection due to an unsupported TLS certificate", 98 }), 99 numUselessPeerListBytes: prometheus.NewCounter(prometheus.CounterOpts{ 100 Name: "num_useless_peerlist_bytes", 101 Help: "Amount of useless bytes (i.e. information about nodes we already knew/don't want to connect to) received in PeerList messages", 102 }), 103 inboundConnRateLimited: prometheus.NewCounter(prometheus.CounterOpts{ 104 Name: "inbound_conn_throttler_rate_limited", 105 Help: "Times this node rejected an inbound connection due to rate-limiting", 106 }), 107 nodeUptimeWeightedAverage: prometheus.NewGauge(prometheus.GaugeOpts{ 108 Name: "node_uptime_weighted_average", 109 Help: "This node's uptime average weighted by observing peer stakes", 110 }), 111 nodeUptimeRewardingStake: prometheus.NewGauge(prometheus.GaugeOpts{ 112 Name: "node_uptime_rewarding_stake", 113 Help: "The percentage of total stake which thinks this node is eligible for rewards", 114 }), 115 nodeSubnetUptimeWeightedAverage: prometheus.NewGaugeVec( 116 prometheus.GaugeOpts{ 117 Name: "node_subnet_uptime_weighted_average", 118 Help: "This node's subnet uptime averages weighted by observing subnet peer stakes", 119 }, 120 []string{"subnetID"}, 121 ), 122 nodeSubnetUptimeRewardingStake: prometheus.NewGaugeVec( 123 prometheus.GaugeOpts{ 124 Name: "node_subnet_uptime_rewarding_stake", 125 Help: "The percentage of subnet's total stake which thinks this node is eligible for subnet's rewards", 126 }, 127 []string{"subnetID"}, 128 ), 129 peerConnectedLifetimeAverage: prometheus.NewGauge( 130 prometheus.GaugeOpts{ 131 Name: "peer_connected_duration_average", 132 Help: "The average duration of all peer connections in nanoseconds", 133 }, 134 ), 135 peerConnectedStartTimes: make(map[ids.NodeID]float64), 136 } 137 138 err := errors.Join( 139 registerer.Register(m.numTracked), 140 registerer.Register(m.numPeers), 141 registerer.Register(m.numSubnetPeers), 142 registerer.Register(m.timeSinceLastMsgReceived), 143 registerer.Register(m.timeSinceLastMsgSent), 144 registerer.Register(m.sendFailRate), 145 registerer.Register(m.connected), 146 registerer.Register(m.disconnected), 147 registerer.Register(m.acceptFailed), 148 registerer.Register(m.inboundConnAllowed), 149 registerer.Register(m.tlsConnRejected), 150 registerer.Register(m.numUselessPeerListBytes), 151 registerer.Register(m.inboundConnRateLimited), 152 registerer.Register(m.nodeUptimeWeightedAverage), 153 registerer.Register(m.nodeUptimeRewardingStake), 154 registerer.Register(m.nodeSubnetUptimeWeightedAverage), 155 registerer.Register(m.nodeSubnetUptimeRewardingStake), 156 registerer.Register(m.peerConnectedLifetimeAverage), 157 ) 158 159 // init subnet tracker metrics with tracked subnets 160 for subnetID := range trackedSubnets { 161 // initialize to 0 162 subnetIDStr := subnetID.String() 163 m.numSubnetPeers.WithLabelValues(subnetIDStr).Set(0) 164 m.nodeSubnetUptimeWeightedAverage.WithLabelValues(subnetIDStr).Set(0) 165 m.nodeSubnetUptimeRewardingStake.WithLabelValues(subnetIDStr).Set(0) 166 } 167 168 return m, err 169 } 170 171 func (m *metrics) markConnected(peer peer.Peer) { 172 m.numPeers.Inc() 173 m.connected.Inc() 174 175 trackedSubnets := peer.TrackedSubnets() 176 for subnetID := range m.trackedSubnets { 177 if trackedSubnets.Contains(subnetID) { 178 m.numSubnetPeers.WithLabelValues(subnetID.String()).Inc() 179 } 180 } 181 182 m.lock.Lock() 183 defer m.lock.Unlock() 184 185 now := float64(time.Now().UnixNano()) 186 m.peerConnectedStartTimes[peer.ID()] = now 187 m.peerConnectedStartTimesSum += now 188 } 189 190 func (m *metrics) markDisconnected(peer peer.Peer) { 191 m.numPeers.Dec() 192 m.disconnected.Inc() 193 194 trackedSubnets := peer.TrackedSubnets() 195 for subnetID := range m.trackedSubnets { 196 if trackedSubnets.Contains(subnetID) { 197 m.numSubnetPeers.WithLabelValues(subnetID.String()).Dec() 198 } 199 } 200 201 m.lock.Lock() 202 defer m.lock.Unlock() 203 204 peerID := peer.ID() 205 start := m.peerConnectedStartTimes[peerID] 206 m.peerConnectedStartTimesSum -= start 207 208 delete(m.peerConnectedStartTimes, peerID) 209 } 210 211 func (m *metrics) updatePeerConnectionLifetimeMetrics() { 212 m.lock.RLock() 213 defer m.lock.RUnlock() 214 215 avg := float64(0) 216 if n := len(m.peerConnectedStartTimes); n > 0 { 217 avgStartTime := m.peerConnectedStartTimesSum / float64(n) 218 avg = float64(time.Now().UnixNano()) - avgStartTime 219 } 220 221 m.peerConnectedLifetimeAverage.Set(avg) 222 }