github.com/MetalBlockchain/metalgo@v1.11.9/network/p2p/peer_tracker.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package p2p 5 6 import ( 7 "errors" 8 "math" 9 "math/rand" 10 "sync" 11 "time" 12 13 "github.com/prometheus/client_golang/prometheus" 14 "go.uber.org/zap" 15 16 "github.com/MetalBlockchain/metalgo/ids" 17 "github.com/MetalBlockchain/metalgo/utils/heap" 18 "github.com/MetalBlockchain/metalgo/utils/logging" 19 "github.com/MetalBlockchain/metalgo/utils/set" 20 "github.com/MetalBlockchain/metalgo/version" 21 22 safemath "github.com/MetalBlockchain/metalgo/utils/math" 23 ) 24 25 const ( 26 bandwidthHalflife = 5 * time.Minute 27 28 // controls how eagerly we connect to new peers vs. using peers with known 29 // good response bandwidth. 30 desiredMinResponsivePeers = 20 31 newPeerConnectFactor = 0.1 32 33 // The probability that, when we select a peer, we select randomly rather 34 // than based on their performance. 35 randomPeerProbability = 0.2 36 ) 37 38 // Tracks the bandwidth of responses coming from peers, 39 // preferring to contact peers with known good bandwidth, connecting 40 // to new peers with an exponentially decaying probability. 41 type PeerTracker struct { 42 // Lock to protect concurrent access to the peer tracker 43 lock sync.RWMutex 44 // Peers that we're connected to that we haven't sent a request to since we 45 // most recently connected to them. 46 untrackedPeers set.Set[ids.NodeID] 47 // Peers that we're connected to that we've sent a request to since we most 48 // recently connected to them. 49 trackedPeers set.Set[ids.NodeID] 50 // Peers that we're connected to that responded to the last request they 51 // were sent. 52 responsivePeers set.Set[ids.NodeID] 53 // Bandwidth of peers that we have measured. 54 peerBandwidth map[ids.NodeID]safemath.Averager 55 // Max heap that contains the average bandwidth of peers that do not have an 56 // outstanding request. 57 bandwidthHeap heap.Map[ids.NodeID, safemath.Averager] 58 // Average bandwidth is only used for metrics. 59 averageBandwidth safemath.Averager 60 61 // The below fields are assumed to be constant and are not protected by the 62 // lock. 63 log logging.Logger 64 ignoredNodes set.Set[ids.NodeID] 65 minVersion *version.Application 66 metrics peerTrackerMetrics 67 } 68 69 type peerTrackerMetrics struct { 70 numTrackedPeers prometheus.Gauge 71 numResponsivePeers prometheus.Gauge 72 averageBandwidth prometheus.Gauge 73 } 74 75 func NewPeerTracker( 76 log logging.Logger, 77 metricsNamespace string, 78 registerer prometheus.Registerer, 79 ignoredNodes set.Set[ids.NodeID], 80 minVersion *version.Application, 81 ) (*PeerTracker, error) { 82 t := &PeerTracker{ 83 peerBandwidth: make(map[ids.NodeID]safemath.Averager), 84 bandwidthHeap: heap.NewMap[ids.NodeID, safemath.Averager](func(a, b safemath.Averager) bool { 85 return a.Read() > b.Read() 86 }), 87 averageBandwidth: safemath.NewAverager(0, bandwidthHalflife, time.Now()), 88 log: log, 89 ignoredNodes: ignoredNodes, 90 minVersion: minVersion, 91 metrics: peerTrackerMetrics{ 92 numTrackedPeers: prometheus.NewGauge( 93 prometheus.GaugeOpts{ 94 Namespace: metricsNamespace, 95 Name: "num_tracked_peers", 96 Help: "number of tracked peers", 97 }, 98 ), 99 numResponsivePeers: prometheus.NewGauge( 100 prometheus.GaugeOpts{ 101 Namespace: metricsNamespace, 102 Name: "num_responsive_peers", 103 Help: "number of responsive peers", 104 }, 105 ), 106 averageBandwidth: prometheus.NewGauge( 107 prometheus.GaugeOpts{ 108 Namespace: metricsNamespace, 109 Name: "average_bandwidth", 110 Help: "average sync bandwidth used by peers", 111 }, 112 ), 113 }, 114 } 115 116 err := errors.Join( 117 registerer.Register(t.metrics.numTrackedPeers), 118 registerer.Register(t.metrics.numResponsivePeers), 119 registerer.Register(t.metrics.averageBandwidth), 120 ) 121 return t, err 122 } 123 124 // Returns true if: 125 // - We have not observed the desired minimum number of responsive peers. 126 // - Randomly with the frequency decreasing as the number of responsive peers 127 // increases. 128 // 129 // Assumes the read lock is held. 130 func (p *PeerTracker) shouldSelectUntrackedPeer() bool { 131 numResponsivePeers := p.responsivePeers.Len() 132 if numResponsivePeers < desiredMinResponsivePeers { 133 return true 134 } 135 if p.untrackedPeers.Len() == 0 { 136 return false // already tracking all peers 137 } 138 139 // TODO danlaine: we should consider tuning this probability function. 140 // With [newPeerConnectFactor] as 0.1 the probabilities are: 141 // 142 // numResponsivePeers | probability 143 // 100 | 4.5399929762484854e-05 144 // 200 | 2.061153622438558e-09 145 // 500 | 1.9287498479639178e-22 146 // 1000 | 3.720075976020836e-44 147 // 2000 | 1.3838965267367376e-87 148 // 5000 | 7.124576406741286e-218 149 // 150 // In other words, the probability drops off extremely quickly. 151 newPeerProbability := math.Exp(-float64(numResponsivePeers) * newPeerConnectFactor) 152 return rand.Float64() < newPeerProbability // #nosec G404 153 } 154 155 // SelectPeer that we could send a request to. 156 // 157 // If we should track more peers, returns a random untracked peer, if any exist. 158 // Otherwise, with probability [randomPeerProbability] returns a random peer 159 // from [p.responsivePeers]. 160 // With probability [1-randomPeerProbability] returns the peer in 161 // [p.bandwidthHeap] with the highest bandwidth. 162 // 163 // Returns false if there are no connected peers. 164 func (p *PeerTracker) SelectPeer() (ids.NodeID, bool) { 165 p.lock.RLock() 166 defer p.lock.RUnlock() 167 168 if p.shouldSelectUntrackedPeer() { 169 if nodeID, ok := p.untrackedPeers.Peek(); ok { 170 p.log.Debug("selecting peer", 171 zap.String("reason", "untracked"), 172 zap.Stringer("nodeID", nodeID), 173 zap.Int("trackedPeers", p.trackedPeers.Len()), 174 zap.Int("responsivePeers", p.responsivePeers.Len()), 175 ) 176 return nodeID, true 177 } 178 } 179 180 useBandwidthHeap := rand.Float64() > randomPeerProbability // #nosec G404 181 if useBandwidthHeap { 182 if nodeID, bandwidth, ok := p.bandwidthHeap.Peek(); ok { 183 p.log.Debug("selecting peer", 184 zap.String("reason", "bandwidth"), 185 zap.Stringer("nodeID", nodeID), 186 zap.Float64("bandwidth", bandwidth.Read()), 187 ) 188 return nodeID, true 189 } 190 } else { 191 if nodeID, ok := p.responsivePeers.Peek(); ok { 192 p.log.Debug("selecting peer", 193 zap.String("reason", "responsive"), 194 zap.Stringer("nodeID", nodeID), 195 ) 196 return nodeID, true 197 } 198 } 199 200 if nodeID, ok := p.trackedPeers.Peek(); ok { 201 p.log.Debug("selecting peer", 202 zap.String("reason", "tracked"), 203 zap.Stringer("nodeID", nodeID), 204 zap.Bool("checkedBandwidthHeap", useBandwidthHeap), 205 ) 206 return nodeID, true 207 } 208 209 // We're not connected to any peers. 210 return ids.EmptyNodeID, false 211 } 212 213 // Record that we sent a request to [nodeID]. 214 // 215 // Removes the peer's bandwidth averager from the bandwidth heap. 216 func (p *PeerTracker) RegisterRequest(nodeID ids.NodeID) { 217 p.lock.Lock() 218 defer p.lock.Unlock() 219 220 p.untrackedPeers.Remove(nodeID) 221 p.trackedPeers.Add(nodeID) 222 p.bandwidthHeap.Remove(nodeID) 223 224 p.metrics.numTrackedPeers.Set(float64(p.trackedPeers.Len())) 225 } 226 227 // Record that we observed that [nodeID]'s bandwidth is [bandwidth]. 228 // 229 // Adds the peer's bandwidth averager to the bandwidth heap. 230 func (p *PeerTracker) RegisterResponse(nodeID ids.NodeID, bandwidth float64) { 231 p.updateBandwidth(nodeID, bandwidth, true) 232 } 233 234 // Record that a request failed to [nodeID]. 235 // 236 // Adds the peer's bandwidth averager to the bandwidth heap. 237 func (p *PeerTracker) RegisterFailure(nodeID ids.NodeID) { 238 p.updateBandwidth(nodeID, 0, false) 239 } 240 241 func (p *PeerTracker) updateBandwidth(nodeID ids.NodeID, bandwidth float64, responsive bool) { 242 p.lock.Lock() 243 defer p.lock.Unlock() 244 245 if !p.trackedPeers.Contains(nodeID) { 246 // we're not tracking this peer, nothing to do here 247 p.log.Debug("tracking bandwidth for untracked peer", 248 zap.Stringer("nodeID", nodeID), 249 ) 250 return 251 } 252 253 now := time.Now() 254 peerBandwidth, ok := p.peerBandwidth[nodeID] 255 if ok { 256 peerBandwidth.Observe(bandwidth, now) 257 } else { 258 peerBandwidth = safemath.NewAverager(bandwidth, bandwidthHalflife, now) 259 p.peerBandwidth[nodeID] = peerBandwidth 260 } 261 p.bandwidthHeap.Push(nodeID, peerBandwidth) 262 p.averageBandwidth.Observe(bandwidth, now) 263 264 if responsive { 265 p.responsivePeers.Add(nodeID) 266 } else { 267 p.responsivePeers.Remove(nodeID) 268 } 269 270 p.metrics.numResponsivePeers.Set(float64(p.responsivePeers.Len())) 271 p.metrics.averageBandwidth.Set(p.averageBandwidth.Read()) 272 } 273 274 // Connected should be called when [nodeID] connects to this node. 275 func (p *PeerTracker) Connected(nodeID ids.NodeID, nodeVersion *version.Application) { 276 // If this peer should be ignored, don't mark it as connected. 277 if p.ignoredNodes.Contains(nodeID) { 278 return 279 } 280 // If minVersion is specified and peer's version is less, don't mark it as 281 // connected. 282 if p.minVersion != nil && nodeVersion.Compare(p.minVersion) < 0 { 283 return 284 } 285 286 p.lock.Lock() 287 defer p.lock.Unlock() 288 289 p.untrackedPeers.Add(nodeID) 290 } 291 292 // Disconnected should be called when [nodeID] disconnects from this node. 293 func (p *PeerTracker) Disconnected(nodeID ids.NodeID) { 294 p.lock.Lock() 295 defer p.lock.Unlock() 296 297 // Because of the checks performed in Connected, it's possible that this 298 // node was never marked as connected here. However, all of the below 299 // functions are noops if called with a peer that was never marked as 300 // connected. 301 p.untrackedPeers.Remove(nodeID) 302 p.trackedPeers.Remove(nodeID) 303 p.responsivePeers.Remove(nodeID) 304 delete(p.peerBandwidth, nodeID) 305 p.bandwidthHeap.Remove(nodeID) 306 307 p.metrics.numTrackedPeers.Set(float64(p.trackedPeers.Len())) 308 p.metrics.numResponsivePeers.Set(float64(p.responsivePeers.Len())) 309 } 310 311 // Returns the number of peers the node is connected to. 312 func (p *PeerTracker) Size() int { 313 p.lock.RLock() 314 defer p.lock.RUnlock() 315 316 return p.untrackedPeers.Len() + p.trackedPeers.Len() 317 }