github.com/MetalBlockchain/metalgo@v1.11.9/network/p2p/peer_tracker.go (about)

     1  // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
     2  // See the file LICENSE for licensing terms.
     3  
     4  package p2p
     5  
     6  import (
     7  	"errors"
     8  	"math"
     9  	"math/rand"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"go.uber.org/zap"
    15  
    16  	"github.com/MetalBlockchain/metalgo/ids"
    17  	"github.com/MetalBlockchain/metalgo/utils/heap"
    18  	"github.com/MetalBlockchain/metalgo/utils/logging"
    19  	"github.com/MetalBlockchain/metalgo/utils/set"
    20  	"github.com/MetalBlockchain/metalgo/version"
    21  
    22  	safemath "github.com/MetalBlockchain/metalgo/utils/math"
    23  )
    24  
    25  const (
    26  	bandwidthHalflife = 5 * time.Minute
    27  
    28  	// controls how eagerly we connect to new peers vs. using peers with known
    29  	// good response bandwidth.
    30  	desiredMinResponsivePeers = 20
    31  	newPeerConnectFactor      = 0.1
    32  
    33  	// The probability that, when we select a peer, we select randomly rather
    34  	// than based on their performance.
    35  	randomPeerProbability = 0.2
    36  )
    37  
    38  // Tracks the bandwidth of responses coming from peers,
    39  // preferring to contact peers with known good bandwidth, connecting
    40  // to new peers with an exponentially decaying probability.
    41  type PeerTracker struct {
    42  	// Lock to protect concurrent access to the peer tracker
    43  	lock sync.RWMutex
    44  	// Peers that we're connected to that we haven't sent a request to since we
    45  	// most recently connected to them.
    46  	untrackedPeers set.Set[ids.NodeID]
    47  	// Peers that we're connected to that we've sent a request to since we most
    48  	// recently connected to them.
    49  	trackedPeers set.Set[ids.NodeID]
    50  	// Peers that we're connected to that responded to the last request they
    51  	// were sent.
    52  	responsivePeers set.Set[ids.NodeID]
    53  	// Bandwidth of peers that we have measured.
    54  	peerBandwidth map[ids.NodeID]safemath.Averager
    55  	// Max heap that contains the average bandwidth of peers that do not have an
    56  	// outstanding request.
    57  	bandwidthHeap heap.Map[ids.NodeID, safemath.Averager]
    58  	// Average bandwidth is only used for metrics.
    59  	averageBandwidth safemath.Averager
    60  
    61  	// The below fields are assumed to be constant and are not protected by the
    62  	// lock.
    63  	log          logging.Logger
    64  	ignoredNodes set.Set[ids.NodeID]
    65  	minVersion   *version.Application
    66  	metrics      peerTrackerMetrics
    67  }
    68  
    69  type peerTrackerMetrics struct {
    70  	numTrackedPeers    prometheus.Gauge
    71  	numResponsivePeers prometheus.Gauge
    72  	averageBandwidth   prometheus.Gauge
    73  }
    74  
    75  func NewPeerTracker(
    76  	log logging.Logger,
    77  	metricsNamespace string,
    78  	registerer prometheus.Registerer,
    79  	ignoredNodes set.Set[ids.NodeID],
    80  	minVersion *version.Application,
    81  ) (*PeerTracker, error) {
    82  	t := &PeerTracker{
    83  		peerBandwidth: make(map[ids.NodeID]safemath.Averager),
    84  		bandwidthHeap: heap.NewMap[ids.NodeID, safemath.Averager](func(a, b safemath.Averager) bool {
    85  			return a.Read() > b.Read()
    86  		}),
    87  		averageBandwidth: safemath.NewAverager(0, bandwidthHalflife, time.Now()),
    88  		log:              log,
    89  		ignoredNodes:     ignoredNodes,
    90  		minVersion:       minVersion,
    91  		metrics: peerTrackerMetrics{
    92  			numTrackedPeers: prometheus.NewGauge(
    93  				prometheus.GaugeOpts{
    94  					Namespace: metricsNamespace,
    95  					Name:      "num_tracked_peers",
    96  					Help:      "number of tracked peers",
    97  				},
    98  			),
    99  			numResponsivePeers: prometheus.NewGauge(
   100  				prometheus.GaugeOpts{
   101  					Namespace: metricsNamespace,
   102  					Name:      "num_responsive_peers",
   103  					Help:      "number of responsive peers",
   104  				},
   105  			),
   106  			averageBandwidth: prometheus.NewGauge(
   107  				prometheus.GaugeOpts{
   108  					Namespace: metricsNamespace,
   109  					Name:      "average_bandwidth",
   110  					Help:      "average sync bandwidth used by peers",
   111  				},
   112  			),
   113  		},
   114  	}
   115  
   116  	err := errors.Join(
   117  		registerer.Register(t.metrics.numTrackedPeers),
   118  		registerer.Register(t.metrics.numResponsivePeers),
   119  		registerer.Register(t.metrics.averageBandwidth),
   120  	)
   121  	return t, err
   122  }
   123  
   124  // Returns true if:
   125  //   - We have not observed the desired minimum number of responsive peers.
   126  //   - Randomly with the frequency decreasing as the number of responsive peers
   127  //     increases.
   128  //
   129  // Assumes the read lock is held.
   130  func (p *PeerTracker) shouldSelectUntrackedPeer() bool {
   131  	numResponsivePeers := p.responsivePeers.Len()
   132  	if numResponsivePeers < desiredMinResponsivePeers {
   133  		return true
   134  	}
   135  	if p.untrackedPeers.Len() == 0 {
   136  		return false // already tracking all peers
   137  	}
   138  
   139  	// TODO danlaine: we should consider tuning this probability function.
   140  	// With [newPeerConnectFactor] as 0.1 the probabilities are:
   141  	//
   142  	// numResponsivePeers | probability
   143  	// 100                | 4.5399929762484854e-05
   144  	// 200                | 2.061153622438558e-09
   145  	// 500                | 1.9287498479639178e-22
   146  	// 1000               | 3.720075976020836e-44
   147  	// 2000               | 1.3838965267367376e-87
   148  	// 5000               | 7.124576406741286e-218
   149  	//
   150  	// In other words, the probability drops off extremely quickly.
   151  	newPeerProbability := math.Exp(-float64(numResponsivePeers) * newPeerConnectFactor)
   152  	return rand.Float64() < newPeerProbability // #nosec G404
   153  }
   154  
   155  // SelectPeer that we could send a request to.
   156  //
   157  // If we should track more peers, returns a random untracked peer, if any exist.
   158  // Otherwise, with probability [randomPeerProbability] returns a random peer
   159  // from [p.responsivePeers].
   160  // With probability [1-randomPeerProbability] returns the peer in
   161  // [p.bandwidthHeap] with the highest bandwidth.
   162  //
   163  // Returns false if there are no connected peers.
   164  func (p *PeerTracker) SelectPeer() (ids.NodeID, bool) {
   165  	p.lock.RLock()
   166  	defer p.lock.RUnlock()
   167  
   168  	if p.shouldSelectUntrackedPeer() {
   169  		if nodeID, ok := p.untrackedPeers.Peek(); ok {
   170  			p.log.Debug("selecting peer",
   171  				zap.String("reason", "untracked"),
   172  				zap.Stringer("nodeID", nodeID),
   173  				zap.Int("trackedPeers", p.trackedPeers.Len()),
   174  				zap.Int("responsivePeers", p.responsivePeers.Len()),
   175  			)
   176  			return nodeID, true
   177  		}
   178  	}
   179  
   180  	useBandwidthHeap := rand.Float64() > randomPeerProbability // #nosec G404
   181  	if useBandwidthHeap {
   182  		if nodeID, bandwidth, ok := p.bandwidthHeap.Peek(); ok {
   183  			p.log.Debug("selecting peer",
   184  				zap.String("reason", "bandwidth"),
   185  				zap.Stringer("nodeID", nodeID),
   186  				zap.Float64("bandwidth", bandwidth.Read()),
   187  			)
   188  			return nodeID, true
   189  		}
   190  	} else {
   191  		if nodeID, ok := p.responsivePeers.Peek(); ok {
   192  			p.log.Debug("selecting peer",
   193  				zap.String("reason", "responsive"),
   194  				zap.Stringer("nodeID", nodeID),
   195  			)
   196  			return nodeID, true
   197  		}
   198  	}
   199  
   200  	if nodeID, ok := p.trackedPeers.Peek(); ok {
   201  		p.log.Debug("selecting peer",
   202  			zap.String("reason", "tracked"),
   203  			zap.Stringer("nodeID", nodeID),
   204  			zap.Bool("checkedBandwidthHeap", useBandwidthHeap),
   205  		)
   206  		return nodeID, true
   207  	}
   208  
   209  	// We're not connected to any peers.
   210  	return ids.EmptyNodeID, false
   211  }
   212  
   213  // Record that we sent a request to [nodeID].
   214  //
   215  // Removes the peer's bandwidth averager from the bandwidth heap.
   216  func (p *PeerTracker) RegisterRequest(nodeID ids.NodeID) {
   217  	p.lock.Lock()
   218  	defer p.lock.Unlock()
   219  
   220  	p.untrackedPeers.Remove(nodeID)
   221  	p.trackedPeers.Add(nodeID)
   222  	p.bandwidthHeap.Remove(nodeID)
   223  
   224  	p.metrics.numTrackedPeers.Set(float64(p.trackedPeers.Len()))
   225  }
   226  
   227  // Record that we observed that [nodeID]'s bandwidth is [bandwidth].
   228  //
   229  // Adds the peer's bandwidth averager to the bandwidth heap.
   230  func (p *PeerTracker) RegisterResponse(nodeID ids.NodeID, bandwidth float64) {
   231  	p.updateBandwidth(nodeID, bandwidth, true)
   232  }
   233  
   234  // Record that a request failed to [nodeID].
   235  //
   236  // Adds the peer's bandwidth averager to the bandwidth heap.
   237  func (p *PeerTracker) RegisterFailure(nodeID ids.NodeID) {
   238  	p.updateBandwidth(nodeID, 0, false)
   239  }
   240  
   241  func (p *PeerTracker) updateBandwidth(nodeID ids.NodeID, bandwidth float64, responsive bool) {
   242  	p.lock.Lock()
   243  	defer p.lock.Unlock()
   244  
   245  	if !p.trackedPeers.Contains(nodeID) {
   246  		// we're not tracking this peer, nothing to do here
   247  		p.log.Debug("tracking bandwidth for untracked peer",
   248  			zap.Stringer("nodeID", nodeID),
   249  		)
   250  		return
   251  	}
   252  
   253  	now := time.Now()
   254  	peerBandwidth, ok := p.peerBandwidth[nodeID]
   255  	if ok {
   256  		peerBandwidth.Observe(bandwidth, now)
   257  	} else {
   258  		peerBandwidth = safemath.NewAverager(bandwidth, bandwidthHalflife, now)
   259  		p.peerBandwidth[nodeID] = peerBandwidth
   260  	}
   261  	p.bandwidthHeap.Push(nodeID, peerBandwidth)
   262  	p.averageBandwidth.Observe(bandwidth, now)
   263  
   264  	if responsive {
   265  		p.responsivePeers.Add(nodeID)
   266  	} else {
   267  		p.responsivePeers.Remove(nodeID)
   268  	}
   269  
   270  	p.metrics.numResponsivePeers.Set(float64(p.responsivePeers.Len()))
   271  	p.metrics.averageBandwidth.Set(p.averageBandwidth.Read())
   272  }
   273  
   274  // Connected should be called when [nodeID] connects to this node.
   275  func (p *PeerTracker) Connected(nodeID ids.NodeID, nodeVersion *version.Application) {
   276  	// If this peer should be ignored, don't mark it as connected.
   277  	if p.ignoredNodes.Contains(nodeID) {
   278  		return
   279  	}
   280  	// If minVersion is specified and peer's version is less, don't mark it as
   281  	// connected.
   282  	if p.minVersion != nil && nodeVersion.Compare(p.minVersion) < 0 {
   283  		return
   284  	}
   285  
   286  	p.lock.Lock()
   287  	defer p.lock.Unlock()
   288  
   289  	p.untrackedPeers.Add(nodeID)
   290  }
   291  
   292  // Disconnected should be called when [nodeID] disconnects from this node.
   293  func (p *PeerTracker) Disconnected(nodeID ids.NodeID) {
   294  	p.lock.Lock()
   295  	defer p.lock.Unlock()
   296  
   297  	// Because of the checks performed in Connected, it's possible that this
   298  	// node was never marked as connected here. However, all of the below
   299  	// functions are noops if called with a peer that was never marked as
   300  	// connected.
   301  	p.untrackedPeers.Remove(nodeID)
   302  	p.trackedPeers.Remove(nodeID)
   303  	p.responsivePeers.Remove(nodeID)
   304  	delete(p.peerBandwidth, nodeID)
   305  	p.bandwidthHeap.Remove(nodeID)
   306  
   307  	p.metrics.numTrackedPeers.Set(float64(p.trackedPeers.Len()))
   308  	p.metrics.numResponsivePeers.Set(float64(p.responsivePeers.Len()))
   309  }
   310  
   311  // Returns the number of peers the node is connected to.
   312  func (p *PeerTracker) Size() int {
   313  	p.lock.RLock()
   314  	defer p.lock.RUnlock()
   315  
   316  	return p.untrackedPeers.Len() + p.trackedPeers.Len()
   317  }