github.com/MetalBlockchain/metalgo@v1.11.9/network/network.go (about) 1 // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. 2 // See the file LICENSE for licensing terms. 3 4 package network 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "math" 11 "net" 12 "net/netip" 13 "strings" 14 "sync" 15 "sync/atomic" 16 "time" 17 18 "github.com/pires/go-proxyproto" 19 "github.com/prometheus/client_golang/prometheus" 20 "go.uber.org/zap" 21 22 "github.com/MetalBlockchain/metalgo/api/health" 23 "github.com/MetalBlockchain/metalgo/genesis" 24 "github.com/MetalBlockchain/metalgo/ids" 25 "github.com/MetalBlockchain/metalgo/message" 26 "github.com/MetalBlockchain/metalgo/network/dialer" 27 "github.com/MetalBlockchain/metalgo/network/peer" 28 "github.com/MetalBlockchain/metalgo/network/throttling" 29 "github.com/MetalBlockchain/metalgo/snow/engine/common" 30 "github.com/MetalBlockchain/metalgo/snow/networking/router" 31 "github.com/MetalBlockchain/metalgo/snow/networking/sender" 32 "github.com/MetalBlockchain/metalgo/subnets" 33 "github.com/MetalBlockchain/metalgo/utils/bloom" 34 "github.com/MetalBlockchain/metalgo/utils/constants" 35 "github.com/MetalBlockchain/metalgo/utils/ips" 36 "github.com/MetalBlockchain/metalgo/utils/logging" 37 "github.com/MetalBlockchain/metalgo/utils/set" 38 "github.com/MetalBlockchain/metalgo/utils/wrappers" 39 "github.com/MetalBlockchain/metalgo/version" 40 41 safemath "github.com/MetalBlockchain/metalgo/utils/math" 42 ) 43 44 const ( 45 ConnectedPeersKey = "connectedPeers" 46 TimeSinceLastMsgReceivedKey = "timeSinceLastMsgReceived" 47 TimeSinceLastMsgSentKey = "timeSinceLastMsgSent" 48 SendFailRateKey = "sendFailRate" 49 ) 50 51 var ( 52 _ Network = (*network)(nil) 53 54 errNotValidator = errors.New("node is not a validator") 55 errNotTracked = errors.New("subnet is not tracked") 56 errExpectedProxy = errors.New("expected proxy") 57 errExpectedTCPProtocol = errors.New("expected TCP protocol") 58 ) 59 60 // Network defines the functionality of the networking library. 61 type Network interface { 62 // All consensus messages can be sent through this interface. Thread safety 63 // must be managed internally in the network. 64 sender.ExternalSender 65 66 // Has a health check 67 health.Checker 68 69 peer.Network 70 71 // StartClose this network and all existing connections it has. Calling 72 // StartClose multiple times is handled gracefully. 73 StartClose() 74 75 // Should only be called once, will run until either a fatal error occurs, 76 // or the network is closed. 77 Dispatch() error 78 79 // Attempt to connect to this IP. The network will never stop attempting to 80 // connect to this ID. 81 ManuallyTrack(nodeID ids.NodeID, ip netip.AddrPort) 82 83 // PeerInfo returns information about peers. If [nodeIDs] is empty, returns 84 // info about all peers that have finished the handshake. Otherwise, returns 85 // info about the peers in [nodeIDs] that have finished the handshake. 86 PeerInfo(nodeIDs []ids.NodeID) []peer.Info 87 88 // NodeUptime returns given node's [subnetID] UptimeResults in the view of 89 // this node's peer validators. 90 NodeUptime(subnetID ids.ID) (UptimeResult, error) 91 } 92 93 type UptimeResult struct { 94 // RewardingStakePercentage shows what percent of network stake thinks we're 95 // above the uptime requirement. 96 RewardingStakePercentage float64 97 98 // WeightedAveragePercentage is the average perceived uptime of this node, 99 // weighted by stake. 100 // Note that this is different from RewardingStakePercentage, which shows 101 // the percent of the network stake that thinks this node is above the 102 // uptime requirement. WeightedAveragePercentage is weighted by uptime. 103 // i.e If uptime requirement is 85 and a peer reports 40 percent it will be 104 // counted (40*weight) in WeightedAveragePercentage but not in 105 // RewardingStakePercentage since 40 < 85 106 WeightedAveragePercentage float64 107 } 108 109 // To avoid potential deadlocks, we maintain that locks must be grabbed in the 110 // following order: 111 // 112 // 1. peersLock 113 // 2. manuallyTrackedIDsLock 114 // 115 // If a higher lock (e.g. manuallyTrackedIDsLock) is held when trying to grab a 116 // lower lock (e.g. peersLock) a deadlock could occur. 117 type network struct { 118 config *Config 119 peerConfig *peer.Config 120 metrics *metrics 121 122 outboundMsgThrottler throttling.OutboundMsgThrottler 123 124 // Limits the number of connection attempts based on IP. 125 inboundConnUpgradeThrottler throttling.InboundConnUpgradeThrottler 126 // Listens for and accepts new inbound connections 127 listener net.Listener 128 // Makes new outbound connections 129 dialer dialer.Dialer 130 // Does TLS handshakes for inbound connections 131 serverUpgrader peer.Upgrader 132 // Does TLS handshakes for outbound connections 133 clientUpgrader peer.Upgrader 134 135 // ensures the close of the network only happens once. 136 closeOnce sync.Once 137 // Cancelled on close 138 onCloseCtx context.Context 139 // Call [onCloseCtxCancel] to cancel [onCloseCtx] during close() 140 onCloseCtxCancel context.CancelFunc 141 142 sendFailRateCalculator safemath.Averager 143 144 // Tracks which peers know about which peers 145 ipTracker *ipTracker 146 peersLock sync.RWMutex 147 // trackedIPs contains the set of IPs that we are currently attempting to 148 // connect to. An entry is added to this set when we first start attempting 149 // to connect to the peer. An entry is deleted from this set once we have 150 // finished the handshake. 151 trackedIPs map[ids.NodeID]*trackedIP 152 connectingPeers peer.Set 153 connectedPeers peer.Set 154 closing bool 155 156 // router is notified about all peer [Connected] and [Disconnected] events 157 // as well as all non-handshake peer messages. 158 // 159 // It is ensured that [Connected] and [Disconnected] are called in 160 // consistent ways. Specifically, the a peer starts in the disconnected 161 // state and the network can change the peer's state from disconnected to 162 // connected and back. 163 // 164 // It is ensured that [HandleInbound] is only called with a message from a 165 // peer that is in the connected state. 166 // 167 // It is expected that the implementation of this interface can handle 168 // concurrent calls to [Connected], [Disconnected], and [HandleInbound]. 169 router router.ExternalHandler 170 } 171 172 // NewNetwork returns a new Network implementation with the provided parameters. 173 func NewNetwork( 174 config *Config, 175 msgCreator message.Creator, 176 metricsRegisterer prometheus.Registerer, 177 log logging.Logger, 178 listener net.Listener, 179 dialer dialer.Dialer, 180 router router.ExternalHandler, 181 ) (Network, error) { 182 if config.ProxyEnabled { 183 // Wrap the listener to process the proxy header. 184 listener = &proxyproto.Listener{ 185 Listener: listener, 186 Policy: func(net.Addr) (proxyproto.Policy, error) { 187 // Do not perform any fuzzy matching, the header must be 188 // provided. 189 return proxyproto.REQUIRE, nil 190 }, 191 ValidateHeader: func(h *proxyproto.Header) error { 192 if !h.Command.IsProxy() { 193 return errExpectedProxy 194 } 195 if h.TransportProtocol != proxyproto.TCPv4 && h.TransportProtocol != proxyproto.TCPv6 { 196 return errExpectedTCPProtocol 197 } 198 return nil 199 }, 200 ReadHeaderTimeout: config.ProxyReadHeaderTimeout, 201 } 202 } 203 204 inboundMsgThrottler, err := throttling.NewInboundMsgThrottler( 205 log, 206 metricsRegisterer, 207 config.Validators, 208 config.ThrottlerConfig.InboundMsgThrottlerConfig, 209 config.ResourceTracker, 210 config.CPUTargeter, 211 config.DiskTargeter, 212 ) 213 if err != nil { 214 return nil, fmt.Errorf("initializing inbound message throttler failed with: %w", err) 215 } 216 217 outboundMsgThrottler, err := throttling.NewSybilOutboundMsgThrottler( 218 log, 219 metricsRegisterer, 220 config.Validators, 221 config.ThrottlerConfig.OutboundMsgThrottlerConfig, 222 ) 223 if err != nil { 224 return nil, fmt.Errorf("initializing outbound message throttler failed with: %w", err) 225 } 226 227 peerMetrics, err := peer.NewMetrics(metricsRegisterer) 228 if err != nil { 229 return nil, fmt.Errorf("initializing peer metrics failed with: %w", err) 230 } 231 232 metrics, err := newMetrics(metricsRegisterer, config.TrackedSubnets) 233 if err != nil { 234 return nil, fmt.Errorf("initializing network metrics failed with: %w", err) 235 } 236 237 ipTracker, err := newIPTracker(log, metricsRegisterer) 238 if err != nil { 239 return nil, fmt.Errorf("initializing ip tracker failed with: %w", err) 240 } 241 config.Validators.RegisterSetCallbackListener(constants.PrimaryNetworkID, ipTracker) 242 243 // Track all default bootstrappers to ensure their current IPs are gossiped 244 // like validator IPs. 245 for _, bootstrapper := range genesis.GetBootstrappers(config.NetworkID) { 246 ipTracker.ManuallyGossip(bootstrapper.ID) 247 } 248 // Track all recent validators to optimistically connect to them before the 249 // P-chain has finished syncing. 250 for nodeID := range genesis.GetValidators(config.NetworkID) { 251 ipTracker.ManuallyTrack(nodeID) 252 } 253 254 peerConfig := &peer.Config{ 255 ReadBufferSize: config.PeerReadBufferSize, 256 WriteBufferSize: config.PeerWriteBufferSize, 257 Metrics: peerMetrics, 258 MessageCreator: msgCreator, 259 260 Log: log, 261 InboundMsgThrottler: inboundMsgThrottler, 262 Network: nil, // This is set below. 263 Router: router, 264 VersionCompatibility: version.GetCompatibility(config.NetworkID), 265 MySubnets: config.TrackedSubnets, 266 Beacons: config.Beacons, 267 Validators: config.Validators, 268 NetworkID: config.NetworkID, 269 PingFrequency: config.PingFrequency, 270 PongTimeout: config.PingPongTimeout, 271 MaxClockDifference: config.MaxClockDifference, 272 SupportedACPs: config.SupportedACPs.List(), 273 ObjectedACPs: config.ObjectedACPs.List(), 274 ResourceTracker: config.ResourceTracker, 275 UptimeCalculator: config.UptimeCalculator, 276 IPSigner: peer.NewIPSigner(config.MyIPPort, config.TLSKey, config.BLSKey), 277 } 278 279 onCloseCtx, cancel := context.WithCancel(context.Background()) 280 n := &network{ 281 config: config, 282 peerConfig: peerConfig, 283 metrics: metrics, 284 outboundMsgThrottler: outboundMsgThrottler, 285 286 inboundConnUpgradeThrottler: throttling.NewInboundConnUpgradeThrottler(log, config.ThrottlerConfig.InboundConnUpgradeThrottlerConfig), 287 listener: listener, 288 dialer: dialer, 289 serverUpgrader: peer.NewTLSServerUpgrader(config.TLSConfig, metrics.tlsConnRejected), 290 clientUpgrader: peer.NewTLSClientUpgrader(config.TLSConfig, metrics.tlsConnRejected), 291 292 onCloseCtx: onCloseCtx, 293 onCloseCtxCancel: cancel, 294 295 sendFailRateCalculator: safemath.NewSyncAverager(safemath.NewAverager( 296 0, 297 config.SendFailRateHalflife, 298 time.Now(), 299 )), 300 301 trackedIPs: make(map[ids.NodeID]*trackedIP), 302 ipTracker: ipTracker, 303 connectingPeers: peer.NewSet(), 304 connectedPeers: peer.NewSet(), 305 router: router, 306 } 307 n.peerConfig.Network = n 308 return n, nil 309 } 310 311 func (n *network) Send( 312 msg message.OutboundMessage, 313 config common.SendConfig, 314 subnetID ids.ID, 315 allower subnets.Allower, 316 ) set.Set[ids.NodeID] { 317 namedPeers := n.getPeers(config.NodeIDs, subnetID, allower) 318 n.peerConfig.Metrics.MultipleSendsFailed( 319 msg.Op(), 320 config.NodeIDs.Len()-len(namedPeers), 321 ) 322 323 var ( 324 sampledPeers = n.samplePeers(config, subnetID, allower) 325 sentTo = set.NewSet[ids.NodeID](len(namedPeers) + len(sampledPeers)) 326 now = n.peerConfig.Clock.Time() 327 ) 328 329 // send to peers and update metrics 330 // 331 // Note: It is guaranteed that namedPeers and sampledPeers are disjoint. 332 for _, peers := range [][]peer.Peer{namedPeers, sampledPeers} { 333 for _, peer := range peers { 334 if peer.Send(n.onCloseCtx, msg) { 335 sentTo.Add(peer.ID()) 336 337 // TODO: move send fail rate calculations into the peer metrics 338 // record metrics for success 339 n.sendFailRateCalculator.Observe(0, now) 340 } else { 341 // record metrics for failure 342 n.sendFailRateCalculator.Observe(1, now) 343 } 344 } 345 } 346 return sentTo 347 } 348 349 // HealthCheck returns information about several network layer health checks. 350 // 1) Information about health check results 351 // 2) An error if the health check reports unhealthy 352 func (n *network) HealthCheck(context.Context) (interface{}, error) { 353 n.peersLock.RLock() 354 connectedTo := n.connectedPeers.Len() 355 n.peersLock.RUnlock() 356 357 sendFailRate := n.sendFailRateCalculator.Read() 358 359 // Make sure we're connected to at least the minimum number of peers 360 isConnected := connectedTo >= int(n.config.HealthConfig.MinConnectedPeers) 361 healthy := isConnected 362 details := map[string]interface{}{ 363 ConnectedPeersKey: connectedTo, 364 } 365 366 // Make sure we've received an incoming message within the threshold 367 now := n.peerConfig.Clock.Time() 368 369 lastMsgReceivedAt, msgReceived := n.getLastReceived() 370 wasMsgReceivedRecently := msgReceived 371 timeSinceLastMsgReceived := time.Duration(0) 372 if msgReceived { 373 timeSinceLastMsgReceived = now.Sub(lastMsgReceivedAt) 374 wasMsgReceivedRecently = timeSinceLastMsgReceived <= n.config.HealthConfig.MaxTimeSinceMsgReceived 375 details[TimeSinceLastMsgReceivedKey] = timeSinceLastMsgReceived.String() 376 n.metrics.timeSinceLastMsgReceived.Set(float64(timeSinceLastMsgReceived)) 377 } 378 healthy = healthy && wasMsgReceivedRecently 379 380 // Make sure we've sent an outgoing message within the threshold 381 lastMsgSentAt, msgSent := n.getLastSent() 382 wasMsgSentRecently := msgSent 383 timeSinceLastMsgSent := time.Duration(0) 384 if msgSent { 385 timeSinceLastMsgSent = now.Sub(lastMsgSentAt) 386 wasMsgSentRecently = timeSinceLastMsgSent <= n.config.HealthConfig.MaxTimeSinceMsgSent 387 details[TimeSinceLastMsgSentKey] = timeSinceLastMsgSent.String() 388 n.metrics.timeSinceLastMsgSent.Set(float64(timeSinceLastMsgSent)) 389 } 390 healthy = healthy && wasMsgSentRecently 391 392 // Make sure the message send failed rate isn't too high 393 isMsgFailRate := sendFailRate <= n.config.HealthConfig.MaxSendFailRate 394 healthy = healthy && isMsgFailRate 395 details[SendFailRateKey] = sendFailRate 396 n.metrics.sendFailRate.Set(sendFailRate) 397 398 // emit metrics about the lifetime of peer connections 399 n.metrics.updatePeerConnectionLifetimeMetrics() 400 401 // Network layer is healthy 402 if healthy || !n.config.HealthConfig.Enabled { 403 return details, nil 404 } 405 406 var errorReasons []string 407 if !isConnected { 408 errorReasons = append(errorReasons, fmt.Sprintf("not connected to a minimum of %d peer(s) only %d", n.config.HealthConfig.MinConnectedPeers, connectedTo)) 409 } 410 if !msgReceived { 411 errorReasons = append(errorReasons, "no messages received from network") 412 } else if !wasMsgReceivedRecently { 413 errorReasons = append(errorReasons, fmt.Sprintf("no messages from network received in %s > %s", timeSinceLastMsgReceived, n.config.HealthConfig.MaxTimeSinceMsgReceived)) 414 } 415 if !msgSent { 416 errorReasons = append(errorReasons, "no messages sent to network") 417 } else if !wasMsgSentRecently { 418 errorReasons = append(errorReasons, fmt.Sprintf("no messages from network sent in %s > %s", timeSinceLastMsgSent, n.config.HealthConfig.MaxTimeSinceMsgSent)) 419 } 420 421 if !isMsgFailRate { 422 errorReasons = append(errorReasons, fmt.Sprintf("messages failure send rate %g > %g", sendFailRate, n.config.HealthConfig.MaxSendFailRate)) 423 } 424 return details, fmt.Errorf("network layer is unhealthy reason: %s", strings.Join(errorReasons, ", ")) 425 } 426 427 // Connected is called after the peer finishes the handshake. 428 // Will not be called after [Disconnected] is called with this peer. 429 func (n *network) Connected(nodeID ids.NodeID) { 430 n.peersLock.Lock() 431 peer, ok := n.connectingPeers.GetByID(nodeID) 432 if !ok { 433 n.peerConfig.Log.Error( 434 "unexpectedly connected to peer when not marked as attempting to connect", 435 zap.Stringer("nodeID", nodeID), 436 ) 437 n.peersLock.Unlock() 438 return 439 } 440 441 if tracked, ok := n.trackedIPs[nodeID]; ok { 442 tracked.stopTracking() 443 delete(n.trackedIPs, nodeID) 444 } 445 n.connectingPeers.Remove(nodeID) 446 n.connectedPeers.Add(peer) 447 n.peersLock.Unlock() 448 449 peerIP := peer.IP() 450 newIP := ips.NewClaimedIPPort( 451 peer.Cert(), 452 peerIP.AddrPort, 453 peerIP.Timestamp, 454 peerIP.TLSSignature, 455 ) 456 n.ipTracker.Connected(newIP) 457 458 n.metrics.markConnected(peer) 459 460 peerVersion := peer.Version() 461 n.router.Connected(nodeID, peerVersion, constants.PrimaryNetworkID) 462 463 trackedSubnets := peer.TrackedSubnets() 464 for subnetID := range n.peerConfig.MySubnets { 465 if trackedSubnets.Contains(subnetID) { 466 n.router.Connected(nodeID, peerVersion, subnetID) 467 } 468 } 469 } 470 471 // AllowConnection returns true if this node should have a connection to the 472 // provided nodeID. If the node is attempting to connect to the minimum number 473 // of peers, then it should only connect if this node is a validator, or the 474 // peer is a validator/beacon. 475 func (n *network) AllowConnection(nodeID ids.NodeID) bool { 476 if !n.config.RequireValidatorToConnect { 477 return true 478 } 479 _, iAmAValidator := n.config.Validators.GetValidator(constants.PrimaryNetworkID, n.config.MyNodeID) 480 return iAmAValidator || n.ipTracker.WantsConnection(nodeID) 481 } 482 483 func (n *network) Track(claimedIPPorts []*ips.ClaimedIPPort) error { 484 for _, ip := range claimedIPPorts { 485 if err := n.track(ip); err != nil { 486 return err 487 } 488 } 489 return nil 490 } 491 492 // Disconnected is called after the peer's handling has been shutdown. 493 // It is not guaranteed that [Connected] was previously called with [nodeID]. 494 // It is guaranteed that [Connected] will not be called with [nodeID] after this 495 // call. Note that this is from the perspective of a single peer object, because 496 // a peer with the same ID can reconnect to this network instance. 497 func (n *network) Disconnected(nodeID ids.NodeID) { 498 n.peersLock.RLock() 499 _, connecting := n.connectingPeers.GetByID(nodeID) 500 peer, connected := n.connectedPeers.GetByID(nodeID) 501 n.peersLock.RUnlock() 502 503 if connecting { 504 n.disconnectedFromConnecting(nodeID) 505 } 506 if connected { 507 n.disconnectedFromConnected(peer, nodeID) 508 } 509 } 510 511 func (n *network) KnownPeers() ([]byte, []byte) { 512 return n.ipTracker.Bloom() 513 } 514 515 func (n *network) Peers(except ids.NodeID, knownPeers *bloom.ReadFilter, salt []byte) []*ips.ClaimedIPPort { 516 return n.ipTracker.GetGossipableIPs( 517 except, 518 knownPeers, 519 salt, 520 int(n.config.PeerListNumValidatorIPs), 521 ) 522 } 523 524 // Dispatch starts accepting connections from other nodes attempting to connect 525 // to this node. 526 func (n *network) Dispatch() error { 527 go n.runTimers() // Periodically perform operations 528 go n.inboundConnUpgradeThrottler.Dispatch() 529 for { // Continuously accept new connections 530 if n.onCloseCtx.Err() != nil { 531 break 532 } 533 534 conn, err := n.listener.Accept() // Returns error when n.Close() is called 535 if err != nil { 536 n.peerConfig.Log.Debug("error during server accept", zap.Error(err)) 537 // Sleep for a small amount of time to try to wait for the 538 // error to go away. 539 time.Sleep(time.Millisecond) 540 n.metrics.acceptFailed.Inc() 541 continue 542 } 543 544 // Note: listener.Accept is rate limited outside of this package, so a 545 // peer can not just arbitrarily spin up goroutines here. 546 go func() { 547 // Note: Calling [RemoteAddr] with the Proxy protocol enabled may 548 // block for up to ProxyReadHeaderTimeout. Therefore, we ensure to 549 // call this function inside the go-routine, rather than the main 550 // accept loop. 551 remoteAddr := conn.RemoteAddr().String() 552 ip, err := ips.ParseAddrPort(remoteAddr) 553 if err != nil { 554 n.peerConfig.Log.Error("failed to parse remote address", 555 zap.String("peerIP", remoteAddr), 556 zap.Error(err), 557 ) 558 _ = conn.Close() 559 return 560 } 561 562 if !n.inboundConnUpgradeThrottler.ShouldUpgrade(ip) { 563 n.peerConfig.Log.Debug("failed to upgrade connection", 564 zap.String("reason", "rate-limiting"), 565 zap.Stringer("peerIP", ip), 566 ) 567 n.metrics.inboundConnRateLimited.Inc() 568 _ = conn.Close() 569 return 570 } 571 n.metrics.inboundConnAllowed.Inc() 572 573 n.peerConfig.Log.Verbo("starting to upgrade connection", 574 zap.String("direction", "inbound"), 575 zap.Stringer("peerIP", ip), 576 ) 577 578 if err := n.upgrade(conn, n.serverUpgrader); err != nil { 579 n.peerConfig.Log.Verbo("failed to upgrade connection", 580 zap.String("direction", "inbound"), 581 zap.Error(err), 582 ) 583 } 584 }() 585 } 586 n.inboundConnUpgradeThrottler.Stop() 587 n.StartClose() 588 589 n.peersLock.RLock() 590 connecting := n.connectingPeers.Sample(n.connectingPeers.Len(), peer.NoPrecondition) 591 connected := n.connectedPeers.Sample(n.connectedPeers.Len(), peer.NoPrecondition) 592 n.peersLock.RUnlock() 593 594 errs := wrappers.Errs{} 595 for _, peer := range append(connecting, connected...) { 596 errs.Add(peer.AwaitClosed(context.TODO())) 597 } 598 return errs.Err 599 } 600 601 func (n *network) ManuallyTrack(nodeID ids.NodeID, ip netip.AddrPort) { 602 n.ipTracker.ManuallyTrack(nodeID) 603 604 n.peersLock.Lock() 605 defer n.peersLock.Unlock() 606 607 _, connected := n.connectedPeers.GetByID(nodeID) 608 if connected { 609 // If I'm currently connected to [nodeID] then they will have told me 610 // how to connect to them in the future, and I don't need to attempt to 611 // connect to them now. 612 return 613 } 614 615 _, isTracked := n.trackedIPs[nodeID] 616 if !isTracked { 617 tracked := newTrackedIP(ip) 618 n.trackedIPs[nodeID] = tracked 619 n.dial(nodeID, tracked) 620 } 621 } 622 623 func (n *network) track(ip *ips.ClaimedIPPort) error { 624 // To avoid signature verification when the IP isn't needed, we 625 // optimistically filter out IPs. This can result in us not tracking an IP 626 // that we otherwise would have. This case can only happen if the node 627 // became a validator between the time we verified the signature and when we 628 // processed the IP; which should be very rare. 629 // 630 // Note: Avoiding signature verification when the IP isn't needed is a 631 // **significant** performance optimization. 632 if !n.ipTracker.ShouldVerifyIP(ip) { 633 n.metrics.numUselessPeerListBytes.Add(float64(ip.Size())) 634 return nil 635 } 636 637 // Perform all signature verification and hashing before grabbing the peer 638 // lock. 639 signedIP := peer.SignedIP{ 640 UnsignedIP: peer.UnsignedIP{ 641 AddrPort: ip.AddrPort, 642 Timestamp: ip.Timestamp, 643 }, 644 TLSSignature: ip.Signature, 645 } 646 maxTimestamp := n.peerConfig.Clock.Time().Add(n.peerConfig.MaxClockDifference) 647 if err := signedIP.Verify(ip.Cert, maxTimestamp); err != nil { 648 return err 649 } 650 651 n.peersLock.Lock() 652 defer n.peersLock.Unlock() 653 654 if !n.ipTracker.AddIP(ip) { 655 return nil 656 } 657 658 if _, connected := n.connectedPeers.GetByID(ip.NodeID); connected { 659 // If I'm currently connected to [nodeID] then I'll attempt to dial them 660 // when we disconnect. 661 return nil 662 } 663 664 tracked, isTracked := n.trackedIPs[ip.NodeID] 665 if isTracked { 666 // Stop tracking the old IP and start tracking the new one. 667 tracked = tracked.trackNewIP(ip.AddrPort) 668 } else { 669 tracked = newTrackedIP(ip.AddrPort) 670 } 671 n.trackedIPs[ip.NodeID] = tracked 672 n.dial(ip.NodeID, tracked) 673 return nil 674 } 675 676 // getPeers returns a slice of connected peers from a set of [nodeIDs]. 677 // 678 // - [nodeIDs] the IDs of the peers that should be returned if they are 679 // connected. 680 // - [subnetID] the subnetID whose membership should be considered if 681 // [validatorOnly] is set to true. 682 // - [validatorOnly] is the flag to drop any nodes from [nodeIDs] that are not 683 // validators in [subnetID]. 684 func (n *network) getPeers( 685 nodeIDs set.Set[ids.NodeID], 686 subnetID ids.ID, 687 allower subnets.Allower, 688 ) []peer.Peer { 689 peers := make([]peer.Peer, 0, nodeIDs.Len()) 690 691 n.peersLock.RLock() 692 defer n.peersLock.RUnlock() 693 694 for nodeID := range nodeIDs { 695 peer, ok := n.connectedPeers.GetByID(nodeID) 696 if !ok { 697 continue 698 } 699 700 if trackedSubnets := peer.TrackedSubnets(); !trackedSubnets.Contains(subnetID) { 701 continue 702 } 703 704 _, isValidator := n.config.Validators.GetValidator(subnetID, nodeID) 705 // check if the peer is allowed to connect to the subnet 706 if !allower.IsAllowed(nodeID, isValidator) { 707 continue 708 } 709 710 peers = append(peers, peer) 711 } 712 713 return peers 714 } 715 716 // samplePeers samples connected peers attempting to align with the number of 717 // requested validators, non-validators, and peers. This function will 718 // explicitly ignore nodeIDs already included in the send config. 719 func (n *network) samplePeers( 720 config common.SendConfig, 721 subnetID ids.ID, 722 allower subnets.Allower, 723 ) []peer.Peer { 724 // As an optimization, if there are fewer validators than 725 // [numValidatorsToSample], only attempt to sample [numValidatorsToSample] 726 // validators to potentially avoid iterating over the entire peer set. 727 numValidatorsToSample := min(config.Validators, n.config.Validators.Count(subnetID)) 728 729 n.peersLock.RLock() 730 defer n.peersLock.RUnlock() 731 732 return n.connectedPeers.Sample( 733 numValidatorsToSample+config.NonValidators+config.Peers, 734 func(p peer.Peer) bool { 735 // Only return peers that are tracking [subnetID] 736 if trackedSubnets := p.TrackedSubnets(); !trackedSubnets.Contains(subnetID) { 737 return false 738 } 739 740 peerID := p.ID() 741 // if the peer was already explicitly included, don't include in the 742 // sample 743 if config.NodeIDs.Contains(peerID) { 744 return false 745 } 746 747 _, isValidator := n.config.Validators.GetValidator(subnetID, peerID) 748 // check if the peer is allowed to connect to the subnet 749 if !allower.IsAllowed(peerID, isValidator) { 750 return false 751 } 752 753 if config.Peers > 0 { 754 config.Peers-- 755 return true 756 } 757 758 if isValidator { 759 numValidatorsToSample-- 760 return numValidatorsToSample >= 0 761 } 762 763 config.NonValidators-- 764 return config.NonValidators >= 0 765 }, 766 ) 767 } 768 769 func (n *network) disconnectedFromConnecting(nodeID ids.NodeID) { 770 n.peersLock.Lock() 771 defer n.peersLock.Unlock() 772 773 n.connectingPeers.Remove(nodeID) 774 775 // The peer that is disconnecting from us didn't finish the handshake 776 tracked, ok := n.trackedIPs[nodeID] 777 if ok { 778 if n.ipTracker.WantsConnection(nodeID) { 779 tracked := tracked.trackNewIP(tracked.ip) 780 n.trackedIPs[nodeID] = tracked 781 n.dial(nodeID, tracked) 782 } else { 783 tracked.stopTracking() 784 delete(n.trackedIPs, nodeID) 785 } 786 } 787 788 n.metrics.disconnected.Inc() 789 } 790 791 func (n *network) disconnectedFromConnected(peer peer.Peer, nodeID ids.NodeID) { 792 n.ipTracker.Disconnected(nodeID) 793 n.router.Disconnected(nodeID) 794 795 n.peersLock.Lock() 796 defer n.peersLock.Unlock() 797 798 n.connectedPeers.Remove(nodeID) 799 800 // The peer that is disconnecting from us finished the handshake 801 if ip, wantsConnection := n.ipTracker.GetIP(nodeID); wantsConnection { 802 tracked := newTrackedIP(ip.AddrPort) 803 n.trackedIPs[nodeID] = tracked 804 n.dial(nodeID, tracked) 805 } 806 807 n.metrics.markDisconnected(peer) 808 } 809 810 // dial will spin up a new goroutine and attempt to establish a connection with 811 // [nodeID] at [ip]. 812 // 813 // If the connection established at [ip] doesn't match [nodeID]: 814 // - attempts to reach [nodeID] at [ip] will be halted. 815 // - the connection will be checked to see if the connection is desired or not. 816 // 817 // If [ip] has been flagged with [ip.stopTracking] then this goroutine will 818 // exit. 819 // 820 // If [nodeID] is marked as connecting or connected then this goroutine will 821 // exit. 822 // 823 // If [nodeID] is no longer marked as desired then this goroutine will exit and 824 // the entry in the [trackedIP]s set will be removed. 825 // 826 // If initiating a connection to [ip] fails, then dial will reattempt. However, 827 // there is a randomized exponential backoff to avoid spamming connection 828 // attempts. 829 func (n *network) dial(nodeID ids.NodeID, ip *trackedIP) { 830 n.peerConfig.Log.Verbo("attempting to dial node", 831 zap.Stringer("nodeID", nodeID), 832 zap.Stringer("ip", ip.ip), 833 ) 834 go func() { 835 n.metrics.numTracked.Inc() 836 defer n.metrics.numTracked.Dec() 837 838 for { 839 timer := time.NewTimer(ip.getDelay()) 840 841 select { 842 case <-n.onCloseCtx.Done(): 843 timer.Stop() 844 return 845 case <-ip.onStopTracking: 846 timer.Stop() 847 return 848 case <-timer.C: 849 } 850 851 n.peersLock.Lock() 852 // If we no longer desire a connect to nodeID, we should cleanup 853 // trackedIPs and this goroutine. This prevents a memory leak when 854 // the tracked nodeID leaves the validator set and is never able to 855 // be connected to. 856 if !n.ipTracker.WantsConnection(nodeID) { 857 // Typically [n.trackedIPs[nodeID]] will already equal [ip], but 858 // the reference to [ip] is refreshed to avoid any potential 859 // race conditions before removing the entry. 860 if ip, exists := n.trackedIPs[nodeID]; exists { 861 ip.stopTracking() 862 delete(n.trackedIPs, nodeID) 863 } 864 n.peersLock.Unlock() 865 return 866 } 867 _, connecting := n.connectingPeers.GetByID(nodeID) 868 _, connected := n.connectedPeers.GetByID(nodeID) 869 n.peersLock.Unlock() 870 871 // While it may not be strictly needed to stop attempting to connect 872 // to an already connected peer here. It does prevent unnecessary 873 // outbound connections. Additionally, because the peer would 874 // immediately drop a duplicated connection, this prevents any 875 // "connection reset by peer" errors from interfering with the 876 // later duplicated connection check. 877 if connecting || connected { 878 n.peerConfig.Log.Verbo( 879 "exiting attempt to dial peer", 880 zap.String("reason", "already connected"), 881 zap.Stringer("nodeID", nodeID), 882 ) 883 return 884 } 885 886 // Increase the delay that we will use for a future connection 887 // attempt. 888 ip.increaseDelay( 889 n.config.InitialReconnectDelay, 890 n.config.MaxReconnectDelay, 891 ) 892 893 // If the network is configured to disallow private IPs and the 894 // provided IP is private, we skip all attempts to initiate a 895 // connection. 896 // 897 // Invariant: We perform this check inside of the looping goroutine 898 // because this goroutine must clean up the trackedIPs entry if 899 // nodeID leaves the validator set. This is why we continue the loop 900 // rather than returning even though we will never initiate an 901 // outbound connection with this IP. 902 if !n.config.AllowPrivateIPs && !ips.IsPublic(ip.ip.Addr()) { 903 n.peerConfig.Log.Verbo("skipping connection dial", 904 zap.String("reason", "outbound connections to private IPs are prohibited"), 905 zap.Stringer("nodeID", nodeID), 906 zap.Stringer("peerIP", ip.ip), 907 zap.Duration("delay", ip.delay), 908 ) 909 continue 910 } 911 912 conn, err := n.dialer.Dial(n.onCloseCtx, ip.ip) 913 if err != nil { 914 n.peerConfig.Log.Verbo( 915 "failed to reach peer, attempting again", 916 zap.Stringer("nodeID", nodeID), 917 zap.Stringer("peerIP", ip.ip), 918 zap.Duration("delay", ip.delay), 919 ) 920 continue 921 } 922 923 n.peerConfig.Log.Verbo("starting to upgrade connection", 924 zap.String("direction", "outbound"), 925 zap.Stringer("nodeID", nodeID), 926 zap.Stringer("peerIP", ip.ip), 927 ) 928 929 err = n.upgrade(conn, n.clientUpgrader) 930 if err != nil { 931 n.peerConfig.Log.Verbo( 932 "failed to upgrade, attempting again", 933 zap.Stringer("nodeID", nodeID), 934 zap.Stringer("peerIP", ip.ip), 935 zap.Duration("delay", ip.delay), 936 ) 937 continue 938 } 939 return 940 } 941 }() 942 } 943 944 // upgrade the provided connection, which may be an inbound connection or an 945 // outbound connection, with the provided [upgrader]. 946 // 947 // If the connection is successfully upgraded, [nil] will be returned. 948 // 949 // If the connection is desired by the node, then the resulting upgraded 950 // connection will be used to create a new peer. Otherwise the connection will 951 // be immediately closed. 952 func (n *network) upgrade(conn net.Conn, upgrader peer.Upgrader) error { 953 upgradeTimeout := n.peerConfig.Clock.Time().Add(n.config.ReadHandshakeTimeout) 954 if err := conn.SetReadDeadline(upgradeTimeout); err != nil { 955 _ = conn.Close() 956 n.peerConfig.Log.Verbo("failed to set the read deadline", 957 zap.Error(err), 958 ) 959 return err 960 } 961 962 nodeID, tlsConn, cert, err := upgrader.Upgrade(conn) 963 if err != nil { 964 _ = conn.Close() 965 n.peerConfig.Log.Verbo("failed to upgrade connection", 966 zap.Error(err), 967 ) 968 return err 969 } 970 971 if err := tlsConn.SetReadDeadline(time.Time{}); err != nil { 972 _ = tlsConn.Close() 973 n.peerConfig.Log.Verbo("failed to clear the read deadline", 974 zap.Error(err), 975 ) 976 return err 977 } 978 979 // At this point we have successfully upgraded the connection and will 980 // return a nil error. 981 982 if nodeID == n.config.MyNodeID { 983 _ = tlsConn.Close() 984 n.peerConfig.Log.Verbo("dropping connection to myself") 985 return nil 986 } 987 988 if !n.AllowConnection(nodeID) { 989 _ = tlsConn.Close() 990 n.peerConfig.Log.Verbo( 991 "dropping undesired connection", 992 zap.Stringer("nodeID", nodeID), 993 ) 994 return nil 995 } 996 997 n.peersLock.Lock() 998 if n.closing { 999 n.peersLock.Unlock() 1000 1001 _ = tlsConn.Close() 1002 n.peerConfig.Log.Verbo( 1003 "dropping connection", 1004 zap.String("reason", "shutting down the p2p network"), 1005 zap.Stringer("nodeID", nodeID), 1006 ) 1007 return nil 1008 } 1009 1010 if _, connecting := n.connectingPeers.GetByID(nodeID); connecting { 1011 n.peersLock.Unlock() 1012 1013 _ = tlsConn.Close() 1014 n.peerConfig.Log.Verbo( 1015 "dropping connection", 1016 zap.String("reason", "already connecting to peer"), 1017 zap.Stringer("nodeID", nodeID), 1018 ) 1019 return nil 1020 } 1021 1022 if _, connected := n.connectedPeers.GetByID(nodeID); connected { 1023 n.peersLock.Unlock() 1024 1025 _ = tlsConn.Close() 1026 n.peerConfig.Log.Verbo( 1027 "dropping connection", 1028 zap.String("reason", "already connecting to peer"), 1029 zap.Stringer("nodeID", nodeID), 1030 ) 1031 return nil 1032 } 1033 1034 n.peerConfig.Log.Verbo("starting handshake", 1035 zap.Stringer("nodeID", nodeID), 1036 ) 1037 1038 // peer.Start requires there is only ever one peer instance running with the 1039 // same [peerConfig.InboundMsgThrottler]. This is guaranteed by the above 1040 // de-duplications for [connectingPeers] and [connectedPeers]. 1041 peer := peer.Start( 1042 n.peerConfig, 1043 tlsConn, 1044 cert, 1045 nodeID, 1046 peer.NewThrottledMessageQueue( 1047 n.peerConfig.Metrics, 1048 nodeID, 1049 n.peerConfig.Log, 1050 n.outboundMsgThrottler, 1051 ), 1052 ) 1053 n.connectingPeers.Add(peer) 1054 n.peersLock.Unlock() 1055 return nil 1056 } 1057 1058 func (n *network) PeerInfo(nodeIDs []ids.NodeID) []peer.Info { 1059 n.peersLock.RLock() 1060 defer n.peersLock.RUnlock() 1061 1062 if len(nodeIDs) == 0 { 1063 return n.connectedPeers.AllInfo() 1064 } 1065 return n.connectedPeers.Info(nodeIDs) 1066 } 1067 1068 func (n *network) StartClose() { 1069 n.closeOnce.Do(func() { 1070 n.peerConfig.Log.Info("shutting down the p2p networking") 1071 1072 if err := n.listener.Close(); err != nil { 1073 n.peerConfig.Log.Debug("closing the network listener", 1074 zap.Error(err), 1075 ) 1076 } 1077 1078 n.peersLock.Lock() 1079 defer n.peersLock.Unlock() 1080 1081 n.closing = true 1082 n.onCloseCtxCancel() 1083 1084 for nodeID, tracked := range n.trackedIPs { 1085 tracked.stopTracking() 1086 delete(n.trackedIPs, nodeID) 1087 } 1088 1089 for i := 0; i < n.connectingPeers.Len(); i++ { 1090 peer, _ := n.connectingPeers.GetByIndex(i) 1091 peer.StartClose() 1092 } 1093 1094 for i := 0; i < n.connectedPeers.Len(); i++ { 1095 peer, _ := n.connectedPeers.GetByIndex(i) 1096 peer.StartClose() 1097 } 1098 }) 1099 } 1100 1101 func (n *network) NodeUptime(subnetID ids.ID) (UptimeResult, error) { 1102 if subnetID != constants.PrimaryNetworkID && !n.config.TrackedSubnets.Contains(subnetID) { 1103 return UptimeResult{}, errNotTracked 1104 } 1105 1106 myStake := n.config.Validators.GetWeight(subnetID, n.config.MyNodeID) 1107 if myStake == 0 { 1108 return UptimeResult{}, errNotValidator 1109 } 1110 1111 totalWeightInt, err := n.config.Validators.TotalWeight(subnetID) 1112 if err != nil { 1113 return UptimeResult{}, fmt.Errorf("error while fetching weight for subnet %s: %w", subnetID, err) 1114 } 1115 1116 var ( 1117 totalWeight = float64(totalWeightInt) 1118 totalWeightedPercent = 100 * float64(myStake) 1119 rewardingStake = float64(myStake) 1120 ) 1121 1122 n.peersLock.RLock() 1123 defer n.peersLock.RUnlock() 1124 1125 for i := 0; i < n.connectedPeers.Len(); i++ { 1126 peer, _ := n.connectedPeers.GetByIndex(i) 1127 1128 nodeID := peer.ID() 1129 weight := n.config.Validators.GetWeight(subnetID, nodeID) 1130 if weight == 0 { 1131 // this is not a validator skip it. 1132 continue 1133 } 1134 1135 observedUptime, exist := peer.ObservedUptime(subnetID) 1136 if !exist { 1137 observedUptime = 0 1138 } 1139 percent := float64(observedUptime) 1140 weightFloat := float64(weight) 1141 totalWeightedPercent += percent * weightFloat 1142 1143 // if this peer thinks we're above requirement add the weight 1144 // TODO: use subnet-specific uptime requirements 1145 if percent/100 >= n.config.UptimeRequirement { 1146 rewardingStake += weightFloat 1147 } 1148 } 1149 1150 return UptimeResult{ 1151 WeightedAveragePercentage: math.Abs(totalWeightedPercent / totalWeight), 1152 RewardingStakePercentage: math.Abs(100 * rewardingStake / totalWeight), 1153 }, nil 1154 } 1155 1156 func (n *network) runTimers() { 1157 pullGossipPeerlists := time.NewTicker(n.config.PeerListPullGossipFreq) 1158 resetPeerListBloom := time.NewTicker(n.config.PeerListBloomResetFreq) 1159 updateUptimes := time.NewTicker(n.config.UptimeMetricFreq) 1160 defer func() { 1161 resetPeerListBloom.Stop() 1162 updateUptimes.Stop() 1163 }() 1164 1165 for { 1166 select { 1167 case <-n.onCloseCtx.Done(): 1168 return 1169 case <-pullGossipPeerlists.C: 1170 n.pullGossipPeerLists() 1171 case <-resetPeerListBloom.C: 1172 if err := n.ipTracker.ResetBloom(); err != nil { 1173 n.peerConfig.Log.Error("failed to reset ip tracker bloom filter", 1174 zap.Error(err), 1175 ) 1176 } else { 1177 n.peerConfig.Log.Debug("reset ip tracker bloom filter") 1178 } 1179 case <-updateUptimes.C: 1180 primaryUptime, err := n.NodeUptime(constants.PrimaryNetworkID) 1181 if err != nil { 1182 n.peerConfig.Log.Debug("failed to get primary network uptime", 1183 zap.Error(err), 1184 ) 1185 } 1186 n.metrics.nodeUptimeWeightedAverage.Set(primaryUptime.WeightedAveragePercentage) 1187 n.metrics.nodeUptimeRewardingStake.Set(primaryUptime.RewardingStakePercentage) 1188 1189 for subnetID := range n.config.TrackedSubnets { 1190 result, err := n.NodeUptime(subnetID) 1191 if err != nil { 1192 n.peerConfig.Log.Debug("failed to get subnet uptime", 1193 zap.Stringer("subnetID", subnetID), 1194 zap.Error(err), 1195 ) 1196 } 1197 subnetIDStr := subnetID.String() 1198 n.metrics.nodeSubnetUptimeWeightedAverage.WithLabelValues(subnetIDStr).Set(result.WeightedAveragePercentage) 1199 n.metrics.nodeSubnetUptimeRewardingStake.WithLabelValues(subnetIDStr).Set(result.RewardingStakePercentage) 1200 } 1201 } 1202 } 1203 } 1204 1205 // pullGossipPeerLists requests validators from peers in the network 1206 func (n *network) pullGossipPeerLists() { 1207 peers := n.samplePeers( 1208 common.SendConfig{ 1209 Validators: 1, 1210 }, 1211 constants.PrimaryNetworkID, 1212 subnets.NoOpAllower, 1213 ) 1214 1215 for _, p := range peers { 1216 p.StartSendGetPeerList() 1217 } 1218 } 1219 1220 func (n *network) getLastReceived() (time.Time, bool) { 1221 lastReceived := atomic.LoadInt64(&n.peerConfig.LastReceived) 1222 if lastReceived == 0 { 1223 return time.Time{}, false 1224 } 1225 return time.Unix(lastReceived, 0), true 1226 } 1227 1228 func (n *network) getLastSent() (time.Time, bool) { 1229 lastSent := atomic.LoadInt64(&n.peerConfig.LastSent) 1230 if lastSent == 0 { 1231 return time.Time{}, false 1232 } 1233 return time.Unix(lastSent, 0), true 1234 }