github.com/ethereum/go-ethereum@v1.16.1/p2p/dial.go (about) 1 // Copyright 2015 The go-ethereum Authors 2 // This file is part of the go-ethereum library. 3 // 4 // The go-ethereum library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The go-ethereum library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>. 16 17 package p2p 18 19 import ( 20 "context" 21 crand "crypto/rand" 22 "encoding/binary" 23 "errors" 24 "fmt" 25 mrand "math/rand" 26 "net" 27 "net/netip" 28 "sync" 29 "sync/atomic" 30 "time" 31 32 "github.com/ethereum/go-ethereum/common/mclock" 33 "github.com/ethereum/go-ethereum/log" 34 "github.com/ethereum/go-ethereum/p2p/enode" 35 "github.com/ethereum/go-ethereum/p2p/enr" 36 "github.com/ethereum/go-ethereum/p2p/netutil" 37 ) 38 39 const ( 40 // This is the amount of time spent waiting in between redialing a certain node. The 41 // limit is a bit higher than inboundThrottleTime to prevent failing dials in small 42 // private networks. 43 dialHistoryExpiration = inboundThrottleTime + 5*time.Second 44 45 // Config for the "Looking for peers" message. 46 dialStatsLogInterval = 10 * time.Second // printed at most this often 47 dialStatsPeerLimit = 3 // but not if more than this many dialed peers 48 49 // Endpoint resolution is throttled with bounded backoff. 50 initialResolveDelay = 60 * time.Second 51 maxResolveDelay = time.Hour 52 ) 53 54 // NodeDialer is used to connect to nodes in the network, typically by using 55 // an underlying net.Dialer but also using net.Pipe in tests. 56 type NodeDialer interface { 57 Dial(context.Context, *enode.Node) (net.Conn, error) 58 } 59 60 type nodeResolver interface { 61 Resolve(*enode.Node) *enode.Node 62 } 63 64 // tcpDialer implements NodeDialer using real TCP connections. 65 type tcpDialer struct { 66 d *net.Dialer 67 } 68 69 func (t tcpDialer) Dial(ctx context.Context, dest *enode.Node) (net.Conn, error) { 70 addr, _ := dest.TCPEndpoint() 71 return t.d.DialContext(ctx, "tcp", addr.String()) 72 } 73 74 // checkDial errors: 75 var ( 76 errSelf = errors.New("is self") 77 errAlreadyDialing = errors.New("already dialing") 78 errAlreadyConnected = errors.New("already connected") 79 errRecentlyDialed = errors.New("recently dialed") 80 errNetRestrict = errors.New("not contained in netrestrict list") 81 errNoPort = errors.New("node does not provide TCP port") 82 errNoResolvedIP = errors.New("node does not provide a resolved IP") 83 ) 84 85 // dialer creates outbound connections and submits them into Server. 86 // Two types of peer connections can be created: 87 // 88 // - static dials are pre-configured connections. The dialer attempts 89 // keep these nodes connected at all times. 90 // 91 // - dynamic dials are created from node discovery results. The dialer 92 // continuously reads candidate nodes from its input iterator and attempts 93 // to create peer connections to nodes arriving through the iterator. 94 type dialScheduler struct { 95 dialConfig 96 setupFunc dialSetupFunc 97 dnsLookupFunc func(ctx context.Context, network string, name string) ([]netip.Addr, error) 98 wg sync.WaitGroup 99 cancel context.CancelFunc 100 ctx context.Context 101 nodesIn chan *enode.Node 102 doneCh chan *dialTask 103 addStaticCh chan *enode.Node 104 remStaticCh chan *enode.Node 105 addPeerCh chan *conn 106 remPeerCh chan *conn 107 108 // Everything below here belongs to loop and 109 // should only be accessed by code on the loop goroutine. 110 dialing map[enode.ID]*dialTask // active tasks 111 peers map[enode.ID]struct{} // all connected peers 112 dialPeers int // current number of dialed peers 113 114 // The static map tracks all static dial tasks. The subset of usable static dial tasks 115 // (i.e. those passing checkDial) is kept in staticPool. The scheduler prefers 116 // launching random static tasks from the pool over launching dynamic dials from the 117 // iterator. 118 static map[enode.ID]*dialTask 119 staticPool []*dialTask 120 121 // The dial history keeps recently dialed nodes. Members of history are not dialed. 122 history expHeap 123 historyTimer *mclock.Alarm 124 125 // for logStats 126 lastStatsLog mclock.AbsTime 127 doneSinceLastLog int 128 } 129 130 type dialSetupFunc func(net.Conn, connFlag, *enode.Node) error 131 132 type dialConfig struct { 133 self enode.ID // our own ID 134 maxDialPeers int // maximum number of dialed peers 135 maxActiveDials int // maximum number of active dials 136 netRestrict *netutil.Netlist // IP netrestrict list, disabled if nil 137 resolver nodeResolver 138 dialer NodeDialer 139 log log.Logger 140 clock mclock.Clock 141 rand *mrand.Rand 142 } 143 144 func (cfg dialConfig) withDefaults() dialConfig { 145 if cfg.maxActiveDials == 0 { 146 cfg.maxActiveDials = defaultMaxPendingPeers 147 } 148 if cfg.log == nil { 149 cfg.log = log.Root() 150 } 151 if cfg.clock == nil { 152 cfg.clock = mclock.System{} 153 } 154 if cfg.rand == nil { 155 seedb := make([]byte, 8) 156 crand.Read(seedb) 157 seed := int64(binary.BigEndian.Uint64(seedb)) 158 cfg.rand = mrand.New(mrand.NewSource(seed)) 159 } 160 return cfg 161 } 162 163 func newDialScheduler(config dialConfig, it enode.Iterator, setupFunc dialSetupFunc) *dialScheduler { 164 cfg := config.withDefaults() 165 d := &dialScheduler{ 166 dialConfig: cfg, 167 historyTimer: mclock.NewAlarm(cfg.clock), 168 setupFunc: setupFunc, 169 dnsLookupFunc: net.DefaultResolver.LookupNetIP, 170 dialing: make(map[enode.ID]*dialTask), 171 static: make(map[enode.ID]*dialTask), 172 peers: make(map[enode.ID]struct{}), 173 doneCh: make(chan *dialTask), 174 nodesIn: make(chan *enode.Node), 175 addStaticCh: make(chan *enode.Node), 176 remStaticCh: make(chan *enode.Node), 177 addPeerCh: make(chan *conn), 178 remPeerCh: make(chan *conn), 179 } 180 d.lastStatsLog = d.clock.Now() 181 d.ctx, d.cancel = context.WithCancel(context.Background()) 182 d.wg.Add(2) 183 go d.readNodes(it) 184 go d.loop(it) 185 return d 186 } 187 188 // stop shuts down the dialer, canceling all current dial tasks. 189 func (d *dialScheduler) stop() { 190 d.cancel() 191 d.wg.Wait() 192 } 193 194 // addStatic adds a static dial candidate. 195 func (d *dialScheduler) addStatic(n *enode.Node) { 196 select { 197 case d.addStaticCh <- n: 198 case <-d.ctx.Done(): 199 } 200 } 201 202 // removeStatic removes a static dial candidate. 203 func (d *dialScheduler) removeStatic(n *enode.Node) { 204 select { 205 case d.remStaticCh <- n: 206 case <-d.ctx.Done(): 207 } 208 } 209 210 // peerAdded updates the peer set. 211 func (d *dialScheduler) peerAdded(c *conn) { 212 select { 213 case d.addPeerCh <- c: 214 case <-d.ctx.Done(): 215 } 216 } 217 218 // peerRemoved updates the peer set. 219 func (d *dialScheduler) peerRemoved(c *conn) { 220 select { 221 case d.remPeerCh <- c: 222 case <-d.ctx.Done(): 223 } 224 } 225 226 // loop is the main loop of the dialer. 227 func (d *dialScheduler) loop(it enode.Iterator) { 228 var ( 229 nodesCh chan *enode.Node 230 ) 231 232 loop: 233 for { 234 // Launch new dials if slots are available. 235 slots := d.freeDialSlots() 236 slots -= d.startStaticDials(slots) 237 if slots > 0 { 238 nodesCh = d.nodesIn 239 } else { 240 nodesCh = nil 241 } 242 d.rearmHistoryTimer() 243 d.logStats() 244 245 select { 246 case node := <-nodesCh: 247 if err := d.checkDial(node); err != nil { 248 d.log.Trace("Discarding dial candidate", "id", node.ID(), "ip", node.IPAddr(), "reason", err) 249 } else { 250 d.startDial(newDialTask(node, dynDialedConn)) 251 } 252 253 case task := <-d.doneCh: 254 id := task.dest().ID() 255 delete(d.dialing, id) 256 d.updateStaticPool(id) 257 d.doneSinceLastLog++ 258 259 case c := <-d.addPeerCh: 260 if c.is(dynDialedConn) || c.is(staticDialedConn) { 261 d.dialPeers++ 262 } 263 id := c.node.ID() 264 d.peers[id] = struct{}{} 265 // Remove from static pool because the node is now connected. 266 task := d.static[id] 267 if task != nil && task.staticPoolIndex >= 0 { 268 d.removeFromStaticPool(task.staticPoolIndex) 269 } 270 // TODO: cancel dials to connected peers 271 272 case c := <-d.remPeerCh: 273 if c.is(dynDialedConn) || c.is(staticDialedConn) { 274 d.dialPeers-- 275 } 276 delete(d.peers, c.node.ID()) 277 d.updateStaticPool(c.node.ID()) 278 279 case node := <-d.addStaticCh: 280 id := node.ID() 281 _, exists := d.static[id] 282 d.log.Trace("Adding static node", "id", id, "endpoint", nodeEndpointForLog(node), "added", !exists) 283 if exists { 284 continue loop 285 } 286 task := newDialTask(node, staticDialedConn) 287 d.static[id] = task 288 if d.checkDial(node) == nil { 289 d.addToStaticPool(task) 290 } 291 292 case node := <-d.remStaticCh: 293 id := node.ID() 294 task := d.static[id] 295 d.log.Trace("Removing static node", "id", id, "ok", task != nil) 296 if task != nil { 297 delete(d.static, id) 298 if task.staticPoolIndex >= 0 { 299 d.removeFromStaticPool(task.staticPoolIndex) 300 } 301 } 302 303 case <-d.historyTimer.C(): 304 d.expireHistory() 305 306 case <-d.ctx.Done(): 307 it.Close() 308 break loop 309 } 310 } 311 312 d.historyTimer.Stop() 313 for range d.dialing { 314 <-d.doneCh 315 } 316 d.wg.Done() 317 } 318 319 // readNodes runs in its own goroutine and delivers nodes from 320 // the input iterator to the nodesIn channel. 321 func (d *dialScheduler) readNodes(it enode.Iterator) { 322 defer d.wg.Done() 323 324 for it.Next() { 325 select { 326 case d.nodesIn <- it.Node(): 327 case <-d.ctx.Done(): 328 } 329 } 330 } 331 332 // logStats prints dialer statistics to the log. The message is suppressed when enough 333 // peers are connected because users should only see it while their client is starting up 334 // or comes back online. 335 func (d *dialScheduler) logStats() { 336 now := d.clock.Now() 337 if d.lastStatsLog.Add(dialStatsLogInterval) > now { 338 return 339 } 340 if d.dialPeers < dialStatsPeerLimit && d.dialPeers < d.maxDialPeers { 341 d.log.Info("Looking for peers", "peercount", len(d.peers), "tried", d.doneSinceLastLog, "static", len(d.static)) 342 } 343 d.doneSinceLastLog = 0 344 d.lastStatsLog = now 345 } 346 347 // rearmHistoryTimer configures d.historyTimer to fire when the 348 // next item in d.history expires. 349 func (d *dialScheduler) rearmHistoryTimer() { 350 if len(d.history) == 0 { 351 return 352 } 353 d.historyTimer.Schedule(d.history.nextExpiry()) 354 } 355 356 // expireHistory removes expired items from d.history. 357 func (d *dialScheduler) expireHistory() { 358 d.history.expire(d.clock.Now(), func(hkey string) { 359 var id enode.ID 360 copy(id[:], hkey) 361 d.updateStaticPool(id) 362 }) 363 } 364 365 // freeDialSlots returns the number of free dial slots. The result can be negative 366 // when peers are connected while their task is still running. 367 func (d *dialScheduler) freeDialSlots() int { 368 slots := (d.maxDialPeers - d.dialPeers) * 2 369 if slots > d.maxActiveDials { 370 slots = d.maxActiveDials 371 } 372 free := slots - len(d.dialing) 373 return free 374 } 375 376 // checkDial returns an error if node n should not be dialed. 377 func (d *dialScheduler) checkDial(n *enode.Node) error { 378 if n.ID() == d.self { 379 return errSelf 380 } 381 if n.IPAddr().IsValid() && n.TCP() == 0 { 382 // This check can trigger if a non-TCP node is found 383 // by discovery. If there is no IP, the node is a static 384 // node and the actual endpoint will be resolved later in dialTask. 385 return errNoPort 386 } 387 if _, ok := d.dialing[n.ID()]; ok { 388 return errAlreadyDialing 389 } 390 if _, ok := d.peers[n.ID()]; ok { 391 return errAlreadyConnected 392 } 393 if d.netRestrict != nil && !d.netRestrict.ContainsAddr(n.IPAddr()) { 394 return errNetRestrict 395 } 396 if d.history.contains(string(n.ID().Bytes())) { 397 return errRecentlyDialed 398 } 399 return nil 400 } 401 402 // startStaticDials starts n static dial tasks. 403 func (d *dialScheduler) startStaticDials(n int) (started int) { 404 for started = 0; started < n && len(d.staticPool) > 0; started++ { 405 idx := d.rand.Intn(len(d.staticPool)) 406 task := d.staticPool[idx] 407 d.startDial(task) 408 d.removeFromStaticPool(idx) 409 } 410 return started 411 } 412 413 // updateStaticPool attempts to move the given static dial back into staticPool. 414 func (d *dialScheduler) updateStaticPool(id enode.ID) { 415 task, ok := d.static[id] 416 if ok && task.staticPoolIndex < 0 && d.checkDial(task.dest()) == nil { 417 d.addToStaticPool(task) 418 } 419 } 420 421 func (d *dialScheduler) addToStaticPool(task *dialTask) { 422 if task.staticPoolIndex >= 0 { 423 panic("attempt to add task to staticPool twice") 424 } 425 d.staticPool = append(d.staticPool, task) 426 task.staticPoolIndex = len(d.staticPool) - 1 427 } 428 429 // removeFromStaticPool removes the task at idx from staticPool. It does that by moving the 430 // current last element of the pool to idx and then shortening the pool by one. 431 func (d *dialScheduler) removeFromStaticPool(idx int) { 432 task := d.staticPool[idx] 433 end := len(d.staticPool) - 1 434 d.staticPool[idx] = d.staticPool[end] 435 d.staticPool[idx].staticPoolIndex = idx 436 d.staticPool[end] = nil 437 d.staticPool = d.staticPool[:end] 438 task.staticPoolIndex = -1 439 } 440 441 // dnsResolveHostname updates the given node from its DNS hostname. 442 // This is used to resolve static dial targets. 443 func (d *dialScheduler) dnsResolveHostname(n *enode.Node) (*enode.Node, error) { 444 if n.Hostname() == "" { 445 return n, nil 446 } 447 448 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 449 defer cancel() 450 foundIPs, err := d.dnsLookupFunc(ctx, "ip", n.Hostname()) 451 if err != nil { 452 return n, err 453 } 454 455 // Check for IP updates. 456 var ( 457 nodeIP4, nodeIP6 netip.Addr 458 foundIP4, foundIP6 netip.Addr 459 ) 460 n.Load((*enr.IPv4Addr)(&nodeIP4)) 461 n.Load((*enr.IPv6Addr)(&nodeIP6)) 462 for _, ip := range foundIPs { 463 if ip.Is4() && !foundIP4.IsValid() { 464 foundIP4 = ip 465 } 466 if ip.Is6() && !foundIP6.IsValid() { 467 foundIP6 = ip 468 } 469 } 470 471 if !foundIP4.IsValid() && !foundIP6.IsValid() { 472 // Lookup failed. 473 return n, errNoResolvedIP 474 } 475 if foundIP4 == nodeIP4 && foundIP6 == nodeIP6 { 476 // No updates necessary. 477 d.log.Trace("Node DNS lookup had no update", "id", n.ID(), "name", n.Hostname(), "ip", foundIP4, "ip6", foundIP6) 478 return n, nil 479 } 480 481 // Update the node. Note this invalidates the ENR signature, because we use SignNull 482 // to create a modified copy. But this should be OK, since we just use the node as a 483 // dial target. And nodes will usually only have a DNS hostname if they came from a 484 // enode:// URL, which has no signature anyway. If it ever becomes a problem, the 485 // resolved IP could also be stored into dialTask instead of the node. 486 rec := n.Record() 487 if foundIP4.IsValid() { 488 rec.Set(enr.IPv4Addr(foundIP4)) 489 } 490 if foundIP6.IsValid() { 491 rec.Set(enr.IPv6Addr(foundIP6)) 492 } 493 rec.SetSeq(n.Seq()) // ensure seq not bumped by update 494 newNode := enode.SignNull(rec, n.ID()).WithHostname(n.Hostname()) 495 d.log.Debug("Node updated from DNS lookup", "id", n.ID(), "name", n.Hostname(), "ip", newNode.IP()) 496 return newNode, nil 497 } 498 499 // startDial runs the given dial task in a separate goroutine. 500 func (d *dialScheduler) startDial(task *dialTask) { 501 node := task.dest() 502 d.log.Trace("Starting p2p dial", "id", node.ID(), "endpoint", nodeEndpointForLog(node), "flag", task.flags) 503 hkey := string(node.ID().Bytes()) 504 d.history.add(hkey, d.clock.Now().Add(dialHistoryExpiration)) 505 d.dialing[node.ID()] = task 506 go func() { 507 task.run(d) 508 d.doneCh <- task 509 }() 510 } 511 512 // A dialTask generated for each node that is dialed. 513 type dialTask struct { 514 staticPoolIndex int 515 flags connFlag 516 517 // These fields are private to the task and should not be 518 // accessed by dialScheduler while the task is running. 519 destPtr atomic.Pointer[enode.Node] 520 lastResolved mclock.AbsTime 521 resolveDelay time.Duration 522 } 523 524 func newDialTask(dest *enode.Node, flags connFlag) *dialTask { 525 t := &dialTask{flags: flags, staticPoolIndex: -1} 526 t.destPtr.Store(dest) 527 return t 528 } 529 530 type dialError struct { 531 error 532 } 533 534 func (t *dialTask) dest() *enode.Node { 535 return t.destPtr.Load() 536 } 537 538 func (t *dialTask) run(d *dialScheduler) { 539 if t.isStatic() { 540 // Resolve DNS. 541 if n := t.dest(); n.Hostname() != "" { 542 resolved, err := d.dnsResolveHostname(n) 543 if err != nil { 544 d.log.Warn("DNS lookup of static node failed", "id", n.ID(), "name", n.Hostname(), "err", err) 545 } else { 546 t.destPtr.Store(resolved) 547 } 548 } 549 // Try resolving node ID through the DHT if there is no IP address. 550 if !t.dest().IPAddr().IsValid() { 551 if !t.resolve(d) { 552 return // DHT resolve failed, skip dial. 553 } 554 } 555 } 556 557 err := t.dial(d, t.dest()) 558 if err != nil { 559 // For static nodes, resolve one more time if dialing fails. 560 var dialErr *dialError 561 if errors.As(err, &dialErr) && t.isStatic() { 562 if t.resolve(d) { 563 t.dial(d, t.dest()) 564 } 565 } 566 } 567 } 568 569 func (t *dialTask) isStatic() bool { 570 return t.flags&staticDialedConn != 0 571 } 572 573 // resolve attempts to find the current endpoint for the destination 574 // using discovery. 575 // 576 // Resolve operations are throttled with backoff to avoid flooding the 577 // discovery network with useless queries for nodes that don't exist. 578 // The backoff delay resets when the node is found. 579 func (t *dialTask) resolve(d *dialScheduler) bool { 580 if d.resolver == nil { 581 return false 582 } 583 if t.resolveDelay == 0 { 584 t.resolveDelay = initialResolveDelay 585 } 586 if t.lastResolved > 0 && time.Duration(d.clock.Now()-t.lastResolved) < t.resolveDelay { 587 return false 588 } 589 590 node := t.dest() 591 resolved := d.resolver.Resolve(node) 592 t.lastResolved = d.clock.Now() 593 if resolved == nil { 594 t.resolveDelay *= 2 595 if t.resolveDelay > maxResolveDelay { 596 t.resolveDelay = maxResolveDelay 597 } 598 d.log.Debug("Resolving node failed", "id", node.ID(), "newdelay", t.resolveDelay) 599 return false 600 } 601 // The node was found. 602 t.resolveDelay = initialResolveDelay 603 t.destPtr.Store(resolved) 604 resAddr, _ := resolved.TCPEndpoint() 605 d.log.Debug("Resolved node", "id", resolved.ID(), "addr", resAddr) 606 return true 607 } 608 609 // dial performs the actual connection attempt. 610 func (t *dialTask) dial(d *dialScheduler, dest *enode.Node) error { 611 dialMeter.Mark(1) 612 fd, err := d.dialer.Dial(d.ctx, dest) 613 if err != nil { 614 addr, _ := dest.TCPEndpoint() 615 d.log.Trace("Dial error", "id", dest.ID(), "addr", addr, "conn", t.flags, "err", cleanupDialErr(err)) 616 dialConnectionError.Mark(1) 617 return &dialError{err} 618 } 619 return d.setupFunc(newMeteredConn(fd), t.flags, dest) 620 } 621 622 func (t *dialTask) String() string { 623 node := t.dest() 624 id := node.ID() 625 return fmt.Sprintf("%v %x %v:%d", t.flags, id[:8], node.IPAddr(), node.TCP()) 626 } 627 628 func cleanupDialErr(err error) error { 629 if netErr, ok := err.(*net.OpError); ok && netErr.Op == "dial" { 630 return netErr.Err 631 } 632 return err 633 } 634 635 func nodeEndpointForLog(n *enode.Node) string { 636 if n.Hostname() != "" { 637 return n.Hostname() 638 } 639 return n.IPAddr().String() 640 }