github.com/celestiaorg/celestia-node@v0.15.0-beta.1/share/p2p/discovery/discovery.go (about) 1 package discovery 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "time" 8 9 logging "github.com/ipfs/go-log/v2" 10 "github.com/libp2p/go-libp2p/core/discovery" 11 "github.com/libp2p/go-libp2p/core/event" 12 "github.com/libp2p/go-libp2p/core/host" 13 "github.com/libp2p/go-libp2p/core/network" 14 "github.com/libp2p/go-libp2p/core/peer" 15 "github.com/libp2p/go-libp2p/p2p/host/eventbus" 16 "golang.org/x/sync/errgroup" 17 ) 18 19 var log = logging.Logger("share/discovery") 20 21 const ( 22 // eventbusBufSize is the size of the buffered channel to handle 23 // events in libp2p. We specify a larger buffer size for the channel 24 // to avoid overflowing and blocking subscription during disconnection bursts. 25 // (by default it is 16) 26 eventbusBufSize = 64 27 28 // findPeersTimeout limits the FindPeers operation in time 29 findPeersTimeout = time.Minute 30 31 // retryTimeout defines time interval between discovery and advertise attempts. 32 retryTimeout = time.Second 33 34 // logInterval defines the time interval at which a warning message will be logged 35 // if the desired number of nodes is not detected. 36 logInterval = 5 * time.Minute 37 ) 38 39 // discoveryRetryTimeout defines time interval between discovery attempts, needed for tests 40 var discoveryRetryTimeout = retryTimeout 41 42 // Discovery combines advertise and discover services and allows to store discovered nodes. 43 // TODO: The code here gets horribly hairy, so we should refactor this at some point 44 type Discovery struct { 45 // Tag is used as rondezvous point for discovery service 46 tag string 47 set *limitedSet 48 host host.Host 49 disc discovery.Discovery 50 connector *backoffConnector 51 // onUpdatedPeers will be called on peer set changes 52 onUpdatedPeers OnUpdatedPeers 53 54 triggerDisc chan struct{} 55 56 metrics *metrics 57 58 cancel context.CancelFunc 59 60 params *Parameters 61 } 62 63 type OnUpdatedPeers func(peerID peer.ID, isAdded bool) 64 65 func (f OnUpdatedPeers) add(next OnUpdatedPeers) OnUpdatedPeers { 66 return func(peerID peer.ID, isAdded bool) { 67 f(peerID, isAdded) 68 next(peerID, isAdded) 69 } 70 } 71 72 // NewDiscovery constructs a new discovery. 73 func NewDiscovery( 74 params *Parameters, 75 h host.Host, 76 d discovery.Discovery, 77 tag string, 78 opts ...Option, 79 ) (*Discovery, error) { 80 if err := params.Validate(); err != nil { 81 return nil, err 82 } 83 84 if tag == "" { 85 return nil, fmt.Errorf("discovery: tag cannot be empty") 86 } 87 o := newOptions(opts...) 88 return &Discovery{ 89 tag: tag, 90 set: newLimitedSet(params.PeersLimit), 91 host: h, 92 disc: d, 93 connector: newBackoffConnector(h, defaultBackoffFactory), 94 onUpdatedPeers: o.onUpdatedPeers, 95 params: params, 96 triggerDisc: make(chan struct{}), 97 }, nil 98 } 99 100 func (d *Discovery) Start(context.Context) error { 101 ctx, cancel := context.WithCancel(context.Background()) 102 d.cancel = cancel 103 104 sub, err := d.host.EventBus().Subscribe(&event.EvtPeerConnectednessChanged{}, eventbus.BufSize(eventbusBufSize)) 105 if err != nil { 106 return fmt.Errorf("subscribing for connection events: %w", err) 107 } 108 109 go d.discoveryLoop(ctx) 110 go d.disconnectsLoop(ctx, sub) 111 go d.connector.GC(ctx) 112 return nil 113 } 114 115 func (d *Discovery) Stop(context.Context) error { 116 d.cancel() 117 return nil 118 } 119 120 // Peers provides a list of discovered peers in the "full" topic. 121 // If Discovery hasn't found any peers, it blocks until at least one peer is found. 122 func (d *Discovery) Peers(ctx context.Context) ([]peer.ID, error) { 123 return d.set.Peers(ctx) 124 } 125 126 // Discard removes the peer from the peer set and rediscovers more if soft peer limit is not 127 // reached. Reports whether peer was removed with bool. 128 func (d *Discovery) Discard(id peer.ID) bool { 129 if !d.set.Contains(id) { 130 return false 131 } 132 133 d.host.ConnManager().Unprotect(id, d.tag) 134 d.connector.Backoff(id) 135 d.set.Remove(id) 136 d.onUpdatedPeers(id, false) 137 log.Debugw("removed peer from the peer set", "peer", id.String()) 138 139 if d.set.Size() < d.set.Limit() { 140 // trigger discovery 141 select { 142 case d.triggerDisc <- struct{}{}: 143 default: 144 } 145 } 146 147 return true 148 } 149 150 // Advertise is a utility function that persistently advertises a service through an Advertiser. 151 // TODO: Start advertising only after the reachability is confirmed by AutoNAT 152 func (d *Discovery) Advertise(ctx context.Context) { 153 timer := time.NewTimer(d.params.AdvertiseInterval) 154 defer timer.Stop() 155 for { 156 _, err := d.disc.Advertise(ctx, d.tag) 157 d.metrics.observeAdvertise(ctx, err) 158 if err != nil { 159 if ctx.Err() != nil { 160 return 161 } 162 log.Warnw("error advertising", "rendezvous", d.tag, "err", err) 163 164 // we don't want retry indefinitely in busy loop 165 // internal discovery mechanism may need some time before attempts 166 errTimer := time.NewTimer(retryTimeout) 167 select { 168 case <-errTimer.C: 169 errTimer.Stop() 170 if !timer.Stop() { 171 <-timer.C 172 } 173 continue 174 case <-ctx.Done(): 175 errTimer.Stop() 176 return 177 } 178 } 179 180 log.Debugf("advertised") 181 if !timer.Stop() { 182 <-timer.C 183 } 184 timer.Reset(d.params.AdvertiseInterval) 185 select { 186 case <-timer.C: 187 case <-ctx.Done(): 188 return 189 } 190 } 191 } 192 193 // discoveryLoop ensures we always have '~peerLimit' connected peers. 194 // It initiates peer discovery upon request and restarts the process until the soft limit is 195 // reached. 196 func (d *Discovery) discoveryLoop(ctx context.Context) { 197 t := time.NewTicker(discoveryRetryTimeout) 198 defer t.Stop() 199 200 warnTicker := time.NewTicker(logInterval) 201 defer warnTicker.Stop() 202 203 for { 204 // drain all previous ticks from the channel 205 drainChannel(t.C) 206 select { 207 case <-t.C: 208 if !d.discover(ctx) { 209 // rerun discovery if the number of peers hasn't reached the limit 210 continue 211 } 212 case <-warnTicker.C: 213 if d.set.Size() < d.set.Limit() { 214 log.Warnf( 215 "Potentially degraded connectivity, unable to discover the desired amount of full node peers in %v. "+ 216 "Number of peers discovered: %d. Required: %d.", 217 logInterval, d.set.Size(), d.set.Limit(), 218 ) 219 } 220 // Do not break the loop; just continue 221 continue 222 case <-ctx.Done(): 223 return 224 } 225 } 226 } 227 228 // disconnectsLoop listen for disconnect events and ensures Discovery state 229 // is updated. 230 func (d *Discovery) disconnectsLoop(ctx context.Context, sub event.Subscription) { 231 defer sub.Close() 232 233 for { 234 select { 235 case <-ctx.Done(): 236 return 237 case e, ok := <-sub.Out(): 238 if !ok { 239 log.Error("connection subscription was closed unexpectedly") 240 return 241 } 242 243 if evnt := e.(event.EvtPeerConnectednessChanged); evnt.Connectedness == network.NotConnected { 244 d.Discard(evnt.Peer) 245 } 246 } 247 } 248 } 249 250 // discover finds new peers and reports whether it succeeded. 251 func (d *Discovery) discover(ctx context.Context) bool { 252 size := d.set.Size() 253 want := d.set.Limit() - size 254 if want == 0 { 255 log.Debugw("reached soft peer limit, skipping discovery", "size", size) 256 return true 257 } 258 // TODO @renaynay: eventually, have a mechanism to catch if wanted amount of peers 259 // has not been discovered in X amount of time so that users are warned of degraded 260 // FN connectivity. 261 log.Debugw("discovering peers", "want", want) 262 263 // we use errgroup as it provide limits 264 var wg errgroup.Group 265 // limit to minimize chances of overreaching the limit 266 wg.SetLimit(int(d.set.Limit())) 267 268 findCtx, findCancel := context.WithTimeout(ctx, findPeersTimeout) 269 defer func() { 270 // some workers could still be running, wait them to finish before canceling findCtx 271 wg.Wait() //nolint:errcheck 272 findCancel() 273 }() 274 275 peers, err := d.disc.FindPeers(findCtx, d.tag) 276 if err != nil { 277 log.Error("unable to start discovery", "err", err) 278 return false 279 } 280 281 for { 282 select { 283 case p, ok := <-peers: 284 if !ok { 285 break 286 } 287 288 peer := p 289 wg.Go(func() error { 290 if findCtx.Err() != nil { 291 log.Debug("find has been canceled, skip peer") 292 return nil 293 } 294 295 // we don't pass findCtx so that we don't cancel in progress connections 296 // that are likely to be valuable 297 if !d.handleDiscoveredPeer(ctx, peer) { 298 return nil 299 } 300 301 size := d.set.Size() 302 log.Debugw("found peer", "peer", peer.ID.String(), "found_amount", size) 303 if size < d.set.Limit() { 304 return nil 305 } 306 307 log.Infow("discovered wanted peers", "amount", size) 308 findCancel() // stop discovery when we are done 309 return nil 310 }) 311 312 continue 313 case <-findCtx.Done(): 314 } 315 316 isEnoughPeers := d.set.Size() >= d.set.Limit() 317 d.metrics.observeFindPeers(ctx, isEnoughPeers) 318 log.Debugw("discovery finished", "discovered_wanted", isEnoughPeers) 319 return isEnoughPeers 320 } 321 } 322 323 // handleDiscoveredPeer adds peer to the internal if can connect or is connected. 324 // Report whether it succeeded. 325 func (d *Discovery) handleDiscoveredPeer(ctx context.Context, peer peer.AddrInfo) bool { 326 logger := log.With("peer", peer.ID.String()) 327 switch { 328 case peer.ID == d.host.ID(): 329 d.metrics.observeHandlePeer(ctx, handlePeerSkipSelf) 330 logger.Debug("skip handle: self discovery") 331 return false 332 case d.set.Size() >= d.set.Limit(): 333 d.metrics.observeHandlePeer(ctx, handlePeerEnoughPeers) 334 logger.Debug("skip handle: enough peers found") 335 return false 336 } 337 338 switch d.host.Network().Connectedness(peer.ID) { 339 case network.Connected: 340 d.connector.Backoff(peer.ID) // we still have to backoff the connected peer 341 case network.NotConnected: 342 err := d.connector.Connect(ctx, peer) 343 if errors.Is(err, errBackoffNotEnded) { 344 d.metrics.observeHandlePeer(ctx, handlePeerBackoff) 345 logger.Debug("skip handle: backoff") 346 return false 347 } 348 if err != nil { 349 d.metrics.observeHandlePeer(ctx, handlePeerConnErr) 350 logger.Debugw("unable to connect", "err", err) 351 return false 352 } 353 default: 354 panic("unknown connectedness") 355 } 356 357 if !d.set.Add(peer.ID) { 358 d.metrics.observeHandlePeer(ctx, handlePeerInSet) 359 logger.Debug("peer is already in discovery set") 360 return false 361 } 362 d.onUpdatedPeers(peer.ID, true) 363 d.metrics.observeHandlePeer(ctx, handlePeerConnected) 364 logger.Debug("added peer to set") 365 366 // Tag to protect peer from being killed by ConnManager 367 // NOTE: This is does not protect from remote killing the connection. 368 // In the future, we should design a protocol that keeps bidirectional agreement on whether 369 // connection should be kept or not, similar to mesh link in GossipSub. 370 d.host.ConnManager().Protect(peer.ID, d.tag) 371 return true 372 } 373 374 func drainChannel(c <-chan time.Time) { 375 for { 376 select { 377 case <-c: 378 default: 379 return 380 } 381 } 382 }