github.com/celestiaorg/celestia-node@v0.15.0-beta.1/share/p2p/discovery/discovery.go (about)

     1  package discovery
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"time"
     8  
     9  	logging "github.com/ipfs/go-log/v2"
    10  	"github.com/libp2p/go-libp2p/core/discovery"
    11  	"github.com/libp2p/go-libp2p/core/event"
    12  	"github.com/libp2p/go-libp2p/core/host"
    13  	"github.com/libp2p/go-libp2p/core/network"
    14  	"github.com/libp2p/go-libp2p/core/peer"
    15  	"github.com/libp2p/go-libp2p/p2p/host/eventbus"
    16  	"golang.org/x/sync/errgroup"
    17  )
    18  
    19  var log = logging.Logger("share/discovery")
    20  
    21  const (
    22  	// eventbusBufSize is the size of the buffered channel to handle
    23  	// events in libp2p. We specify a larger buffer size for the channel
    24  	// to avoid overflowing and blocking subscription during disconnection bursts.
    25  	// (by default it is 16)
    26  	eventbusBufSize = 64
    27  
    28  	// findPeersTimeout limits the FindPeers operation in time
    29  	findPeersTimeout = time.Minute
    30  
    31  	// retryTimeout defines time interval between discovery and advertise attempts.
    32  	retryTimeout = time.Second
    33  
    34  	// logInterval defines the time interval at which a warning message will be logged
    35  	// if the desired number of nodes is not detected.
    36  	logInterval = 5 * time.Minute
    37  )
    38  
    39  // discoveryRetryTimeout defines time interval between discovery attempts, needed for tests
    40  var discoveryRetryTimeout = retryTimeout
    41  
    42  // Discovery combines advertise and discover services and allows to store discovered nodes.
    43  // TODO: The code here gets horribly hairy, so we should refactor this at some point
    44  type Discovery struct {
    45  	// Tag is used as rondezvous point for discovery service
    46  	tag       string
    47  	set       *limitedSet
    48  	host      host.Host
    49  	disc      discovery.Discovery
    50  	connector *backoffConnector
    51  	// onUpdatedPeers will be called on peer set changes
    52  	onUpdatedPeers OnUpdatedPeers
    53  
    54  	triggerDisc chan struct{}
    55  
    56  	metrics *metrics
    57  
    58  	cancel context.CancelFunc
    59  
    60  	params *Parameters
    61  }
    62  
    63  type OnUpdatedPeers func(peerID peer.ID, isAdded bool)
    64  
    65  func (f OnUpdatedPeers) add(next OnUpdatedPeers) OnUpdatedPeers {
    66  	return func(peerID peer.ID, isAdded bool) {
    67  		f(peerID, isAdded)
    68  		next(peerID, isAdded)
    69  	}
    70  }
    71  
    72  // NewDiscovery constructs a new discovery.
    73  func NewDiscovery(
    74  	params *Parameters,
    75  	h host.Host,
    76  	d discovery.Discovery,
    77  	tag string,
    78  	opts ...Option,
    79  ) (*Discovery, error) {
    80  	if err := params.Validate(); err != nil {
    81  		return nil, err
    82  	}
    83  
    84  	if tag == "" {
    85  		return nil, fmt.Errorf("discovery: tag cannot be empty")
    86  	}
    87  	o := newOptions(opts...)
    88  	return &Discovery{
    89  		tag:            tag,
    90  		set:            newLimitedSet(params.PeersLimit),
    91  		host:           h,
    92  		disc:           d,
    93  		connector:      newBackoffConnector(h, defaultBackoffFactory),
    94  		onUpdatedPeers: o.onUpdatedPeers,
    95  		params:         params,
    96  		triggerDisc:    make(chan struct{}),
    97  	}, nil
    98  }
    99  
   100  func (d *Discovery) Start(context.Context) error {
   101  	ctx, cancel := context.WithCancel(context.Background())
   102  	d.cancel = cancel
   103  
   104  	sub, err := d.host.EventBus().Subscribe(&event.EvtPeerConnectednessChanged{}, eventbus.BufSize(eventbusBufSize))
   105  	if err != nil {
   106  		return fmt.Errorf("subscribing for connection events: %w", err)
   107  	}
   108  
   109  	go d.discoveryLoop(ctx)
   110  	go d.disconnectsLoop(ctx, sub)
   111  	go d.connector.GC(ctx)
   112  	return nil
   113  }
   114  
   115  func (d *Discovery) Stop(context.Context) error {
   116  	d.cancel()
   117  	return nil
   118  }
   119  
   120  // Peers provides a list of discovered peers in the "full" topic.
   121  // If Discovery hasn't found any peers, it blocks until at least one peer is found.
   122  func (d *Discovery) Peers(ctx context.Context) ([]peer.ID, error) {
   123  	return d.set.Peers(ctx)
   124  }
   125  
   126  // Discard removes the peer from the peer set and rediscovers more if soft peer limit is not
   127  // reached. Reports whether peer was removed with bool.
   128  func (d *Discovery) Discard(id peer.ID) bool {
   129  	if !d.set.Contains(id) {
   130  		return false
   131  	}
   132  
   133  	d.host.ConnManager().Unprotect(id, d.tag)
   134  	d.connector.Backoff(id)
   135  	d.set.Remove(id)
   136  	d.onUpdatedPeers(id, false)
   137  	log.Debugw("removed peer from the peer set", "peer", id.String())
   138  
   139  	if d.set.Size() < d.set.Limit() {
   140  		// trigger discovery
   141  		select {
   142  		case d.triggerDisc <- struct{}{}:
   143  		default:
   144  		}
   145  	}
   146  
   147  	return true
   148  }
   149  
   150  // Advertise is a utility function that persistently advertises a service through an Advertiser.
   151  // TODO: Start advertising only after the reachability is confirmed by AutoNAT
   152  func (d *Discovery) Advertise(ctx context.Context) {
   153  	timer := time.NewTimer(d.params.AdvertiseInterval)
   154  	defer timer.Stop()
   155  	for {
   156  		_, err := d.disc.Advertise(ctx, d.tag)
   157  		d.metrics.observeAdvertise(ctx, err)
   158  		if err != nil {
   159  			if ctx.Err() != nil {
   160  				return
   161  			}
   162  			log.Warnw("error advertising", "rendezvous", d.tag, "err", err)
   163  
   164  			// we don't want retry indefinitely in busy loop
   165  			// internal discovery mechanism may need some time before attempts
   166  			errTimer := time.NewTimer(retryTimeout)
   167  			select {
   168  			case <-errTimer.C:
   169  				errTimer.Stop()
   170  				if !timer.Stop() {
   171  					<-timer.C
   172  				}
   173  				continue
   174  			case <-ctx.Done():
   175  				errTimer.Stop()
   176  				return
   177  			}
   178  		}
   179  
   180  		log.Debugf("advertised")
   181  		if !timer.Stop() {
   182  			<-timer.C
   183  		}
   184  		timer.Reset(d.params.AdvertiseInterval)
   185  		select {
   186  		case <-timer.C:
   187  		case <-ctx.Done():
   188  			return
   189  		}
   190  	}
   191  }
   192  
   193  // discoveryLoop ensures we always have '~peerLimit' connected peers.
   194  // It initiates peer discovery upon request and restarts the process until the soft limit is
   195  // reached.
   196  func (d *Discovery) discoveryLoop(ctx context.Context) {
   197  	t := time.NewTicker(discoveryRetryTimeout)
   198  	defer t.Stop()
   199  
   200  	warnTicker := time.NewTicker(logInterval)
   201  	defer warnTicker.Stop()
   202  
   203  	for {
   204  		// drain all previous ticks from the channel
   205  		drainChannel(t.C)
   206  		select {
   207  		case <-t.C:
   208  			if !d.discover(ctx) {
   209  				// rerun discovery if the number of peers hasn't reached the limit
   210  				continue
   211  			}
   212  		case <-warnTicker.C:
   213  			if d.set.Size() < d.set.Limit() {
   214  				log.Warnf(
   215  					"Potentially degraded connectivity, unable to discover the desired amount of full node peers in %v. "+
   216  						"Number of peers discovered: %d. Required: %d.",
   217  					logInterval, d.set.Size(), d.set.Limit(),
   218  				)
   219  			}
   220  			// Do not break the loop; just continue
   221  			continue
   222  		case <-ctx.Done():
   223  			return
   224  		}
   225  	}
   226  }
   227  
   228  // disconnectsLoop listen for disconnect events and ensures Discovery state
   229  // is updated.
   230  func (d *Discovery) disconnectsLoop(ctx context.Context, sub event.Subscription) {
   231  	defer sub.Close()
   232  
   233  	for {
   234  		select {
   235  		case <-ctx.Done():
   236  			return
   237  		case e, ok := <-sub.Out():
   238  			if !ok {
   239  				log.Error("connection subscription was closed unexpectedly")
   240  				return
   241  			}
   242  
   243  			if evnt := e.(event.EvtPeerConnectednessChanged); evnt.Connectedness == network.NotConnected {
   244  				d.Discard(evnt.Peer)
   245  			}
   246  		}
   247  	}
   248  }
   249  
   250  // discover finds new peers and reports whether it succeeded.
   251  func (d *Discovery) discover(ctx context.Context) bool {
   252  	size := d.set.Size()
   253  	want := d.set.Limit() - size
   254  	if want == 0 {
   255  		log.Debugw("reached soft peer limit, skipping discovery", "size", size)
   256  		return true
   257  	}
   258  	// TODO @renaynay: eventually, have a mechanism to catch if wanted amount of peers
   259  	//  has not been discovered in X amount of time so that users are warned of degraded
   260  	//  FN connectivity.
   261  	log.Debugw("discovering peers", "want", want)
   262  
   263  	// we use errgroup as it provide limits
   264  	var wg errgroup.Group
   265  	// limit to minimize chances of overreaching the limit
   266  	wg.SetLimit(int(d.set.Limit()))
   267  
   268  	findCtx, findCancel := context.WithTimeout(ctx, findPeersTimeout)
   269  	defer func() {
   270  		// some workers could still be running, wait them to finish before canceling findCtx
   271  		wg.Wait() //nolint:errcheck
   272  		findCancel()
   273  	}()
   274  
   275  	peers, err := d.disc.FindPeers(findCtx, d.tag)
   276  	if err != nil {
   277  		log.Error("unable to start discovery", "err", err)
   278  		return false
   279  	}
   280  
   281  	for {
   282  		select {
   283  		case p, ok := <-peers:
   284  			if !ok {
   285  				break
   286  			}
   287  
   288  			peer := p
   289  			wg.Go(func() error {
   290  				if findCtx.Err() != nil {
   291  					log.Debug("find has been canceled, skip peer")
   292  					return nil
   293  				}
   294  
   295  				// we don't pass findCtx so that we don't cancel in progress connections
   296  				// that are likely to be valuable
   297  				if !d.handleDiscoveredPeer(ctx, peer) {
   298  					return nil
   299  				}
   300  
   301  				size := d.set.Size()
   302  				log.Debugw("found peer", "peer", peer.ID.String(), "found_amount", size)
   303  				if size < d.set.Limit() {
   304  					return nil
   305  				}
   306  
   307  				log.Infow("discovered wanted peers", "amount", size)
   308  				findCancel() // stop discovery when we are done
   309  				return nil
   310  			})
   311  
   312  			continue
   313  		case <-findCtx.Done():
   314  		}
   315  
   316  		isEnoughPeers := d.set.Size() >= d.set.Limit()
   317  		d.metrics.observeFindPeers(ctx, isEnoughPeers)
   318  		log.Debugw("discovery finished", "discovered_wanted", isEnoughPeers)
   319  		return isEnoughPeers
   320  	}
   321  }
   322  
   323  // handleDiscoveredPeer adds peer to the internal if can connect or is connected.
   324  // Report whether it succeeded.
   325  func (d *Discovery) handleDiscoveredPeer(ctx context.Context, peer peer.AddrInfo) bool {
   326  	logger := log.With("peer", peer.ID.String())
   327  	switch {
   328  	case peer.ID == d.host.ID():
   329  		d.metrics.observeHandlePeer(ctx, handlePeerSkipSelf)
   330  		logger.Debug("skip handle: self discovery")
   331  		return false
   332  	case d.set.Size() >= d.set.Limit():
   333  		d.metrics.observeHandlePeer(ctx, handlePeerEnoughPeers)
   334  		logger.Debug("skip handle: enough peers found")
   335  		return false
   336  	}
   337  
   338  	switch d.host.Network().Connectedness(peer.ID) {
   339  	case network.Connected:
   340  		d.connector.Backoff(peer.ID) // we still have to backoff the connected peer
   341  	case network.NotConnected:
   342  		err := d.connector.Connect(ctx, peer)
   343  		if errors.Is(err, errBackoffNotEnded) {
   344  			d.metrics.observeHandlePeer(ctx, handlePeerBackoff)
   345  			logger.Debug("skip handle: backoff")
   346  			return false
   347  		}
   348  		if err != nil {
   349  			d.metrics.observeHandlePeer(ctx, handlePeerConnErr)
   350  			logger.Debugw("unable to connect", "err", err)
   351  			return false
   352  		}
   353  	default:
   354  		panic("unknown connectedness")
   355  	}
   356  
   357  	if !d.set.Add(peer.ID) {
   358  		d.metrics.observeHandlePeer(ctx, handlePeerInSet)
   359  		logger.Debug("peer is already in discovery set")
   360  		return false
   361  	}
   362  	d.onUpdatedPeers(peer.ID, true)
   363  	d.metrics.observeHandlePeer(ctx, handlePeerConnected)
   364  	logger.Debug("added peer to set")
   365  
   366  	// Tag to protect peer from being killed by ConnManager
   367  	// NOTE: This is does not protect from remote killing the connection.
   368  	//  In the future, we should design a protocol that keeps bidirectional agreement on whether
   369  	//  connection should be kept or not, similar to mesh link in GossipSub.
   370  	d.host.ConnManager().Protect(peer.ID, d.tag)
   371  	return true
   372  }
   373  
   374  func drainChannel(c <-chan time.Time) {
   375  	for {
   376  		select {
   377  		case <-c:
   378  		default:
   379  			return
   380  		}
   381  	}
   382  }