github.com/ari-anchor/sei-tendermint@v0.0.0-20230519144642-dc826b7b56bb/internal/p2p/pex/reactor.go (about)

     1  package pex
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/ari-anchor/sei-tendermint/config"
    10  	"github.com/ari-anchor/sei-tendermint/internal/p2p"
    11  	"github.com/ari-anchor/sei-tendermint/internal/p2p/conn"
    12  	"github.com/ari-anchor/sei-tendermint/libs/log"
    13  	"github.com/ari-anchor/sei-tendermint/libs/service"
    14  	protop2p "github.com/ari-anchor/sei-tendermint/proto/tendermint/p2p"
    15  	"github.com/ari-anchor/sei-tendermint/types"
    16  )
    17  
    18  var (
    19  	_ service.Service = (*Reactor)(nil)
    20  	_ p2p.Wrapper     = (*protop2p.PexMessage)(nil)
    21  )
    22  
    23  const (
    24  	// PexChannel is a channel for PEX messages
    25  	PexChannel = 0x00
    26  
    27  	// over-estimate of max NetAddress size
    28  	// hexID (40) + IP (16) + Port (2) + Name (100) ...
    29  	// NOTE: dont use massive DNS name ..
    30  	maxAddressSize = 256
    31  
    32  	// max addresses returned by GetSelection
    33  	// NOTE: this must match "maxMsgSize"
    34  	maxGetSelection = 250
    35  
    36  	// NOTE: amplification factor!
    37  	// small request results in up to maxMsgSize response
    38  	maxMsgSize = maxAddressSize * maxGetSelection
    39  
    40  	// the minimum time one peer can send another request to the same peer
    41  	minReceiveRequestInterval = 100 * time.Millisecond
    42  
    43  	// the maximum amount of addresses that can be included in a response
    44  	maxAddresses = 100
    45  
    46  	// How long to wait when there are no peers available before trying again
    47  	noAvailablePeersWaitPeriod = 1 * time.Second
    48  
    49  	// indicates the ping rate of the pex reactor when the peer store is full.
    50  	// The reactor should still look to add new peers in order to flush out low
    51  	// scoring peers that are still in the peer store
    52  	fullCapacityInterval = 10 * time.Minute
    53  )
    54  
    55  type NoPeersAvailableError struct {
    56  	error
    57  }
    58  
    59  func (e *NoPeersAvailableError) Error() string {
    60  	return fmt.Sprintf("no available peers to send a PEX request to (retrying)")
    61  }
    62  
    63  // TODO: We should decide whether we want channel descriptors to be housed
    64  // within each reactor (as they are now) or, considering that the reactor doesn't
    65  // really need to care about the channel descriptors, if they should be housed
    66  // in the node module.
    67  func ChannelDescriptor() *conn.ChannelDescriptor {
    68  	return &conn.ChannelDescriptor{
    69  		ID:                  PexChannel,
    70  		MessageType:         new(protop2p.PexMessage),
    71  		Priority:            1,
    72  		SendQueueCapacity:   10,
    73  		RecvMessageCapacity: maxMsgSize,
    74  		RecvBufferCapacity:  128,
    75  		Name:                "pex",
    76  	}
    77  }
    78  
    79  // The peer exchange or PEX reactor supports the peer manager by sending
    80  // requests to other peers for addresses that can be given to the peer manager
    81  // and at the same time advertises addresses to peers that need more.
    82  //
    83  // The reactor is able to tweak the intensity of it's search by decreasing or
    84  // increasing the interval between each request. It tracks connected peers via
    85  // a linked list, sending a request to the node at the front of the list and
    86  // adding it to the back of the list once a response is received.
    87  type Reactor struct {
    88  	service.BaseService
    89  	logger log.Logger
    90  
    91  	peerManager *p2p.PeerManager
    92  	peerEvents  p2p.PeerEventSubscriber
    93  	// list of available peers to loop through and send peer requests to
    94  	availablePeers map[types.NodeID]struct{}
    95  	// keep track of the last time we saw no available peers, so we can restart if it's been too long
    96  	lastNoAvailablePeers time.Time
    97  
    98  	mtx sync.RWMutex
    99  
   100  	// requestsSent keeps track of which peers the PEX reactor has sent requests
   101  	// to. This prevents the sending of spurious responses.
   102  	// NOTE: If a node never responds, they will remain in this map until a
   103  	// peer down status update is sent
   104  	requestsSent map[types.NodeID]struct{}
   105  
   106  	// lastReceivedRequests keeps track of when peers send a request to prevent
   107  	// peers from sending requests too often (as defined by
   108  	// minReceiveRequestInterval).
   109  	lastReceivedRequests map[types.NodeID]time.Time
   110  
   111  	// the total number of unique peers added
   112  	totalPeers int
   113  
   114  	channel *p2p.Channel
   115  
   116  	// Used to signal a restart the node on the application level
   117  	restartCh                     chan struct{}
   118  	restartNoAvailablePeersWindow time.Duration
   119  }
   120  
   121  // NewReactor returns a reference to a new reactor.
   122  func NewReactor(
   123  	logger log.Logger,
   124  	peerManager *p2p.PeerManager,
   125  	peerEvents p2p.PeerEventSubscriber,
   126  	restartCh chan struct{},
   127  	selfRemediationConfig *config.SelfRemediationConfig,
   128  ) *Reactor {
   129  	r := &Reactor{
   130  		logger:                        logger,
   131  		peerManager:                   peerManager,
   132  		peerEvents:                    peerEvents,
   133  		availablePeers:                make(map[types.NodeID]struct{}),
   134  		lastNoAvailablePeers:          time.Time{},
   135  		requestsSent:                  make(map[types.NodeID]struct{}),
   136  		lastReceivedRequests:          make(map[types.NodeID]time.Time),
   137  		restartCh:                     restartCh,
   138  		restartNoAvailablePeersWindow: time.Duration(selfRemediationConfig.P2pNoPeersRestarWindowSeconds) * time.Second,
   139  	}
   140  
   141  	r.BaseService = *service.NewBaseService(logger, "PEX", r)
   142  	return r
   143  }
   144  
   145  func (r *Reactor) SetChannel(ch *p2p.Channel) {
   146  	r.channel = ch
   147  }
   148  
   149  // OnStart starts separate go routines for each p2p Channel and listens for
   150  // envelopes on each. In addition, it also listens for peer updates and handles
   151  // messages on that p2p channel accordingly. The caller must be sure to execute
   152  // OnStop to ensure the outbound p2p Channels are closed.
   153  func (r *Reactor) OnStart(ctx context.Context) error {
   154  	peerUpdates := r.peerEvents(ctx)
   155  	go r.processPexCh(ctx, r.channel)
   156  	go r.processPeerUpdates(ctx, peerUpdates)
   157  	return nil
   158  }
   159  
   160  // OnStop stops the reactor by signaling to all spawned goroutines to exit and
   161  // blocking until they all exit.
   162  func (r *Reactor) OnStop() {}
   163  
   164  // processPexCh implements a blocking event loop where we listen for p2p
   165  // Envelope messages from the pexCh.
   166  func (r *Reactor) processPexCh(ctx context.Context, pexCh *p2p.Channel) {
   167  	incoming := make(chan *p2p.Envelope)
   168  	go func() {
   169  		defer close(incoming)
   170  		iter := pexCh.Receive(ctx)
   171  		for iter.Next(ctx) {
   172  			select {
   173  			case <-ctx.Done():
   174  				return
   175  			case incoming <- iter.Envelope():
   176  			}
   177  		}
   178  	}()
   179  
   180  	// Initially, we will request peers quickly to bootstrap.  This duration
   181  	// will be adjusted upward as knowledge of the network grows.
   182  	var nextPeerRequest = minReceiveRequestInterval
   183  	noAvailablePeerFailCounter := 0
   184  	lastNoAvailablePeersTime := time.Now()
   185  
   186  	timer := time.NewTimer(0)
   187  	defer timer.Stop()
   188  
   189  	for {
   190  		timer.Reset(nextPeerRequest)
   191  
   192  		select {
   193  		case <-ctx.Done():
   194  			return
   195  
   196  		case <-timer.C:
   197  			// back off sending peer requests if there's none available.
   198  			// Let the loop continue to handle incoming pex messages
   199  			if noAvailablePeerFailCounter > 0 {
   200  				waitPeriod := float64(noAvailablePeersWaitPeriod) * float64(noAvailablePeerFailCounter)
   201  				if time.Since(lastNoAvailablePeersTime).Seconds() < time.Duration(waitPeriod).Seconds() {
   202  					r.logger.Debug(fmt.Sprintf("waiting for more peers to become available still in the waitPeriod=%f\n", time.Duration(waitPeriod).Seconds()))
   203  					continue
   204  				}
   205  			}
   206  
   207  			// Send a request for more peer addresses.
   208  			if err := r.sendRequestForPeers(ctx, pexCh); err != nil {
   209  				r.logger.Error("failed to send request for peers", "err", err)
   210  				if _, ok := err.(*NoPeersAvailableError); ok {
   211  					noAvailablePeerFailCounter++
   212  					lastNoAvailablePeersTime = time.Now()
   213  					continue
   214  				}
   215  				return
   216  			}
   217  			noAvailablePeerFailCounter = 0
   218  		case envelope, ok := <-incoming:
   219  			if !ok {
   220  				return // channel closed
   221  			}
   222  
   223  			// A request from another peer, or a response to one of our requests.
   224  			dur, err := r.handlePexMessage(ctx, envelope, pexCh)
   225  			if err != nil {
   226  				r.logger.Error("failed to process message",
   227  					"ch_id", envelope.ChannelID, "envelope", envelope, "err", err)
   228  				if serr := pexCh.SendError(ctx, p2p.PeerError{
   229  					NodeID: envelope.From,
   230  					Err:    err,
   231  				}); serr != nil {
   232  					return
   233  				}
   234  			} else if dur != 0 {
   235  				// We got a useful result; update the poll timer.
   236  				nextPeerRequest = dur
   237  			}
   238  		}
   239  	}
   240  }
   241  
   242  // processPeerUpdates initiates a blocking process where we listen for and handle
   243  // PeerUpdate messages. When the reactor is stopped, we will catch the signal and
   244  // close the p2p PeerUpdatesCh gracefully.
   245  func (r *Reactor) processPeerUpdates(ctx context.Context, peerUpdates *p2p.PeerUpdates) {
   246  	for {
   247  		select {
   248  		case <-ctx.Done():
   249  			return
   250  		case peerUpdate := <-peerUpdates.Updates():
   251  			r.processPeerUpdate(peerUpdate)
   252  		}
   253  	}
   254  }
   255  
   256  // handlePexMessage handles envelopes sent from peers on the PexChannel.
   257  // If an update was received, a new polling interval is returned; otherwise the
   258  // duration is 0.
   259  func (r *Reactor) handlePexMessage(ctx context.Context, envelope *p2p.Envelope, pexCh *p2p.Channel) (time.Duration, error) {
   260  	logger := r.logger.With("peer", envelope.From)
   261  
   262  	switch msg := envelope.Message.(type) {
   263  	case *protop2p.PexRequest:
   264  		// Verify that this peer hasn't sent us another request too recently.
   265  		if err := r.markPeerRequest(envelope.From); err != nil {
   266  			r.logger.Error(fmt.Sprintf("PEX mark peer req from %s error %s", envelope.From, err))
   267  			return 0, err
   268  		}
   269  
   270  		// Fetch peers from the peer manager, convert NodeAddresses into URL
   271  		// strings, and send them back to the caller.
   272  		nodeAddresses := r.peerManager.Advertise(envelope.From, maxAddresses)
   273  		pexAddresses := make([]protop2p.PexAddress, len(nodeAddresses))
   274  		for idx, addr := range nodeAddresses {
   275  			pexAddresses[idx] = protop2p.PexAddress{
   276  				URL: addr.String(),
   277  			}
   278  		}
   279  		return 0, pexCh.Send(ctx, p2p.Envelope{
   280  			To:      envelope.From,
   281  			Message: &protop2p.PexResponse{Addresses: pexAddresses},
   282  		})
   283  
   284  	case *protop2p.PexResponse:
   285  		// Verify that this response corresponds to one of our pending requests.
   286  		if err := r.markPeerResponse(envelope.From); err != nil {
   287  			r.logger.Error(fmt.Sprintf("PEX mark peer resp from %s error %s", envelope.From, err))
   288  			return 0, err
   289  		}
   290  
   291  		// Verify that the response does not exceed the safety limit.
   292  		if len(msg.Addresses) > maxAddresses {
   293  			r.logger.Error(fmt.Sprintf("peer %s sent too many addresses (%d > maxiumum %d)",
   294  				envelope.From, len(msg.Addresses), maxAddresses))
   295  			return 0, fmt.Errorf("peer sent too many addresses (%d > maxiumum %d)",
   296  				len(msg.Addresses), maxAddresses)
   297  		}
   298  
   299  		var numAdded int
   300  		for _, pexAddress := range msg.Addresses {
   301  			peerAddress, err := p2p.ParseNodeAddress(pexAddress.URL)
   302  			if err != nil {
   303  				r.logger.Error(fmt.Sprintf("PEX parse node address error %s", err))
   304  				continue
   305  			}
   306  			added, err := r.peerManager.Add(peerAddress)
   307  			if err != nil {
   308  				logger.Error("failed to add PEX address", "address", peerAddress, "err", err)
   309  				continue
   310  			}
   311  			if added {
   312  				numAdded++
   313  				logger.Debug("added PEX address", "address", peerAddress)
   314  			}
   315  		}
   316  
   317  		return r.calculateNextRequestTime(numAdded), nil
   318  
   319  	default:
   320  		return 0, fmt.Errorf("received unknown message: %T", msg)
   321  	}
   322  }
   323  
   324  // processPeerUpdate processes a PeerUpdate. For added peers, PeerStatusUp, we
   325  // send a request for addresses.
   326  func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
   327  	r.logger.Debug("received PEX peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status)
   328  
   329  	r.mtx.Lock()
   330  	defer r.mtx.Unlock()
   331  
   332  	switch peerUpdate.Status {
   333  	case p2p.PeerStatusUp:
   334  		r.availablePeers[peerUpdate.NodeID] = struct{}{}
   335  		r.lastNoAvailablePeers = time.Time{} // reset
   336  	case p2p.PeerStatusDown:
   337  		delete(r.availablePeers, peerUpdate.NodeID)
   338  		delete(r.requestsSent, peerUpdate.NodeID)
   339  		delete(r.lastReceivedRequests, peerUpdate.NodeID)
   340  
   341  		// p2p can be flaky. If no peers are available, let's restart the entire router
   342  		if len(r.availablePeers) == 0 && r.restartNoAvailablePeersWindow > 0 {
   343  			r.logger.Error("no available peers to send a PEX request to (restarting router)")
   344  			if r.lastNoAvailablePeers.IsZero() {
   345  				r.lastNoAvailablePeers = time.Now()
   346  			} else if time.Since(r.lastNoAvailablePeers) > r.restartNoAvailablePeersWindow {
   347  				r.restartCh <- struct{}{}
   348  			}
   349  		}
   350  	default:
   351  	}
   352  }
   353  
   354  // sendRequestForPeers chooses a peer from the set of available peers and sends
   355  // that peer a request for more peer addresses. The chosen peer is moved into
   356  // the requestsSent bucket so that we will not attempt to contact them again
   357  // until they've replied or updated.
   358  func (r *Reactor) sendRequestForPeers(ctx context.Context, pexCh *p2p.Channel) error {
   359  	r.mtx.Lock()
   360  	defer r.mtx.Unlock()
   361  	if len(r.availablePeers) == 0 {
   362  		return &NoPeersAvailableError{}
   363  	}
   364  
   365  	// Select an arbitrary peer from the available set.
   366  	var peerID types.NodeID
   367  	for peerID = range r.availablePeers {
   368  		break
   369  	}
   370  
   371  	if err := pexCh.Send(ctx, p2p.Envelope{
   372  		To:      peerID,
   373  		Message: &protop2p.PexRequest{},
   374  	}); err != nil {
   375  		return err
   376  	}
   377  
   378  	// Move the peer from available to pending.
   379  	delete(r.availablePeers, peerID)
   380  	r.requestsSent[peerID] = struct{}{}
   381  
   382  	return nil
   383  }
   384  
   385  // calculateNextRequestTime selects how long we should wait before attempting
   386  // to send out another request for peer addresses.
   387  //
   388  // This implements a simplified proportional control mechanism to poll more
   389  // often when our knowledge of the network is incomplete, and less often as our
   390  // knowledge grows. To estimate our knowledge of the network, we use the
   391  // fraction of "new" peers (addresses we have not previously seen) to the total
   392  // so far observed. When we first join the network, this fraction will be close
   393  // to 1, meaning most new peers are "new" to us, and as we discover more peers,
   394  // the fraction will go toward zero.
   395  //
   396  // The minimum interval will be minReceiveRequestInterval to ensure we will not
   397  // request from any peer more often than we would allow them to do from us.
   398  func (r *Reactor) calculateNextRequestTime(added int) time.Duration {
   399  	r.mtx.Lock()
   400  	defer r.mtx.Unlock()
   401  
   402  	r.totalPeers += added
   403  
   404  	// If the peer store is nearly full, wait the maximum interval.
   405  	if ratio := r.peerManager.PeerRatio(); ratio >= 0.95 {
   406  		r.logger.Debug("Peer manager is nearly full",
   407  			"sleep_period", fullCapacityInterval, "ratio", ratio)
   408  		return fullCapacityInterval
   409  	}
   410  
   411  	// If there are no available peers to query, poll less aggressively.
   412  	if len(r.availablePeers) == 0 {
   413  		r.logger.Debug("No available peers to send a PEX request",
   414  			"sleep_period", noAvailablePeersWaitPeriod)
   415  		return noAvailablePeersWaitPeriod
   416  	}
   417  
   418  	// Reaching here, there are available peers to query and the peer store
   419  	// still has space. Estimate our knowledge of the network from the latest
   420  	// update and choose a new interval.
   421  	base := float64(minReceiveRequestInterval) / float64(len(r.availablePeers))
   422  	multiplier := float64(r.totalPeers+1) / float64(added+1) // +1 to avert zero division
   423  	return time.Duration(base*multiplier*multiplier) + minReceiveRequestInterval
   424  }
   425  
   426  func (r *Reactor) markPeerRequest(peer types.NodeID) error {
   427  	r.mtx.Lock()
   428  	defer r.mtx.Unlock()
   429  	if lastRequestTime, ok := r.lastReceivedRequests[peer]; ok {
   430  		if d := time.Since(lastRequestTime); d < minReceiveRequestInterval {
   431  			return fmt.Errorf("peer %v sent PEX request too soon (%v < minimum %v)",
   432  				peer, d, minReceiveRequestInterval)
   433  		}
   434  	}
   435  	r.lastReceivedRequests[peer] = time.Now()
   436  	return nil
   437  }
   438  
   439  func (r *Reactor) markPeerResponse(peer types.NodeID) error {
   440  	r.mtx.Lock()
   441  	defer r.mtx.Unlock()
   442  	// check if a request to this peer was sent
   443  	if _, ok := r.requestsSent[peer]; !ok {
   444  		return fmt.Errorf("peer sent a PEX response when none was requested (%v)", peer)
   445  	}
   446  	delete(r.requestsSent, peer)
   447  	// attach to the back of the list so that the peer can be used again for
   448  	// future requests
   449  
   450  	r.availablePeers[peer] = struct{}{}
   451  	return nil
   452  }