github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/access/ping/engine.go (about)

     1  package ping
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	"github.com/rs/zerolog"
     8  	"golang.org/x/sync/errgroup"
     9  
    10  	"github.com/onflow/flow-go/model/flow"
    11  	"github.com/onflow/flow-go/model/flow/filter"
    12  	"github.com/onflow/flow-go/module"
    13  	"github.com/onflow/flow-go/module/component"
    14  	"github.com/onflow/flow-go/module/irrecoverable"
    15  	"github.com/onflow/flow-go/network"
    16  	"github.com/onflow/flow-go/network/p2p"
    17  )
    18  
    19  const (
    20  	// PingTimeout is maximum time to wait for a ping reply from a remote node
    21  	PingTimeout = time.Second * 4
    22  
    23  	// PingInterval is the interval between pings to remote nodes
    24  	PingInterval = time.Minute
    25  
    26  	// MaxConcurrentPings is the maximum number of ping requests that can be sent concurrently
    27  	MaxConcurrentPings = 100
    28  
    29  	// MaxJitter is the maximum time to pause between nodes during ping
    30  	MaxJitter = 5 * time.Second
    31  )
    32  
    33  type Engine struct {
    34  	component.Component
    35  
    36  	log          zerolog.Logger
    37  	idProvider   module.IdentityProvider
    38  	idTranslator p2p.IDTranslator
    39  	me           module.Local
    40  	metrics      module.PingMetrics
    41  
    42  	pingService network.PingService
    43  	nodeInfo    map[flow.Identifier]string // additional details about a node such as operator name
    44  }
    45  
    46  func New(
    47  	log zerolog.Logger,
    48  	idProvider module.IdentityProvider,
    49  	idTranslator p2p.IDTranslator,
    50  	me module.Local,
    51  	metrics module.PingMetrics,
    52  	nodeInfoFile string,
    53  	pingService network.PingService,
    54  ) (*Engine, error) {
    55  	eng := &Engine{
    56  		log:          log.With().Str("engine", "ping").Logger(),
    57  		idProvider:   idProvider,
    58  		idTranslator: idTranslator,
    59  		me:           me,
    60  		metrics:      metrics,
    61  		pingService:  pingService,
    62  	}
    63  	eng.nodeInfo = eng.loadNodeInfo(nodeInfoFile)
    64  
    65  	eng.Component = component.NewComponentManagerBuilder().
    66  		AddWorker(eng.pingLoop).
    67  		Build()
    68  
    69  	return eng, nil
    70  }
    71  
    72  func (e *Engine) loadNodeInfo(nodeInfoFile string) map[flow.Identifier]string {
    73  	if nodeInfoFile == "" {
    74  		// initialize nodeInfo with an empty map
    75  		// the node info file is not mandatory and should not stop the Ping engine from running
    76  		e.log.Trace().Msg("no node info file specified")
    77  		return make(map[flow.Identifier]string)
    78  	}
    79  
    80  	nodeInfo, err := readExtraNodeInfoJSON(nodeInfoFile)
    81  	if err != nil {
    82  		e.log.Error().Err(err).
    83  			Str("node_info_file", nodeInfoFile).
    84  			Msg("failed to read node info file")
    85  		return make(map[flow.Identifier]string)
    86  	}
    87  
    88  	e.log.Debug().
    89  		Str("node_info_file", nodeInfoFile).
    90  		Msg("using node info file")
    91  	return nodeInfo
    92  }
    93  
    94  func (e *Engine) pingLoop(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) {
    95  	ticker := time.NewTicker(PingInterval)
    96  	defer ticker.Stop()
    97  
    98  	for {
    99  		select {
   100  		case <-ctx.Done():
   101  			return
   102  		case <-ticker.C:
   103  			e.pingAllNodes(ctx)
   104  		}
   105  	}
   106  }
   107  
   108  func (e *Engine) pingAllNodes(ctx context.Context) {
   109  	start := time.Now()
   110  	e.log.Debug().Msg("pinging all nodes")
   111  
   112  	g := new(errgroup.Group)
   113  
   114  	// restrict the number of concurrently running ping requests.
   115  	g.SetLimit(MaxConcurrentPings)
   116  
   117  	peers := e.idProvider.Identities(filter.Not(filter.HasNodeID[flow.Identity](e.me.NodeID())))
   118  	for i, peer := range peers {
   119  		peer := peer
   120  		delay := makeJitter(i)
   121  
   122  		g.Go(func() error {
   123  			select {
   124  			case <-ctx.Done():
   125  				return nil
   126  			case <-time.After(delay):
   127  			}
   128  
   129  			e.pingNode(ctx, peer)
   130  			return nil
   131  		})
   132  	}
   133  
   134  	_ = g.Wait()
   135  
   136  	e.log.Debug().
   137  		Dur("duration", time.Since(start)).
   138  		Int("node_count", len(peers)).
   139  		Msg("finished pinging all nodes")
   140  }
   141  
   142  // pingNode pings the given peer and updates the metrics with the result and the additional node information
   143  func (e *Engine) pingNode(ctx context.Context, peer *flow.Identity) {
   144  	pid, err := e.idTranslator.GetPeerID(peer.ID())
   145  
   146  	if err != nil {
   147  		e.log.Error().Err(err).Str("peer", peer.String()).Msg("failed to get peer ID")
   148  		return
   149  	}
   150  
   151  	ctx, cancel := context.WithTimeout(ctx, PingTimeout)
   152  	defer cancel()
   153  
   154  	// ping the node
   155  	resp, rtt, pingErr := e.pingService.Ping(ctx, pid) // ping will timeout in PingTimeout seconds
   156  	if pingErr != nil {
   157  		e.log.Debug().Err(pingErr).Str("target", peer.ID().String()).Msg("failed to ping")
   158  		// report the rtt duration as negative to make it easier to distinguish between pingable and non-pingable nodes
   159  		rtt = -1
   160  	}
   161  
   162  	// get the additional info about the node
   163  	info := e.nodeInfo[peer.ID()]
   164  
   165  	// update metric
   166  	e.metrics.NodeReachable(peer, info, rtt)
   167  
   168  	// if ping succeeded then update the node info metric
   169  	if pingErr == nil {
   170  		e.metrics.NodeInfo(peer, info, resp.Version, resp.BlockHeight, resp.HotstuffView)
   171  	}
   172  }
   173  
   174  // makeJitter returns a jitter between 0 and MaxJitter
   175  func makeJitter(offset int) time.Duration {
   176  	jitter := float64(MaxJitter) * float64(offset%MaxConcurrentPings) / float64(MaxConcurrentPings)
   177  	return time.Duration(jitter)
   178  }