github.com/onflow/flow-go@v0.35.7-crescendo-preview.23-atree-inlining/engine/access/ping/engine.go (about) 1 package ping 2 3 import ( 4 "context" 5 "time" 6 7 "github.com/rs/zerolog" 8 "golang.org/x/sync/errgroup" 9 10 "github.com/onflow/flow-go/model/flow" 11 "github.com/onflow/flow-go/model/flow/filter" 12 "github.com/onflow/flow-go/module" 13 "github.com/onflow/flow-go/module/component" 14 "github.com/onflow/flow-go/module/irrecoverable" 15 "github.com/onflow/flow-go/network" 16 "github.com/onflow/flow-go/network/p2p" 17 ) 18 19 const ( 20 // PingTimeout is maximum time to wait for a ping reply from a remote node 21 PingTimeout = time.Second * 4 22 23 // PingInterval is the interval between pings to remote nodes 24 PingInterval = time.Minute 25 26 // MaxConcurrentPings is the maximum number of ping requests that can be sent concurrently 27 MaxConcurrentPings = 100 28 29 // MaxJitter is the maximum time to pause between nodes during ping 30 MaxJitter = 5 * time.Second 31 ) 32 33 type Engine struct { 34 component.Component 35 36 log zerolog.Logger 37 idProvider module.IdentityProvider 38 idTranslator p2p.IDTranslator 39 me module.Local 40 metrics module.PingMetrics 41 42 pingService network.PingService 43 nodeInfo map[flow.Identifier]string // additional details about a node such as operator name 44 } 45 46 func New( 47 log zerolog.Logger, 48 idProvider module.IdentityProvider, 49 idTranslator p2p.IDTranslator, 50 me module.Local, 51 metrics module.PingMetrics, 52 nodeInfoFile string, 53 pingService network.PingService, 54 ) (*Engine, error) { 55 eng := &Engine{ 56 log: log.With().Str("engine", "ping").Logger(), 57 idProvider: idProvider, 58 idTranslator: idTranslator, 59 me: me, 60 metrics: metrics, 61 pingService: pingService, 62 } 63 eng.nodeInfo = eng.loadNodeInfo(nodeInfoFile) 64 65 eng.Component = component.NewComponentManagerBuilder(). 66 AddWorker(eng.pingLoop). 67 Build() 68 69 return eng, nil 70 } 71 72 func (e *Engine) loadNodeInfo(nodeInfoFile string) map[flow.Identifier]string { 73 if nodeInfoFile == "" { 74 // initialize nodeInfo with an empty map 75 // the node info file is not mandatory and should not stop the Ping engine from running 76 e.log.Trace().Msg("no node info file specified") 77 return make(map[flow.Identifier]string) 78 } 79 80 nodeInfo, err := readExtraNodeInfoJSON(nodeInfoFile) 81 if err != nil { 82 e.log.Error().Err(err). 83 Str("node_info_file", nodeInfoFile). 84 Msg("failed to read node info file") 85 return make(map[flow.Identifier]string) 86 } 87 88 e.log.Debug(). 89 Str("node_info_file", nodeInfoFile). 90 Msg("using node info file") 91 return nodeInfo 92 } 93 94 func (e *Engine) pingLoop(ctx irrecoverable.SignalerContext, ready component.ReadyFunc) { 95 ticker := time.NewTicker(PingInterval) 96 defer ticker.Stop() 97 98 for { 99 select { 100 case <-ctx.Done(): 101 return 102 case <-ticker.C: 103 e.pingAllNodes(ctx) 104 } 105 } 106 } 107 108 func (e *Engine) pingAllNodes(ctx context.Context) { 109 start := time.Now() 110 e.log.Debug().Msg("pinging all nodes") 111 112 g := new(errgroup.Group) 113 114 // restrict the number of concurrently running ping requests. 115 g.SetLimit(MaxConcurrentPings) 116 117 peers := e.idProvider.Identities(filter.Not(filter.HasNodeID[flow.Identity](e.me.NodeID()))) 118 for i, peer := range peers { 119 peer := peer 120 delay := makeJitter(i) 121 122 g.Go(func() error { 123 select { 124 case <-ctx.Done(): 125 return nil 126 case <-time.After(delay): 127 } 128 129 e.pingNode(ctx, peer) 130 return nil 131 }) 132 } 133 134 _ = g.Wait() 135 136 e.log.Debug(). 137 Dur("duration", time.Since(start)). 138 Int("node_count", len(peers)). 139 Msg("finished pinging all nodes") 140 } 141 142 // pingNode pings the given peer and updates the metrics with the result and the additional node information 143 func (e *Engine) pingNode(ctx context.Context, peer *flow.Identity) { 144 pid, err := e.idTranslator.GetPeerID(peer.ID()) 145 146 if err != nil { 147 e.log.Error().Err(err).Str("peer", peer.String()).Msg("failed to get peer ID") 148 return 149 } 150 151 ctx, cancel := context.WithTimeout(ctx, PingTimeout) 152 defer cancel() 153 154 // ping the node 155 resp, rtt, pingErr := e.pingService.Ping(ctx, pid) // ping will timeout in PingTimeout seconds 156 if pingErr != nil { 157 e.log.Debug().Err(pingErr).Str("target", peer.ID().String()).Msg("failed to ping") 158 // report the rtt duration as negative to make it easier to distinguish between pingable and non-pingable nodes 159 rtt = -1 160 } 161 162 // get the additional info about the node 163 info := e.nodeInfo[peer.ID()] 164 165 // update metric 166 e.metrics.NodeReachable(peer, info, rtt) 167 168 // if ping succeeded then update the node info metric 169 if pingErr == nil { 170 e.metrics.NodeInfo(peer, info, resp.Version, resp.BlockHeight, resp.HotstuffView) 171 } 172 } 173 174 // makeJitter returns a jitter between 0 and MaxJitter 175 func makeJitter(offset int) time.Duration { 176 jitter := float64(MaxJitter) * float64(offset%MaxConcurrentPings) / float64(MaxConcurrentPings) 177 return time.Duration(jitter) 178 }