github.com/elfadel/cilium@v1.6.12/pkg/health/server/prober.go (about) 1 // Copyright 2017-2019 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package server 16 17 import ( 18 "net" 19 "strconv" 20 "strings" 21 "time" 22 23 "github.com/cilium/cilium/api/v1/health/models" 24 ciliumModels "github.com/cilium/cilium/api/v1/models" 25 "github.com/cilium/cilium/pkg/health/defaults" 26 "github.com/cilium/cilium/pkg/health/probe" 27 "github.com/cilium/cilium/pkg/lock" 28 "github.com/cilium/cilium/pkg/logging/logfields" 29 30 "github.com/servak/go-fastping" 31 "github.com/sirupsen/logrus" 32 ) 33 34 // healthReport is a snapshot of the health of the cluster. 35 type healthReport struct { 36 startTime time.Time 37 nodes []*models.NodeStatus 38 } 39 40 type prober struct { 41 *fastping.Pinger 42 server *Server 43 44 // 'stop' is closed upon a call to prober.Stop(). When the stopping is 45 // finished, then prober.Done() will be notified. 46 stop chan bool 47 proberExited chan bool 48 done chan bool 49 50 // The lock protects multiple requests attempting to update the status 51 // at the same time - ie, serialize updates between the periodic prober 52 // and probes initiated via "GET /status/probe". It is also used to 53 // co-ordinate updates of the ICMP responses and the HTTP responses. 54 lock.RWMutex 55 56 // start is the start time for the current probe cycle. 57 start time.Time 58 results map[ipString]*models.PathStatus 59 nodes nodeMap 60 } 61 62 // copyResultRLocked makes a copy of the path status for the specified IP. 63 func (p *prober) copyResultRLocked(ip string) *models.PathStatus { 64 status := p.results[ipString(ip)] 65 if status == nil { 66 return nil 67 } 68 69 result := &models.PathStatus{ 70 IP: ip, 71 } 72 paths := map[**models.ConnectivityStatus]*models.ConnectivityStatus{ 73 &result.Icmp: status.Icmp, 74 &result.HTTP: status.HTTP, 75 } 76 for res, value := range paths { 77 if value != nil { 78 *res = &*value 79 } 80 } 81 return result 82 } 83 84 // getResults gathers a copy of all of the results for nodes currently in the 85 // cluster. 86 func (p *prober) getResults() *healthReport { 87 p.RLock() 88 defer p.RUnlock() 89 90 // De-duplicate IPs in 'p.nodes' by building a map based on node.Name. 91 resultMap := map[string]*models.NodeStatus{} 92 for _, node := range p.nodes { 93 if resultMap[node.Name] != nil { 94 continue 95 } 96 primaryIP := node.PrimaryIP() 97 healthIP := node.HealthIP() 98 status := &models.NodeStatus{ 99 Name: node.Name, 100 Host: &models.HostStatus{ 101 PrimaryAddress: p.copyResultRLocked(primaryIP), 102 }, 103 } 104 if healthIP != "" { 105 status.Endpoint = p.copyResultRLocked(healthIP) 106 } 107 secondaryResults := []*models.PathStatus{} 108 for _, addr := range node.SecondaryAddresses { 109 if addr.Enabled { 110 secondaryStatus := p.copyResultRLocked(addr.IP) 111 secondaryResults = append(secondaryResults, secondaryStatus) 112 } 113 } 114 status.Host.SecondaryAddresses = secondaryResults 115 resultMap[node.Name] = status 116 } 117 118 result := &healthReport{startTime: p.start} 119 for _, res := range resultMap { 120 result.nodes = append(result.nodes, res) 121 } 122 return result 123 } 124 125 func isIPv4(ip string) bool { 126 netIP := net.ParseIP(ip) 127 return netIP != nil && !strings.Contains(ip, ":") 128 } 129 130 func skipAddress(elem *ciliumModels.NodeAddressingElement) bool { 131 return elem == nil || !elem.Enabled || elem.IP == "<nil>" 132 } 133 134 // resolveIP attempts to sanitize 'node' and 'ip', and if successful, returns 135 // the name of the node and the IP address specified in the addressing element. 136 // If validation fails or this IP should not be pinged, 'ip' is returned as nil. 137 func resolveIP(n *healthNode, addr *ciliumModels.NodeAddressingElement, proto string, primary bool) (string, *net.IPAddr) { 138 node := n.NodeElement 139 network := "ip6:icmp" 140 if isIPv4(addr.IP) { 141 network = "ip4:icmp" 142 } 143 scopedLog := log.WithFields(logrus.Fields{ 144 logfields.NodeName: node.Name, 145 logfields.IPAddr: addr.IP, 146 "primary": primary, 147 }) 148 149 if skipAddress(addr) { 150 scopedLog.Debug("Skipping probe for address") 151 return "", nil 152 } 153 154 ra, err := net.ResolveIPAddr(network, addr.IP) 155 if err != nil { 156 scopedLog.Debug("Unable to resolve address") 157 return "", nil 158 } 159 160 scopedLog.WithField("protocol", proto).Debug("Probing for connectivity to node") 161 return node.Name, ra 162 } 163 164 // RemoveIP removes all traces of the specified IP from the prober, including 165 // clearing all cached results, mapping from this IP to a node, and entries in 166 // the ICMP and TCP pingers. 167 func (p *prober) RemoveIP(ip string) { 168 nodeIP := ipString(ip) 169 delete(p.results, nodeIP) 170 p.Pinger.RemoveIP(ip) // ICMP pinger 171 delete(p.nodes, nodeIP) // TCP prober 172 } 173 174 // setNodes sets the list of nodes for the prober, and updates the pinger to 175 // start sending pings to all nodes added. 176 // 'removed' nodes will be removed from the pinger to stop sending pings to 177 // those removed nodes. 178 // setNodes will steal references to nodes referenced from 'added', so the 179 // caller should not modify them after a call to setNodes. 180 // If a node is updated, it will appear in both maps and will be removed then 181 // added (potentially with different information). 182 func (p *prober) setNodes(added nodeMap, removed nodeMap) { 183 p.Lock() 184 defer p.Unlock() 185 186 for _, n := range removed { 187 for elem := range n.Addresses() { 188 p.RemoveIP(elem.IP) 189 } 190 } 191 192 for _, n := range added { 193 for elem, primary := range n.Addresses() { 194 _, addr := resolveIP(&n, elem, "icmp", primary) 195 196 ip := ipString(elem.IP) 197 result := &models.ConnectivityStatus{} 198 if addr == nil { 199 result.Status = "Failed to resolve IP" 200 } else { 201 result.Status = "Connection timed out" 202 p.AddIPAddr(addr) 203 p.nodes[ip] = n 204 } 205 206 if p.results[ip] == nil { 207 p.results[ip] = &models.PathStatus{ 208 IP: elem.IP, 209 } 210 } 211 p.results[ip].Icmp = result 212 } 213 } 214 } 215 216 func (p *prober) httpProbe(node string, ip string, port int) *models.ConnectivityStatus { 217 result := &models.ConnectivityStatus{} 218 219 host := "http://" + net.JoinHostPort(ip, strconv.Itoa(port)) 220 scopedLog := log.WithFields(logrus.Fields{ 221 logfields.NodeName: node, 222 logfields.IPAddr: ip, 223 "host": host, 224 "path": PortToPaths[port], 225 }) 226 227 scopedLog.Debug("Greeting host") 228 start := time.Now() 229 err := probe.GetHello(host) 230 rtt := time.Since(start) 231 if err == nil { 232 scopedLog.WithField("rtt", rtt).Debug("Greeting successful") 233 result.Status = "" 234 result.Latency = rtt.Nanoseconds() 235 } else { 236 scopedLog.WithError(err).Debug("Greeting failed") 237 result.Status = err.Error() 238 } 239 240 return result 241 } 242 243 func (p *prober) getIPsByNode() map[string][]*net.IPAddr { 244 p.RLock() 245 defer p.RUnlock() 246 247 // p.nodes is mapped from all known IPs -> nodes in N:M configuration, 248 // so multiple IPs could refer to the same node. To ensure we only 249 // ping each node once, deduplicate nodes into map of nodeName -> []IP. 250 nodes := make(map[string][]*net.IPAddr) 251 for _, node := range p.nodes { 252 if nodes[node.Name] != nil { 253 // Already handled this node. 254 continue 255 } 256 nodes[node.Name] = []*net.IPAddr{} 257 for elem, primary := range node.Addresses() { 258 if _, addr := resolveIP(&node, elem, "http", primary); addr != nil { 259 nodes[node.Name] = append(nodes[node.Name], addr) 260 } 261 } 262 } 263 264 return nodes 265 } 266 267 func (p *prober) runHTTPProbe() { 268 startTime := time.Now() 269 p.Lock() 270 p.start = startTime 271 p.Unlock() 272 273 for name, ips := range p.getIPsByNode() { 274 for _, ip := range ips { 275 scopedLog := log.WithFields(logrus.Fields{ 276 logfields.NodeName: name, 277 logfields.IPAddr: ip.String(), 278 }) 279 280 status := &models.PathStatus{} 281 ports := map[int]**models.ConnectivityStatus{ 282 defaults.HTTPPathPort: &status.HTTP, 283 } 284 for port, result := range ports { 285 *result = p.httpProbe(name, ip.String(), port) 286 if status.HTTP.Status != "" { 287 scopedLog.WithFields(logrus.Fields{ 288 logfields.Port: port, 289 }).Debugf("Failed to probe: %s", status.HTTP.Status) 290 } 291 } 292 293 peer := ipString(ip.String()) 294 p.Lock() 295 if _, ok := p.results[peer]; ok { 296 p.results[peer].HTTP = status.HTTP 297 } else { 298 // While we weren't holding the lock, the 299 // pinger's OnIdle() callback fired and updated 300 // the set of nodes to remove this node. 301 scopedLog.Debug("Node disappeared before result written") 302 } 303 p.Unlock() 304 } 305 } 306 } 307 308 // Done returns a channel that is closed when RunLoop() is stopped by an error. 309 // It must be called after the RunLoop() call. 310 func (p *prober) Done() <-chan bool { 311 return p.done 312 } 313 314 // Run sends a single probes out to all of the other cilium nodes to gather 315 // connectivity status for the cluster. 316 func (p *prober) Run() error { 317 err := p.Pinger.Run() 318 p.runHTTPProbe() 319 return err 320 } 321 322 // Stop disrupts the currently running RunLoop(). This may only be called after 323 // a call to RunLoop(). 324 func (p *prober) Stop() { 325 p.Pinger.Stop() 326 close(p.stop) 327 <-p.proberExited 328 close(p.done) 329 } 330 331 // RunLoop periodically sends probes out to all of the other cilium nodes to 332 // gather connectivity status for the cluster. 333 // 334 // This is a non-blocking method so it immediately returns. If you want to 335 // stop sending packets, call Stop(). 336 func (p *prober) RunLoop() { 337 // FIXME: Spread the probes out across the probing interval 338 p.Pinger.RunLoop() 339 340 go func() { 341 tick := time.NewTicker(p.server.ProbeInterval) 342 loop: 343 for { 344 select { 345 case <-p.stop: 346 break loop 347 case <-tick.C: 348 p.runHTTPProbe() 349 continue 350 } 351 } 352 tick.Stop() 353 close(p.proberExited) 354 }() 355 } 356 357 // newPinger prepares a prober. The caller may invoke one the Run* methods of 358 // the prober to populate its 'results' map. 359 func newProber(s *Server, nodes nodeMap) *prober { 360 prober := &prober{ 361 Pinger: fastping.NewPinger(), 362 server: s, 363 done: make(chan bool), 364 proberExited: make(chan bool), 365 stop: make(chan bool), 366 results: make(map[ipString]*models.PathStatus), 367 nodes: make(nodeMap), 368 } 369 prober.MaxRTT = s.ProbeDeadline 370 371 prober.setNodes(nodes, nil) 372 prober.OnRecv = func(addr *net.IPAddr, rtt time.Duration) { 373 prober.Lock() 374 defer prober.Unlock() 375 node, exists := prober.nodes[ipString(addr.String())] 376 377 scopedLog := log.WithFields(logrus.Fields{ 378 logfields.IPAddr: addr, 379 "rtt": rtt, 380 }) 381 if !exists { 382 scopedLog.Debugf("Node disappeared, skip result") 383 return 384 } 385 386 prober.results[ipString(addr.String())].Icmp = &models.ConnectivityStatus{ 387 Latency: rtt.Nanoseconds(), 388 Status: "", 389 } 390 scopedLog.WithFields(logrus.Fields{ 391 logfields.NodeName: node.Name, 392 }).Debugf("Probe successful") 393 } 394 395 return prober 396 }