github.com/cilium/cilium@v1.16.2/pkg/health/server/prober.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package server 5 6 import ( 7 "net" 8 "strconv" 9 "strings" 10 "time" 11 12 "github.com/servak/go-fastping" 13 "github.com/sirupsen/logrus" 14 15 "github.com/cilium/cilium/api/v1/health/models" 16 ciliumModels "github.com/cilium/cilium/api/v1/models" 17 "github.com/cilium/cilium/pkg/health/probe" 18 "github.com/cilium/cilium/pkg/lock" 19 "github.com/cilium/cilium/pkg/logging" 20 "github.com/cilium/cilium/pkg/logging/logfields" 21 ) 22 23 // healthReport is a snapshot of the health of the cluster. 24 type healthReport struct { 25 startTime time.Time 26 nodes []*models.NodeStatus 27 } 28 29 type prober struct { 30 *fastping.Pinger 31 server *Server 32 33 // 'stop' is closed upon a call to prober.Stop(). When the stopping is 34 // finished, then prober.Done() will be notified. 35 stop chan bool 36 proberExited chan bool 37 38 // The lock protects multiple requests attempting to update the status 39 // at the same time - ie, serialize updates between the periodic prober 40 // and probes initiated via "GET /status/probe". It is also used to 41 // co-ordinate updates of the ICMP responses and the HTTP responses. 42 lock.RWMutex 43 44 // start is the start time for the current probe cycle. 45 start time.Time 46 results map[ipString]*models.PathStatus 47 nodes nodeMap 48 } 49 50 // copyResultRLocked makes a copy of the path status for the specified IP. 51 func (p *prober) copyResultRLocked(ip string) *models.PathStatus { 52 status := p.results[ipString(ip)] 53 if status == nil { 54 return nil 55 } 56 57 result := &models.PathStatus{ 58 IP: ip, 59 } 60 paths := map[**models.ConnectivityStatus]*models.ConnectivityStatus{ 61 &result.Icmp: status.Icmp, 62 &result.HTTP: status.HTTP, 63 } 64 for res, value := range paths { 65 if value != nil { 66 *res = value 67 } 68 } 69 return result 70 } 71 72 // getResults gathers a copy of all of the results for nodes currently in the 73 // cluster. 74 func (p *prober) getResults() *healthReport { 75 p.RLock() 76 defer p.RUnlock() 77 78 // De-duplicate IPs in 'p.nodes' by building a map based on node.Name. 79 resultMap := map[string]*models.NodeStatus{} 80 for _, node := range p.nodes { 81 if resultMap[node.Name] != nil { 82 continue 83 } 84 primaryIP := node.PrimaryIP() 85 primaryHealthIP := node.HealthIP() 86 87 secondaryAddresses := []*models.PathStatus{} 88 for _, ip := range node.SecondaryIPs() { 89 if addr := p.copyResultRLocked(ip); addr != nil { 90 secondaryAddresses = append(secondaryAddresses, addr) 91 } 92 } 93 94 status := &models.NodeStatus{ 95 Name: node.Name, 96 Host: &models.HostStatus{ 97 PrimaryAddress: p.copyResultRLocked(primaryIP), 98 SecondaryAddresses: secondaryAddresses, 99 }, 100 } 101 102 secondaryEndpointAddresses := []*models.PathStatus{} 103 for _, ip := range node.SecondaryHealthIPs() { 104 if addr := p.copyResultRLocked(ip); addr != nil { 105 secondaryEndpointAddresses = append(secondaryEndpointAddresses, addr) 106 } 107 } 108 109 if primaryHealthIP != "" { 110 primaryEndpointAddress := p.copyResultRLocked(primaryHealthIP) 111 status.Endpoint = primaryEndpointAddress 112 status.HealthEndpoint = &models.EndpointStatus{ 113 PrimaryAddress: primaryEndpointAddress, 114 SecondaryAddresses: secondaryEndpointAddresses, 115 } 116 } 117 118 resultMap[node.Name] = status 119 } 120 121 result := &healthReport{startTime: p.start} 122 for _, res := range resultMap { 123 result.nodes = append(result.nodes, res) 124 } 125 return result 126 } 127 128 func isIPv4(ip string) bool { 129 netIP := net.ParseIP(ip) 130 return netIP != nil && !strings.Contains(ip, ":") 131 } 132 133 func skipAddress(elem *ciliumModels.NodeAddressingElement) bool { 134 return elem == nil || !elem.Enabled || elem.IP == "<nil>" 135 } 136 137 // resolveIP attempts to sanitize 'node' and 'ip', and if successful, returns 138 // the name of the node and the IP address specified in the addressing element. 139 // If validation fails or this IP should not be pinged, 'ip' is returned as nil. 140 func resolveIP(n *healthNode, addr *ciliumModels.NodeAddressingElement, primary bool) (string, *net.IPAddr) { 141 node := n.NodeElement 142 network := "ip6:icmp" 143 if isIPv4(addr.IP) { 144 network = "ip4:icmp" 145 } 146 147 // Only add fields to the scoped logger if debug is enabled, to save on resources. 148 // This can be done since all logs in this function are debug-level only. 149 scopedLog := log 150 if logging.CanLogAt(scopedLog.Logger, logrus.DebugLevel) { 151 scopedLog = log.WithFields(logrus.Fields{ 152 logfields.NodeName: node.Name, 153 logfields.IPAddr: addr.IP, 154 "primary": primary, 155 }) 156 } 157 158 if skipAddress(addr) { 159 scopedLog.Debug("Skipping probe for address") 160 return "", nil 161 } 162 163 ra, err := net.ResolveIPAddr(network, addr.IP) 164 if err != nil || ra.String() == "" { 165 scopedLog.Debug("Unable to resolve address") 166 return "", nil 167 } 168 169 scopedLog.Debug("Probing for connectivity to node") 170 return node.Name, ra 171 } 172 173 // RemoveIP removes all traces of the specified IP from the prober, including 174 // clearing all cached results, mapping from this IP to a node, and entries in 175 // the ICMP and TCP pingers. 176 func (p *prober) RemoveIP(ip string) { 177 nodeIP := ipString(ip) 178 delete(p.results, nodeIP) 179 p.Pinger.RemoveIP(ip) // ICMP pinger 180 delete(p.nodes, nodeIP) // TCP prober 181 } 182 183 // setNodes sets the list of nodes for the prober, and updates the pinger to 184 // start sending pings to all nodes added. 185 // 'removed' nodes will be removed from the pinger to stop sending pings to 186 // those removed nodes. 187 // setNodes will steal references to nodes referenced from 'added', so the 188 // caller should not modify them after a call to setNodes. 189 // If a node is updated, it will appear in both maps and will be removed then 190 // added (potentially with different information). We want to do it only if relevant 191 // health-information changes to preserve previous health-checking results. 192 func (p *prober) setNodes(added nodeMap, removed nodeMap) { 193 p.Lock() 194 defer p.Unlock() 195 196 // Check what IPs will be readded 197 // so we don't remove results that we already have for them. 198 readdedIPs := map[string]struct{}{} 199 for _, n := range added { 200 for elem, primary := range n.Addresses() { 201 _, addr := resolveIP(&n, elem, primary) 202 if addr == nil { 203 continue 204 } 205 readdedIPs[elem.IP] = struct{}{} 206 } 207 } 208 209 for _, n := range removed { 210 for elem := range n.Addresses() { 211 if _, ok := readdedIPs[elem.IP]; !ok { 212 p.RemoveIP(elem.IP) 213 } 214 } 215 } 216 217 for _, n := range added { 218 for elem, primary := range n.Addresses() { 219 _, addr := resolveIP(&n, elem, primary) 220 if addr == nil { 221 continue 222 } 223 224 ip := ipString(elem.IP) 225 p.AddIPAddr(addr) 226 p.nodes[ip] = n 227 228 if p.results[ip] == nil { 229 p.results[ip] = &models.PathStatus{ 230 IP: elem.IP, 231 } 232 } 233 } 234 } 235 } 236 237 func (p *prober) updateIcmpStatus() { 238 p.Lock() 239 defer p.Unlock() 240 241 for _, status := range p.results { 242 if status.Icmp == nil { 243 status.Icmp = &models.ConnectivityStatus{} 244 status.Icmp.Status = "Connection timed out" 245 } 246 } 247 } 248 249 func (p *prober) clearIcmpStatus() { 250 p.Lock() 251 defer p.Unlock() 252 253 for _, status := range p.results { 254 status.Icmp = nil 255 } 256 } 257 258 const httpPathDescription = "Via L3" 259 260 func (p *prober) httpProbe(node string, ip string) *models.ConnectivityStatus { 261 result := &models.ConnectivityStatus{} 262 263 host := "http://" + net.JoinHostPort(ip, strconv.Itoa(p.server.Config.HTTPPathPort)) 264 265 // Only add fields to the scoped logger if debug is enabled, to save on resources. 266 // This can be done since all logs in this function are debug-level only. 267 scopedLog := log 268 debugLogsEnabled := logging.CanLogAt(scopedLog.Logger, logrus.DebugLevel) 269 if debugLogsEnabled { 270 scopedLog = log.WithFields(logrus.Fields{ 271 logfields.NodeName: node, 272 logfields.IPAddr: ip, 273 "host": host, 274 "path": httpPathDescription, 275 }) 276 } 277 278 scopedLog.Debug("Greeting host") 279 start := time.Now() 280 err := probe.GetHello(host) 281 rtt := time.Since(start) 282 if err == nil { 283 if debugLogsEnabled { 284 scopedLog.WithField("rtt", rtt).Debug("Greeting successful") 285 } 286 287 result.Status = "" 288 result.Latency = rtt.Nanoseconds() 289 } else { 290 if debugLogsEnabled { 291 scopedLog.WithError(err).Debug("Greeting failed") 292 } 293 294 result.Status = err.Error() 295 } 296 297 return result 298 } 299 300 func (p *prober) getIPsByNode() map[string][]*net.IPAddr { 301 p.RLock() 302 defer p.RUnlock() 303 304 // p.nodes is mapped from all known IPs -> nodes in N:M configuration, 305 // so multiple IPs could refer to the same node. To ensure we only 306 // ping each node once, deduplicate nodes into map of nodeName -> []IP. 307 nodes := make(map[string][]*net.IPAddr) 308 for _, node := range p.nodes { 309 if nodes[node.Name] != nil { 310 // Already handled this node. 311 continue 312 } 313 nodes[node.Name] = []*net.IPAddr{} 314 for elem, primary := range node.Addresses() { 315 if _, addr := resolveIP(&node, elem, primary); addr != nil { 316 nodes[node.Name] = append(nodes[node.Name], addr) 317 } 318 } 319 } 320 321 return nodes 322 } 323 324 func (p *prober) runHTTPProbe() { 325 startTime := time.Now() 326 p.Lock() 327 p.start = startTime 328 p.Unlock() 329 330 // Only add fields to the scoped logger if debug is enabled, to save on resources. 331 // This can be done since all logs in this function are debug-level only. 332 debugLogsEnabled := logging.CanLogAt(log.Logger, logrus.DebugLevel) 333 334 for name, ips := range p.getIPsByNode() { 335 scopedLog := log 336 337 if debugLogsEnabled { 338 scopedLog = log.WithField(logfields.NodeName, name) 339 } 340 341 for _, ip := range ips { 342 if debugLogsEnabled { 343 scopedLog = scopedLog.WithFields(logrus.Fields{ 344 logfields.IPAddr: ip.String(), 345 logfields.Port: p.server.Config.HTTPPathPort, 346 }) 347 } 348 349 resp := p.httpProbe(name, ip.String()) 350 if resp.Status != "" { 351 scopedLog.Debugf("Failed to probe: %s", resp.Status) 352 } 353 354 peer := ipString(ip.String()) 355 p.Lock() 356 if _, ok := p.results[peer]; ok { 357 p.results[peer].HTTP = resp 358 } else { 359 // While we weren't holding the lock, the 360 // pinger's OnIdle() callback fired and updated 361 // the set of nodes to remove this node. 362 scopedLog.Debug("Node disappeared before result written") 363 } 364 p.Unlock() 365 } 366 } 367 } 368 369 // Run sends a single probes out to all of the other cilium nodes to gather 370 // connectivity status for the cluster. 371 func (p *prober) Run() error { 372 err := p.Pinger.Run() 373 p.runHTTPProbe() 374 p.updateIcmpStatus() 375 return err 376 } 377 378 // Stop disrupts the currently running RunLoop(). This may only be called after 379 // a call to RunLoop(). 380 func (p *prober) Stop() { 381 p.Pinger.Stop() 382 close(p.stop) 383 <-p.proberExited 384 } 385 386 // RunLoop periodically sends probes out to all of the other cilium nodes to 387 // gather connectivity status for the cluster. 388 // 389 // This is a non-blocking method so it immediately returns. If you want to 390 // stop sending packets, call Stop(). 391 func (p *prober) RunLoop() { 392 // FIXME: Spread the probes out across the probing interval 393 p.Pinger.RunLoop() 394 395 go func() { 396 tick := time.NewTicker(p.server.ProbeInterval) 397 p.runHTTPProbe() 398 loop: 399 for { 400 select { 401 case <-p.stop: 402 break loop 403 case <-tick.C: 404 p.runHTTPProbe() 405 continue 406 } 407 } 408 tick.Stop() 409 close(p.proberExited) 410 }() 411 } 412 413 // newPinger prepares a prober. The caller may invoke one the Run* methods of 414 // the prober to populate its 'results' map. 415 func newProber(s *Server, nodes nodeMap) *prober { 416 prober := &prober{ 417 Pinger: fastping.NewPinger(), 418 server: s, 419 proberExited: make(chan bool), 420 stop: make(chan bool), 421 results: make(map[ipString]*models.PathStatus), 422 nodes: make(nodeMap), 423 } 424 prober.MaxRTT = s.ProbeDeadline 425 // FIXME: Doubling the default payload size to 16 is a workaround for GH-18177 426 prober.Size = 2 * fastping.TimeSliceLength 427 prober.setNodes(nodes, nil) 428 prober.OnRecv = func(addr *net.IPAddr, rtt time.Duration) { 429 prober.Lock() 430 defer prober.Unlock() 431 node, exists := prober.nodes[ipString(addr.String())] 432 433 // Only add fields to the scoped logger if debug is enabled, to save on resources. 434 // This can be done since all logs in this function are debug-level only. 435 scopedLog := log 436 if logging.CanLogAt(log.Logger, logrus.DebugLevel) { 437 scopedLog = log.WithFields(logrus.Fields{ 438 logfields.IPAddr: addr, 439 logfields.NodeName: node.Name, 440 "rtt": rtt, 441 }) 442 } 443 444 if !exists { 445 scopedLog.Debug("Node disappeared, skip result") 446 return 447 } 448 449 prober.results[ipString(addr.String())].Icmp = &models.ConnectivityStatus{ 450 Latency: rtt.Nanoseconds(), 451 Status: "", 452 } 453 scopedLog.Debug("probe successful") 454 } 455 456 return prober 457 }