github.com/cilium/cilium@v1.16.2/pkg/health/server/server.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package server 5 6 import ( 7 "fmt" 8 "path" 9 "time" 10 11 "github.com/cilium/cilium/api/v1/client/daemon" 12 healthModels "github.com/cilium/cilium/api/v1/health/models" 13 healthApi "github.com/cilium/cilium/api/v1/health/server" 14 "github.com/cilium/cilium/api/v1/health/server/restapi" 15 "github.com/cilium/cilium/api/v1/models" 16 "github.com/cilium/cilium/pkg/api" 17 ciliumPkg "github.com/cilium/cilium/pkg/client" 18 ciliumDefaults "github.com/cilium/cilium/pkg/defaults" 19 healthClientPkg "github.com/cilium/cilium/pkg/health/client" 20 "github.com/cilium/cilium/pkg/health/defaults" 21 "github.com/cilium/cilium/pkg/health/probe/responder" 22 "github.com/cilium/cilium/pkg/lock" 23 "github.com/cilium/cilium/pkg/logging" 24 "github.com/cilium/cilium/pkg/logging/logfields" 25 "github.com/cilium/cilium/pkg/metrics" 26 "github.com/cilium/cilium/pkg/node" 27 "github.com/cilium/cilium/pkg/option" 28 ) 29 30 var ( 31 log = logging.DefaultLogger.WithField(logfields.LogSubsys, "health-server") 32 ) 33 34 // Config stores the configuration data for a cilium-health server. 35 type Config struct { 36 Debug bool 37 CiliumURI string 38 ProbeInterval time.Duration 39 ProbeDeadline time.Duration 40 HTTPPathPort int 41 HealthAPISpec *healthApi.Spec 42 } 43 44 // ipString is an IP address used as a more descriptive type name in maps. 45 type ipString string 46 47 // nodeMap maps IP addresses to healthNode objects for convenient access to 48 // node information. 49 type nodeMap map[ipString]healthNode 50 51 // Server is the cilium-health daemon that is in charge of performing health 52 // and connectivity checks periodically, and serving the cilium-health API. 53 type Server struct { 54 healthApi.Server // Server to provide cilium-health API 55 *ciliumPkg.Client // Client to "GET /healthz" on cilium daemon 56 Config 57 // clientID is the client ID returned by the cilium-agent that should 58 // be used when making frequent requests. The server will return 59 // a diff of the nodes added and removed based on this clientID. 60 clientID int64 61 62 httpPathServer *responder.Server // HTTP server for external pings 63 startTime time.Time 64 65 // The lock protects against read and write access to the IP->Node map, 66 // the list of statuses as most recently seen, and the last time a 67 // probe was conducted. 68 lock.RWMutex 69 connectivity *healthReport 70 localStatus *healthModels.SelfStatus 71 } 72 73 // DumpUptime returns the time that this server has been running. 74 func (s *Server) DumpUptime() string { 75 return time.Since(s.startTime).String() 76 } 77 78 // getNodes fetches the nodes added and removed from the last time the server 79 // made a request to the daemon. 80 func (s *Server) getNodes() (nodeMap, nodeMap, error) { 81 scopedLog := log 82 if s.CiliumURI != "" { 83 scopedLog = log.WithField("URI", s.CiliumURI) 84 } 85 scopedLog.Debug("Sending request for /cluster/nodes ...") 86 87 clusterNodesParam := daemon.NewGetClusterNodesParams() 88 s.RWMutex.RLock() 89 cID := s.clientID 90 s.RWMutex.RUnlock() 91 clusterNodesParam.SetClientID(&cID) 92 resp, err := s.Daemon.GetClusterNodes(clusterNodesParam) 93 if err != nil { 94 return nil, nil, fmt.Errorf("unable to get nodes' cluster: %w", err) 95 } 96 log.Debug("Got cilium /cluster/nodes") 97 98 if resp == nil || resp.Payload == nil { 99 return nil, nil, fmt.Errorf("received nil health response") 100 } 101 102 s.RWMutex.Lock() 103 s.clientID = resp.Payload.ClientID 104 105 if resp.Payload.Self != "" { 106 s.localStatus = &healthModels.SelfStatus{ 107 Name: resp.Payload.Self, 108 } 109 } 110 s.RWMutex.Unlock() 111 112 nodesAdded := nodeElementSliceToNodeMap(resp.Payload.NodesAdded) 113 nodesRemoved := nodeElementSliceToNodeMap(resp.Payload.NodesRemoved) 114 115 return nodesAdded, nodesRemoved, nil 116 } 117 118 // getAllNodes fetches all nodes the daemon is aware of. 119 func (s *Server) getAllNodes() (nodeMap, error) { 120 scopedLog := log 121 if s.CiliumURI != "" { 122 scopedLog = log.WithField("URI", s.CiliumURI) 123 } 124 scopedLog.Debug("Sending request for /cluster/nodes ...") 125 126 resp, err := s.Daemon.GetClusterNodes(nil) 127 if err != nil { 128 return nil, fmt.Errorf("unable to get nodes' cluster: %w", err) 129 } 130 log.Debug("Got cilium /cluster/nodes") 131 132 if resp == nil || resp.Payload == nil { 133 return nil, fmt.Errorf("received nil health response") 134 } 135 136 nodesAdded := nodeElementSliceToNodeMap(resp.Payload.NodesAdded) 137 138 return nodesAdded, nil 139 } 140 141 // nodeElementSliceToNodeMap returns a slice of models.NodeElement into a 142 // nodeMap. 143 func nodeElementSliceToNodeMap(nodeElements []*models.NodeElement) nodeMap { 144 nodes := make(nodeMap) 145 for _, n := range nodeElements { 146 if n.PrimaryAddress != nil { 147 if n.PrimaryAddress.IPV4 != nil { 148 nodes[ipString(n.PrimaryAddress.IPV4.IP)] = NewHealthNode(n) 149 } 150 if n.PrimaryAddress.IPV6 != nil { 151 nodes[ipString(n.PrimaryAddress.IPV6.IP)] = NewHealthNode(n) 152 } 153 } 154 for _, addr := range n.SecondaryAddresses { 155 nodes[ipString(addr.IP)] = NewHealthNode(n) 156 } 157 if n.HealthEndpointAddress != nil { 158 if n.HealthEndpointAddress.IPV4 != nil { 159 nodes[ipString(n.HealthEndpointAddress.IPV4.IP)] = NewHealthNode(n) 160 } 161 if n.HealthEndpointAddress.IPV6 != nil { 162 nodes[ipString(n.HealthEndpointAddress.IPV6.IP)] = NewHealthNode(n) 163 } 164 } 165 } 166 return nodes 167 } 168 169 // updateCluster makes the specified health report visible to the API. 170 // 171 // It only updates the server's API-visible health report if the provided 172 // report started after the current report. 173 func (s *Server) updateCluster(report *healthReport) { 174 s.Lock() 175 defer s.Unlock() 176 177 if s.connectivity.startTime.Before(report.startTime) { 178 s.connectivity = report 179 s.collectNodeConnectivityMetrics() 180 } 181 } 182 183 func (s *Server) collectNodeConnectivityMetrics() { 184 if s.localStatus == nil || s.connectivity == nil { 185 return 186 } 187 localClusterName, localNodeName := getClusterNodeName(s.localStatus.Name) 188 189 for _, n := range s.connectivity.nodes { 190 if n == nil || n.Host == nil || n.Host.PrimaryAddress == nil || n.HealthEndpoint == nil || n.HealthEndpoint.PrimaryAddress == nil { 191 continue 192 } 193 194 targetClusterName, targetNodeName := getClusterNodeName(n.Name) 195 nodePathPrimaryAddress := healthClientPkg.GetHostPrimaryAddress(n) 196 nodePathSecondaryAddress := healthClientPkg.GetHostSecondaryAddresses(n) 197 198 endpointPathStatus := n.HealthEndpoint 199 isEndpointReachable := healthClientPkg.SummarizePathConnectivityStatusType(healthClientPkg.GetAllEndpointAddresses(n)) == healthClientPkg.ConnStatusReachable 200 isNodeReachable := healthClientPkg.SummarizePathConnectivityStatusType(healthClientPkg.GetAllHostAddresses(n)) == healthClientPkg.ConnStatusReachable 201 202 location := metrics.LabelLocationLocalNode 203 if targetClusterName != localClusterName { 204 location = metrics.LabelLocationRemoteInterCluster 205 } else if targetNodeName != localNodeName { 206 location = metrics.LabelLocationRemoteIntraCluster 207 } 208 209 // Aggregated status for endpoint connectivity 210 metrics.NodeConnectivityStatus.WithLabelValues( 211 localClusterName, localNodeName, targetClusterName, targetNodeName, location, metrics.LabelPeerEndpoint). 212 Set(metrics.BoolToFloat64(isEndpointReachable)) 213 214 // Aggregated status for node connectivity 215 metrics.NodeConnectivityStatus.WithLabelValues( 216 localClusterName, localNodeName, targetClusterName, targetNodeName, location, metrics.LabelPeerNode). 217 Set(metrics.BoolToFloat64(isNodeReachable)) 218 219 // HTTP endpoint primary 220 collectConnectivityMetric(endpointPathStatus.PrimaryAddress.HTTP, localClusterName, localNodeName, 221 targetClusterName, targetNodeName, endpointPathStatus.PrimaryAddress.IP, 222 location, metrics.LabelPeerEndpoint, metrics.LabelTrafficHTTP, metrics.LabelAddressTypePrimary) 223 224 // HTTP endpoint secondary 225 for _, secondary := range endpointPathStatus.SecondaryAddresses { 226 collectConnectivityMetric(secondary.HTTP, localClusterName, localNodeName, 227 targetClusterName, targetNodeName, secondary.IP, 228 location, metrics.LabelPeerEndpoint, metrics.LabelTrafficHTTP, metrics.LabelAddressTypeSecondary) 229 } 230 231 // HTTP node primary 232 collectConnectivityMetric(nodePathPrimaryAddress.HTTP, localClusterName, localNodeName, 233 targetClusterName, targetNodeName, nodePathPrimaryAddress.IP, 234 location, metrics.LabelPeerNode, metrics.LabelTrafficHTTP, metrics.LabelAddressTypePrimary) 235 236 // HTTP node secondary 237 for _, secondary := range nodePathSecondaryAddress { 238 collectConnectivityMetric(secondary.HTTP, localClusterName, localNodeName, 239 targetClusterName, targetNodeName, secondary.IP, 240 location, metrics.LabelPeerNode, metrics.LabelTrafficHTTP, metrics.LabelAddressTypeSecondary) 241 } 242 243 // ICMP endpoint primary 244 collectConnectivityMetric(endpointPathStatus.PrimaryAddress.Icmp, localClusterName, localNodeName, 245 targetClusterName, targetNodeName, endpointPathStatus.PrimaryAddress.IP, 246 location, metrics.LabelPeerEndpoint, metrics.LabelTrafficICMP, metrics.LabelAddressTypePrimary) 247 248 // ICMP endpoint secondary 249 for _, secondary := range endpointPathStatus.SecondaryAddresses { 250 collectConnectivityMetric(secondary.Icmp, localClusterName, localNodeName, 251 targetClusterName, targetNodeName, secondary.IP, 252 location, metrics.LabelPeerEndpoint, metrics.LabelTrafficICMP, metrics.LabelAddressTypeSecondary) 253 } 254 255 // ICMP node primary 256 collectConnectivityMetric(nodePathPrimaryAddress.Icmp, localClusterName, localNodeName, 257 targetClusterName, targetNodeName, nodePathPrimaryAddress.IP, 258 location, metrics.LabelPeerNode, metrics.LabelTrafficICMP, metrics.LabelAddressTypePrimary) 259 260 // ICMP node secondary 261 for _, secondary := range nodePathSecondaryAddress { 262 collectConnectivityMetric(secondary.Icmp, localClusterName, localNodeName, 263 targetClusterName, targetNodeName, secondary.IP, 264 location, metrics.LabelPeerNode, metrics.LabelTrafficICMP, metrics.LabelAddressTypeSecondary) 265 } 266 } 267 } 268 269 func collectConnectivityMetric(status *healthModels.ConnectivityStatus, labels ...string) { 270 var metricValue float64 = -1 271 if status != nil { 272 metricValue = float64(status.Latency) / float64(time.Second) 273 } 274 metrics.NodeConnectivityLatency.WithLabelValues(labels...).Set(metricValue) 275 } 276 277 // getClusterNodeName returns the cluster name and node name if possible. 278 func getClusterNodeName(str string) (string, string) { 279 clusterName, nodeName := path.Split(str) 280 if len(clusterName) == 0 { 281 return ciliumDefaults.ClusterName, nodeName 282 } 283 // remove forward slash at the end if any for cluster name 284 return path.Dir(clusterName), nodeName 285 } 286 287 // GetStatusResponse returns the most recent cluster connectivity status. 288 func (s *Server) GetStatusResponse() *healthModels.HealthStatusResponse { 289 s.RLock() 290 defer s.RUnlock() 291 292 var name string 293 // Check if localStatus is populated already. If not, the name is empty 294 if s.localStatus != nil { 295 name = s.localStatus.Name 296 } 297 298 return &healthModels.HealthStatusResponse{ 299 Local: &healthModels.SelfStatus{ 300 Name: name, 301 }, 302 Nodes: s.connectivity.nodes, 303 Timestamp: s.connectivity.startTime.Format(time.RFC3339), 304 } 305 } 306 307 // FetchStatusResponse updates the cluster with the latest set of nodes, 308 // runs a synchronous probe across the cluster, updates the connectivity cache 309 // and returns the results. 310 func (s *Server) FetchStatusResponse() (*healthModels.HealthStatusResponse, error) { 311 nodes, err := s.getAllNodes() 312 if err != nil { 313 return nil, err 314 } 315 316 prober := newProber(s, nodes) 317 if err := prober.Run(); err != nil { 318 log.WithError(err).Info("Failed to run ping") 319 return nil, err 320 } 321 log.Debug("Run complete") 322 s.updateCluster(prober.getResults()) 323 324 return s.GetStatusResponse(), nil 325 } 326 327 // Run services that are actively probing other hosts and endpoints over 328 // ICMP and HTTP, and hosting the health admin API on a local Unix socket. 329 // Blocks indefinitely, or returns any errors that occur hosting the Unix 330 // socket API server. 331 func (s *Server) runActiveServices() error { 332 // Run it once at the start so we get some initial status 333 s.FetchStatusResponse() 334 335 // We can safely ignore nodesRemoved since it's the first time we are 336 // fetching the nodes from the server. 337 nodesAdded, _, _ := s.getNodes() 338 prober := newProber(s, nodesAdded) 339 prober.MaxRTT = s.ProbeInterval 340 prober.OnIdle = func() { 341 // OnIdle is called every ProbeInterval after sending out all icmp pings. 342 // There are a few important consideration here: 343 // (1) ICMP prober doesn't report failed probes 344 // (2) We can receive the same nodes multiple times, 345 // updated node is present in both nodesAdded and nodesRemoved 346 // (3) We need to clean icmp status to not retain stale probe results 347 // (4) We don't want to report stale nodes in metrics 348 349 if nodesAdded, nodesRemoved, err := s.getNodes(); err != nil { 350 // reset the cache by setting clientID to 0 and removing all current nodes 351 s.clientID = 0 352 prober.setNodes(nil, prober.nodes) 353 log.WithError(err).Error("unable to get cluster nodes") 354 return 355 } else { 356 // (1) Mark ips that did not receive ICMP as unreachable. 357 prober.updateIcmpStatus() 358 // (2) setNodes implementation doesn't override results for existing nodes. 359 // (4) Remove stale nodes so we don't report them in metrics before updating results 360 prober.setNodes(nodesAdded, nodesRemoved) 361 // (4) Update results without stale nodes 362 s.updateCluster(prober.getResults()) 363 // (3) Cleanup icmp results for next iteration of probing 364 prober.clearIcmpStatus() 365 } 366 } 367 prober.RunLoop() 368 defer prober.Stop() 369 370 return s.Server.Serve() 371 } 372 373 // Serve spins up the following goroutines: 374 // - HTTP API Server: Responder to the health API "/hello" message 375 // - Prober: Periodically run pings across the cluster at a configured interval 376 // and update the server's connectivity status cache. 377 // - Unix API Server: Handle all health API requests over a unix socket. 378 // 379 // Callers should first defer the Server.Shutdown(), then call Serve(). 380 func (s *Server) Serve() (err error) { 381 errors := make(chan error) 382 383 go func() { 384 errors <- s.httpPathServer.Serve() 385 }() 386 387 go func() { 388 errors <- s.runActiveServices() 389 }() 390 391 // Block for the first error, then return. 392 err = <-errors 393 return err 394 } 395 396 // Shutdown server and clean up resources 397 func (s *Server) Shutdown() { 398 s.httpPathServer.Shutdown() 399 s.Server.Shutdown() 400 } 401 402 // newServer instantiates a new instance of the health API server on the 403 // defaults unix socket. 404 func (s *Server) newServer(spec *healthApi.Spec) *healthApi.Server { 405 restAPI := restapi.NewCiliumHealthAPIAPI(spec.Document) 406 restAPI.Logger = log.Printf 407 408 // Admin API 409 restAPI.GetHealthzHandler = NewGetHealthzHandler(s) 410 restAPI.ConnectivityGetStatusHandler = NewGetStatusHandler(s) 411 restAPI.ConnectivityPutStatusProbeHandler = NewPutStatusProbeHandler(s) 412 413 api.DisableAPIs(spec.DeniedAPIs, restAPI.AddMiddlewareFor) 414 srv := healthApi.NewServer(restAPI) 415 srv.EnabledListeners = []string{"unix"} 416 srv.SocketPath = defaults.SockPath 417 418 srv.ConfigureAPI() 419 420 return srv 421 } 422 423 // NewServer creates a server to handle health requests. 424 func NewServer(config Config) (*Server, error) { 425 server := &Server{ 426 startTime: time.Now(), 427 Config: config, 428 connectivity: &healthReport{}, 429 } 430 431 cl, err := ciliumPkg.NewClient(config.CiliumURI) 432 if err != nil { 433 return nil, err 434 } 435 436 server.Client = cl 437 server.Server = *server.newServer(config.HealthAPISpec) 438 439 server.httpPathServer = responder.NewServers(getAddresses(), config.HTTPPathPort) 440 441 return server, nil 442 } 443 444 // Get internal node ipv4/ipv6 addresses based on config enabled. 445 // If it fails to get either of internal node address, it returns "0.0.0.0" if ipv4 or "::" if ipv6. 446 func getAddresses() []string { 447 addresses := make([]string, 0, 2) 448 449 // listen on all interfaces and all families in case of external-workloads 450 if option.Config.JoinCluster { 451 return []string{""} 452 } 453 454 if option.Config.EnableIPv4 { 455 if ipv4 := node.GetInternalIPv4(); ipv4 != nil { 456 addresses = append(addresses, ipv4.String()) 457 } else { 458 // if Get ipv4 fails, then listen on all ipv4 addr. 459 addresses = append(addresses, "0.0.0.0") 460 } 461 } 462 463 if option.Config.EnableIPv6 { 464 if ipv6 := node.GetInternalIPv6(); ipv6 != nil { 465 addresses = append(addresses, ipv6.String()) 466 } else { 467 // if Get ipv6 fails, then listen on all ipv6 addr. 468 addresses = append(addresses, "::") 469 } 470 } 471 472 return addresses 473 }