github.com/cilium/cilium@v1.16.2/pkg/health/server/server.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package server
     5  
     6  import (
     7  	"fmt"
     8  	"path"
     9  	"time"
    10  
    11  	"github.com/cilium/cilium/api/v1/client/daemon"
    12  	healthModels "github.com/cilium/cilium/api/v1/health/models"
    13  	healthApi "github.com/cilium/cilium/api/v1/health/server"
    14  	"github.com/cilium/cilium/api/v1/health/server/restapi"
    15  	"github.com/cilium/cilium/api/v1/models"
    16  	"github.com/cilium/cilium/pkg/api"
    17  	ciliumPkg "github.com/cilium/cilium/pkg/client"
    18  	ciliumDefaults "github.com/cilium/cilium/pkg/defaults"
    19  	healthClientPkg "github.com/cilium/cilium/pkg/health/client"
    20  	"github.com/cilium/cilium/pkg/health/defaults"
    21  	"github.com/cilium/cilium/pkg/health/probe/responder"
    22  	"github.com/cilium/cilium/pkg/lock"
    23  	"github.com/cilium/cilium/pkg/logging"
    24  	"github.com/cilium/cilium/pkg/logging/logfields"
    25  	"github.com/cilium/cilium/pkg/metrics"
    26  	"github.com/cilium/cilium/pkg/node"
    27  	"github.com/cilium/cilium/pkg/option"
    28  )
    29  
    30  var (
    31  	log = logging.DefaultLogger.WithField(logfields.LogSubsys, "health-server")
    32  )
    33  
    34  // Config stores the configuration data for a cilium-health server.
    35  type Config struct {
    36  	Debug         bool
    37  	CiliumURI     string
    38  	ProbeInterval time.Duration
    39  	ProbeDeadline time.Duration
    40  	HTTPPathPort  int
    41  	HealthAPISpec *healthApi.Spec
    42  }
    43  
    44  // ipString is an IP address used as a more descriptive type name in maps.
    45  type ipString string
    46  
    47  // nodeMap maps IP addresses to healthNode objects for convenient access to
    48  // node information.
    49  type nodeMap map[ipString]healthNode
    50  
    51  // Server is the cilium-health daemon that is in charge of performing health
    52  // and connectivity checks periodically, and serving the cilium-health API.
    53  type Server struct {
    54  	healthApi.Server  // Server to provide cilium-health API
    55  	*ciliumPkg.Client // Client to "GET /healthz" on cilium daemon
    56  	Config
    57  	// clientID is the client ID returned by the cilium-agent that should
    58  	// be used when making frequent requests. The server will return
    59  	// a diff of the nodes added and removed based on this clientID.
    60  	clientID int64
    61  
    62  	httpPathServer *responder.Server // HTTP server for external pings
    63  	startTime      time.Time
    64  
    65  	// The lock protects against read and write access to the IP->Node map,
    66  	// the list of statuses as most recently seen, and the last time a
    67  	// probe was conducted.
    68  	lock.RWMutex
    69  	connectivity *healthReport
    70  	localStatus  *healthModels.SelfStatus
    71  }
    72  
    73  // DumpUptime returns the time that this server has been running.
    74  func (s *Server) DumpUptime() string {
    75  	return time.Since(s.startTime).String()
    76  }
    77  
    78  // getNodes fetches the nodes added and removed from the last time the server
    79  // made a request to the daemon.
    80  func (s *Server) getNodes() (nodeMap, nodeMap, error) {
    81  	scopedLog := log
    82  	if s.CiliumURI != "" {
    83  		scopedLog = log.WithField("URI", s.CiliumURI)
    84  	}
    85  	scopedLog.Debug("Sending request for /cluster/nodes ...")
    86  
    87  	clusterNodesParam := daemon.NewGetClusterNodesParams()
    88  	s.RWMutex.RLock()
    89  	cID := s.clientID
    90  	s.RWMutex.RUnlock()
    91  	clusterNodesParam.SetClientID(&cID)
    92  	resp, err := s.Daemon.GetClusterNodes(clusterNodesParam)
    93  	if err != nil {
    94  		return nil, nil, fmt.Errorf("unable to get nodes' cluster: %w", err)
    95  	}
    96  	log.Debug("Got cilium /cluster/nodes")
    97  
    98  	if resp == nil || resp.Payload == nil {
    99  		return nil, nil, fmt.Errorf("received nil health response")
   100  	}
   101  
   102  	s.RWMutex.Lock()
   103  	s.clientID = resp.Payload.ClientID
   104  
   105  	if resp.Payload.Self != "" {
   106  		s.localStatus = &healthModels.SelfStatus{
   107  			Name: resp.Payload.Self,
   108  		}
   109  	}
   110  	s.RWMutex.Unlock()
   111  
   112  	nodesAdded := nodeElementSliceToNodeMap(resp.Payload.NodesAdded)
   113  	nodesRemoved := nodeElementSliceToNodeMap(resp.Payload.NodesRemoved)
   114  
   115  	return nodesAdded, nodesRemoved, nil
   116  }
   117  
   118  // getAllNodes fetches all nodes the daemon is aware of.
   119  func (s *Server) getAllNodes() (nodeMap, error) {
   120  	scopedLog := log
   121  	if s.CiliumURI != "" {
   122  		scopedLog = log.WithField("URI", s.CiliumURI)
   123  	}
   124  	scopedLog.Debug("Sending request for /cluster/nodes ...")
   125  
   126  	resp, err := s.Daemon.GetClusterNodes(nil)
   127  	if err != nil {
   128  		return nil, fmt.Errorf("unable to get nodes' cluster: %w", err)
   129  	}
   130  	log.Debug("Got cilium /cluster/nodes")
   131  
   132  	if resp == nil || resp.Payload == nil {
   133  		return nil, fmt.Errorf("received nil health response")
   134  	}
   135  
   136  	nodesAdded := nodeElementSliceToNodeMap(resp.Payload.NodesAdded)
   137  
   138  	return nodesAdded, nil
   139  }
   140  
   141  // nodeElementSliceToNodeMap returns a slice of models.NodeElement into a
   142  // nodeMap.
   143  func nodeElementSliceToNodeMap(nodeElements []*models.NodeElement) nodeMap {
   144  	nodes := make(nodeMap)
   145  	for _, n := range nodeElements {
   146  		if n.PrimaryAddress != nil {
   147  			if n.PrimaryAddress.IPV4 != nil {
   148  				nodes[ipString(n.PrimaryAddress.IPV4.IP)] = NewHealthNode(n)
   149  			}
   150  			if n.PrimaryAddress.IPV6 != nil {
   151  				nodes[ipString(n.PrimaryAddress.IPV6.IP)] = NewHealthNode(n)
   152  			}
   153  		}
   154  		for _, addr := range n.SecondaryAddresses {
   155  			nodes[ipString(addr.IP)] = NewHealthNode(n)
   156  		}
   157  		if n.HealthEndpointAddress != nil {
   158  			if n.HealthEndpointAddress.IPV4 != nil {
   159  				nodes[ipString(n.HealthEndpointAddress.IPV4.IP)] = NewHealthNode(n)
   160  			}
   161  			if n.HealthEndpointAddress.IPV6 != nil {
   162  				nodes[ipString(n.HealthEndpointAddress.IPV6.IP)] = NewHealthNode(n)
   163  			}
   164  		}
   165  	}
   166  	return nodes
   167  }
   168  
   169  // updateCluster makes the specified health report visible to the API.
   170  //
   171  // It only updates the server's API-visible health report if the provided
   172  // report started after the current report.
   173  func (s *Server) updateCluster(report *healthReport) {
   174  	s.Lock()
   175  	defer s.Unlock()
   176  
   177  	if s.connectivity.startTime.Before(report.startTime) {
   178  		s.connectivity = report
   179  		s.collectNodeConnectivityMetrics()
   180  	}
   181  }
   182  
   183  func (s *Server) collectNodeConnectivityMetrics() {
   184  	if s.localStatus == nil || s.connectivity == nil {
   185  		return
   186  	}
   187  	localClusterName, localNodeName := getClusterNodeName(s.localStatus.Name)
   188  
   189  	for _, n := range s.connectivity.nodes {
   190  		if n == nil || n.Host == nil || n.Host.PrimaryAddress == nil || n.HealthEndpoint == nil || n.HealthEndpoint.PrimaryAddress == nil {
   191  			continue
   192  		}
   193  
   194  		targetClusterName, targetNodeName := getClusterNodeName(n.Name)
   195  		nodePathPrimaryAddress := healthClientPkg.GetHostPrimaryAddress(n)
   196  		nodePathSecondaryAddress := healthClientPkg.GetHostSecondaryAddresses(n)
   197  
   198  		endpointPathStatus := n.HealthEndpoint
   199  		isEndpointReachable := healthClientPkg.SummarizePathConnectivityStatusType(healthClientPkg.GetAllEndpointAddresses(n)) == healthClientPkg.ConnStatusReachable
   200  		isNodeReachable := healthClientPkg.SummarizePathConnectivityStatusType(healthClientPkg.GetAllHostAddresses(n)) == healthClientPkg.ConnStatusReachable
   201  
   202  		location := metrics.LabelLocationLocalNode
   203  		if targetClusterName != localClusterName {
   204  			location = metrics.LabelLocationRemoteInterCluster
   205  		} else if targetNodeName != localNodeName {
   206  			location = metrics.LabelLocationRemoteIntraCluster
   207  		}
   208  
   209  		// Aggregated status for endpoint connectivity
   210  		metrics.NodeConnectivityStatus.WithLabelValues(
   211  			localClusterName, localNodeName, targetClusterName, targetNodeName, location, metrics.LabelPeerEndpoint).
   212  			Set(metrics.BoolToFloat64(isEndpointReachable))
   213  
   214  		// Aggregated status for node connectivity
   215  		metrics.NodeConnectivityStatus.WithLabelValues(
   216  			localClusterName, localNodeName, targetClusterName, targetNodeName, location, metrics.LabelPeerNode).
   217  			Set(metrics.BoolToFloat64(isNodeReachable))
   218  
   219  		// HTTP endpoint primary
   220  		collectConnectivityMetric(endpointPathStatus.PrimaryAddress.HTTP, localClusterName, localNodeName,
   221  			targetClusterName, targetNodeName, endpointPathStatus.PrimaryAddress.IP,
   222  			location, metrics.LabelPeerEndpoint, metrics.LabelTrafficHTTP, metrics.LabelAddressTypePrimary)
   223  
   224  		// HTTP endpoint secondary
   225  		for _, secondary := range endpointPathStatus.SecondaryAddresses {
   226  			collectConnectivityMetric(secondary.HTTP, localClusterName, localNodeName,
   227  				targetClusterName, targetNodeName, secondary.IP,
   228  				location, metrics.LabelPeerEndpoint, metrics.LabelTrafficHTTP, metrics.LabelAddressTypeSecondary)
   229  		}
   230  
   231  		// HTTP node primary
   232  		collectConnectivityMetric(nodePathPrimaryAddress.HTTP, localClusterName, localNodeName,
   233  			targetClusterName, targetNodeName, nodePathPrimaryAddress.IP,
   234  			location, metrics.LabelPeerNode, metrics.LabelTrafficHTTP, metrics.LabelAddressTypePrimary)
   235  
   236  		// HTTP node secondary
   237  		for _, secondary := range nodePathSecondaryAddress {
   238  			collectConnectivityMetric(secondary.HTTP, localClusterName, localNodeName,
   239  				targetClusterName, targetNodeName, secondary.IP,
   240  				location, metrics.LabelPeerNode, metrics.LabelTrafficHTTP, metrics.LabelAddressTypeSecondary)
   241  		}
   242  
   243  		// ICMP endpoint primary
   244  		collectConnectivityMetric(endpointPathStatus.PrimaryAddress.Icmp, localClusterName, localNodeName,
   245  			targetClusterName, targetNodeName, endpointPathStatus.PrimaryAddress.IP,
   246  			location, metrics.LabelPeerEndpoint, metrics.LabelTrafficICMP, metrics.LabelAddressTypePrimary)
   247  
   248  		// ICMP endpoint secondary
   249  		for _, secondary := range endpointPathStatus.SecondaryAddresses {
   250  			collectConnectivityMetric(secondary.Icmp, localClusterName, localNodeName,
   251  				targetClusterName, targetNodeName, secondary.IP,
   252  				location, metrics.LabelPeerEndpoint, metrics.LabelTrafficICMP, metrics.LabelAddressTypeSecondary)
   253  		}
   254  
   255  		// ICMP node primary
   256  		collectConnectivityMetric(nodePathPrimaryAddress.Icmp, localClusterName, localNodeName,
   257  			targetClusterName, targetNodeName, nodePathPrimaryAddress.IP,
   258  			location, metrics.LabelPeerNode, metrics.LabelTrafficICMP, metrics.LabelAddressTypePrimary)
   259  
   260  		// ICMP node secondary
   261  		for _, secondary := range nodePathSecondaryAddress {
   262  			collectConnectivityMetric(secondary.Icmp, localClusterName, localNodeName,
   263  				targetClusterName, targetNodeName, secondary.IP,
   264  				location, metrics.LabelPeerNode, metrics.LabelTrafficICMP, metrics.LabelAddressTypeSecondary)
   265  		}
   266  	}
   267  }
   268  
   269  func collectConnectivityMetric(status *healthModels.ConnectivityStatus, labels ...string) {
   270  	var metricValue float64 = -1
   271  	if status != nil {
   272  		metricValue = float64(status.Latency) / float64(time.Second)
   273  	}
   274  	metrics.NodeConnectivityLatency.WithLabelValues(labels...).Set(metricValue)
   275  }
   276  
   277  // getClusterNodeName returns the cluster name and node name if possible.
   278  func getClusterNodeName(str string) (string, string) {
   279  	clusterName, nodeName := path.Split(str)
   280  	if len(clusterName) == 0 {
   281  		return ciliumDefaults.ClusterName, nodeName
   282  	}
   283  	// remove forward slash at the end if any for cluster name
   284  	return path.Dir(clusterName), nodeName
   285  }
   286  
   287  // GetStatusResponse returns the most recent cluster connectivity status.
   288  func (s *Server) GetStatusResponse() *healthModels.HealthStatusResponse {
   289  	s.RLock()
   290  	defer s.RUnlock()
   291  
   292  	var name string
   293  	// Check if localStatus is populated already. If not, the name is empty
   294  	if s.localStatus != nil {
   295  		name = s.localStatus.Name
   296  	}
   297  
   298  	return &healthModels.HealthStatusResponse{
   299  		Local: &healthModels.SelfStatus{
   300  			Name: name,
   301  		},
   302  		Nodes:     s.connectivity.nodes,
   303  		Timestamp: s.connectivity.startTime.Format(time.RFC3339),
   304  	}
   305  }
   306  
   307  // FetchStatusResponse updates the cluster with the latest set of nodes,
   308  // runs a synchronous probe across the cluster, updates the connectivity cache
   309  // and returns the results.
   310  func (s *Server) FetchStatusResponse() (*healthModels.HealthStatusResponse, error) {
   311  	nodes, err := s.getAllNodes()
   312  	if err != nil {
   313  		return nil, err
   314  	}
   315  
   316  	prober := newProber(s, nodes)
   317  	if err := prober.Run(); err != nil {
   318  		log.WithError(err).Info("Failed to run ping")
   319  		return nil, err
   320  	}
   321  	log.Debug("Run complete")
   322  	s.updateCluster(prober.getResults())
   323  
   324  	return s.GetStatusResponse(), nil
   325  }
   326  
   327  // Run services that are actively probing other hosts and endpoints over
   328  // ICMP and HTTP, and hosting the health admin API on a local Unix socket.
   329  // Blocks indefinitely, or returns any errors that occur hosting the Unix
   330  // socket API server.
   331  func (s *Server) runActiveServices() error {
   332  	// Run it once at the start so we get some initial status
   333  	s.FetchStatusResponse()
   334  
   335  	// We can safely ignore nodesRemoved since it's the first time we are
   336  	// fetching the nodes from the server.
   337  	nodesAdded, _, _ := s.getNodes()
   338  	prober := newProber(s, nodesAdded)
   339  	prober.MaxRTT = s.ProbeInterval
   340  	prober.OnIdle = func() {
   341  		// OnIdle is called every ProbeInterval after sending out all icmp pings.
   342  		// There are a few important consideration here:
   343  		// (1) ICMP prober doesn't report failed probes
   344  		// (2) We can receive the same nodes multiple times,
   345  		// updated node is present in both nodesAdded and nodesRemoved
   346  		// (3) We need to clean icmp status to not retain stale probe results
   347  		// (4) We don't want to report stale nodes in metrics
   348  
   349  		if nodesAdded, nodesRemoved, err := s.getNodes(); err != nil {
   350  			// reset the cache by setting clientID to 0 and removing all current nodes
   351  			s.clientID = 0
   352  			prober.setNodes(nil, prober.nodes)
   353  			log.WithError(err).Error("unable to get cluster nodes")
   354  			return
   355  		} else {
   356  			// (1) Mark ips that did not receive ICMP as unreachable.
   357  			prober.updateIcmpStatus()
   358  			// (2) setNodes implementation doesn't override results for existing nodes.
   359  			// (4) Remove stale nodes so we don't report them in metrics before updating results
   360  			prober.setNodes(nodesAdded, nodesRemoved)
   361  			// (4) Update results without stale nodes
   362  			s.updateCluster(prober.getResults())
   363  			// (3) Cleanup icmp results for next iteration of probing
   364  			prober.clearIcmpStatus()
   365  		}
   366  	}
   367  	prober.RunLoop()
   368  	defer prober.Stop()
   369  
   370  	return s.Server.Serve()
   371  }
   372  
   373  // Serve spins up the following goroutines:
   374  //   - HTTP API Server: Responder to the health API "/hello" message
   375  //   - Prober: Periodically run pings across the cluster at a configured interval
   376  //     and update the server's connectivity status cache.
   377  //   - Unix API Server: Handle all health API requests over a unix socket.
   378  //
   379  // Callers should first defer the Server.Shutdown(), then call Serve().
   380  func (s *Server) Serve() (err error) {
   381  	errors := make(chan error)
   382  
   383  	go func() {
   384  		errors <- s.httpPathServer.Serve()
   385  	}()
   386  
   387  	go func() {
   388  		errors <- s.runActiveServices()
   389  	}()
   390  
   391  	// Block for the first error, then return.
   392  	err = <-errors
   393  	return err
   394  }
   395  
   396  // Shutdown server and clean up resources
   397  func (s *Server) Shutdown() {
   398  	s.httpPathServer.Shutdown()
   399  	s.Server.Shutdown()
   400  }
   401  
   402  // newServer instantiates a new instance of the health API server on the
   403  // defaults unix socket.
   404  func (s *Server) newServer(spec *healthApi.Spec) *healthApi.Server {
   405  	restAPI := restapi.NewCiliumHealthAPIAPI(spec.Document)
   406  	restAPI.Logger = log.Printf
   407  
   408  	// Admin API
   409  	restAPI.GetHealthzHandler = NewGetHealthzHandler(s)
   410  	restAPI.ConnectivityGetStatusHandler = NewGetStatusHandler(s)
   411  	restAPI.ConnectivityPutStatusProbeHandler = NewPutStatusProbeHandler(s)
   412  
   413  	api.DisableAPIs(spec.DeniedAPIs, restAPI.AddMiddlewareFor)
   414  	srv := healthApi.NewServer(restAPI)
   415  	srv.EnabledListeners = []string{"unix"}
   416  	srv.SocketPath = defaults.SockPath
   417  
   418  	srv.ConfigureAPI()
   419  
   420  	return srv
   421  }
   422  
   423  // NewServer creates a server to handle health requests.
   424  func NewServer(config Config) (*Server, error) {
   425  	server := &Server{
   426  		startTime:    time.Now(),
   427  		Config:       config,
   428  		connectivity: &healthReport{},
   429  	}
   430  
   431  	cl, err := ciliumPkg.NewClient(config.CiliumURI)
   432  	if err != nil {
   433  		return nil, err
   434  	}
   435  
   436  	server.Client = cl
   437  	server.Server = *server.newServer(config.HealthAPISpec)
   438  
   439  	server.httpPathServer = responder.NewServers(getAddresses(), config.HTTPPathPort)
   440  
   441  	return server, nil
   442  }
   443  
   444  // Get internal node ipv4/ipv6 addresses based on config enabled.
   445  // If it fails to get either of internal node address, it returns "0.0.0.0" if ipv4 or "::" if ipv6.
   446  func getAddresses() []string {
   447  	addresses := make([]string, 0, 2)
   448  
   449  	// listen on all interfaces and all families in case of external-workloads
   450  	if option.Config.JoinCluster {
   451  		return []string{""}
   452  	}
   453  
   454  	if option.Config.EnableIPv4 {
   455  		if ipv4 := node.GetInternalIPv4(); ipv4 != nil {
   456  			addresses = append(addresses, ipv4.String())
   457  		} else {
   458  			// if Get ipv4 fails, then listen on all ipv4 addr.
   459  			addresses = append(addresses, "0.0.0.0")
   460  		}
   461  	}
   462  
   463  	if option.Config.EnableIPv6 {
   464  		if ipv6 := node.GetInternalIPv6(); ipv6 != nil {
   465  			addresses = append(addresses, ipv6.String())
   466  		} else {
   467  			// if Get ipv6 fails, then listen on all ipv6 addr.
   468  			addresses = append(addresses, "::")
   469  		}
   470  	}
   471  
   472  	return addresses
   473  }