github.com/elfadel/cilium@v1.6.12/pkg/health/server/server.go (about)

     1  // Copyright 2017-2019 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package server
    16  
    17  import (
    18  	"fmt"
    19  	"time"
    20  
    21  	"github.com/cilium/cilium/api/v1/client/daemon"
    22  	healthModels "github.com/cilium/cilium/api/v1/health/models"
    23  	healthApi "github.com/cilium/cilium/api/v1/health/server"
    24  	"github.com/cilium/cilium/api/v1/health/server/restapi"
    25  	"github.com/cilium/cilium/api/v1/models"
    26  	ciliumPkg "github.com/cilium/cilium/pkg/client"
    27  	"github.com/cilium/cilium/pkg/health/defaults"
    28  	"github.com/cilium/cilium/pkg/health/probe/responder"
    29  	"github.com/cilium/cilium/pkg/lock"
    30  	"github.com/cilium/cilium/pkg/logging"
    31  	"github.com/cilium/cilium/pkg/logging/logfields"
    32  
    33  	"github.com/go-openapi/loads"
    34  	"github.com/jessevdk/go-flags"
    35  )
    36  
    37  var (
    38  	log = logging.DefaultLogger.WithField(logfields.LogSubsys, "health-server")
    39  
    40  	// PortToPaths is a convenience map for access to the ports and their
    41  	// common string representations
    42  	PortToPaths = map[int]string{
    43  		defaults.HTTPPathPort: "Via L3",
    44  	}
    45  )
    46  
    47  // Config stores the configuration data for a cilium-health server.
    48  type Config struct {
    49  	Debug         bool
    50  	CiliumURI     string
    51  	ProbeInterval time.Duration
    52  	ProbeDeadline time.Duration
    53  }
    54  
    55  // ipString is an IP address used as a more descriptive type name in maps.
    56  type ipString string
    57  
    58  // nodeMap maps IP addresses to healthNode objectss for convenient access to
    59  // node information.
    60  type nodeMap map[ipString]healthNode
    61  
    62  // Server is the cilium-health daemon that is in charge of performing health
    63  // and connectivity checks periodically, and serving the cilium-health API.
    64  type Server struct {
    65  	healthApi.Server  // Server to provide cilium-health API
    66  	*ciliumPkg.Client // Client to "GET /healthz" on cilium daemon
    67  	Config
    68  	// clientID is the client ID returned by the cilium-agent that should
    69  	// be used when making frequent requests. The server will return
    70  	// a diff of the nodes added and removed based on this clientID.
    71  	clientID int64
    72  
    73  	tcpServers []*responder.Server // Servers for external pings
    74  	startTime  time.Time
    75  
    76  	// The lock protects against read and write access to the IP->Node map,
    77  	// the list of statuses as most recently seen, and the last time a
    78  	// probe was conducted.
    79  	lock.RWMutex
    80  	connectivity *healthReport
    81  	localStatus  *healthModels.SelfStatus
    82  }
    83  
    84  // DumpUptime returns the time that this server has been running.
    85  func (s *Server) DumpUptime() string {
    86  	return time.Since(s.startTime).String()
    87  }
    88  
    89  // getNodes fetches the nodes added and removed from the last time the server
    90  // made a request to the daemon.
    91  func (s *Server) getNodes() (nodeMap, nodeMap, error) {
    92  	scopedLog := log
    93  	if s.CiliumURI != "" {
    94  		scopedLog = log.WithField("URI", s.CiliumURI)
    95  	}
    96  	scopedLog.Debug("Sending request for /cluster/nodes ...")
    97  
    98  	clusterNodesParam := daemon.NewGetClusterNodesParams()
    99  	s.RWMutex.RLock()
   100  	cID := s.clientID
   101  	s.RWMutex.RUnlock()
   102  	clusterNodesParam.SetClientID(&cID)
   103  	resp, err := s.Daemon.GetClusterNodes(clusterNodesParam)
   104  	if err != nil {
   105  		return nil, nil, fmt.Errorf("unable to get nodes' cluster: %s", err)
   106  	}
   107  	log.Debug("Got cilium /cluster/nodes")
   108  
   109  	if resp == nil || resp.Payload == nil {
   110  		return nil, nil, fmt.Errorf("received nil health response")
   111  	}
   112  
   113  	s.RWMutex.Lock()
   114  	s.clientID = resp.Payload.ClientID
   115  
   116  	if resp.Payload.Self != "" {
   117  		s.localStatus = &healthModels.SelfStatus{
   118  			Name: resp.Payload.Self,
   119  		}
   120  	}
   121  	s.RWMutex.Unlock()
   122  
   123  	nodesAdded := nodeElementSliceToNodeMap(resp.Payload.NodesAdded)
   124  	nodesRemoved := nodeElementSliceToNodeMap(resp.Payload.NodesRemoved)
   125  
   126  	return nodesAdded, nodesRemoved, nil
   127  }
   128  
   129  // getAllNodes fetches all nodes the daemon is aware of.
   130  func (s *Server) getAllNodes() (nodeMap, error) {
   131  	scopedLog := log
   132  	if s.CiliumURI != "" {
   133  		scopedLog = log.WithField("URI", s.CiliumURI)
   134  	}
   135  	scopedLog.Debug("Sending request for /cluster/nodes ...")
   136  
   137  	resp, err := s.Daemon.GetClusterNodes(nil)
   138  	if err != nil {
   139  		return nil, fmt.Errorf("unable to get nodes' cluster: %s", err)
   140  	}
   141  	log.Debug("Got cilium /cluster/nodes")
   142  
   143  	if resp == nil || resp.Payload == nil {
   144  		return nil, fmt.Errorf("received nil health response")
   145  	}
   146  
   147  	nodesAdded := nodeElementSliceToNodeMap(resp.Payload.NodesAdded)
   148  
   149  	return nodesAdded, nil
   150  }
   151  
   152  // nodeElementSliceToNodeMap returns a slice of models.NodeElement into a
   153  // nodeMap.
   154  func nodeElementSliceToNodeMap(nodeElements []*models.NodeElement) nodeMap {
   155  	nodes := make(nodeMap)
   156  	for _, n := range nodeElements {
   157  		if n.PrimaryAddress != nil {
   158  			if n.PrimaryAddress.IPV4 != nil {
   159  				nodes[ipString(n.PrimaryAddress.IPV4.IP)] = NewHealthNode(n)
   160  			}
   161  			if n.PrimaryAddress.IPV6 != nil {
   162  				nodes[ipString(n.PrimaryAddress.IPV6.IP)] = NewHealthNode(n)
   163  			}
   164  		}
   165  		for _, addr := range n.SecondaryAddresses {
   166  			nodes[ipString(addr.IP)] = NewHealthNode(n)
   167  		}
   168  		if n.HealthEndpointAddress != nil {
   169  			if n.HealthEndpointAddress.IPV4 != nil {
   170  				nodes[ipString(n.HealthEndpointAddress.IPV4.IP)] = NewHealthNode(n)
   171  			}
   172  			if n.HealthEndpointAddress.IPV6 != nil {
   173  				nodes[ipString(n.HealthEndpointAddress.IPV6.IP)] = NewHealthNode(n)
   174  			}
   175  		}
   176  	}
   177  	return nodes
   178  }
   179  
   180  // updateCluster makes the specified health report visible to the API.
   181  //
   182  // It only updates the server's API-visible health report if the provided
   183  // report started after the current report.
   184  func (s *Server) updateCluster(report *healthReport) {
   185  	s.Lock()
   186  	defer s.Unlock()
   187  
   188  	if s.connectivity.startTime.Before(report.startTime) {
   189  		s.connectivity = report
   190  	}
   191  }
   192  
   193  // GetStatusResponse returns the most recent cluster connectivity status.
   194  func (s *Server) GetStatusResponse() *healthModels.HealthStatusResponse {
   195  	s.RLock()
   196  	defer s.RUnlock()
   197  
   198  	var name string
   199  	// Check if localStatus is populated already. If not, the name is empty
   200  	if s.localStatus != nil {
   201  		name = s.localStatus.Name
   202  	}
   203  
   204  	return &healthModels.HealthStatusResponse{
   205  		Local: &healthModels.SelfStatus{
   206  			Name: name,
   207  		},
   208  		Nodes:     s.connectivity.nodes,
   209  		Timestamp: s.connectivity.startTime.Format(time.RFC3339),
   210  	}
   211  }
   212  
   213  // FetchStatusResponse updates the cluster with the latest set of nodes,
   214  // runs a synchronous probe across the cluster, updates the connectivity cache
   215  // and returns the results.
   216  func (s *Server) FetchStatusResponse() (*healthModels.HealthStatusResponse, error) {
   217  	nodes, err := s.getAllNodes()
   218  	if err != nil {
   219  		return nil, err
   220  	}
   221  
   222  	prober := newProber(s, nodes)
   223  	if err := prober.Run(); err != nil {
   224  		log.WithError(err).Info("Failed to run ping")
   225  		return nil, err
   226  	}
   227  	log.Debug("Run complete")
   228  	s.updateCluster(prober.getResults())
   229  
   230  	return s.GetStatusResponse(), nil
   231  }
   232  
   233  // Run services that are actively probing other hosts and endpoints over
   234  // ICMP and HTTP, and hosting the health admin API on a local Unix socket.
   235  // Blocks indefinitely, or returns any errors that occur hosting the Unix
   236  // socket API server.
   237  func (s *Server) runActiveServices() error {
   238  	// Run it once at the start so we get some initial status
   239  	s.FetchStatusResponse()
   240  
   241  	// We can safely ignore nodesRemoved since it's the first time we are
   242  	// fetching the nodes from the server.
   243  	nodesAdded, _, _ := s.getNodes()
   244  	prober := newProber(s, nodesAdded)
   245  	prober.MaxRTT = s.ProbeInterval
   246  	prober.OnIdle = func() {
   247  		// Fetch results and update set of nodes to probe every
   248  		// ProbeInterval
   249  		s.updateCluster(prober.getResults())
   250  		if nodesAdded, nodesRemoved, err := s.getNodes(); err != nil {
   251  			log.WithError(err).Error("unable to get cluster nodes")
   252  		} else {
   253  			prober.setNodes(nodesAdded, nodesRemoved)
   254  		}
   255  	}
   256  	prober.RunLoop()
   257  	defer prober.Stop()
   258  
   259  	return s.Server.Serve()
   260  }
   261  
   262  // Serve spins up the following goroutines:
   263  // * TCP API Server: Responders to the health API "/hello" message, one per path
   264  // * Prober: Periodically run pings across the cluster at a configured interval
   265  //   and update the server's connectivity status cache.
   266  // * Unix API Server: Handle all health API requests over a unix socket.
   267  //
   268  // Callers should first defer the Server.Shutdown(), then call Serve().
   269  func (s *Server) Serve() (err error) {
   270  	errors := make(chan error)
   271  
   272  	for i := range s.tcpServers {
   273  		srv := s.tcpServers[i]
   274  		go func() {
   275  			errors <- srv.Serve()
   276  		}()
   277  	}
   278  
   279  	go func() {
   280  		errors <- s.runActiveServices()
   281  	}()
   282  
   283  	// Block for the first error, then return.
   284  	err = <-errors
   285  	return err
   286  }
   287  
   288  // Shutdown server and clean up resources
   289  func (s *Server) Shutdown() {
   290  	for i := range s.tcpServers {
   291  		s.tcpServers[i].Shutdown()
   292  	}
   293  	s.Server.Shutdown()
   294  }
   295  
   296  // newServer instantiates a new instance of the health API server on the
   297  // defaults unix socket.
   298  func (s *Server) newServer(spec *loads.Document) *healthApi.Server {
   299  	api := restapi.NewCiliumHealthAPI(spec)
   300  	api.Logger = log.Printf
   301  
   302  	// Admin API
   303  	api.GetHealthzHandler = NewGetHealthzHandler(s)
   304  	api.ConnectivityGetStatusHandler = NewGetStatusHandler(s)
   305  	api.ConnectivityPutStatusProbeHandler = NewPutStatusProbeHandler(s)
   306  
   307  	srv := healthApi.NewServer(api)
   308  	srv.EnabledListeners = []string{"unix"}
   309  	srv.SocketPath = flags.Filename(defaults.SockPath)
   310  
   311  	srv.ConfigureAPI()
   312  
   313  	return srv
   314  }
   315  
   316  // NewServer creates a server to handle health requests.
   317  func NewServer(config Config) (*Server, error) {
   318  	server := &Server{
   319  		startTime:    time.Now(),
   320  		Config:       config,
   321  		tcpServers:   []*responder.Server{},
   322  		connectivity: &healthReport{},
   323  	}
   324  
   325  	swaggerSpec, err := loads.Analyzed(healthApi.SwaggerJSON, "")
   326  	if err != nil {
   327  		return nil, err
   328  	}
   329  
   330  	cl, err := ciliumPkg.NewClient(config.CiliumURI)
   331  	if err != nil {
   332  		return nil, err
   333  	}
   334  
   335  	server.Client = cl
   336  	server.Server = *server.newServer(swaggerSpec)
   337  
   338  	for port := range PortToPaths {
   339  		srv := responder.NewServer(port)
   340  		server.tcpServers = append(server.tcpServers, srv)
   341  	}
   342  
   343  	return server, nil
   344  }