github.com/elfadel/cilium@v1.6.12/pkg/health/server/server.go (about) 1 // Copyright 2017-2019 Authors of Cilium 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package server 16 17 import ( 18 "fmt" 19 "time" 20 21 "github.com/cilium/cilium/api/v1/client/daemon" 22 healthModels "github.com/cilium/cilium/api/v1/health/models" 23 healthApi "github.com/cilium/cilium/api/v1/health/server" 24 "github.com/cilium/cilium/api/v1/health/server/restapi" 25 "github.com/cilium/cilium/api/v1/models" 26 ciliumPkg "github.com/cilium/cilium/pkg/client" 27 "github.com/cilium/cilium/pkg/health/defaults" 28 "github.com/cilium/cilium/pkg/health/probe/responder" 29 "github.com/cilium/cilium/pkg/lock" 30 "github.com/cilium/cilium/pkg/logging" 31 "github.com/cilium/cilium/pkg/logging/logfields" 32 33 "github.com/go-openapi/loads" 34 "github.com/jessevdk/go-flags" 35 ) 36 37 var ( 38 log = logging.DefaultLogger.WithField(logfields.LogSubsys, "health-server") 39 40 // PortToPaths is a convenience map for access to the ports and their 41 // common string representations 42 PortToPaths = map[int]string{ 43 defaults.HTTPPathPort: "Via L3", 44 } 45 ) 46 47 // Config stores the configuration data for a cilium-health server. 48 type Config struct { 49 Debug bool 50 CiliumURI string 51 ProbeInterval time.Duration 52 ProbeDeadline time.Duration 53 } 54 55 // ipString is an IP address used as a more descriptive type name in maps. 56 type ipString string 57 58 // nodeMap maps IP addresses to healthNode objectss for convenient access to 59 // node information. 60 type nodeMap map[ipString]healthNode 61 62 // Server is the cilium-health daemon that is in charge of performing health 63 // and connectivity checks periodically, and serving the cilium-health API. 64 type Server struct { 65 healthApi.Server // Server to provide cilium-health API 66 *ciliumPkg.Client // Client to "GET /healthz" on cilium daemon 67 Config 68 // clientID is the client ID returned by the cilium-agent that should 69 // be used when making frequent requests. The server will return 70 // a diff of the nodes added and removed based on this clientID. 71 clientID int64 72 73 tcpServers []*responder.Server // Servers for external pings 74 startTime time.Time 75 76 // The lock protects against read and write access to the IP->Node map, 77 // the list of statuses as most recently seen, and the last time a 78 // probe was conducted. 79 lock.RWMutex 80 connectivity *healthReport 81 localStatus *healthModels.SelfStatus 82 } 83 84 // DumpUptime returns the time that this server has been running. 85 func (s *Server) DumpUptime() string { 86 return time.Since(s.startTime).String() 87 } 88 89 // getNodes fetches the nodes added and removed from the last time the server 90 // made a request to the daemon. 91 func (s *Server) getNodes() (nodeMap, nodeMap, error) { 92 scopedLog := log 93 if s.CiliumURI != "" { 94 scopedLog = log.WithField("URI", s.CiliumURI) 95 } 96 scopedLog.Debug("Sending request for /cluster/nodes ...") 97 98 clusterNodesParam := daemon.NewGetClusterNodesParams() 99 s.RWMutex.RLock() 100 cID := s.clientID 101 s.RWMutex.RUnlock() 102 clusterNodesParam.SetClientID(&cID) 103 resp, err := s.Daemon.GetClusterNodes(clusterNodesParam) 104 if err != nil { 105 return nil, nil, fmt.Errorf("unable to get nodes' cluster: %s", err) 106 } 107 log.Debug("Got cilium /cluster/nodes") 108 109 if resp == nil || resp.Payload == nil { 110 return nil, nil, fmt.Errorf("received nil health response") 111 } 112 113 s.RWMutex.Lock() 114 s.clientID = resp.Payload.ClientID 115 116 if resp.Payload.Self != "" { 117 s.localStatus = &healthModels.SelfStatus{ 118 Name: resp.Payload.Self, 119 } 120 } 121 s.RWMutex.Unlock() 122 123 nodesAdded := nodeElementSliceToNodeMap(resp.Payload.NodesAdded) 124 nodesRemoved := nodeElementSliceToNodeMap(resp.Payload.NodesRemoved) 125 126 return nodesAdded, nodesRemoved, nil 127 } 128 129 // getAllNodes fetches all nodes the daemon is aware of. 130 func (s *Server) getAllNodes() (nodeMap, error) { 131 scopedLog := log 132 if s.CiliumURI != "" { 133 scopedLog = log.WithField("URI", s.CiliumURI) 134 } 135 scopedLog.Debug("Sending request for /cluster/nodes ...") 136 137 resp, err := s.Daemon.GetClusterNodes(nil) 138 if err != nil { 139 return nil, fmt.Errorf("unable to get nodes' cluster: %s", err) 140 } 141 log.Debug("Got cilium /cluster/nodes") 142 143 if resp == nil || resp.Payload == nil { 144 return nil, fmt.Errorf("received nil health response") 145 } 146 147 nodesAdded := nodeElementSliceToNodeMap(resp.Payload.NodesAdded) 148 149 return nodesAdded, nil 150 } 151 152 // nodeElementSliceToNodeMap returns a slice of models.NodeElement into a 153 // nodeMap. 154 func nodeElementSliceToNodeMap(nodeElements []*models.NodeElement) nodeMap { 155 nodes := make(nodeMap) 156 for _, n := range nodeElements { 157 if n.PrimaryAddress != nil { 158 if n.PrimaryAddress.IPV4 != nil { 159 nodes[ipString(n.PrimaryAddress.IPV4.IP)] = NewHealthNode(n) 160 } 161 if n.PrimaryAddress.IPV6 != nil { 162 nodes[ipString(n.PrimaryAddress.IPV6.IP)] = NewHealthNode(n) 163 } 164 } 165 for _, addr := range n.SecondaryAddresses { 166 nodes[ipString(addr.IP)] = NewHealthNode(n) 167 } 168 if n.HealthEndpointAddress != nil { 169 if n.HealthEndpointAddress.IPV4 != nil { 170 nodes[ipString(n.HealthEndpointAddress.IPV4.IP)] = NewHealthNode(n) 171 } 172 if n.HealthEndpointAddress.IPV6 != nil { 173 nodes[ipString(n.HealthEndpointAddress.IPV6.IP)] = NewHealthNode(n) 174 } 175 } 176 } 177 return nodes 178 } 179 180 // updateCluster makes the specified health report visible to the API. 181 // 182 // It only updates the server's API-visible health report if the provided 183 // report started after the current report. 184 func (s *Server) updateCluster(report *healthReport) { 185 s.Lock() 186 defer s.Unlock() 187 188 if s.connectivity.startTime.Before(report.startTime) { 189 s.connectivity = report 190 } 191 } 192 193 // GetStatusResponse returns the most recent cluster connectivity status. 194 func (s *Server) GetStatusResponse() *healthModels.HealthStatusResponse { 195 s.RLock() 196 defer s.RUnlock() 197 198 var name string 199 // Check if localStatus is populated already. If not, the name is empty 200 if s.localStatus != nil { 201 name = s.localStatus.Name 202 } 203 204 return &healthModels.HealthStatusResponse{ 205 Local: &healthModels.SelfStatus{ 206 Name: name, 207 }, 208 Nodes: s.connectivity.nodes, 209 Timestamp: s.connectivity.startTime.Format(time.RFC3339), 210 } 211 } 212 213 // FetchStatusResponse updates the cluster with the latest set of nodes, 214 // runs a synchronous probe across the cluster, updates the connectivity cache 215 // and returns the results. 216 func (s *Server) FetchStatusResponse() (*healthModels.HealthStatusResponse, error) { 217 nodes, err := s.getAllNodes() 218 if err != nil { 219 return nil, err 220 } 221 222 prober := newProber(s, nodes) 223 if err := prober.Run(); err != nil { 224 log.WithError(err).Info("Failed to run ping") 225 return nil, err 226 } 227 log.Debug("Run complete") 228 s.updateCluster(prober.getResults()) 229 230 return s.GetStatusResponse(), nil 231 } 232 233 // Run services that are actively probing other hosts and endpoints over 234 // ICMP and HTTP, and hosting the health admin API on a local Unix socket. 235 // Blocks indefinitely, or returns any errors that occur hosting the Unix 236 // socket API server. 237 func (s *Server) runActiveServices() error { 238 // Run it once at the start so we get some initial status 239 s.FetchStatusResponse() 240 241 // We can safely ignore nodesRemoved since it's the first time we are 242 // fetching the nodes from the server. 243 nodesAdded, _, _ := s.getNodes() 244 prober := newProber(s, nodesAdded) 245 prober.MaxRTT = s.ProbeInterval 246 prober.OnIdle = func() { 247 // Fetch results and update set of nodes to probe every 248 // ProbeInterval 249 s.updateCluster(prober.getResults()) 250 if nodesAdded, nodesRemoved, err := s.getNodes(); err != nil { 251 log.WithError(err).Error("unable to get cluster nodes") 252 } else { 253 prober.setNodes(nodesAdded, nodesRemoved) 254 } 255 } 256 prober.RunLoop() 257 defer prober.Stop() 258 259 return s.Server.Serve() 260 } 261 262 // Serve spins up the following goroutines: 263 // * TCP API Server: Responders to the health API "/hello" message, one per path 264 // * Prober: Periodically run pings across the cluster at a configured interval 265 // and update the server's connectivity status cache. 266 // * Unix API Server: Handle all health API requests over a unix socket. 267 // 268 // Callers should first defer the Server.Shutdown(), then call Serve(). 269 func (s *Server) Serve() (err error) { 270 errors := make(chan error) 271 272 for i := range s.tcpServers { 273 srv := s.tcpServers[i] 274 go func() { 275 errors <- srv.Serve() 276 }() 277 } 278 279 go func() { 280 errors <- s.runActiveServices() 281 }() 282 283 // Block for the first error, then return. 284 err = <-errors 285 return err 286 } 287 288 // Shutdown server and clean up resources 289 func (s *Server) Shutdown() { 290 for i := range s.tcpServers { 291 s.tcpServers[i].Shutdown() 292 } 293 s.Server.Shutdown() 294 } 295 296 // newServer instantiates a new instance of the health API server on the 297 // defaults unix socket. 298 func (s *Server) newServer(spec *loads.Document) *healthApi.Server { 299 api := restapi.NewCiliumHealthAPI(spec) 300 api.Logger = log.Printf 301 302 // Admin API 303 api.GetHealthzHandler = NewGetHealthzHandler(s) 304 api.ConnectivityGetStatusHandler = NewGetStatusHandler(s) 305 api.ConnectivityPutStatusProbeHandler = NewPutStatusProbeHandler(s) 306 307 srv := healthApi.NewServer(api) 308 srv.EnabledListeners = []string{"unix"} 309 srv.SocketPath = flags.Filename(defaults.SockPath) 310 311 srv.ConfigureAPI() 312 313 return srv 314 } 315 316 // NewServer creates a server to handle health requests. 317 func NewServer(config Config) (*Server, error) { 318 server := &Server{ 319 startTime: time.Now(), 320 Config: config, 321 tcpServers: []*responder.Server{}, 322 connectivity: &healthReport{}, 323 } 324 325 swaggerSpec, err := loads.Analyzed(healthApi.SwaggerJSON, "") 326 if err != nil { 327 return nil, err 328 } 329 330 cl, err := ciliumPkg.NewClient(config.CiliumURI) 331 if err != nil { 332 return nil, err 333 } 334 335 server.Client = cl 336 server.Server = *server.newServer(swaggerSpec) 337 338 for port := range PortToPaths { 339 srv := responder.NewServer(port) 340 server.tcpServers = append(server.tcpServers, srv) 341 } 342 343 return server, nil 344 }