github.com/elfadel/cilium@v1.6.12/pkg/health/server/prober.go (about)

     1  // Copyright 2017-2019 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package server
    16  
    17  import (
    18  	"net"
    19  	"strconv"
    20  	"strings"
    21  	"time"
    22  
    23  	"github.com/cilium/cilium/api/v1/health/models"
    24  	ciliumModels "github.com/cilium/cilium/api/v1/models"
    25  	"github.com/cilium/cilium/pkg/health/defaults"
    26  	"github.com/cilium/cilium/pkg/health/probe"
    27  	"github.com/cilium/cilium/pkg/lock"
    28  	"github.com/cilium/cilium/pkg/logging/logfields"
    29  
    30  	"github.com/servak/go-fastping"
    31  	"github.com/sirupsen/logrus"
    32  )
    33  
    34  // healthReport is a snapshot of the health of the cluster.
    35  type healthReport struct {
    36  	startTime time.Time
    37  	nodes     []*models.NodeStatus
    38  }
    39  
    40  type prober struct {
    41  	*fastping.Pinger
    42  	server *Server
    43  
    44  	// 'stop' is closed upon a call to prober.Stop(). When the stopping is
    45  	// finished, then prober.Done() will be notified.
    46  	stop         chan bool
    47  	proberExited chan bool
    48  	done         chan bool
    49  
    50  	// The lock protects multiple requests attempting to update the status
    51  	// at the same time - ie, serialize updates between the periodic prober
    52  	// and probes initiated via "GET /status/probe". It is also used to
    53  	// co-ordinate updates of the ICMP responses and the HTTP responses.
    54  	lock.RWMutex
    55  
    56  	// start is the start time for the current probe cycle.
    57  	start   time.Time
    58  	results map[ipString]*models.PathStatus
    59  	nodes   nodeMap
    60  }
    61  
    62  // copyResultRLocked makes a copy of the path status for the specified IP.
    63  func (p *prober) copyResultRLocked(ip string) *models.PathStatus {
    64  	status := p.results[ipString(ip)]
    65  	if status == nil {
    66  		return nil
    67  	}
    68  
    69  	result := &models.PathStatus{
    70  		IP: ip,
    71  	}
    72  	paths := map[**models.ConnectivityStatus]*models.ConnectivityStatus{
    73  		&result.Icmp: status.Icmp,
    74  		&result.HTTP: status.HTTP,
    75  	}
    76  	for res, value := range paths {
    77  		if value != nil {
    78  			*res = &*value
    79  		}
    80  	}
    81  	return result
    82  }
    83  
    84  // getResults gathers a copy of all of the results for nodes currently in the
    85  // cluster.
    86  func (p *prober) getResults() *healthReport {
    87  	p.RLock()
    88  	defer p.RUnlock()
    89  
    90  	// De-duplicate IPs in 'p.nodes' by building a map based on node.Name.
    91  	resultMap := map[string]*models.NodeStatus{}
    92  	for _, node := range p.nodes {
    93  		if resultMap[node.Name] != nil {
    94  			continue
    95  		}
    96  		primaryIP := node.PrimaryIP()
    97  		healthIP := node.HealthIP()
    98  		status := &models.NodeStatus{
    99  			Name: node.Name,
   100  			Host: &models.HostStatus{
   101  				PrimaryAddress: p.copyResultRLocked(primaryIP),
   102  			},
   103  		}
   104  		if healthIP != "" {
   105  			status.Endpoint = p.copyResultRLocked(healthIP)
   106  		}
   107  		secondaryResults := []*models.PathStatus{}
   108  		for _, addr := range node.SecondaryAddresses {
   109  			if addr.Enabled {
   110  				secondaryStatus := p.copyResultRLocked(addr.IP)
   111  				secondaryResults = append(secondaryResults, secondaryStatus)
   112  			}
   113  		}
   114  		status.Host.SecondaryAddresses = secondaryResults
   115  		resultMap[node.Name] = status
   116  	}
   117  
   118  	result := &healthReport{startTime: p.start}
   119  	for _, res := range resultMap {
   120  		result.nodes = append(result.nodes, res)
   121  	}
   122  	return result
   123  }
   124  
   125  func isIPv4(ip string) bool {
   126  	netIP := net.ParseIP(ip)
   127  	return netIP != nil && !strings.Contains(ip, ":")
   128  }
   129  
   130  func skipAddress(elem *ciliumModels.NodeAddressingElement) bool {
   131  	return elem == nil || !elem.Enabled || elem.IP == "<nil>"
   132  }
   133  
   134  // resolveIP attempts to sanitize 'node' and 'ip', and if successful, returns
   135  // the name of the node and the IP address specified in the addressing element.
   136  // If validation fails or this IP should not be pinged, 'ip' is returned as nil.
   137  func resolveIP(n *healthNode, addr *ciliumModels.NodeAddressingElement, proto string, primary bool) (string, *net.IPAddr) {
   138  	node := n.NodeElement
   139  	network := "ip6:icmp"
   140  	if isIPv4(addr.IP) {
   141  		network = "ip4:icmp"
   142  	}
   143  	scopedLog := log.WithFields(logrus.Fields{
   144  		logfields.NodeName: node.Name,
   145  		logfields.IPAddr:   addr.IP,
   146  		"primary":          primary,
   147  	})
   148  
   149  	if skipAddress(addr) {
   150  		scopedLog.Debug("Skipping probe for address")
   151  		return "", nil
   152  	}
   153  
   154  	ra, err := net.ResolveIPAddr(network, addr.IP)
   155  	if err != nil {
   156  		scopedLog.Debug("Unable to resolve address")
   157  		return "", nil
   158  	}
   159  
   160  	scopedLog.WithField("protocol", proto).Debug("Probing for connectivity to node")
   161  	return node.Name, ra
   162  }
   163  
   164  // RemoveIP removes all traces of the specified IP from the prober, including
   165  // clearing all cached results, mapping from this IP to a node, and entries in
   166  // the ICMP and TCP pingers.
   167  func (p *prober) RemoveIP(ip string) {
   168  	nodeIP := ipString(ip)
   169  	delete(p.results, nodeIP)
   170  	p.Pinger.RemoveIP(ip)   // ICMP pinger
   171  	delete(p.nodes, nodeIP) // TCP prober
   172  }
   173  
   174  // setNodes sets the list of nodes for the prober, and updates the pinger to
   175  // start sending pings to all nodes added.
   176  // 'removed' nodes will be removed from the pinger to stop sending pings to
   177  // those removed nodes.
   178  // setNodes will steal references to nodes referenced from 'added', so the
   179  // caller should not modify them after a call to setNodes.
   180  // If a node is updated, it will appear in both maps and will be removed then
   181  // added (potentially with different information).
   182  func (p *prober) setNodes(added nodeMap, removed nodeMap) {
   183  	p.Lock()
   184  	defer p.Unlock()
   185  
   186  	for _, n := range removed {
   187  		for elem := range n.Addresses() {
   188  			p.RemoveIP(elem.IP)
   189  		}
   190  	}
   191  
   192  	for _, n := range added {
   193  		for elem, primary := range n.Addresses() {
   194  			_, addr := resolveIP(&n, elem, "icmp", primary)
   195  
   196  			ip := ipString(elem.IP)
   197  			result := &models.ConnectivityStatus{}
   198  			if addr == nil {
   199  				result.Status = "Failed to resolve IP"
   200  			} else {
   201  				result.Status = "Connection timed out"
   202  				p.AddIPAddr(addr)
   203  				p.nodes[ip] = n
   204  			}
   205  
   206  			if p.results[ip] == nil {
   207  				p.results[ip] = &models.PathStatus{
   208  					IP: elem.IP,
   209  				}
   210  			}
   211  			p.results[ip].Icmp = result
   212  		}
   213  	}
   214  }
   215  
   216  func (p *prober) httpProbe(node string, ip string, port int) *models.ConnectivityStatus {
   217  	result := &models.ConnectivityStatus{}
   218  
   219  	host := "http://" + net.JoinHostPort(ip, strconv.Itoa(port))
   220  	scopedLog := log.WithFields(logrus.Fields{
   221  		logfields.NodeName: node,
   222  		logfields.IPAddr:   ip,
   223  		"host":             host,
   224  		"path":             PortToPaths[port],
   225  	})
   226  
   227  	scopedLog.Debug("Greeting host")
   228  	start := time.Now()
   229  	err := probe.GetHello(host)
   230  	rtt := time.Since(start)
   231  	if err == nil {
   232  		scopedLog.WithField("rtt", rtt).Debug("Greeting successful")
   233  		result.Status = ""
   234  		result.Latency = rtt.Nanoseconds()
   235  	} else {
   236  		scopedLog.WithError(err).Debug("Greeting failed")
   237  		result.Status = err.Error()
   238  	}
   239  
   240  	return result
   241  }
   242  
   243  func (p *prober) getIPsByNode() map[string][]*net.IPAddr {
   244  	p.RLock()
   245  	defer p.RUnlock()
   246  
   247  	// p.nodes is mapped from all known IPs -> nodes in N:M configuration,
   248  	// so multiple IPs could refer to the same node. To ensure we only
   249  	// ping each node once, deduplicate nodes into map of nodeName -> []IP.
   250  	nodes := make(map[string][]*net.IPAddr)
   251  	for _, node := range p.nodes {
   252  		if nodes[node.Name] != nil {
   253  			// Already handled this node.
   254  			continue
   255  		}
   256  		nodes[node.Name] = []*net.IPAddr{}
   257  		for elem, primary := range node.Addresses() {
   258  			if _, addr := resolveIP(&node, elem, "http", primary); addr != nil {
   259  				nodes[node.Name] = append(nodes[node.Name], addr)
   260  			}
   261  		}
   262  	}
   263  
   264  	return nodes
   265  }
   266  
   267  func (p *prober) runHTTPProbe() {
   268  	startTime := time.Now()
   269  	p.Lock()
   270  	p.start = startTime
   271  	p.Unlock()
   272  
   273  	for name, ips := range p.getIPsByNode() {
   274  		for _, ip := range ips {
   275  			scopedLog := log.WithFields(logrus.Fields{
   276  				logfields.NodeName: name,
   277  				logfields.IPAddr:   ip.String(),
   278  			})
   279  
   280  			status := &models.PathStatus{}
   281  			ports := map[int]**models.ConnectivityStatus{
   282  				defaults.HTTPPathPort: &status.HTTP,
   283  			}
   284  			for port, result := range ports {
   285  				*result = p.httpProbe(name, ip.String(), port)
   286  				if status.HTTP.Status != "" {
   287  					scopedLog.WithFields(logrus.Fields{
   288  						logfields.Port: port,
   289  					}).Debugf("Failed to probe: %s", status.HTTP.Status)
   290  				}
   291  			}
   292  
   293  			peer := ipString(ip.String())
   294  			p.Lock()
   295  			if _, ok := p.results[peer]; ok {
   296  				p.results[peer].HTTP = status.HTTP
   297  			} else {
   298  				// While we weren't holding the lock, the
   299  				// pinger's OnIdle() callback fired and updated
   300  				// the set of nodes to remove this node.
   301  				scopedLog.Debug("Node disappeared before result written")
   302  			}
   303  			p.Unlock()
   304  		}
   305  	}
   306  }
   307  
   308  // Done returns a channel that is closed when RunLoop() is stopped by an error.
   309  // It must be called after the RunLoop() call.
   310  func (p *prober) Done() <-chan bool {
   311  	return p.done
   312  }
   313  
   314  // Run sends a single probes out to all of the other cilium nodes to gather
   315  // connectivity status for the cluster.
   316  func (p *prober) Run() error {
   317  	err := p.Pinger.Run()
   318  	p.runHTTPProbe()
   319  	return err
   320  }
   321  
   322  // Stop disrupts the currently running RunLoop(). This may only be called after
   323  // a call to RunLoop().
   324  func (p *prober) Stop() {
   325  	p.Pinger.Stop()
   326  	close(p.stop)
   327  	<-p.proberExited
   328  	close(p.done)
   329  }
   330  
   331  // RunLoop periodically sends probes out to all of the other cilium nodes to
   332  // gather connectivity status for the cluster.
   333  //
   334  // This is a non-blocking method so it immediately returns. If you want to
   335  // stop sending packets, call Stop().
   336  func (p *prober) RunLoop() {
   337  	// FIXME: Spread the probes out across the probing interval
   338  	p.Pinger.RunLoop()
   339  
   340  	go func() {
   341  		tick := time.NewTicker(p.server.ProbeInterval)
   342  	loop:
   343  		for {
   344  			select {
   345  			case <-p.stop:
   346  				break loop
   347  			case <-tick.C:
   348  				p.runHTTPProbe()
   349  				continue
   350  			}
   351  		}
   352  		tick.Stop()
   353  		close(p.proberExited)
   354  	}()
   355  }
   356  
   357  // newPinger prepares a prober. The caller may invoke one the Run* methods of
   358  // the prober to populate its 'results' map.
   359  func newProber(s *Server, nodes nodeMap) *prober {
   360  	prober := &prober{
   361  		Pinger:       fastping.NewPinger(),
   362  		server:       s,
   363  		done:         make(chan bool),
   364  		proberExited: make(chan bool),
   365  		stop:         make(chan bool),
   366  		results:      make(map[ipString]*models.PathStatus),
   367  		nodes:        make(nodeMap),
   368  	}
   369  	prober.MaxRTT = s.ProbeDeadline
   370  
   371  	prober.setNodes(nodes, nil)
   372  	prober.OnRecv = func(addr *net.IPAddr, rtt time.Duration) {
   373  		prober.Lock()
   374  		defer prober.Unlock()
   375  		node, exists := prober.nodes[ipString(addr.String())]
   376  
   377  		scopedLog := log.WithFields(logrus.Fields{
   378  			logfields.IPAddr: addr,
   379  			"rtt":            rtt,
   380  		})
   381  		if !exists {
   382  			scopedLog.Debugf("Node disappeared, skip result")
   383  			return
   384  		}
   385  
   386  		prober.results[ipString(addr.String())].Icmp = &models.ConnectivityStatus{
   387  			Latency: rtt.Nanoseconds(),
   388  			Status:  "",
   389  		}
   390  		scopedLog.WithFields(logrus.Fields{
   391  			logfields.NodeName: node.Name,
   392  		}).Debugf("Probe successful")
   393  	}
   394  
   395  	return prober
   396  }