github.com/elfadel/cilium@v1.6.12/pkg/status/status.go (about)

     1  // Copyright 2018 Authors of Cilium
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package status
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"time"
    21  
    22  	"github.com/sirupsen/logrus"
    23  
    24  	"github.com/cilium/cilium/pkg/defaults"
    25  	"github.com/cilium/cilium/pkg/lock"
    26  	"github.com/cilium/cilium/pkg/logging"
    27  	"github.com/cilium/cilium/pkg/logging/logfields"
    28  )
    29  
    30  const (
    31  	subsystem = "status"
    32  )
    33  
    34  var (
    35  	log = logging.DefaultLogger.WithField(logfields.LogSubsys, subsystem)
    36  )
    37  
    38  // Status is passed to a probe when its state changes
    39  type Status struct {
    40  	// Data is non-nil when the probe has completed successfully. Data is
    41  	// set to the value returned by Probe()
    42  	Data interface{}
    43  
    44  	// Err is non-nil if either the probe file or the Failure or Warning
    45  	// threshold has been reached
    46  	Err error
    47  
    48  	// StaleWarning is true once the WarningThreshold has been reached
    49  	StaleWarning bool
    50  }
    51  
    52  // Probe is run by the collector at a particular interval between invocations
    53  type Probe struct {
    54  	Name string
    55  
    56  	Probe func(ctx context.Context) (interface{}, error)
    57  
    58  	// OnStatusUpdate is called whenever the status of the probe changes
    59  	OnStatusUpdate func(status Status)
    60  
    61  	// Interval allows to specify a probe specific interval that can be
    62  	// mutated based on whether the probe is failing or based on external
    63  	// factors such as current cluster size
    64  	Interval func(failures int) time.Duration
    65  
    66  	// consecutiveFailures is the number of consecutive failures in the
    67  	// probe becoming stale or failing. It is managed by
    68  	// updateProbeStatus()
    69  	consecutiveFailures int
    70  }
    71  
    72  // Collector concurrently runs probes used to check status of various subsystems
    73  type Collector struct {
    74  	lock.RWMutex   // protects staleProbes and probeStartTime
    75  	config         Config
    76  	stop           chan struct{}
    77  	staleProbes    map[string]struct{}
    78  	probeStartTime map[string]time.Time
    79  }
    80  
    81  // Config is the collector configuration
    82  type Config struct {
    83  	WarningThreshold time.Duration
    84  	FailureThreshold time.Duration
    85  	Interval         time.Duration
    86  }
    87  
    88  // NewCollector creates a collector and starts the given probes.
    89  //
    90  // Each probe runs in a separate goroutine.
    91  func NewCollector(probes []Probe, config Config) *Collector {
    92  	c := &Collector{
    93  		config:         config,
    94  		stop:           make(chan struct{}),
    95  		staleProbes:    make(map[string]struct{}),
    96  		probeStartTime: make(map[string]time.Time),
    97  	}
    98  
    99  	if c.config.Interval == time.Duration(0) {
   100  		c.config.Interval = defaults.StatusCollectorInterval
   101  	}
   102  
   103  	if c.config.FailureThreshold == time.Duration(0) {
   104  		c.config.FailureThreshold = defaults.StatusCollectorFailureThreshold
   105  	}
   106  
   107  	if c.config.WarningThreshold == time.Duration(0) {
   108  		c.config.WarningThreshold = defaults.StatusCollectorWarningThreshold
   109  	}
   110  
   111  	for i := range probes {
   112  		c.spawnProbe(&probes[i])
   113  	}
   114  
   115  	return c
   116  }
   117  
   118  // Close exits all probes and shuts down the collector
   119  // TODO(brb): call it when daemon exits (after GH#6248).
   120  func (c *Collector) Close() {
   121  	close(c.stop)
   122  }
   123  
   124  // GetStaleProbes returns a map of stale probes which key is a probe name and
   125  // value is a time when the last instance of the probe has been started.
   126  //
   127  // A probe is declared stale if it hasn't returned in FailureThreshold.
   128  func (c *Collector) GetStaleProbes() map[string]time.Time {
   129  	c.RLock()
   130  	defer c.RUnlock()
   131  
   132  	probes := make(map[string]time.Time)
   133  
   134  	for p := range c.staleProbes {
   135  		probes[p] = c.probeStartTime[p]
   136  	}
   137  
   138  	return probes
   139  }
   140  
   141  // spawnProbe starts a goroutine which invokes the probe at the particular interval.
   142  func (c *Collector) spawnProbe(p *Probe) {
   143  	go func() {
   144  		for {
   145  			c.runProbe(p)
   146  
   147  			interval := c.config.Interval
   148  			if p.Interval != nil {
   149  				interval = p.Interval(p.consecutiveFailures)
   150  			}
   151  
   152  			select {
   153  			case <-c.stop:
   154  				// collector is closed, stop looping
   155  				return
   156  			case <-time.After(interval):
   157  				// keep looping
   158  			}
   159  		}
   160  	}()
   161  }
   162  
   163  // runProbe runs the given probe, and returns either after the probe has returned
   164  // or after the collector has been closed.
   165  func (c *Collector) runProbe(p *Probe) {
   166  	var (
   167  		statusData       interface{}
   168  		err              error
   169  		warningThreshold = time.After(c.config.WarningThreshold)
   170  		hardTimeout      = false
   171  		probeReturned    = make(chan struct{}, 1)
   172  		ctx, cancel      = context.WithTimeout(context.Background(), c.config.FailureThreshold)
   173  		ctxTimeout       = make(chan struct{}, 1)
   174  	)
   175  
   176  	c.Lock()
   177  	c.probeStartTime[p.Name] = time.Now()
   178  	c.Unlock()
   179  
   180  	go func() {
   181  		statusData, err = p.Probe(ctx)
   182  		close(probeReturned)
   183  	}()
   184  
   185  	go func() {
   186  		// Once ctx.Done() has been closed, we notify the polling loop by
   187  		// sending to the ctxTimeout channel. We cannot just close the channel,
   188  		// because otherwise the loop will always enter the "<-ctxTimeout" case.
   189  		<-ctx.Done()
   190  		ctxTimeout <- struct{}{}
   191  	}()
   192  
   193  	// This is a loop so that, when we hit a FailureThreshold, we still do
   194  	// not return until the probe returns. This is to ensure the same probe
   195  	// does not run again while it is blocked.
   196  	for {
   197  		select {
   198  		case <-c.stop:
   199  			// Collector was closed. The probe will complete in the background
   200  			// and won't be restarted again.
   201  			cancel()
   202  			return
   203  
   204  		case <-warningThreshold:
   205  			// Just warn and continue waiting for probe
   206  			log.WithField(logfields.Probe, p.Name).
   207  				Warnf("No response from probe within %v seconds",
   208  					c.config.WarningThreshold.Seconds())
   209  
   210  		case <-probeReturned:
   211  			// The probe completed and we can return from runProbe
   212  			switch {
   213  			case hardTimeout:
   214  				// FailureThreshold was already reached. Keep the failure error
   215  				// message
   216  			case err != nil:
   217  				c.updateProbeStatus(p, nil, false, err)
   218  			default:
   219  				c.updateProbeStatus(p, statusData, false, nil)
   220  			}
   221  
   222  			cancel()
   223  			return
   224  
   225  		case <-ctxTimeout:
   226  			// We have timed out. Report a status and mark that we timed out so we
   227  			// do not emit status later.
   228  			staleErr := fmt.Errorf("no response from %s probe within %v seconds",
   229  				p.Name, c.config.FailureThreshold.Seconds())
   230  			c.updateProbeStatus(p, nil, true, staleErr)
   231  			hardTimeout = true
   232  		}
   233  	}
   234  }
   235  
   236  func (c *Collector) updateProbeStatus(p *Probe, data interface{}, stale bool, err error) {
   237  	// Update stale status of the probe
   238  	c.Lock()
   239  	startTime := c.probeStartTime[p.Name]
   240  	if stale {
   241  		c.staleProbes[p.Name] = struct{}{}
   242  		p.consecutiveFailures++
   243  	} else {
   244  		delete(c.staleProbes, p.Name)
   245  		if err == nil {
   246  			p.consecutiveFailures = 0
   247  		} else {
   248  			p.consecutiveFailures++
   249  		}
   250  	}
   251  	c.Unlock()
   252  
   253  	if stale {
   254  		log.WithFields(logrus.Fields{
   255  			logfields.StartTime: startTime,
   256  			logfields.Probe:     p.Name,
   257  		}).Warn("Timeout while waiting probe")
   258  	}
   259  
   260  	// Notify the probe about status update
   261  	p.OnStatusUpdate(Status{Err: err, Data: data, StaleWarning: stale})
   262  }