github.com/decred/dcrlnd@v0.7.6/healthcheck/healthcheck.go (about)

     1  // Package healthcheck contains a monitor which takes a set of liveliness checks
     2  // which it periodically checks. If a check fails after its configured number
     3  // of allowed call attempts, the monitor will send a request to shutdown using
     4  // the function is is provided in its config. Checks are dispatched in their own
     5  // goroutines so that they do not block each other.
     6  package healthcheck
     7  
     8  import (
     9  	"errors"
    10  	"fmt"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	"github.com/decred/dcrlnd/ticker"
    16  )
    17  
    18  // Config contains configuration settings for our monitor.
    19  type Config struct {
    20  	// Checks is a set of health checks that assert that lnd has access to
    21  	// critical resources.
    22  	Checks []*Observation
    23  
    24  	// Shutdown should be called to request safe shutdown on failure of a
    25  	// health check.
    26  	Shutdown shutdownFunc
    27  }
    28  
    29  // shutdownFunc is the signature we use for a shutdown function which allows us
    30  // to print our reason for shutdown.
    31  type shutdownFunc func(format string, params ...interface{})
    32  
    33  // Monitor periodically checks a series of configured liveliness checks to
    34  // ensure that lnd has access to all critical resources.
    35  type Monitor struct {
    36  	started int32 // To be used atomically.
    37  	stopped int32 // To be used atomically.
    38  
    39  	cfg *Config
    40  
    41  	quit chan struct{}
    42  	wg   sync.WaitGroup
    43  }
    44  
    45  // NewMonitor returns a monitor with the provided config.
    46  func NewMonitor(cfg *Config) *Monitor {
    47  	return &Monitor{
    48  		cfg:  cfg,
    49  		quit: make(chan struct{}),
    50  	}
    51  }
    52  
    53  // Start launches the goroutines required to run our monitor.
    54  func (m *Monitor) Start() error {
    55  	if !atomic.CompareAndSwapInt32(&m.started, 0, 1) {
    56  		return errors.New("monitor already started")
    57  	}
    58  
    59  	// Run through all of the health checks that we have configured and
    60  	// start a goroutine for each check.
    61  	for _, check := range m.cfg.Checks {
    62  		check := check
    63  
    64  		// Skip over health checks that are disabled by setting zero
    65  		// attempts.
    66  		if check.Attempts == 0 {
    67  			log.Warnf("check: %v configured with 0 attempts, "+
    68  				"skipping it", check.Name)
    69  
    70  			continue
    71  		}
    72  
    73  		m.wg.Add(1)
    74  		go func() {
    75  			defer m.wg.Done()
    76  			check.monitor(m.cfg.Shutdown, m.quit)
    77  		}()
    78  	}
    79  
    80  	return nil
    81  }
    82  
    83  // Stop sends all goroutines the signal to exit and waits for them to exit.
    84  func (m *Monitor) Stop() error {
    85  	if !atomic.CompareAndSwapInt32(&m.stopped, 0, 1) {
    86  		return fmt.Errorf("monitor already stopped")
    87  	}
    88  
    89  	log.Info("Health monitor shutting down")
    90  
    91  	close(m.quit)
    92  	m.wg.Wait()
    93  
    94  	return nil
    95  }
    96  
    97  // CreateCheck is a helper function that takes a function that produces an error
    98  // and wraps it in a function that returns its result on an error channel.
    99  // We do not wait group the goroutine running our checkFunc because we expect
   100  // to be dealing with health checks that may block; if we wait group them, we
   101  // may wait forever. Ideally future health checks will allow callers to cancel
   102  // them early, and we can wait group this.
   103  func CreateCheck(checkFunc func() error) func() chan error {
   104  	return func() chan error {
   105  		errChan := make(chan error, 1)
   106  		go func() {
   107  			errChan <- checkFunc()
   108  		}()
   109  
   110  		return errChan
   111  	}
   112  }
   113  
   114  // Observation represents a liveliness check that we periodically check.
   115  type Observation struct {
   116  	// Name describes the health check.
   117  	Name string
   118  
   119  	// Check runs the health check itself, returning an error channel that
   120  	// is expected to receive nil or an error.
   121  	Check func() chan error
   122  
   123  	// Interval is a ticker which triggers running our check function. This
   124  	// ticker must be started and stopped by the observation.
   125  	Interval ticker.Ticker
   126  
   127  	// Attempts is the number of calls we make for a single check before
   128  	// failing.
   129  	Attempts int
   130  
   131  	// Timeout is the amount of time we allow our check function to take
   132  	// before we time it out.
   133  	Timeout time.Duration
   134  
   135  	// Backoff is the amount of time we back off between retries for failed
   136  	// checks.
   137  	Backoff time.Duration
   138  }
   139  
   140  // NewObservation creates an observation.
   141  func NewObservation(name string, check func() error, interval,
   142  	timeout, backoff time.Duration, attempts int) *Observation {
   143  
   144  	return &Observation{
   145  		Name:     name,
   146  		Check:    CreateCheck(check),
   147  		Interval: ticker.New(interval),
   148  		Attempts: attempts,
   149  		Timeout:  timeout,
   150  		Backoff:  backoff,
   151  	}
   152  }
   153  
   154  // String returns a string representation of an observation.
   155  func (o *Observation) String() string {
   156  	return o.Name
   157  }
   158  
   159  // monitor executes a health check every time its interval ticks until the quit
   160  // channel signals that we should shutdown. This function is also responsible
   161  // for starting and stopping our ticker.
   162  func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
   163  	log.Debugf("Monitoring: %v", o)
   164  
   165  	o.Interval.Resume()
   166  	defer o.Interval.Stop()
   167  
   168  	for {
   169  		select {
   170  		case <-o.Interval.Ticks():
   171  			// retryCheck will return errMaxAttemptsReached when
   172  			// the max attempts are reached. In that case we will
   173  			// stop the ticker and quit.
   174  			if o.retryCheck(quit, shutdown) {
   175  				log.Debugf("Health check: max attempts " +
   176  					"failed, monitor exiting")
   177  				return
   178  			}
   179  
   180  		// Exit if we receive the instruction to shutdown.
   181  		case <-quit:
   182  			log.Debug("Health check: monitor quit")
   183  			return
   184  		}
   185  	}
   186  }
   187  
   188  // retryCheck calls a check function until it succeeds, or we reach our
   189  // configured number of attempts, waiting for our back off period between failed
   190  // calls. If we fail to obtain a passing health check after the allowed number
   191  // of calls, we will request shutdown. It returns a bool to indicate whether
   192  // the max number of attempts is reached.
   193  func (o *Observation) retryCheck(quit chan struct{},
   194  	shutdown shutdownFunc) bool {
   195  
   196  	var count int
   197  
   198  	for count < o.Attempts {
   199  		// Increment our call count and call the health check endpoint.
   200  		count++
   201  
   202  		// Wait for our check to return, timeout to elapse, or quit
   203  		// signal to be received.
   204  		var err error
   205  		select {
   206  		case err = <-o.Check():
   207  
   208  		case <-time.After(o.Timeout):
   209  			err = fmt.Errorf("health check: %v timed out after: "+
   210  				"%v", o, o.Timeout)
   211  
   212  		case <-quit:
   213  			log.Debug("Health check: monitor quit")
   214  			return false
   215  		}
   216  
   217  		// If our error is nil, we have passed our health check, so we
   218  		// can exit.
   219  		if err == nil {
   220  			return false
   221  		}
   222  
   223  		// If we have reached our allowed number of attempts, this
   224  		// check has failed so we request shutdown.
   225  		if count == o.Attempts {
   226  			shutdown("Health check: %v failed after %v "+
   227  				"calls", o, o.Attempts)
   228  			return true
   229  		}
   230  
   231  		log.Infof("Health check: %v, call: %v failed with: %v, "+
   232  			"backing off for: %v", o, count, err, o.Backoff)
   233  
   234  		// If we are still within the number of calls allowed for this
   235  		// check, we wait for our back off period to elapse, or exit if
   236  		// we get the signal to shutdown.
   237  		select {
   238  		case <-time.After(o.Backoff):
   239  
   240  		case <-quit:
   241  			log.Debug("Health check: monitor quit")
   242  			return false
   243  		}
   244  	}
   245  
   246  	return false
   247  }