github.com/decred/dcrlnd@v0.7.6/healthcheck/healthcheck.go (about) 1 // Package healthcheck contains a monitor which takes a set of liveliness checks 2 // which it periodically checks. If a check fails after its configured number 3 // of allowed call attempts, the monitor will send a request to shutdown using 4 // the function is is provided in its config. Checks are dispatched in their own 5 // goroutines so that they do not block each other. 6 package healthcheck 7 8 import ( 9 "errors" 10 "fmt" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 "github.com/decred/dcrlnd/ticker" 16 ) 17 18 // Config contains configuration settings for our monitor. 19 type Config struct { 20 // Checks is a set of health checks that assert that lnd has access to 21 // critical resources. 22 Checks []*Observation 23 24 // Shutdown should be called to request safe shutdown on failure of a 25 // health check. 26 Shutdown shutdownFunc 27 } 28 29 // shutdownFunc is the signature we use for a shutdown function which allows us 30 // to print our reason for shutdown. 31 type shutdownFunc func(format string, params ...interface{}) 32 33 // Monitor periodically checks a series of configured liveliness checks to 34 // ensure that lnd has access to all critical resources. 35 type Monitor struct { 36 started int32 // To be used atomically. 37 stopped int32 // To be used atomically. 38 39 cfg *Config 40 41 quit chan struct{} 42 wg sync.WaitGroup 43 } 44 45 // NewMonitor returns a monitor with the provided config. 46 func NewMonitor(cfg *Config) *Monitor { 47 return &Monitor{ 48 cfg: cfg, 49 quit: make(chan struct{}), 50 } 51 } 52 53 // Start launches the goroutines required to run our monitor. 54 func (m *Monitor) Start() error { 55 if !atomic.CompareAndSwapInt32(&m.started, 0, 1) { 56 return errors.New("monitor already started") 57 } 58 59 // Run through all of the health checks that we have configured and 60 // start a goroutine for each check. 61 for _, check := range m.cfg.Checks { 62 check := check 63 64 // Skip over health checks that are disabled by setting zero 65 // attempts. 66 if check.Attempts == 0 { 67 log.Warnf("check: %v configured with 0 attempts, "+ 68 "skipping it", check.Name) 69 70 continue 71 } 72 73 m.wg.Add(1) 74 go func() { 75 defer m.wg.Done() 76 check.monitor(m.cfg.Shutdown, m.quit) 77 }() 78 } 79 80 return nil 81 } 82 83 // Stop sends all goroutines the signal to exit and waits for them to exit. 84 func (m *Monitor) Stop() error { 85 if !atomic.CompareAndSwapInt32(&m.stopped, 0, 1) { 86 return fmt.Errorf("monitor already stopped") 87 } 88 89 log.Info("Health monitor shutting down") 90 91 close(m.quit) 92 m.wg.Wait() 93 94 return nil 95 } 96 97 // CreateCheck is a helper function that takes a function that produces an error 98 // and wraps it in a function that returns its result on an error channel. 99 // We do not wait group the goroutine running our checkFunc because we expect 100 // to be dealing with health checks that may block; if we wait group them, we 101 // may wait forever. Ideally future health checks will allow callers to cancel 102 // them early, and we can wait group this. 103 func CreateCheck(checkFunc func() error) func() chan error { 104 return func() chan error { 105 errChan := make(chan error, 1) 106 go func() { 107 errChan <- checkFunc() 108 }() 109 110 return errChan 111 } 112 } 113 114 // Observation represents a liveliness check that we periodically check. 115 type Observation struct { 116 // Name describes the health check. 117 Name string 118 119 // Check runs the health check itself, returning an error channel that 120 // is expected to receive nil or an error. 121 Check func() chan error 122 123 // Interval is a ticker which triggers running our check function. This 124 // ticker must be started and stopped by the observation. 125 Interval ticker.Ticker 126 127 // Attempts is the number of calls we make for a single check before 128 // failing. 129 Attempts int 130 131 // Timeout is the amount of time we allow our check function to take 132 // before we time it out. 133 Timeout time.Duration 134 135 // Backoff is the amount of time we back off between retries for failed 136 // checks. 137 Backoff time.Duration 138 } 139 140 // NewObservation creates an observation. 141 func NewObservation(name string, check func() error, interval, 142 timeout, backoff time.Duration, attempts int) *Observation { 143 144 return &Observation{ 145 Name: name, 146 Check: CreateCheck(check), 147 Interval: ticker.New(interval), 148 Attempts: attempts, 149 Timeout: timeout, 150 Backoff: backoff, 151 } 152 } 153 154 // String returns a string representation of an observation. 155 func (o *Observation) String() string { 156 return o.Name 157 } 158 159 // monitor executes a health check every time its interval ticks until the quit 160 // channel signals that we should shutdown. This function is also responsible 161 // for starting and stopping our ticker. 162 func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) { 163 log.Debugf("Monitoring: %v", o) 164 165 o.Interval.Resume() 166 defer o.Interval.Stop() 167 168 for { 169 select { 170 case <-o.Interval.Ticks(): 171 // retryCheck will return errMaxAttemptsReached when 172 // the max attempts are reached. In that case we will 173 // stop the ticker and quit. 174 if o.retryCheck(quit, shutdown) { 175 log.Debugf("Health check: max attempts " + 176 "failed, monitor exiting") 177 return 178 } 179 180 // Exit if we receive the instruction to shutdown. 181 case <-quit: 182 log.Debug("Health check: monitor quit") 183 return 184 } 185 } 186 } 187 188 // retryCheck calls a check function until it succeeds, or we reach our 189 // configured number of attempts, waiting for our back off period between failed 190 // calls. If we fail to obtain a passing health check after the allowed number 191 // of calls, we will request shutdown. It returns a bool to indicate whether 192 // the max number of attempts is reached. 193 func (o *Observation) retryCheck(quit chan struct{}, 194 shutdown shutdownFunc) bool { 195 196 var count int 197 198 for count < o.Attempts { 199 // Increment our call count and call the health check endpoint. 200 count++ 201 202 // Wait for our check to return, timeout to elapse, or quit 203 // signal to be received. 204 var err error 205 select { 206 case err = <-o.Check(): 207 208 case <-time.After(o.Timeout): 209 err = fmt.Errorf("health check: %v timed out after: "+ 210 "%v", o, o.Timeout) 211 212 case <-quit: 213 log.Debug("Health check: monitor quit") 214 return false 215 } 216 217 // If our error is nil, we have passed our health check, so we 218 // can exit. 219 if err == nil { 220 return false 221 } 222 223 // If we have reached our allowed number of attempts, this 224 // check has failed so we request shutdown. 225 if count == o.Attempts { 226 shutdown("Health check: %v failed after %v "+ 227 "calls", o, o.Attempts) 228 return true 229 } 230 231 log.Infof("Health check: %v, call: %v failed with: %v, "+ 232 "backing off for: %v", o, count, err, o.Backoff) 233 234 // If we are still within the number of calls allowed for this 235 // check, we wait for our back off period to elapse, or exit if 236 // we get the signal to shutdown. 237 select { 238 case <-time.After(o.Backoff): 239 240 case <-quit: 241 log.Debug("Health check: monitor quit") 242 return false 243 } 244 } 245 246 return false 247 }