github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/status/health_check.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package status
    12  
    13  import (
    14  	"context"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    18  	"github.com/cockroachdb/cockroach/pkg/server/status/statuspb"
    19  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    20  )
    21  
    22  type threshold struct {
    23  	gauge bool
    24  	min   int64
    25  }
    26  
    27  var (
    28  	counterZero = threshold{}
    29  	gaugeZero   = threshold{gauge: true}
    30  )
    31  
    32  // TODO(tschottdorf): I think we should just export the metric metadata from
    33  // their respective packages and reference them here, instead of the
    34  // duplication. It also seems useful to specify the metric type in the metadata
    35  // so that we don't have to "guess" whether it's a gauge or counter. However
    36  // there's some massaging for latency histograms that happens in NodeStatus,
    37  // so the logic likely has to be moved up a bit. A thread not worth pulling on
    38  // at the moment, I suppose.
    39  //
    40  // TODO(tschottdorf): there are some metrics that could be used in alerts but
    41  // need special treatment. For example, we want to alert when compactions are
    42  // queued but not processed over long periods of time, or when queues have a
    43  // large backlog but show no sign of processing times.
    44  var trackedMetrics = map[string]threshold{
    45  	// Gauges.
    46  	"ranges.unavailable":          gaugeZero,
    47  	"ranges.underreplicated":      gaugeZero,
    48  	"requests.backpressure.split": gaugeZero,
    49  	"requests.slow.latch":         gaugeZero,
    50  	"requests.slow.lease":         gaugeZero,
    51  	"requests.slow.raft":          gaugeZero,
    52  	// TODO(tbg): this fires too eagerly. On a large machine that can handle many
    53  	// concurrent requests, we'll blow a limit that would be disastrous to a smaller
    54  	// machine. This will be hard to fix. We could track the max goroutine count
    55  	// seen or the growth in goroutine count (like the goroutine dumper does)
    56  	// but it's unclear that this will ever make a good alert. CPU load might
    57  	// work a lot better.
    58  	// "sys.goroutines":              {gauge: true, min: 5000},
    59  
    60  	// Latencies (which are really histograms, but we get to see a fixed number
    61  	// of percentiles as gauges)
    62  	"raft.process.logcommit.latency-90": {gauge: true, min: int64(100 * time.Millisecond)},
    63  	"round-trip-latency-p90":            {gauge: true, min: int64(time.Second)},
    64  
    65  	// Counters.
    66  
    67  	"liveness.heartbeatfailures": counterZero,
    68  	"timeseries.write.errors":    counterZero,
    69  
    70  	// Queue processing errors. This might be too aggressive. For example, if the
    71  	// replicate queue is waiting for a split, does that generate an error? If so,
    72  	// is that worth alerting about? We might need severities here at some point
    73  	// or some other way to guard against "blips".
    74  	//
    75  	// TODO(tbg): as the comment above suspected, these were usually spammy and
    76  	// not useful to the untrained eye.
    77  	// "compactor.compactions.failure":       counterZero,
    78  	// "queue.replicagc.process.failure":     counterZero,
    79  	// "queue.raftlog.process.failure":       counterZero,
    80  	// "queue.gc.process.failure":            counterZero,
    81  	// "queue.split.process.failure":         counterZero,
    82  	// "queue.replicate.process.failure":     counterZero,
    83  	// "queue.raftsnapshot.process.failure":  counterZero,
    84  	// "queue.tsmaintenance.process.failure": counterZero,
    85  	// "queue.consistency.process.failure":   counterZero,
    86  
    87  	// When there are more than 100 pending items in the Raft snapshot queue,
    88  	// this is certainly worth pointing out.
    89  	"queue.raftsnapshot.pending": {gauge: true, min: 100},
    90  }
    91  
    92  type metricsMap map[roachpb.StoreID]map[string]float64
    93  
    94  // update takes a populated metrics map and extracts the tracked metrics. Gauges
    95  // are returned verbatim, while for counters the diff between the last seen
    96  // value is returned. Only nonzero values are reported and the seen (non-relative)
    97  // values are persisted for the next call.
    98  func (d metricsMap) update(tracked map[string]threshold, m metricsMap) metricsMap {
    99  	out := metricsMap{}
   100  	for storeID := range m {
   101  		for name, threshold := range tracked {
   102  			val, ok := m[storeID][name]
   103  			if !ok {
   104  				continue
   105  			}
   106  
   107  			if !threshold.gauge {
   108  				prevVal, havePrev := d[storeID][name]
   109  				if d[storeID] == nil {
   110  					d[storeID] = map[string]float64{}
   111  				}
   112  				d[storeID][name] = val
   113  				if havePrev {
   114  					val -= prevVal
   115  				} else {
   116  					// Can't report the first time around if we don't know the previous
   117  					// value of the counter.
   118  					val = 0
   119  				}
   120  			}
   121  
   122  			if val > float64(threshold.min) {
   123  				if out[storeID] == nil {
   124  					out[storeID] = map[string]float64{}
   125  				}
   126  				out[storeID][name] = val
   127  			}
   128  		}
   129  	}
   130  	return out
   131  }
   132  
   133  // A HealthChecker inspects the node metrics and optionally a NodeStatus for
   134  // anomalous conditions that the operator should be alerted to.
   135  type HealthChecker struct {
   136  	mu struct {
   137  		syncutil.Mutex
   138  		metricsMap // - the last recorded values of all counters
   139  	}
   140  	tracked map[string]threshold
   141  }
   142  
   143  // NewHealthChecker creates a new health checker that emits alerts whenever the
   144  // given metrics are nonzero. Setting the boolean map value indicates a gauge
   145  // (in which case it is reported whenever it's nonzero); otherwise the metric is
   146  // treated as a counter and reports whenever it is incremented between
   147  // consecutive calls of `CheckHealth`.
   148  func NewHealthChecker(trackedMetrics map[string]threshold) *HealthChecker {
   149  	h := &HealthChecker{tracked: trackedMetrics}
   150  	h.mu.metricsMap = metricsMap{}
   151  	return h
   152  }
   153  
   154  // CheckHealth performs a (cheap) health check.
   155  func (h *HealthChecker) CheckHealth(
   156  	ctx context.Context, nodeStatus statuspb.NodeStatus,
   157  ) statuspb.HealthCheckResult {
   158  	h.mu.Lock()
   159  	defer h.mu.Unlock()
   160  	// Gauges that trigger alerts when nonzero.
   161  	var alerts []statuspb.HealthAlert
   162  
   163  	m := map[roachpb.StoreID]map[string]float64{
   164  		0: nodeStatus.Metrics,
   165  	}
   166  	for _, storeStatus := range nodeStatus.StoreStatuses {
   167  		m[storeStatus.Desc.StoreID] = storeStatus.Metrics
   168  	}
   169  
   170  	diffs := h.mu.update(h.tracked, m)
   171  
   172  	for storeID, storeDiff := range diffs {
   173  		for name, value := range storeDiff {
   174  			alerts = append(alerts, statuspb.HealthAlert{
   175  				StoreID:     storeID,
   176  				Category:    statuspb.HealthAlert_METRICS,
   177  				Description: name,
   178  				Value:       value,
   179  			})
   180  		}
   181  	}
   182  
   183  	return statuspb.HealthCheckResult{Alerts: alerts}
   184  }