github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/status/health_check.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package status 12 13 import ( 14 "context" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/roachpb" 18 "github.com/cockroachdb/cockroach/pkg/server/status/statuspb" 19 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 20 ) 21 22 type threshold struct { 23 gauge bool 24 min int64 25 } 26 27 var ( 28 counterZero = threshold{} 29 gaugeZero = threshold{gauge: true} 30 ) 31 32 // TODO(tschottdorf): I think we should just export the metric metadata from 33 // their respective packages and reference them here, instead of the 34 // duplication. It also seems useful to specify the metric type in the metadata 35 // so that we don't have to "guess" whether it's a gauge or counter. However 36 // there's some massaging for latency histograms that happens in NodeStatus, 37 // so the logic likely has to be moved up a bit. A thread not worth pulling on 38 // at the moment, I suppose. 39 // 40 // TODO(tschottdorf): there are some metrics that could be used in alerts but 41 // need special treatment. For example, we want to alert when compactions are 42 // queued but not processed over long periods of time, or when queues have a 43 // large backlog but show no sign of processing times. 44 var trackedMetrics = map[string]threshold{ 45 // Gauges. 46 "ranges.unavailable": gaugeZero, 47 "ranges.underreplicated": gaugeZero, 48 "requests.backpressure.split": gaugeZero, 49 "requests.slow.latch": gaugeZero, 50 "requests.slow.lease": gaugeZero, 51 "requests.slow.raft": gaugeZero, 52 // TODO(tbg): this fires too eagerly. On a large machine that can handle many 53 // concurrent requests, we'll blow a limit that would be disastrous to a smaller 54 // machine. This will be hard to fix. We could track the max goroutine count 55 // seen or the growth in goroutine count (like the goroutine dumper does) 56 // but it's unclear that this will ever make a good alert. CPU load might 57 // work a lot better. 58 // "sys.goroutines": {gauge: true, min: 5000}, 59 60 // Latencies (which are really histograms, but we get to see a fixed number 61 // of percentiles as gauges) 62 "raft.process.logcommit.latency-90": {gauge: true, min: int64(100 * time.Millisecond)}, 63 "round-trip-latency-p90": {gauge: true, min: int64(time.Second)}, 64 65 // Counters. 66 67 "liveness.heartbeatfailures": counterZero, 68 "timeseries.write.errors": counterZero, 69 70 // Queue processing errors. This might be too aggressive. For example, if the 71 // replicate queue is waiting for a split, does that generate an error? If so, 72 // is that worth alerting about? We might need severities here at some point 73 // or some other way to guard against "blips". 74 // 75 // TODO(tbg): as the comment above suspected, these were usually spammy and 76 // not useful to the untrained eye. 77 // "compactor.compactions.failure": counterZero, 78 // "queue.replicagc.process.failure": counterZero, 79 // "queue.raftlog.process.failure": counterZero, 80 // "queue.gc.process.failure": counterZero, 81 // "queue.split.process.failure": counterZero, 82 // "queue.replicate.process.failure": counterZero, 83 // "queue.raftsnapshot.process.failure": counterZero, 84 // "queue.tsmaintenance.process.failure": counterZero, 85 // "queue.consistency.process.failure": counterZero, 86 87 // When there are more than 100 pending items in the Raft snapshot queue, 88 // this is certainly worth pointing out. 89 "queue.raftsnapshot.pending": {gauge: true, min: 100}, 90 } 91 92 type metricsMap map[roachpb.StoreID]map[string]float64 93 94 // update takes a populated metrics map and extracts the tracked metrics. Gauges 95 // are returned verbatim, while for counters the diff between the last seen 96 // value is returned. Only nonzero values are reported and the seen (non-relative) 97 // values are persisted for the next call. 98 func (d metricsMap) update(tracked map[string]threshold, m metricsMap) metricsMap { 99 out := metricsMap{} 100 for storeID := range m { 101 for name, threshold := range tracked { 102 val, ok := m[storeID][name] 103 if !ok { 104 continue 105 } 106 107 if !threshold.gauge { 108 prevVal, havePrev := d[storeID][name] 109 if d[storeID] == nil { 110 d[storeID] = map[string]float64{} 111 } 112 d[storeID][name] = val 113 if havePrev { 114 val -= prevVal 115 } else { 116 // Can't report the first time around if we don't know the previous 117 // value of the counter. 118 val = 0 119 } 120 } 121 122 if val > float64(threshold.min) { 123 if out[storeID] == nil { 124 out[storeID] = map[string]float64{} 125 } 126 out[storeID][name] = val 127 } 128 } 129 } 130 return out 131 } 132 133 // A HealthChecker inspects the node metrics and optionally a NodeStatus for 134 // anomalous conditions that the operator should be alerted to. 135 type HealthChecker struct { 136 mu struct { 137 syncutil.Mutex 138 metricsMap // - the last recorded values of all counters 139 } 140 tracked map[string]threshold 141 } 142 143 // NewHealthChecker creates a new health checker that emits alerts whenever the 144 // given metrics are nonzero. Setting the boolean map value indicates a gauge 145 // (in which case it is reported whenever it's nonzero); otherwise the metric is 146 // treated as a counter and reports whenever it is incremented between 147 // consecutive calls of `CheckHealth`. 148 func NewHealthChecker(trackedMetrics map[string]threshold) *HealthChecker { 149 h := &HealthChecker{tracked: trackedMetrics} 150 h.mu.metricsMap = metricsMap{} 151 return h 152 } 153 154 // CheckHealth performs a (cheap) health check. 155 func (h *HealthChecker) CheckHealth( 156 ctx context.Context, nodeStatus statuspb.NodeStatus, 157 ) statuspb.HealthCheckResult { 158 h.mu.Lock() 159 defer h.mu.Unlock() 160 // Gauges that trigger alerts when nonzero. 161 var alerts []statuspb.HealthAlert 162 163 m := map[roachpb.StoreID]map[string]float64{ 164 0: nodeStatus.Metrics, 165 } 166 for _, storeStatus := range nodeStatus.StoreStatuses { 167 m[storeStatus.Desc.StoreID] = storeStatus.Metrics 168 } 169 170 diffs := h.mu.update(h.tracked, m) 171 172 for storeID, storeDiff := range diffs { 173 for name, value := range storeDiff { 174 alerts = append(alerts, statuspb.HealthAlert{ 175 StoreID: storeID, 176 Category: statuspb.HealthAlert_METRICS, 177 Description: name, 178 Value: value, 179 }) 180 } 181 } 182 183 return statuspb.HealthCheckResult{Alerts: alerts} 184 }