github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/cloud/kubernetes/prometheus/alert-rules.yaml (about) 1 # GENERATED FILE - DO NOT EDIT 2 apiVersion: monitoring.coreos.com/v1 3 kind: PrometheusRule 4 metadata: 5 labels: 6 app: cockroachdb 7 prometheus: cockroachdb 8 role: alert-rules 9 name: prometheus-cockroachdb-rules 10 spec: 11 groups: 12 - name: rules/dummy.rules 13 rules: 14 - alert: TestAlertManager 15 expr: vector(1) 16 - name: rules/aggregation.rules 17 rules: 18 - expr: sum without(store) (capacity{job="cockroachdb"}) 19 record: node:capacity 20 - expr: sum without(instance) (node:capacity{job="cockroachdb"}) 21 record: cluster:capacity 22 - expr: sum without(store) (capacity_available{job="cockroachdb"}) 23 record: node:capacity_available 24 - expr: sum without(instance) (node:capacity_available{job="cockroachdb"}) 25 record: cluster:capacity_available 26 - expr: capacity_available{job="cockroachdb"} / capacity{job="cockroachdb"} 27 record: capacity_available:ratio 28 - expr: node:capacity_available{job="cockroachdb"} / node:capacity{job="cockroachdb"} 29 record: node:capacity_available:ratio 30 - expr: cluster:capacity_available{job="cockroachdb"} / cluster:capacity{job="cockroachdb"} 31 record: cluster:capacity_available:ratio 32 - expr: rate(txn_durations_bucket{job="cockroachdb"}[1m]) 33 record: txn_durations_bucket:rate1m 34 - expr: histogram_quantile(0.5, txn_durations_bucket:rate1m) 35 record: txn_durations:rate1m:quantile_50 36 - expr: histogram_quantile(0.75, txn_durations_bucket:rate1m) 37 record: txn_durations:rate1m:quantile_75 38 - expr: histogram_quantile(0.9, txn_durations_bucket:rate1m) 39 record: txn_durations:rate1m:quantile_90 40 - expr: histogram_quantile(0.95, txn_durations_bucket:rate1m) 41 record: txn_durations:rate1m:quantile_95 42 - expr: histogram_quantile(0.99, txn_durations_bucket:rate1m) 43 record: txn_durations:rate1m:quantile_99 44 - expr: rate(exec_latency_bucket{job="cockroachdb"}[1m]) 45 record: exec_latency_bucket:rate1m 46 - expr: histogram_quantile(0.5, exec_latency_bucket:rate1m) 47 record: exec_latency:rate1m:quantile_50 48 - expr: histogram_quantile(0.75, exec_latency_bucket:rate1m) 49 record: exec_latency:rate1m:quantile_75 50 - expr: histogram_quantile(0.9, exec_latency_bucket:rate1m) 51 record: exec_latency:rate1m:quantile_90 52 - expr: histogram_quantile(0.95, exec_latency_bucket:rate1m) 53 record: exec_latency:rate1m:quantile_95 54 - expr: histogram_quantile(0.99, exec_latency_bucket:rate1m) 55 record: exec_latency:rate1m:quantile_99 56 - expr: rate(round_trip_latency_bucket{job="cockroachdb"}[1m]) 57 record: round_trip_latency_bucket:rate1m 58 - expr: histogram_quantile(0.5, round_trip_latency_bucket:rate1m) 59 record: round_trip_latency:rate1m:quantile_50 60 - expr: histogram_quantile(0.75, round_trip_latency_bucket:rate1m) 61 record: round_trip_latency:rate1m:quantile_75 62 - expr: histogram_quantile(0.9, round_trip_latency_bucket:rate1m) 63 record: round_trip_latency:rate1m:quantile_90 64 - expr: histogram_quantile(0.95, round_trip_latency_bucket:rate1m) 65 record: round_trip_latency:rate1m:quantile_95 66 - expr: histogram_quantile(0.99, round_trip_latency_bucket:rate1m) 67 record: round_trip_latency:rate1m:quantile_99 68 - expr: rate(sql_exec_latency_bucket{job="cockroachdb"}[1m]) 69 record: sql_exec_latency_bucket:rate1m 70 - expr: histogram_quantile(0.5, sql_exec_latency_bucket:rate1m) 71 record: sql_exec_latency:rate1m:quantile_50 72 - expr: histogram_quantile(0.75, sql_exec_latency_bucket:rate1m) 73 record: sql_exec_latency:rate1m:quantile_75 74 - expr: histogram_quantile(0.9, sql_exec_latency_bucket:rate1m) 75 record: sql_exec_latency:rate1m:quantile_90 76 - expr: histogram_quantile(0.95, sql_exec_latency_bucket:rate1m) 77 record: sql_exec_latency:rate1m:quantile_95 78 - expr: histogram_quantile(0.99, sql_exec_latency_bucket:rate1m) 79 record: sql_exec_latency:rate1m:quantile_99 80 - expr: rate(raft_process_logcommit_latency_bucket{job="cockroachdb"}[1m]) 81 record: raft_process_logcommit_latency_bucket:rate1m 82 - expr: histogram_quantile(0.5, raft_process_logcommit_latency_bucket:rate1m) 83 record: raft_process_logcommit_latency:rate1m:quantile_50 84 - expr: histogram_quantile(0.75, raft_process_logcommit_latency_bucket:rate1m) 85 record: raft_process_logcommit_latency:rate1m:quantile_75 86 - expr: histogram_quantile(0.9, raft_process_logcommit_latency_bucket:rate1m) 87 record: raft_process_logcommit_latency:rate1m:quantile_90 88 - expr: histogram_quantile(0.95, raft_process_logcommit_latency_bucket:rate1m) 89 record: raft_process_logcommit_latency:rate1m:quantile_95 90 - expr: histogram_quantile(0.99, raft_process_logcommit_latency_bucket:rate1m) 91 record: raft_process_logcommit_latency:rate1m:quantile_99 92 - expr: rate(raft_process_commandcommit_latency_bucket{job="cockroachdb"}[1m]) 93 record: raft_process_commandcommit_latency_bucket:rate1m 94 - expr: histogram_quantile(0.5, raft_process_commandcommit_latency_bucket:rate1m) 95 record: raft_process_commandcommit_latency:rate1m:quantile_50 96 - expr: histogram_quantile(0.75, raft_process_commandcommit_latency_bucket:rate1m) 97 record: raft_process_commandcommit_latency:rate1m:quantile_75 98 - expr: histogram_quantile(0.9, raft_process_commandcommit_latency_bucket:rate1m) 99 record: raft_process_commandcommit_latency:rate1m:quantile_90 100 - expr: histogram_quantile(0.95, raft_process_commandcommit_latency_bucket:rate1m) 101 record: raft_process_commandcommit_latency:rate1m:quantile_95 102 - expr: histogram_quantile(0.99, raft_process_commandcommit_latency_bucket:rate1m) 103 record: raft_process_commandcommit_latency:rate1m:quantile_99 104 - name: rules/alerts.rules 105 rules: 106 - alert: InstanceDown 107 annotations: 108 description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has 109 been down for more than 5 minutes.' 110 summary: Instance {{ $labels.instance }} down 111 expr: up{job="cockroachdb"} == 0 112 for: 5m 113 - alert: InstanceDead 114 annotations: 115 description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has 116 been down for more than 15 minutes.' 117 summary: Instance {{ $labels.instance }} dead 118 expr: up{job="cockroachdb"} == 0 119 for: 15m 120 - alert: InstanceRestart 121 annotations: 122 description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted 123 {{ $value }} time(s) in 10m' 124 summary: Instance {{ $labels.instance }} restarted 125 expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 0 and resets(sys_uptime{job="cockroachdb"}[10m]) 126 < 5 127 - alert: InstanceFlapping 128 annotations: 129 description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted 130 {{ $value }} time(s) in 10m' 131 summary: Instance {{ $labels.instance }} flapping 132 expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 5 133 - alert: LivenessMismatch 134 annotations: 135 description: Prometheus and {{ $labels.instance }} disagree on liveness 136 summary: Liveness mismatch for {{ $labels.instance }} 137 expr: (liveness_livenodes{job="cockroachdb"}) != ignoring(instance) group_left() 138 (count by(cluster, job) (up{job="cockroachdb"} == 1)) 139 for: 5m 140 labels: 141 severity: testing 142 - alert: VersionMismatch 143 annotations: 144 description: Cluster {{ $labels.cluster }} running {{ $value }} different 145 versions 146 summary: Binary version mismatch on {{ $labels.cluster }} 147 expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{job="cockroachdb"})) 148 > 1 149 for: 30m 150 - alert: StoreDiskLow 151 annotations: 152 summary: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value 153 }} available disk fraction 154 expr: capacity_available:ratio{job="cockroachdb"} < 0.15 155 - alert: ClusterDiskLow 156 annotations: 157 summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction 158 expr: cluster:capacity_available:ratio{job="cockroachdb"} < 0.2 159 - alert: ZeroSQLQps 160 annotations: 161 summary: Instance {{ $labels.instance }} has SQL connections but no queries 162 expr: sql_conns{job="cockroachdb"} > 0 and rate(sql_query_count{job="cockroachdb"}[5m]) 163 == 0 164 for: 10m 165 - alert: UnavailableRanges 166 annotations: 167 summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges 168 expr: (sum by(instance, cluster) (ranges_unavailable{job="cockroachdb"})) > 169 0 170 for: 10m 171 labels: 172 severity: testing 173 - alert: NoLeaseRanges 174 annotations: 175 summary: Instance {{ $labels.instance }} has {{ $value }} ranges without leases 176 expr: (sum by(instance, cluster) (replicas_leaders_not_leaseholders{job="cockroachdb"})) 177 > 0 178 for: 10m 179 labels: 180 severity: testing 181 - alert: CACertificateExpiresSoon 182 annotations: 183 summary: CA certificate for {{ $labels.instance }} expires in less than a 184 year 185 expr: (security_certificate_expiration_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ca{job="cockroachdb"} 186 - time()) < 86400 * 366 187 labels: 188 frequency: daily 189 - alert: NodeCertificateExpiresSoon 190 annotations: 191 summary: Node certificate for {{ $labels.instance }} expires in less than 192 six months 193 expr: (security_certificate_expiration_node{job="cockroachdb"} > 0) and (security_certificate_expiration_node{job="cockroachdb"} 194 - time()) < 86400 * 183 195 labels: 196 frequency: daily 197 - alert: HighOpenFDCount 198 annotations: 199 summary: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value 200 }} fraction used' 201 expr: sys_fd_open{job="cockroachdb"} / sys_fd_softlimit{job="cockroachdb"} > 202 0.8 203 for: 10m 204 labels: 205 severity: testing