github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/monitoring/rules/alerts.rules.yml (about) 1 groups: 2 - name: rules/alerts.rules 3 rules: 4 # Alert for any instance that is unreachable for >5 minutes. 5 - alert: InstanceDown 6 expr: up{job="cockroachdb"} == 0 7 for: 5m 8 annotations: 9 description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been 10 down for more than 5 minutes.' 11 summary: Instance {{ $labels.instance }} down 12 # Alert for any instance that is unreachable for >15 minutes. 13 - alert: InstanceDead 14 expr: up{job="cockroachdb"} == 0 15 for: 15m 16 annotations: 17 description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} has been 18 down for more than 15 minutes.' 19 summary: Instance {{ $labels.instance }} dead 20 # Alert on instance restarts. 21 - alert: InstanceRestart 22 expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 0 and resets(sys_uptime{job="cockroachdb"}[10m]) 23 < 5 24 annotations: 25 description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted 26 {{ $value }} time(s) in 10m' 27 summary: Instance {{ $labels.instance }} restarted 28 # Alert on flapping instances (frequent restarts). 29 - alert: InstanceFlapping 30 expr: resets(sys_uptime{job="cockroachdb"}[10m]) > 5 31 annotations: 32 description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted 33 {{ $value }} time(s) in 10m' 34 summary: Instance {{ $labels.instance }} flapping 35 # Alert on mismatching "up" (from prometheus) vs "liveness" (from cockroach). We do this at the node level. 36 # This compares per-instance "liveness_livenodes" against the per-cluster count(up == 1). 37 - alert: LivenessMismatch 38 expr: (liveness_livenodes{job="cockroachdb"}) != ignoring(instance) group_left() 39 (count by(cluster, job) (up{job="cockroachdb"} == 1)) 40 for: 5m 41 labels: 42 severity: testing 43 annotations: 44 description: Prometheus and {{ $labels.instance }} disagree on liveness 45 summary: Liveness mismatch for {{ $labels.instance }} 46 # Alert on version mismatch. 47 # This alert is intentionally loose (30 minutes) to allow for rolling upgrades. 48 # This may need to be adjusted for large clusters. 49 - alert: VersionMismatch 50 expr: count by(cluster) (count_values by(tag, cluster) ("version", build_timestamp{job="cockroachdb"})) 51 > 1 52 for: 30m 53 annotations: 54 description: Cluster {{ $labels.cluster }} running {{ $value }} different versions 55 summary: Binary version mismatch on {{ $labels.cluster }} 56 # Available capacity alerts. 57 - alert: StoreDiskLow 58 expr: capacity_available:ratio{job="cockroachdb"} < 0.15 59 annotations: 60 summary: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value 61 }} available disk fraction 62 - alert: ClusterDiskLow 63 expr: cluster:capacity_available:ratio{job="cockroachdb"} < 0.2 64 annotations: 65 summary: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction 66 # Zero SQL qps. 67 - alert: ZeroSQLQps 68 expr: sql_conns{job="cockroachdb"} > 0 and rate(sql_query_count{job="cockroachdb"}[5m]) 69 == 0 70 for: 10m 71 annotations: 72 summary: Instance {{ $labels.instance }} has SQL connections but no queries 73 # Unavailable ranges. 74 - alert: UnavailableRanges 75 expr: (sum by(instance, cluster) (ranges_unavailable{job="cockroachdb"})) > 0 76 for: 10m 77 labels: 78 severity: testing 79 annotations: 80 summary: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges 81 # Leader-not-leaseholder ranges. 82 - alert: NoLeaseRanges 83 expr: (sum by(instance, cluster) (replicas_leaders_not_leaseholders{job="cockroachdb"})) 84 > 0 85 for: 10m 86 labels: 87 severity: testing 88 annotations: 89 summary: Instance {{ $labels.instance }} has {{ $value }} ranges without leases 90 # Certificate expiration. Alerts are per node. 91 - alert: CACertificateExpiresSoon 92 expr: (security_certificate_expiration_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ca{job="cockroachdb"} 93 - time()) < 86400 * 366 94 labels: 95 frequency: daily 96 annotations: 97 summary: CA certificate for {{ $labels.instance }} expires in less than a year 98 - alert: ClientCACertificateExpiresSoon 99 expr: (security_certificate_expiration_client_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_client_ca{job="cockroachdb"} 100 - time()) < 86400 * 366 101 labels: 102 frequency: daily 103 annotations: 104 summary: Client CA certificate for {{ $labels.instance }} expires in less than a year 105 - alert: UICACertificateExpiresSoon 106 expr: (security_certificate_expiration_ui_ca{job="cockroachdb"} > 0) and (security_certificate_expiration_ui_ca{job="cockroachdb"} 107 - time()) < 86400 * 366 108 labels: 109 frequency: daily 110 annotations: 111 summary: UI CA certificate for {{ $labels.instance }} expires in less than a year 112 - alert: NodeCertificateExpiresSoon 113 expr: (security_certificate_expiration_node{job="cockroachdb"} > 0) and (security_certificate_expiration_node{job="cockroachdb"} 114 - time()) < 86400 * 183 115 labels: 116 frequency: daily 117 annotations: 118 summary: Node certificate for {{ $labels.instance }} expires in less than six months 119 - alert: NodeClientCertificateExpiresSoon 120 expr: (security_certificate_expiration_node_client{job="cockroachdb"} > 0) and (security_certificate_expiration_node_client{job="cockroachdb"} 121 - time()) < 86400 * 183 122 labels: 123 frequency: daily 124 annotations: 125 summary: Client certificate for {{ $labels.instance }} expires in less than six months 126 - alert: UICertificateExpiresSoon 127 expr: (security_certificate_expiration_ui{job="cockroachdb"} > 0) and (security_certificate_expiration_ui{job="cockroachdb"} 128 - time()) < 86400 * 30 129 labels: 130 frequency: daily 131 annotations: 132 summary: UI certificate for {{ $labels.instance }} expires in less than 30 days 133 # Getting close to open file descriptor limit. 134 - alert: HighOpenFDCount 135 expr: sys_fd_open{job="cockroachdb"} / sys_fd_softlimit{job="cockroachdb"} > 0.8 136 for: 10m 137 labels: 138 severity: testing 139 annotations: 140 summary: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value 141 }} fraction used'