github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/prometheus-community/kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml (about) 1 {{- /* 2 Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/etcd/main/contrib/mixin/mixin.libsonnet 3 Do not change in-place! In order to change this file first read following link: 4 https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack 5 */ -}} 6 {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} 7 {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }} 8 apiVersion: monitoring.coreos.com/v1 9 kind: PrometheusRule 10 metadata: 11 name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }} 12 namespace: {{ template "kube-prometheus-stack.namespace" . }} 13 labels: 14 app: {{ template "kube-prometheus-stack.name" . }} 15 {{ include "kube-prometheus-stack.labels" . | indent 4 }} 16 {{- if .Values.defaultRules.labels }} 17 {{ toYaml .Values.defaultRules.labels | indent 4 }} 18 {{- end }} 19 {{- if .Values.defaultRules.annotations }} 20 annotations: 21 {{ toYaml .Values.defaultRules.annotations | indent 4 }} 22 {{- end }} 23 spec: 24 groups: 25 - name: etcd 26 rules: 27 {{- if not (.Values.defaultRules.disabled.etcdMembersDown | default false) }} 28 - alert: etcdMembersDown 29 annotations: 30 {{- if .Values.defaultRules.additionalRuleAnnotations }} 31 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 32 {{- end }} 33 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).' 34 summary: etcd cluster members are down. 35 expr: |- 36 max without (endpoint) ( 37 sum without (instance) (up{job=~".*etcd.*"} == bool 0) 38 or 39 count without (To) ( 40 sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 41 ) 42 ) 43 > 0 44 for: 10m 45 labels: 46 severity: critical 47 {{- if .Values.defaultRules.additionalRuleLabels }} 48 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 49 {{- end }} 50 {{- end }} 51 {{- if not (.Values.defaultRules.disabled.etcdInsufficientMembers | default false) }} 52 - alert: etcdInsufficientMembers 53 annotations: 54 {{- if .Values.defaultRules.additionalRuleAnnotations }} 55 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 56 {{- end }} 57 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' 58 summary: etcd cluster has insufficient number of members. 59 expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) 60 for: 3m 61 labels: 62 severity: critical 63 {{- if .Values.defaultRules.additionalRuleLabels }} 64 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 65 {{- end }} 66 {{- end }} 67 {{- if not (.Values.defaultRules.disabled.etcdNoLeader | default false) }} 68 - alert: etcdNoLeader 69 annotations: 70 {{- if .Values.defaultRules.additionalRuleAnnotations }} 71 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 72 {{- end }} 73 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' 74 summary: etcd cluster has no leader. 75 expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 76 for: 1m 77 labels: 78 severity: critical 79 {{- if .Values.defaultRules.additionalRuleLabels }} 80 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 81 {{- end }} 82 {{- end }} 83 {{- if not (.Values.defaultRules.disabled.etcdHighNumberOfLeaderChanges | default false) }} 84 - alert: etcdHighNumberOfLeaderChanges 85 annotations: 86 {{- if .Values.defaultRules.additionalRuleAnnotations }} 87 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 88 {{- end }} 89 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' 90 summary: etcd cluster has high number of leader changes. 91 expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 92 for: 5m 93 labels: 94 severity: warning 95 {{- if .Values.defaultRules.additionalRuleLabels }} 96 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 97 {{- end }} 98 {{- end }} 99 {{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedGRPCRequests | default false) }} 100 - alert: etcdHighNumberOfFailedGRPCRequests 101 annotations: 102 {{- if .Values.defaultRules.additionalRuleAnnotations }} 103 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 104 {{- end }} 105 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' 106 summary: etcd cluster has high number of failed grpc requests. 107 expr: |- 108 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) 109 / 110 sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) 111 > 1 112 for: 10m 113 labels: 114 severity: warning 115 {{- if .Values.defaultRules.additionalRuleLabels }} 116 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 117 {{- end }} 118 {{- end }} 119 {{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedGRPCRequests | default false) }} 120 - alert: etcdHighNumberOfFailedGRPCRequests 121 annotations: 122 {{- if .Values.defaultRules.additionalRuleAnnotations }} 123 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 124 {{- end }} 125 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' 126 summary: etcd cluster has high number of failed grpc requests. 127 expr: |- 128 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) 129 / 130 sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) 131 > 5 132 for: 5m 133 labels: 134 severity: critical 135 {{- if .Values.defaultRules.additionalRuleLabels }} 136 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 137 {{- end }} 138 {{- end }} 139 {{- if not (.Values.defaultRules.disabled.etcdGRPCRequestsSlow | default false) }} 140 - alert: etcdGRPCRequestsSlow 141 annotations: 142 {{- if .Values.defaultRules.additionalRuleAnnotations }} 143 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 144 {{- end }} 145 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}} for {{`{{`}} $labels.grpc_method {{`}}`}} method.' 146 summary: etcd grpc requests are slow 147 expr: |- 148 histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) 149 > 0.15 150 for: 10m 151 labels: 152 severity: critical 153 {{- if .Values.defaultRules.additionalRuleLabels }} 154 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 155 {{- end }} 156 {{- end }} 157 {{- if not (.Values.defaultRules.disabled.etcdMemberCommunicationSlow | default false) }} 158 - alert: etcdMemberCommunicationSlow 159 annotations: 160 {{- if .Values.defaultRules.additionalRuleAnnotations }} 161 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 162 {{- end }} 163 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' 164 summary: etcd cluster member communication is slow. 165 expr: |- 166 histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) 167 > 0.15 168 for: 10m 169 labels: 170 severity: warning 171 {{- if .Values.defaultRules.additionalRuleLabels }} 172 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 173 {{- end }} 174 {{- end }} 175 {{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedProposals | default false) }} 176 - alert: etcdHighNumberOfFailedProposals 177 annotations: 178 {{- if .Values.defaultRules.additionalRuleAnnotations }} 179 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 180 {{- end }} 181 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' 182 summary: etcd cluster has high number of proposal failures. 183 expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 184 for: 15m 185 labels: 186 severity: warning 187 {{- if .Values.defaultRules.additionalRuleLabels }} 188 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 189 {{- end }} 190 {{- end }} 191 {{- if not (.Values.defaultRules.disabled.etcdHighFsyncDurations | default false) }} 192 - alert: etcdHighFsyncDurations 193 annotations: 194 {{- if .Values.defaultRules.additionalRuleAnnotations }} 195 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 196 {{- end }} 197 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' 198 summary: etcd cluster 99th percentile fsync durations are too high. 199 expr: |- 200 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) 201 > 0.5 202 for: 10m 203 labels: 204 severity: warning 205 {{- if .Values.defaultRules.additionalRuleLabels }} 206 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 207 {{- end }} 208 {{- end }} 209 {{- if not (.Values.defaultRules.disabled.etcdHighFsyncDurations | default false) }} 210 - alert: etcdHighFsyncDurations 211 annotations: 212 {{- if .Values.defaultRules.additionalRuleAnnotations }} 213 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 214 {{- end }} 215 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' 216 summary: etcd cluster 99th percentile fsync durations are too high. 217 expr: |- 218 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) 219 > 1 220 for: 10m 221 labels: 222 severity: critical 223 {{- if .Values.defaultRules.additionalRuleLabels }} 224 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 225 {{- end }} 226 {{- end }} 227 {{- if not (.Values.defaultRules.disabled.etcdHighCommitDurations | default false) }} 228 - alert: etcdHighCommitDurations 229 annotations: 230 {{- if .Values.defaultRules.additionalRuleAnnotations }} 231 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 232 {{- end }} 233 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' 234 summary: etcd cluster 99th percentile commit durations are too high. 235 expr: |- 236 histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) 237 > 0.25 238 for: 10m 239 labels: 240 severity: warning 241 {{- if .Values.defaultRules.additionalRuleLabels }} 242 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 243 {{- end }} 244 {{- end }} 245 {{- if not (.Values.defaultRules.disabled.etcdDatabaseQuotaLowSpace | default false) }} 246 - alert: etcdDatabaseQuotaLowSpace 247 annotations: 248 {{- if .Values.defaultRules.additionalRuleAnnotations }} 249 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 250 {{- end }} 251 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' 252 summary: etcd cluster database is running full. 253 expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 254 for: 10m 255 labels: 256 severity: critical 257 {{- if .Values.defaultRules.additionalRuleLabels }} 258 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 259 {{- end }} 260 {{- end }} 261 {{- if not (.Values.defaultRules.disabled.etcdExcessiveDatabaseGrowth | default false) }} 262 - alert: etcdExcessiveDatabaseGrowth 263 annotations: 264 {{- if .Values.defaultRules.additionalRuleAnnotations }} 265 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 266 {{- end }} 267 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.' 268 summary: etcd cluster database growing very fast. 269 expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes 270 for: 10m 271 labels: 272 severity: warning 273 {{- if .Values.defaultRules.additionalRuleLabels }} 274 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 275 {{- end }} 276 {{- end }} 277 {{- if not (.Values.defaultRules.disabled.etcdDatabaseHighFragmentationRatio | default false) }} 278 - alert: etcdDatabaseHighFragmentationRatio 279 annotations: 280 {{- if .Values.defaultRules.additionalRuleAnnotations }} 281 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 282 {{- end }} 283 description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size in use on instance {{`{{`}} $labels.instance {{`}}`}} is {{`{{`}} $value | humanizePercentage {{`}}`}} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.' 284 runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation 285 summary: etcd database size in use is less than 50% of the actual allocated storage. 286 expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes > 104857600 287 for: 10m 288 labels: 289 severity: warning 290 {{- if .Values.defaultRules.additionalRuleLabels }} 291 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 292 {{- end }} 293 {{- end }} 294 {{- end }}