github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/ruler.yml (about) 1 {{- /* 2 Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md 3 */ -}} 4 {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.ruler ) .Values.ruler.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }} 5 apiVersion: monitoring.coreos.com/v1 6 kind: PrometheusRule 7 metadata: 8 name: {{ template "common.names.fullname" . }}-ruler 9 namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }} 10 labels: {{- include "common.labels.standard" . | nindent 4 }} 11 {{- if .Values.metrics.prometheusRule.additionalLabels }} 12 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }} 13 {{- end }} 14 {{- if .Values.commonLabels }} 15 {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }} 16 {{- end }} 17 {{- if .Values.commonAnnotations }} 18 annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} 19 {{- end }} 20 spec: 21 groups: 22 - name: thanos-rule 23 rules: 24 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleQueueIsDroppingAlerts | default false) }} 25 - alert: ThanosRuleQueueIsDroppingAlerts 26 annotations: 27 {{- if .Values.commonAnnotations }} 28 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 29 {{- end }} 30 description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} is failing to queue alerts. 31 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts 32 summary: Thanos Rule is failing to queue alerts. 33 expr: | 34 sum by (job, instance, verrazzano_cluster) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0 35 for: 5m 36 labels: 37 severity: critical 38 {{- if .Values.metrics.prometheusRule.additionalLabels }} 39 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 40 {{- end }} 41 {{- end }} 42 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleSenderIsFailingAlerts | default false) }} 43 - alert: ThanosRuleSenderIsFailingAlerts 44 annotations: 45 {{- if .Values.commonAnnotations }} 46 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 47 {{- end }} 48 description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} is failing to send alerts to alertmanager. 49 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts 50 summary: Thanos Rule is failing to send alerts to alertmanager. 51 expr: | 52 sum by (job, instance, verrazzano_cluster) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0 53 for: 5m 54 labels: 55 severity: critical 56 {{- if .Values.metrics.prometheusRule.additionalLabels }} 57 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 58 {{- end }} 59 {{- end }} 60 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleHighRuleEvaluationFailures | default false) }} 61 - alert: ThanosRuleHighRuleEvaluationFailures 62 annotations: 63 {{- if .Values.commonAnnotations }} 64 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 65 {{- end }} 66 description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} is failing to evaluate rules. 67 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures 68 summary: Thanos Rule is failing to evaluate rules. 69 expr: | 70 ( 71 sum by (job, instance, verrazzano_cluster) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) 72 / 73 sum by (job, instance, verrazzano_cluster) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) 74 * 100 > 5 75 ) 76 for: 5m 77 labels: 78 severity: critical 79 {{- if .Values.metrics.prometheusRule.additionalLabels }} 80 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 81 {{- end }} 82 {{- end }} 83 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleHighRuleEvaluationWarnings | default false) }} 84 - alert: ThanosRuleHighRuleEvaluationWarnings 85 annotations: 86 {{- if .Values.commonAnnotations }} 87 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 88 {{- end }} 89 description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} has high number of evaluation warnings. 90 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings 91 summary: Thanos Rule has high number of evaluation warnings. 92 expr: | 93 sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0 94 for: 15m 95 labels: 96 severity: info 97 {{- if .Values.metrics.prometheusRule.additionalLabels }} 98 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 99 {{- end }} 100 {{- end }} 101 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleRuleEvaluationLatencyHigh | default false) }} 102 - alert: ThanosRuleRuleEvaluationLatencyHigh 103 annotations: 104 {{- if .Values.commonAnnotations }} 105 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 106 {{- end }} 107 description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} has higher evaluation latency than interval for {{`{{`}} $labels.rule_group {{`}}`}}. 108 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh 109 summary: Thanos Rule has high rule evaluation latency. 110 expr: | 111 ( 112 sum by (job, instance, rule_group, verrazzano_cluster) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) 113 > 114 sum by (job, instance, rule_group, verrazzano_cluster) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}) 115 ) 116 for: 5m 117 labels: 118 severity: warning 119 {{- if .Values.metrics.prometheusRule.additionalLabels }} 120 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 121 {{- end }} 122 {{- end }} 123 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleGrpcErrorRate | default false) }} 124 - alert: ThanosRuleGrpcErrorRate 125 annotations: 126 {{- if .Values.commonAnnotations }} 127 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 128 {{- end }} 129 description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests. 130 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate 131 summary: Thanos Rule is failing to handle grpc requests. 132 expr: | 133 ( 134 sum by (job, instance, verrazzano_cluster) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m])) 135 / 136 sum by (job, instance, verrazzano_cluster) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) 137 * 100 > 5 138 ) 139 for: 5m 140 labels: 141 severity: warning 142 {{- if .Values.metrics.prometheusRule.additionalLabels }} 143 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 144 {{- end }} 145 {{- end }} 146 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleConfigReloadFailure | default false) }} 147 - alert: ThanosRuleConfigReloadFailure 148 annotations: 149 {{- if .Values.commonAnnotations }} 150 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 151 {{- end }} 152 description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} has not been able to reload its configuration. 153 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleconfigreloadfailure 154 summary: Thanos Rule has not been able to reload configuration. 155 expr: avg by (job, instance, verrazzano_cluster) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1 156 for: 5m 157 labels: 158 severity: info 159 {{- if .Values.metrics.prometheusRule.additionalLabels }} 160 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 161 {{- end }} 162 {{- end }} 163 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleQueryHighDNSFailures | default false) }} 164 - alert: ThanosRuleQueryHighDNSFailures 165 annotations: 166 {{- if .Values.commonAnnotations }} 167 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 168 {{- end }} 169 description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} has {{`{{`}} $value | humanize{{`}}`}}% of failing DNS queries for query endpoints. 170 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures 171 summary: Thanos Rule is having high number of DNS failures. 172 expr: | 173 ( 174 sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) 175 / 176 sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) 177 * 100 > 1 178 ) 179 for: 15m 180 labels: 181 severity: warning 182 {{- if .Values.metrics.prometheusRule.additionalLabels }} 183 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 184 {{- end }} 185 {{- end }} 186 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleAlertmanagerHighDNSFailures | default false) }} 187 - alert: ThanosRuleAlertmanagerHighDNSFailures 188 annotations: 189 {{- if .Values.commonAnnotations }} 190 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 191 {{- end }} 192 description: Thanos Rule {{`{{`}} $labels.instance{{`}}`}} has {{`{{`}} $value | humanize {{`}}`}}% of failing DNS queries for Alertmanager endpoints. 193 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures 194 summary: Thanos Rule is having high number of DNS failures. 195 expr: | 196 ( 197 sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) 198 / 199 sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) 200 * 100 > 1 201 ) 202 for: 15m 203 labels: 204 severity: warning 205 {{- if .Values.metrics.prometheusRule.additionalLabels }} 206 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 207 {{- end }} 208 {{- end }} 209 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleNoEvaluationFor10Intervals | default false) }} 210 - alert: ThanosRuleNoEvaluationFor10Intervals 211 annotations: 212 {{- if .Values.commonAnnotations }} 213 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 214 {{- end }} 215 description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} has rule groups that did not evaluate for at least 10x of their expected interval. 216 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals 217 summary: Thanos Rule has rule groups that did not evaluate for 10 intervals. 218 expr: | 219 time() - max by (job, instance, group, verrazzano_cluster) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"}) 220 > 221 10 * max by (job, instance, group, verrazzano_cluster) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}) 222 for: 5m 223 labels: 224 severity: info 225 {{- if .Values.metrics.prometheusRule.additionalLabels }} 226 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 227 {{- end }} 228 {{- end }} 229 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosNoRuleEvaluations | default false) }} 230 - alert: ThanosNoRuleEvaluations 231 annotations: 232 {{- if .Values.commonAnnotations }} 233 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 234 {{- end }} 235 description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} did not perform any rule evaluations in the past 10 minutes. 236 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations 237 summary: Thanos Rule did not perform any rule evaluations. 238 expr: | 239 sum by (job, instance, verrazzano_cluster) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 240 and 241 sum by (job, instance, verrazzano_cluster) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0 242 for: 5m 243 labels: 244 severity: critical 245 {{- if .Values.metrics.prometheusRule.additionalLabels }} 246 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 247 {{- end }} 248 {{- end }} 249 {{- end }}