github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/ruler.yml (about)

     1  {{- /*
     2  Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md
     3  */ -}}
     4  {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.ruler ) .Values.ruler.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }}
     5  apiVersion: monitoring.coreos.com/v1
     6  kind: PrometheusRule
     7  metadata:
     8    name: {{ template "common.names.fullname" . }}-ruler
     9    namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }}
    10    labels: {{- include "common.labels.standard" . | nindent 4 }}
    11      {{- if .Values.metrics.prometheusRule.additionalLabels }}
    12      {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }}
    13      {{- end }}
    14      {{- if .Values.commonLabels }}
    15      {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }}
    16      {{- end }}
    17    {{- if .Values.commonAnnotations }}
    18    annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
    19    {{- end }}
    20  spec:
    21    groups:
    22    - name: thanos-rule
    23      rules:
    24      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleQueueIsDroppingAlerts | default false) }}
    25      - alert: ThanosRuleQueueIsDroppingAlerts
    26        annotations:
    27          {{- if .Values.commonAnnotations }}
    28          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    29          {{- end }}
    30          description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} is failing to queue alerts.
    31          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeueisdroppingalerts
    32          summary: Thanos Rule is failing to queue alerts.
    33        expr: |
    34          sum by (job, instance, verrazzano_cluster) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0
    35        for: 5m
    36        labels:
    37          severity: critical
    38          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    39          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    40          {{- end }}
    41      {{- end }}
    42      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleSenderIsFailingAlerts | default false) }}
    43      - alert: ThanosRuleSenderIsFailingAlerts
    44        annotations:
    45          {{- if .Values.commonAnnotations }}
    46          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    47          {{- end }}
    48          description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} is failing to send alerts to alertmanager.
    49          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulesenderisfailingalerts
    50          summary: Thanos Rule is failing to send alerts to alertmanager.
    51        expr: |
    52          sum by (job, instance, verrazzano_cluster) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0
    53        for: 5m
    54        labels:
    55          severity: critical
    56          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    57          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    58          {{- end }}
    59      {{- end }}
    60      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleHighRuleEvaluationFailures | default false) }}
    61      - alert: ThanosRuleHighRuleEvaluationFailures
    62        annotations:
    63          {{- if .Values.commonAnnotations }}
    64          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    65          {{- end }}
    66          description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} is failing to evaluate rules.
    67          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationfailures
    68          summary: Thanos Rule is failing to evaluate rules.
    69        expr: |
    70          (
    71            sum by (job, instance, verrazzano_cluster) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m]))
    72          /
    73            sum by (job, instance, verrazzano_cluster) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m]))
    74          * 100 > 5
    75          )
    76        for: 5m
    77        labels:
    78          severity: critical
    79          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    80          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    81          {{- end }}
    82      {{- end }}
    83      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleHighRuleEvaluationWarnings | default false) }}
    84      - alert: ThanosRuleHighRuleEvaluationWarnings
    85        annotations:
    86          {{- if .Values.commonAnnotations }}
    87          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    88          {{- end }}
    89          description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} has high number of evaluation warnings.
    90          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulehighruleevaluationwarnings
    91          summary: Thanos Rule has high number of evaluation warnings.
    92        expr: |
    93          sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0
    94        for: 15m
    95        labels:
    96          severity: info
    97          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    98          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    99          {{- end }}
   100      {{- end }}
   101      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleRuleEvaluationLatencyHigh | default false) }}
   102      - alert: ThanosRuleRuleEvaluationLatencyHigh
   103        annotations:
   104          {{- if .Values.commonAnnotations }}
   105          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   106          {{- end }}
   107          description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} has higher evaluation latency than interval for {{`{{`}} $labels.rule_group {{`}}`}}.
   108          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleruleevaluationlatencyhigh
   109          summary: Thanos Rule has high rule evaluation latency.
   110        expr: |
   111          (
   112            sum by (job, instance, rule_group, verrazzano_cluster) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"})
   113          >
   114            sum by (job, instance, rule_group, verrazzano_cluster) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})
   115          )
   116        for: 5m
   117        labels:
   118          severity: warning
   119          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   120          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   121          {{- end }}
   122      {{- end }}
   123      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleGrpcErrorRate | default false) }}
   124      - alert: ThanosRuleGrpcErrorRate
   125        annotations:
   126          {{- if .Values.commonAnnotations }}
   127          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   128          {{- end }}
   129          description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests.
   130          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulegrpcerrorrate
   131          summary: Thanos Rule is failing to handle grpc requests.
   132        expr: |
   133          (
   134            sum by (job, instance, verrazzano_cluster) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))
   135          /
   136            sum by (job, instance, verrazzano_cluster) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m]))
   137          * 100 > 5
   138          )
   139        for: 5m
   140        labels:
   141          severity: warning
   142          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   143          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   144          {{- end }}
   145      {{- end }}
   146      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleConfigReloadFailure | default false) }}
   147      - alert: ThanosRuleConfigReloadFailure
   148        annotations:
   149          {{- if .Values.commonAnnotations }}
   150          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   151          {{- end }}
   152          description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} has not been able to reload its configuration.
   153          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosruleconfigreloadfailure
   154          summary: Thanos Rule has not been able to reload configuration.
   155        expr: avg by (job, instance, verrazzano_cluster) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1
   156        for: 5m
   157        labels:
   158          severity: info
   159          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   160          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   161          {{- end }}
   162      {{- end }}
   163      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleQueryHighDNSFailures | default false) }}
   164      - alert: ThanosRuleQueryHighDNSFailures
   165        annotations:
   166          {{- if .Values.commonAnnotations }}
   167          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   168          {{- end }}
   169          description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} has {{`{{`}} $value | humanize{{`}}`}}% of failing DNS queries for query endpoints.
   170          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulequeryhighdnsfailures
   171          summary: Thanos Rule is having high number of DNS failures.
   172        expr: |
   173          (
   174            sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m]))
   175          /
   176            sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))
   177          * 100 > 1
   178          )
   179        for: 15m
   180        labels:
   181          severity: warning
   182          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   183          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   184          {{- end }}
   185      {{- end }}
   186      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleAlertmanagerHighDNSFailures | default false) }}
   187      - alert: ThanosRuleAlertmanagerHighDNSFailures
   188        annotations:
   189          {{- if .Values.commonAnnotations }}
   190          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   191          {{- end }}
   192          description: Thanos Rule {{`{{`}} $labels.instance{{`}}`}} has {{`{{`}} $value | humanize {{`}}`}}% of failing DNS queries for Alertmanager endpoints.
   193          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulealertmanagerhighdnsfailures
   194          summary: Thanos Rule is having high number of DNS failures.
   195        expr: |
   196          (
   197            sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m]))
   198          /
   199            sum by (job, instance, verrazzano_cluster) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m]))
   200          * 100 > 1
   201          )
   202        for: 15m
   203        labels:
   204          severity: warning
   205          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   206          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   207          {{- end }}
   208      {{- end }}
   209      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosRuleNoEvaluationFor10Intervals | default false) }}
   210      - alert: ThanosRuleNoEvaluationFor10Intervals
   211        annotations:
   212          {{- if .Values.commonAnnotations }}
   213          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   214          {{- end }}
   215          description: Thanos Rule {{`{{`}} $labels.job {{`}}`}} has rule groups that did not evaluate for at least 10x of their expected interval.
   216          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosrulenoevaluationfor10intervals
   217          summary: Thanos Rule has rule groups that did not evaluate for 10 intervals.
   218        expr: |
   219          time() -  max by (job, instance, group, verrazzano_cluster) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})
   220          >
   221          10 * max by (job, instance, group, verrazzano_cluster) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})
   222        for: 5m
   223        labels:
   224          severity: info
   225          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   226          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   227          {{- end }}
   228      {{- end }}
   229      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosNoRuleEvaluations | default false) }}
   230      - alert: ThanosNoRuleEvaluations
   231        annotations:
   232          {{- if .Values.commonAnnotations }}
   233          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   234          {{- end }}
   235          description: Thanos Rule {{`{{`}} $labels.instance {{`}}`}} did not perform any rule evaluations in the past 10 minutes.
   236          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosnoruleevaluations
   237          summary: Thanos Rule did not perform any rule evaluations.
   238        expr: |
   239          sum by (job, instance, verrazzano_cluster) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0
   240            and
   241          sum by (job, instance, verrazzano_cluster) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0
   242        for: 5m
   243        labels:
   244          severity: critical
   245          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   246          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   247          {{- end }}
   248      {{- end }}
   249  {{- end }}