github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/query.yml

github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/query.yml (about)

     1  {{- /*
     2  Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md
     3  */ -}}
     4  {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.query ) .Values.query.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }}
     5  apiVersion: monitoring.coreos.com/v1
     6  kind: PrometheusRule
     7  metadata:
     8    name: {{ template "common.names.fullname" . }}-query
     9    namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }}
    10    labels: {{- include "common.labels.standard" . | nindent 4 }}
    11      {{- if .Values.metrics.prometheusRule.additionalLabels }}
    12      {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }}
    13      {{- end }}
    14      {{- if .Values.commonLabels }}
    15      {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }}
    16      {{- end }}
    17    {{- if .Values.commonAnnotations }}
    18    annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
    19    {{- end }}
    20  spec:
    21    groups:
    22    - name: thanos-query
    23      rules:
    24      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryHttpRequestQueryErrorRateHigh | default false) }}
    25      - alert: ThanosQueryHttpRequestQueryErrorRateHigh
    26        annotations:
    27          {{- if .Values.commonAnnotations }}
    28          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    29          {{- end }}
    30          description: Thanos Query {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of "query" requests.
    31          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh
    32          summary: Thanos Query is failing to handle requests.
    33        expr: |
    34          (
    35            sum by (job, verrazzano_cluster) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))
    36          /
    37            sum by (job, verrazzano_cluster) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))
    38          ) * 100 > 5
    39        for: 5m
    40        labels:
    41          severity: critical
    42          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    43          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    44          {{- end }}
    45      {{- end }}
    46      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryHttpRequestQueryRangeErrorRateHigh | default false) }}
    47      - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
    48        annotations:
    49          {{- if .Values.commonAnnotations }}
    50          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    51          {{- end }}
    52          description: Thanos Query {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of "query_range" requests.
    53          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh
    54          summary: Thanos Query is failing to handle requests.
    55        expr: |
    56          (
    57            sum by (job, verrazzano_cluster) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))
    58          /
    59            sum by (job, verrazzano_cluster) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))
    60          ) * 100 > 5
    61        for: 5m
    62        labels:
    63          severity: critical
    64          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    65          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    66          {{- end }}
    67      {{- end }}
    68      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryGrpcServerErrorRate | default false) }}
    69      - alert: ThanosQueryGrpcServerErrorRate
    70        annotations:
    71          {{- if .Values.commonAnnotations }}
    72          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    73          {{- end }}
    74          description: Thanos Query {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests.
    75          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate
    76          summary: Thanos Query is failing to handle requests.
    77        expr: |
    78          (
    79            sum by (job, verrazzano_cluster) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))
    80          /
    81            sum by (job, verrazzano_cluster) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m]))
    82          * 100 > 5
    83          )
    84        for: 5m
    85        labels:
    86          severity: warning
    87          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    88          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    89          {{- end }}
    90      {{- end }}
    91      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryGrpcClientErrorRate | default false) }}
    92      - alert: ThanosQueryGrpcClientErrorRate
    93        annotations:
    94          {{- if .Values.commonAnnotations }}
    95          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    96          {{- end }}
    97          description: Thanos Query {{`{{`}} $labels.job {{`}}`}} is failing to send {{`{{`}} $value | humanize {{`}}`}}% of requests.
    98          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate
    99          summary: Thanos Query is failing to send requests.
   100        expr: |
   101          (
   102            sum by (job, verrazzano_cluster) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m]))
   103          /
   104            sum by (job, verrazzano_cluster) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))
   105          ) * 100 > 5
   106        for: 5m
   107        labels:
   108          severity: warning
   109          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   110          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   111          {{- end }}
   112      {{- end }}
   113      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryHighDNSFailures | default false) }}
   114      - alert: ThanosQueryHighDNSFailures
   115        annotations:
   116          {{- if .Values.commonAnnotations }}
   117          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   118          {{- end }}
   119          description: Thanos Query {{`{{`}} $labels.job {{`}}`}} have {{`{{`}} $value | humanize{{`}}`}}% of failing DNS queries for store endpoints.
   120          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures
   121          summary: Thanos Query is having high number of DNS failures.
   122        expr: |
   123          (
   124            sum by (job, verrazzano_cluster) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m]))
   125          /
   126            sum by (job, verrazzano_cluster) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))
   127          ) * 100 > 1
   128        for: 15m
   129        labels:
   130          severity: warning
   131          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   132          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   133          {{- end }}
   134      {{- end }}
   135      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryInstantLatencyHigh | default false) }}
   136      - alert: ThanosQueryInstantLatencyHigh
   137        annotations:
   138          {{- if .Values.commonAnnotations }}
   139          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   140          {{- end }}
   141          description: Thanos Query {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for instant queries.
   142          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh
   143          summary: Thanos Query has high latency for queries.
   144        expr: |
   145          (
   146            histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40
   147          and
   148            sum by (job, verrazzano_cluster) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0
   149          )
   150        for: 10m
   151        labels:
   152          severity: critical
   153          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   154          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   155          {{- end }}
   156      {{- end }}
   157      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryRangeLatencyHigh | default false) }}
   158      - alert: ThanosQueryRangeLatencyHigh
   159        annotations:
   160          {{- if .Values.commonAnnotations }}
   161          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   162          {{- end }}
   163          description: Thanos Query {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for range queries.
   164          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh
   165          summary: Thanos Query has high latency for queries.
   166        expr: |
   167          (
   168            histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90
   169          and
   170            sum by (job, verrazzano_cluster) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0
   171          )
   172        for: 10m
   173        labels:
   174          severity: critical
   175          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   176          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   177          {{- end }}
   178      {{- end }}
   179      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryOverload | default false) }}
   180      - alert: ThanosQueryOverload
   181        annotations:
   182          {{- if .Values.commonAnnotations }}
   183          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   184          {{- end }}
   185          description: Thanos Query {{`{{`}} $labels.job {{`}}`}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.
   186          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload
   187          summary: Thanos query reaches its maximum capacity serving concurrent requests.
   188        expr: |
   189          (
   190            max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1
   191          )
   192        for: 15m
   193        labels:
   194          severity: warning
   195          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   196          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   197          {{- end }}
   198      {{- end }}
   199  {{- end }}