github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/receive.yml (about)

     1  {{- /*
     2  Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md
     3  */ -}}
     4  {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.receive ) .Values.receive.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }}
     5  apiVersion: monitoring.coreos.com/v1
     6  kind: PrometheusRule
     7  metadata:
     8    name: {{ template "common.names.fullname" . }}-receive
     9    namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }}
    10    labels: {{- include "common.labels.standard" . | nindent 4 }}
    11      {{- if .Values.metrics.prometheusRule.additionalLabels }}
    12      {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }}
    13      {{- end }}
    14      {{- if .Values.commonLabels }}
    15      {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }}
    16      {{- end }}
    17    {{- if .Values.commonAnnotations }}
    18    annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
    19    {{- end }}
    20  spec:
    21    groups:
    22    - name: thanos-receive
    23      rules:
    24      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHttpRequestErrorRateHigh | default false) }}
    25      - alert: ThanosReceiveHttpRequestErrorRateHigh
    26        annotations:
    27          {{- if .Values.commonAnnotations }}
    28          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    29          {{- end }}
    30          description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests.
    31          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh
    32          summary: Thanos Receive is failing to handle requests.
    33        expr: |
    34          (
    35            sum by (job, verrazzano_cluster) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))
    36          /
    37            sum by (job, verrazzano_cluster) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))
    38          ) * 100 > 5
    39        for: 5m
    40        labels:
    41          severity: critical
    42          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    43          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    44          {{- end }}
    45      {{- end }}
    46      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHttpRequestLatencyHigh | default false) }}
    47      - alert: ThanosReceiveHttpRequestLatencyHigh
    48        annotations:
    49          {{- if .Values.commonAnnotations }}
    50          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    51          {{- end }}
    52          description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for requests.
    53          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequestlatencyhigh
    54          summary: Thanos Receive has high HTTP requests latency.
    55        expr: |
    56          (
    57            histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10
    58          and
    59            sum by (job, verrazzano_cluster) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0
    60          )
    61        for: 10m
    62        labels:
    63          severity: critical
    64          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    65          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    66          {{- end }}
    67      {{- end }}
    68      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHighReplicationFailures | default false) }}
    69      - alert: ThanosReceiveHighReplicationFailures
    70        annotations:
    71          {{- if .Values.commonAnnotations }}
    72          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    73          {{- end }}
    74          description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} is failing to replicate {{`{{`}} $value | humanize {{`}}`}}% of requests.
    75          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures
    76          summary: Thanos Receive is having high number of replication failures.
    77        expr: |
    78          thanos_receive_replication_factor > 1
    79            and
    80          (
    81            (
    82              sum by (job, verrazzano_cluster) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m]))
    83            /
    84              sum by (job, verrazzano_cluster) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))
    85            )
    86            >
    87            (
    88              max by (job, verrazzano_cluster) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1) / 2))
    89            /
    90              max by (job, verrazzano_cluster) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"})
    91            )
    92          ) * 100
    93        for: 5m
    94        labels:
    95          severity: warning
    96          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    97          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    98          {{- end }}
    99      {{- end }}
   100      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHighForwardRequestFailures | default false) }}
   101      - alert: ThanosReceiveHighForwardRequestFailures
   102        annotations:
   103          {{- if .Values.commonAnnotations }}
   104          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   105          {{- end }}
   106          description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} is failing to forward {{`{{`}} $value | humanize {{`}}`}}% of requests.
   107          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures
   108          summary: Thanos Receive is failing to forward requests.
   109        expr: |
   110          (
   111            sum by (job, verrazzano_cluster) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))
   112          /
   113            sum by (job, verrazzano_cluster) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))
   114          ) * 100 > 20
   115        for: 5m
   116        labels:
   117          severity: info
   118          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   119          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   120          {{- end }}
   121      {{- end }}
   122      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHighHashringFileRefreshFailures | default false) }}
   123      - alert: ThanosReceiveHighHashringFileRefreshFailures
   124        annotations:
   125          {{- if .Values.commonAnnotations }}
   126          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   127          {{- end }}
   128          description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} is failing to refresh hashring file, {{`{{`}} $value | humanize {{`}}`}} of attempts failed.
   129          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures
   130          summary: Thanos Receive is failing to refresh hasring file.
   131        expr: |
   132          (
   133            sum by (job, verrazzano_cluster) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m]))
   134          /
   135            sum by (job, verrazzano_cluster) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m]))
   136          > 0
   137          )
   138        for: 15m
   139        labels:
   140          severity: warning
   141          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   142          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   143          {{- end }}
   144      {{- end }}
   145      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveConfigReloadFailure | default false) }}
   146      - alert: ThanosReceiveConfigReloadFailure
   147        annotations:
   148          {{- if .Values.commonAnnotations }}
   149          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   150          {{- end }}
   151          description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} has not been able to reload hashring configurations.
   152          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure
   153          summary: Thanos Receive has not been able to reload configuration.
   154        expr: avg by (job, verrazzano_cluster) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1
   155        for: 5m
   156        labels:
   157          severity: warning
   158          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   159          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   160          {{- end }}
   161      {{- end }}
   162      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveNoUpload | default false) }}
   163      - alert: ThanosReceiveNoUpload
   164        annotations:
   165          {{- if .Values.commonAnnotations }}
   166          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   167          {{- end }}
   168          description: Thanos Receive {{`{{`}} $labels.instance {{`}}`}} has not uploaded latest data to object storage.
   169          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload
   170          summary: Thanos Receive has not uploaded latest data to object storage.
   171        expr: |
   172          (up{job=~".*thanos-receive.*"} - 1)
   173          + on (job, instance, verrazzano_cluster) # filters to only alert on current instance last 3h
   174          (sum by (job, instance, verrazzano_cluster) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)
   175        for: 3h
   176        labels:
   177          severity: critical
   178          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   179          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   180          {{- end }}
   181      {{- end }}
   182      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveTrafficBelowThreshold | default false) }}
   183      - alert: ThanosReceiveTrafficBelowThreshold
   184        annotations:
   185          {{- if .Values.commonAnnotations }}
   186          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   187          {{- end }}
   188          description: At Thanos Receive {{`{{`}} $labels.job {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} , the average 1-hr avg. metrics ingestion rate  is {{`{{`}} $value | humanize {{`}}`}}% of 12-hr avg. ingestion rate.
   189          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold
   190          summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate.
   191        expr: |
   192          (
   193            avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m])
   194          /
   195            avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m])
   196          ) * 100 < 50
   197        for: 1h
   198        labels:
   199          severity: warning
   200          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   201          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   202          {{- end }}
   203      {{- end }}
   204  {{- end }}