github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/replicate.yml (about)

     1  {{- /*
     2  Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md
     3  */ -}}
     4  {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.replicate ) ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }}
     5  apiVersion: monitoring.coreos.com/v1
     6  kind: PrometheusRule
     7  metadata:
     8    name: {{ template "common.names.fullname" . }}-replicate
     9    namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }}
    10    labels: {{- include "common.labels.standard" . | nindent 4 }}
    11      {{- if .Values.metrics.prometheusRule.additionalLabels }}
    12      {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }}
    13      {{- end }}
    14      {{- if .Values.commonLabels }}
    15      {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }}
    16      {{- end }}
    17    {{- if .Values.commonAnnotations }}
    18    annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
    19    {{- end }}
    20  spec:
    21    groups:
    22    - name: thanos-bucket-replicate
    23      rules:
    24      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosBucketReplicateErrorRate | default false) }}
    25      - alert: ThanosBucketReplicateErrorRate
    26        annotations:
    27          {{- if .Values.commonAnnotations }}
    28          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    29          {{- end }}
    30          description: Thanos Replicate is failing to run, {{`{{`}} $value | humanize {{`}}`}}% of attempts failed.
    31          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate
    32          summary: Thanos Replicate is failing to run.
    33        expr: |
    34          (
    35            sum by (job, verrazzano_cluster) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))
    36          / on (job, verrazzano_cluster) group_left
    37            sum by (job, verrazzano_cluster) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))
    38          ) * 100 >= 10
    39        for: 5m
    40        labels:
    41          severity: critical
    42          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    43          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    44          {{- end }}
    45      {{- end }}
    46      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosBucketReplicateRunLatency | default false) }}
    47      - alert: ThanosBucketReplicateRunLatency
    48        annotations:
    49          {{- if .Values.commonAnnotations }}
    50          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    51          {{- end }}
    52          description: Thanos Replicate {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for the replicate operations.
    53          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency
    54          summary: Thanos Replicate has a high latency for replicate operations.
    55        expr: |
    56          (
    57            histogram_quantile(0.99, sum by (job, verrazzano_cluster) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20
    58          and
    59            sum by (job, verrazzano_cluster) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0
    60          )
    61        for: 5m
    62        labels:
    63          severity: critical
    64          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    65          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    66          {{- end }}
    67      {{- end }}
    68  {{- end }}