github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/compaction.yml (about)

     1  {{- /*
     2  Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md
     3  */ -}}
     4  {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.compaction ) .Values.compactor.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }}
     5  apiVersion: monitoring.coreos.com/v1
     6  kind: PrometheusRule
     7  metadata:
     8    name: {{ template "common.names.fullname" . }}-compact
     9    namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }}
    10    labels: {{- include "common.labels.standard" . | nindent 4 }}
    11      {{- if .Values.metrics.prometheusRule.additionalLabels }}
    12      {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }}
    13      {{- end }}
    14      {{- if .Values.commonLabels }}
    15      {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }}
    16      {{- end }}
    17    {{- if .Values.commonAnnotations }}
    18    annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
    19    {{- end }}
    20  spec:
    21    groups:
    22    - name: thanos-compact
    23      rules:
    24      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactMultipleRunning | default false) }}
    25      - alert: ThanosCompactMultipleRunning
    26        annotations:
    27          {{- if .Values.commonAnnotations }}
    28          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    29          {{- end }}
    30          description: No more than one Thanos Compact instance should be running at once. There are {{`{{`}} $value {{`}}`}} instances running.
    31          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning
    32          summary: Thanos Compact has multiple instances running.
    33        expr: sum by (job, verrazzano_cluster) (up{job=~".*thanos-compact.*"}) > 1
    34        for: 5m
    35        labels:
    36          severity: warning
    37          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    38          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    39          {{- end }}
    40      {{- end }}
    41      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactHalted | default false) }}
    42      - alert: ThanosCompactHalted
    43        annotations:
    44          {{- if .Values.commonAnnotations }}
    45          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    46          {{- end }}
    47          description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} has failed to run and now is halted.
    48          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted
    49          summary: Thanos Compact has failed to run and is now halted.
    50        expr: thanos_compact_halted{job=~".*thanos-compact.*"} == 1
    51        for: 5m
    52        labels:
    53          severity: warning
    54          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    55          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    56          {{- end }}
    57      {{- end }}
    58      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactHighCompactionFailures | default false) }}
    59      - alert: ThanosCompactHighCompactionFailures
    60        annotations:
    61          {{- if .Values.commonAnnotations }}
    62          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    63          {{- end }}
    64          description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} is failing to execute {{`{{`}} $value | humanize {{`}}`}}% of compactions.
    65          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures
    66          summary: Thanos Compact is failing to execute compactions.
    67        expr: |
    68          (
    69            sum by (job, verrazzano_cluster) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m]))
    70          /
    71            sum by (job, verrazzano_cluster) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m]))
    72          * 100 > 5
    73          )
    74        for: 15m
    75        labels:
    76          severity: warning
    77          {{- if .Values.metrics.prometheusRule.additionalLabels }}
    78          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
    79          {{- end }}
    80      {{- end }}
    81      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactBucketHighOperationFailures | default false) }}
    82      - alert: ThanosCompactBucketHighOperationFailures
    83        annotations:
    84          {{- if .Values.commonAnnotations }}
    85          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
    86          {{- end }}
    87          description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} Bucket is failing to execute {{`{{`}} $value | humanize {{`}}`}}% of operations.
    88          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures
    89          summary: Thanos Compact Bucket is having a high number of operation failures.
    90        expr: |
    91          (
    92            sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m]))
    93          /
    94            sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m]))
    95          * 100 > 5
    96          )
    97        for: 15m
    98        labels:
    99          severity: warning
   100          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   101          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   102          {{- end }}
   103      {{- end }}
   104      {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactHasNotRun | default false) }}
   105      - alert: ThanosCompactHasNotRun
   106        annotations:
   107          {{- if .Values.commonAnnotations }}
   108          {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }}
   109          {{- end }}
   110          description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} has not uploaded anything for 24 hours.
   111          runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun
   112          summary: Thanos Compact has not uploaded anything for last 24 hours.
   113        expr: (time() - max by (job, verrazzano_cluster) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24
   114        labels:
   115          severity: warning
   116          {{- if .Values.metrics.prometheusRule.additionalLabels }}
   117          {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }}
   118          {{- end }}
   119      {{- end }}
   120  {{- end }}