github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/compaction.yml (about) 1 {{- /* 2 Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md 3 */ -}} 4 {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.compaction ) .Values.compactor.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }} 5 apiVersion: monitoring.coreos.com/v1 6 kind: PrometheusRule 7 metadata: 8 name: {{ template "common.names.fullname" . }}-compact 9 namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }} 10 labels: {{- include "common.labels.standard" . | nindent 4 }} 11 {{- if .Values.metrics.prometheusRule.additionalLabels }} 12 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }} 13 {{- end }} 14 {{- if .Values.commonLabels }} 15 {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }} 16 {{- end }} 17 {{- if .Values.commonAnnotations }} 18 annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} 19 {{- end }} 20 spec: 21 groups: 22 - name: thanos-compact 23 rules: 24 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactMultipleRunning | default false) }} 25 - alert: ThanosCompactMultipleRunning 26 annotations: 27 {{- if .Values.commonAnnotations }} 28 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 29 {{- end }} 30 description: No more than one Thanos Compact instance should be running at once. There are {{`{{`}} $value {{`}}`}} instances running. 31 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactmultiplerunning 32 summary: Thanos Compact has multiple instances running. 33 expr: sum by (job, verrazzano_cluster) (up{job=~".*thanos-compact.*"}) > 1 34 for: 5m 35 labels: 36 severity: warning 37 {{- if .Values.metrics.prometheusRule.additionalLabels }} 38 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 39 {{- end }} 40 {{- end }} 41 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactHalted | default false) }} 42 - alert: ThanosCompactHalted 43 annotations: 44 {{- if .Values.commonAnnotations }} 45 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 46 {{- end }} 47 description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} has failed to run and now is halted. 48 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthalted 49 summary: Thanos Compact has failed to run and is now halted. 50 expr: thanos_compact_halted{job=~".*thanos-compact.*"} == 1 51 for: 5m 52 labels: 53 severity: warning 54 {{- if .Values.metrics.prometheusRule.additionalLabels }} 55 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 56 {{- end }} 57 {{- end }} 58 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactHighCompactionFailures | default false) }} 59 - alert: ThanosCompactHighCompactionFailures 60 annotations: 61 {{- if .Values.commonAnnotations }} 62 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 63 {{- end }} 64 description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} is failing to execute {{`{{`}} $value | humanize {{`}}`}}% of compactions. 65 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthighcompactionfailures 66 summary: Thanos Compact is failing to execute compactions. 67 expr: | 68 ( 69 sum by (job, verrazzano_cluster) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) 70 / 71 sum by (job, verrazzano_cluster) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) 72 * 100 > 5 73 ) 74 for: 15m 75 labels: 76 severity: warning 77 {{- if .Values.metrics.prometheusRule.additionalLabels }} 78 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 79 {{- end }} 80 {{- end }} 81 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactBucketHighOperationFailures | default false) }} 82 - alert: ThanosCompactBucketHighOperationFailures 83 annotations: 84 {{- if .Values.commonAnnotations }} 85 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 86 {{- end }} 87 description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} Bucket is failing to execute {{`{{`}} $value | humanize {{`}}`}}% of operations. 88 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompactbuckethighoperationfailures 89 summary: Thanos Compact Bucket is having a high number of operation failures. 90 expr: | 91 ( 92 sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) 93 / 94 sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) 95 * 100 > 5 96 ) 97 for: 15m 98 labels: 99 severity: warning 100 {{- if .Values.metrics.prometheusRule.additionalLabels }} 101 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 102 {{- end }} 103 {{- end }} 104 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosCompactHasNotRun | default false) }} 105 - alert: ThanosCompactHasNotRun 106 annotations: 107 {{- if .Values.commonAnnotations }} 108 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 109 {{- end }} 110 description: Thanos Compact {{`{{`}} $labels.job {{`}}`}} has not uploaded anything for 24 hours. 111 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanoscompacthasnotrun 112 summary: Thanos Compact has not uploaded anything for last 24 hours. 113 expr: (time() - max by (job, verrazzano_cluster) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24 114 labels: 115 severity: warning 116 {{- if .Values.metrics.prometheusRule.additionalLabels }} 117 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 118 {{- end }} 119 {{- end }} 120 {{- end }}