github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/replicate.yml (about) 1 {{- /* 2 Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md 3 */ -}} 4 {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.replicate ) ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }} 5 apiVersion: monitoring.coreos.com/v1 6 kind: PrometheusRule 7 metadata: 8 name: {{ template "common.names.fullname" . }}-replicate 9 namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }} 10 labels: {{- include "common.labels.standard" . | nindent 4 }} 11 {{- if .Values.metrics.prometheusRule.additionalLabels }} 12 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }} 13 {{- end }} 14 {{- if .Values.commonLabels }} 15 {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }} 16 {{- end }} 17 {{- if .Values.commonAnnotations }} 18 annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} 19 {{- end }} 20 spec: 21 groups: 22 - name: thanos-bucket-replicate 23 rules: 24 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosBucketReplicateErrorRate | default false) }} 25 - alert: ThanosBucketReplicateErrorRate 26 annotations: 27 {{- if .Values.commonAnnotations }} 28 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 29 {{- end }} 30 description: Thanos Replicate is failing to run, {{`{{`}} $value | humanize {{`}}`}}% of attempts failed. 31 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicateerrorrate 32 summary: Thanos Replicate is failing to run. 33 expr: | 34 ( 35 sum by (job, verrazzano_cluster) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) 36 / on (job, verrazzano_cluster) group_left 37 sum by (job, verrazzano_cluster) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) 38 ) * 100 >= 10 39 for: 5m 40 labels: 41 severity: critical 42 {{- if .Values.metrics.prometheusRule.additionalLabels }} 43 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 44 {{- end }} 45 {{- end }} 46 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosBucketReplicateRunLatency | default false) }} 47 - alert: ThanosBucketReplicateRunLatency 48 annotations: 49 {{- if .Values.commonAnnotations }} 50 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 51 {{- end }} 52 description: Thanos Replicate {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for the replicate operations. 53 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosbucketreplicaterunlatency 54 summary: Thanos Replicate has a high latency for replicate operations. 55 expr: | 56 ( 57 histogram_quantile(0.99, sum by (job, verrazzano_cluster) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 58 and 59 sum by (job, verrazzano_cluster) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0 60 ) 61 for: 5m 62 labels: 63 severity: critical 64 {{- if .Values.metrics.prometheusRule.additionalLabels }} 65 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 66 {{- end }} 67 {{- end }} 68 {{- end }}