github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/store_gateway.yml (about) 1 {{- /* 2 Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md 3 */ -}} 4 {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.store_gateway ) .Values.storegateway.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }} 5 apiVersion: monitoring.coreos.com/v1 6 kind: PrometheusRule 7 metadata: 8 name: {{ template "common.names.fullname" . }}-store-gateway 9 namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }} 10 labels: {{- include "common.labels.standard" . | nindent 4 }} 11 {{- if .Values.metrics.prometheusRule.additionalLabels }} 12 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }} 13 {{- end }} 14 {{- if .Values.commonLabels }} 15 {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }} 16 {{- end }} 17 {{- if .Values.commonAnnotations }} 18 annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} 19 {{- end }} 20 spec: 21 groups: 22 - name: thanos-store 23 rules: 24 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosStoreGrpcErrorRate | default false) }} 25 - alert: ThanosStoreGrpcErrorRate 26 annotations: 27 {{- if .Values.commonAnnotations }} 28 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 29 {{- end }} 30 description: Thanos Store {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests. 31 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoregrpcerrorrate 32 summary: Thanos Store is failing to handle qrpcd requests. 33 expr: | 34 ( 35 sum by (job, verrazzano_cluster) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m])) 36 / 37 sum by (job, verrazzano_cluster) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) 38 * 100 > 5 39 ) 40 for: 5m 41 labels: 42 severity: warning 43 {{- if .Values.metrics.prometheusRule.additionalLabels }} 44 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 45 {{- end }} 46 {{- end }} 47 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosStoreSeriesGateLatencyHigh | default false) }} 48 - alert: ThanosStoreSeriesGateLatencyHigh 49 annotations: 50 {{- if .Values.commonAnnotations }} 51 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 52 {{- end }} 53 description: Thanos Store {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for store series gate requests. 54 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreseriesgatelatencyhigh 55 summary: Thanos Store has high latency for store series gate requests. 56 expr: | 57 ( 58 histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 59 and 60 sum by (job, verrazzano_cluster) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0 61 ) 62 for: 10m 63 labels: 64 severity: warning 65 {{- if .Values.metrics.prometheusRule.additionalLabels }} 66 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 67 {{- end }} 68 {{- end }} 69 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosStoreBucketHighOperationFailures | default false) }} 70 - alert: ThanosStoreBucketHighOperationFailures 71 annotations: 72 {{- if .Values.commonAnnotations }} 73 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 74 {{- end }} 75 description: Thanos Store {{`{{`}} $labels.job {{`}}`}} Bucket is failing to execute {{`{{`}} $value | humanize {{`}}`}}% of operations. 76 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstorebuckethighoperationfailures 77 summary: Thanos Store Bucket is failing to execute operations. 78 expr: | 79 ( 80 sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) 81 / 82 sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) 83 * 100 > 5 84 ) 85 for: 15m 86 labels: 87 severity: warning 88 {{- if .Values.metrics.prometheusRule.additionalLabels }} 89 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 90 {{- end }} 91 {{- end }} 92 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosStoreObjstoreOperationLatencyHigh | default false) }} 93 - alert: ThanosStoreObjstoreOperationLatencyHigh 94 annotations: 95 {{- if .Values.commonAnnotations }} 96 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 97 {{- end }} 98 description: Thanos Store {{`{{`}} $labels.job {{`}}`}} Bucket has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for the bucket operations. 99 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosstoreobjstoreoperationlatencyhigh 100 summary: Thanos Store is having high latency for bucket operations. 101 expr: | 102 ( 103 histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 104 and 105 sum by (job, verrazzano_cluster) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0 106 ) 107 for: 10m 108 labels: 109 severity: warning 110 {{- if .Values.metrics.prometheusRule.additionalLabels }} 111 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 112 {{- end }} 113 {{- end }} 114 {{- end }}