github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/query.yml (about) 1 {{- /* 2 Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md 3 */ -}} 4 {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.query ) .Values.query.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }} 5 apiVersion: monitoring.coreos.com/v1 6 kind: PrometheusRule 7 metadata: 8 name: {{ template "common.names.fullname" . }}-query 9 namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }} 10 labels: {{- include "common.labels.standard" . | nindent 4 }} 11 {{- if .Values.metrics.prometheusRule.additionalLabels }} 12 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }} 13 {{- end }} 14 {{- if .Values.commonLabels }} 15 {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }} 16 {{- end }} 17 {{- if .Values.commonAnnotations }} 18 annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} 19 {{- end }} 20 spec: 21 groups: 22 - name: thanos-query 23 rules: 24 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryHttpRequestQueryErrorRateHigh | default false) }} 25 - alert: ThanosQueryHttpRequestQueryErrorRateHigh 26 annotations: 27 {{- if .Values.commonAnnotations }} 28 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 29 {{- end }} 30 description: Thanos Query {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of "query" requests. 31 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryerrorratehigh 32 summary: Thanos Query is failing to handle requests. 33 expr: | 34 ( 35 sum by (job, verrazzano_cluster) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m])) 36 / 37 sum by (job, verrazzano_cluster) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m])) 38 ) * 100 > 5 39 for: 5m 40 labels: 41 severity: critical 42 {{- if .Values.metrics.prometheusRule.additionalLabels }} 43 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 44 {{- end }} 45 {{- end }} 46 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryHttpRequestQueryRangeErrorRateHigh | default false) }} 47 - alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh 48 annotations: 49 {{- if .Values.commonAnnotations }} 50 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 51 {{- end }} 52 description: Thanos Query {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of "query_range" requests. 53 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhttprequestqueryrangeerrorratehigh 54 summary: Thanos Query is failing to handle requests. 55 expr: | 56 ( 57 sum by (job, verrazzano_cluster) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m])) 58 / 59 sum by (job, verrazzano_cluster) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m])) 60 ) * 100 > 5 61 for: 5m 62 labels: 63 severity: critical 64 {{- if .Values.metrics.prometheusRule.additionalLabels }} 65 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 66 {{- end }} 67 {{- end }} 68 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryGrpcServerErrorRate | default false) }} 69 - alert: ThanosQueryGrpcServerErrorRate 70 annotations: 71 {{- if .Values.commonAnnotations }} 72 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 73 {{- end }} 74 description: Thanos Query {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests. 75 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcservererrorrate 76 summary: Thanos Query is failing to handle requests. 77 expr: | 78 ( 79 sum by (job, verrazzano_cluster) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m])) 80 / 81 sum by (job, verrazzano_cluster) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) 82 * 100 > 5 83 ) 84 for: 5m 85 labels: 86 severity: warning 87 {{- if .Values.metrics.prometheusRule.additionalLabels }} 88 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 89 {{- end }} 90 {{- end }} 91 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryGrpcClientErrorRate | default false) }} 92 - alert: ThanosQueryGrpcClientErrorRate 93 annotations: 94 {{- if .Values.commonAnnotations }} 95 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 96 {{- end }} 97 description: Thanos Query {{`{{`}} $labels.job {{`}}`}} is failing to send {{`{{`}} $value | humanize {{`}}`}}% of requests. 98 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosquerygrpcclienterrorrate 99 summary: Thanos Query is failing to send requests. 100 expr: | 101 ( 102 sum by (job, verrazzano_cluster) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) 103 / 104 sum by (job, verrazzano_cluster) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) 105 ) * 100 > 5 106 for: 5m 107 labels: 108 severity: warning 109 {{- if .Values.metrics.prometheusRule.additionalLabels }} 110 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 111 {{- end }} 112 {{- end }} 113 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryHighDNSFailures | default false) }} 114 - alert: ThanosQueryHighDNSFailures 115 annotations: 116 {{- if .Values.commonAnnotations }} 117 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 118 {{- end }} 119 description: Thanos Query {{`{{`}} $labels.job {{`}}`}} have {{`{{`}} $value | humanize{{`}}`}}% of failing DNS queries for store endpoints. 120 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryhighdnsfailures 121 summary: Thanos Query is having high number of DNS failures. 122 expr: | 123 ( 124 sum by (job, verrazzano_cluster) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) 125 / 126 sum by (job, verrazzano_cluster) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) 127 ) * 100 > 1 128 for: 15m 129 labels: 130 severity: warning 131 {{- if .Values.metrics.prometheusRule.additionalLabels }} 132 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 133 {{- end }} 134 {{- end }} 135 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryInstantLatencyHigh | default false) }} 136 - alert: ThanosQueryInstantLatencyHigh 137 annotations: 138 {{- if .Values.commonAnnotations }} 139 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 140 {{- end }} 141 description: Thanos Query {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for instant queries. 142 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryinstantlatencyhigh 143 summary: Thanos Query has high latency for queries. 144 expr: | 145 ( 146 histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 147 and 148 sum by (job, verrazzano_cluster) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0 149 ) 150 for: 10m 151 labels: 152 severity: critical 153 {{- if .Values.metrics.prometheusRule.additionalLabels }} 154 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 155 {{- end }} 156 {{- end }} 157 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryRangeLatencyHigh | default false) }} 158 - alert: ThanosQueryRangeLatencyHigh 159 annotations: 160 {{- if .Values.commonAnnotations }} 161 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 162 {{- end }} 163 description: Thanos Query {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for range queries. 164 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryrangelatencyhigh 165 summary: Thanos Query has high latency for queries. 166 expr: | 167 ( 168 histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 169 and 170 sum by (job, verrazzano_cluster) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0 171 ) 172 for: 10m 173 labels: 174 severity: critical 175 {{- if .Values.metrics.prometheusRule.additionalLabels }} 176 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 177 {{- end }} 178 {{- end }} 179 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosQueryOverload | default false) }} 180 - alert: ThanosQueryOverload 181 annotations: 182 {{- if .Values.commonAnnotations }} 183 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 184 {{- end }} 185 description: Thanos Query {{`{{`}} $labels.job {{`}}`}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support. 186 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosqueryoverload 187 summary: Thanos query reaches its maximum capacity serving concurrent requests. 188 expr: | 189 ( 190 max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 191 ) 192 for: 15m 193 labels: 194 severity: warning 195 {{- if .Values.metrics.prometheusRule.additionalLabels }} 196 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 197 {{- end }} 198 {{- end }} 199 {{- end }}