github.com/thanos-io/thanos@v0.32.5/mixin/alerts/query.libsonnet (about) 1 { 2 local thanos = self, 3 query+:: { 4 selector: error 'must provide selector for Thanos Query alerts', 5 httpErrorThreshold: 5, 6 grpcErrorThreshold: 5, 7 dnsErrorThreshold: 1, 8 p99QueryLatencyThreshold: 40, 9 p99QueryRangeLatencyThreshold: 90, 10 dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), 11 }, 12 prometheusAlerts+:: { 13 groups+: if thanos.query == null then [] else [ 14 local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in %s' % std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ''; 15 { 16 name: 'thanos-query', 17 rules: [ 18 { 19 alert: 'ThanosQueryHttpRequestQueryErrorRateHigh', 20 annotations: { 21 description: 'Thanos Query {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of "query" requests.' % location, 22 summary: 'Thanos Query is failing to handle requests.', 23 }, 24 expr: ||| 25 ( 26 sum by (%(dimensions)s) (rate(http_requests_total{code=~"5..", %(selector)s, handler="query"}[5m])) 27 / 28 sum by (%(dimensions)s) (rate(http_requests_total{%(selector)s, handler="query"}[5m])) 29 ) * 100 > %(httpErrorThreshold)s 30 ||| % thanos.query, 31 'for': '5m', 32 labels: { 33 severity: 'critical', 34 }, 35 }, 36 { 37 alert: 'ThanosQueryHttpRequestQueryRangeErrorRateHigh', 38 annotations: { 39 description: 'Thanos Query {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of "query_range" requests.' % location, 40 summary: 'Thanos Query is failing to handle requests.', 41 }, 42 expr: ||| 43 ( 44 sum by (%(dimensions)s) (rate(http_requests_total{code=~"5..", %(selector)s, handler="query_range"}[5m])) 45 / 46 sum by (%(dimensions)s) (rate(http_requests_total{%(selector)s, handler="query_range"}[5m])) 47 ) * 100 > %(httpErrorThreshold)s 48 ||| % thanos.query, 49 'for': '5m', 50 labels: { 51 severity: 'critical', 52 }, 53 }, 54 { 55 alert: 'ThanosQueryGrpcServerErrorRate', 56 annotations: { 57 description: 'Thanos Query {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of requests.' % location, 58 summary: 'Thanos Query is failing to handle requests.', 59 }, 60 expr: ||| 61 ( 62 sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) 63 / 64 sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m])) 65 * 100 > %(grpcErrorThreshold)s 66 ) 67 ||| % thanos.query, 68 'for': '5m', 69 labels: { 70 severity: 'warning', 71 }, 72 }, 73 { 74 alert: 'ThanosQueryGrpcClientErrorRate', 75 annotations: { 76 description: 'Thanos Query {{$labels.job}}%s is failing to send {{$value | humanize}}%% of requests.' % location, 77 summary: 'Thanos Query is failing to send requests.', 78 }, 79 expr: ||| 80 ( 81 sum by (%(dimensions)s) (rate(grpc_client_handled_total{grpc_code!="OK", %(selector)s}[5m])) 82 / 83 sum by (%(dimensions)s) (rate(grpc_client_started_total{%(selector)s}[5m])) 84 ) * 100 > %(grpcErrorThreshold)s 85 ||| % thanos.query, 86 'for': '5m', 87 labels: { 88 severity: 'warning', 89 }, 90 }, 91 { 92 alert: 'ThanosQueryHighDNSFailures', 93 annotations: { 94 description: 'Thanos Query {{$labels.job}}%s have {{$value | humanize}}%% of failing DNS queries for store endpoints.' % location, 95 summary: 'Thanos Query is having high number of DNS failures.', 96 }, 97 expr: ||| 98 ( 99 sum by (%(dimensions)s) (rate(thanos_query_store_apis_dns_failures_total{%(selector)s}[5m])) 100 / 101 sum by (%(dimensions)s) (rate(thanos_query_store_apis_dns_lookups_total{%(selector)s}[5m])) 102 ) * 100 > %(dnsErrorThreshold)s 103 ||| % thanos.query, 104 'for': '15m', 105 labels: { 106 severity: 'warning', 107 }, 108 }, 109 { 110 alert: 'ThanosQueryInstantLatencyHigh', 111 annotations: { 112 description: 'Thanos Query {{$labels.job}}%s has a 99th percentile latency of {{$value}} seconds for instant queries.' % location, 113 summary: 'Thanos Query has high latency for queries.', 114 }, 115 expr: ||| 116 ( 117 histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m]))) > %(p99QueryLatencyThreshold)s 118 and 119 sum by (%(dimensions)s) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) > 0 120 ) 121 ||| % thanos.query, 122 'for': '10m', 123 labels: { 124 severity: 'critical', 125 }, 126 }, 127 { 128 alert: 'ThanosQueryRangeLatencyHigh', 129 annotations: { 130 description: 'Thanos Query {{$labels.job}}%s has a 99th percentile latency of {{$value}} seconds for range queries.' % location, 131 summary: 'Thanos Query has high latency for queries.', 132 }, 133 expr: ||| 134 ( 135 histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m]))) > %(p99QueryRangeLatencyThreshold)s 136 and 137 sum by (%(dimensions)s) (rate(http_request_duration_seconds_count{%(selector)s, handler="query_range"}[5m])) > 0 138 ) 139 ||| % thanos.query, 140 'for': '10m', 141 labels: { 142 severity: 'critical', 143 }, 144 }, 145 { 146 alert: 'ThanosQueryOverload', 147 annotations: { 148 description: 'Thanos Query {{$labels.job}}%s has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.' % location, 149 summary: 'Thanos query reaches its maximum capacity serving concurrent requests.', 150 }, 151 expr: ||| 152 ( 153 max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1 154 ) 155 ||| % thanos.query, 156 'for': '15m', 157 labels: { 158 severity: 'warning', 159 }, 160 }, 161 ], 162 }, 163 ], 164 }, 165 }