github.com/thanos-io/thanos@v0.32.5/mixin/alerts/query.libsonnet

github.com/thanos-io/thanos@v0.32.5/mixin/alerts/query.libsonnet (about)

     1  {
     2    local thanos = self,
     3    query+:: {
     4      selector: error 'must provide selector for Thanos Query alerts',
     5      httpErrorThreshold: 5,
     6      grpcErrorThreshold: 5,
     7      dnsErrorThreshold: 1,
     8      p99QueryLatencyThreshold: 40,
     9      p99QueryRangeLatencyThreshold: 90,
    10      dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']),
    11    },
    12    prometheusAlerts+:: {
    13      groups+: if thanos.query == null then [] else [
    14        local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in %s' % std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else '';
    15        {
    16          name: 'thanos-query',
    17          rules: [
    18            {
    19              alert: 'ThanosQueryHttpRequestQueryErrorRateHigh',
    20              annotations: {
    21                description: 'Thanos Query {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of "query" requests.' % location,
    22                summary: 'Thanos Query is failing to handle requests.',
    23              },
    24              expr: |||
    25                (
    26                  sum by (%(dimensions)s) (rate(http_requests_total{code=~"5..", %(selector)s, handler="query"}[5m]))
    27                /
    28                  sum by (%(dimensions)s) (rate(http_requests_total{%(selector)s, handler="query"}[5m]))
    29                ) * 100 > %(httpErrorThreshold)s
    30              ||| % thanos.query,
    31              'for': '5m',
    32              labels: {
    33                severity: 'critical',
    34              },
    35            },
    36            {
    37              alert: 'ThanosQueryHttpRequestQueryRangeErrorRateHigh',
    38              annotations: {
    39                description: 'Thanos Query {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of "query_range" requests.' % location,
    40                summary: 'Thanos Query is failing to handle requests.',
    41              },
    42              expr: |||
    43                (
    44                  sum by (%(dimensions)s) (rate(http_requests_total{code=~"5..", %(selector)s, handler="query_range"}[5m]))
    45                /
    46                  sum by (%(dimensions)s) (rate(http_requests_total{%(selector)s, handler="query_range"}[5m]))
    47                ) * 100 > %(httpErrorThreshold)s
    48              ||| % thanos.query,
    49              'for': '5m',
    50              labels: {
    51                severity: 'critical',
    52              },
    53            },
    54            {
    55              alert: 'ThanosQueryGrpcServerErrorRate',
    56              annotations: {
    57                description: 'Thanos Query {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of requests.' % location,
    58                summary: 'Thanos Query is failing to handle requests.',
    59              },
    60              expr: |||
    61                (
    62                  sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m]))
    63                /
    64                  sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m]))
    65                * 100 > %(grpcErrorThreshold)s
    66                )
    67              ||| % thanos.query,
    68              'for': '5m',
    69              labels: {
    70                severity: 'warning',
    71              },
    72            },
    73            {
    74              alert: 'ThanosQueryGrpcClientErrorRate',
    75              annotations: {
    76                description: 'Thanos Query {{$labels.job}}%s is failing to send {{$value | humanize}}%% of requests.' % location,
    77                summary: 'Thanos Query is failing to send requests.',
    78              },
    79              expr: |||
    80                (
    81                  sum by (%(dimensions)s) (rate(grpc_client_handled_total{grpc_code!="OK", %(selector)s}[5m]))
    82                /
    83                  sum by (%(dimensions)s) (rate(grpc_client_started_total{%(selector)s}[5m]))
    84                ) * 100 > %(grpcErrorThreshold)s
    85              ||| % thanos.query,
    86              'for': '5m',
    87              labels: {
    88                severity: 'warning',
    89              },
    90            },
    91            {
    92              alert: 'ThanosQueryHighDNSFailures',
    93              annotations: {
    94                description: 'Thanos Query {{$labels.job}}%s have {{$value | humanize}}%% of failing DNS queries for store endpoints.' % location,
    95                summary: 'Thanos Query is having high number of DNS failures.',
    96              },
    97              expr: |||
    98                (
    99                  sum by (%(dimensions)s) (rate(thanos_query_store_apis_dns_failures_total{%(selector)s}[5m]))
   100                /
   101                  sum by (%(dimensions)s) (rate(thanos_query_store_apis_dns_lookups_total{%(selector)s}[5m]))
   102                ) * 100 > %(dnsErrorThreshold)s
   103              ||| % thanos.query,
   104              'for': '15m',
   105              labels: {
   106                severity: 'warning',
   107              },
   108            },
   109            {
   110              alert: 'ThanosQueryInstantLatencyHigh',
   111              annotations: {
   112                description: 'Thanos Query {{$labels.job}}%s has a 99th percentile latency of {{$value}} seconds for instant queries.' % location,
   113                summary: 'Thanos Query has high latency for queries.',
   114              },
   115              expr: |||
   116                (
   117                  histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m]))) > %(p99QueryLatencyThreshold)s
   118                and
   119                  sum by (%(dimensions)s) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query"}[5m])) > 0
   120                )
   121              ||| % thanos.query,
   122              'for': '10m',
   123              labels: {
   124                severity: 'critical',
   125              },
   126            },
   127            {
   128              alert: 'ThanosQueryRangeLatencyHigh',
   129              annotations: {
   130                description: 'Thanos Query {{$labels.job}}%s has a 99th percentile latency of {{$value}} seconds for range queries.' % location,
   131                summary: 'Thanos Query has high latency for queries.',
   132              },
   133              expr: |||
   134                (
   135                  histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="query_range"}[5m]))) > %(p99QueryRangeLatencyThreshold)s
   136                and
   137                  sum by (%(dimensions)s) (rate(http_request_duration_seconds_count{%(selector)s, handler="query_range"}[5m])) > 0
   138                )
   139              ||| % thanos.query,
   140              'for': '10m',
   141              labels: {
   142                severity: 'critical',
   143              },
   144            },
   145            {
   146              alert: 'ThanosQueryOverload',
   147              annotations: {
   148                description: 'Thanos Query {{$labels.job}}%s has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.' % location,
   149                summary: 'Thanos query reaches its maximum capacity serving concurrent requests.',
   150              },
   151              expr: |||
   152                (
   153                  max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1
   154                )
   155              ||| % thanos.query,
   156              'for': '15m',
   157              labels: {
   158                severity: 'warning',
   159              },
   160            },
   161          ],
   162        },
   163      ],
   164    },
   165  }