github.com/thanos-io/thanos@v0.32.5/mixin/alerts/receive.libsonnet (about)

     1  {
     2    local thanos = self,
     3    receive+:: {
     4      selector: error 'must provide selector for Thanos Receive alerts',
     5      httpErrorThreshold: 5,
     6      ingestionThreshold: 50,
     7      forwardErrorThreshold: 20,
     8      metaMonitoringErrorThreshold: 20,
     9      refreshErrorThreshold: 0,
    10      p99LatencyThreshold: 10,
    11      dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']),
    12    },
    13    prometheusAlerts+:: {
    14      groups+: if thanos.receive == null then [] else [
    15        local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in %s' % std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else '';
    16        {
    17          name: 'thanos-receive',
    18          rules: [
    19            {
    20              alert: 'ThanosReceiveHttpRequestErrorRateHigh',
    21              annotations: {
    22                description: 'Thanos Receive {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of requests.' % location,
    23                summary: 'Thanos Receive is failing to handle requests.',
    24              },
    25              expr: |||
    26                (
    27                  sum by (%(dimensions)s) (rate(http_requests_total{code=~"5..", %(selector)s, handler="receive"}[5m]))
    28                /
    29                  sum by (%(dimensions)s) (rate(http_requests_total{%(selector)s, handler="receive"}[5m]))
    30                ) * 100 > %(httpErrorThreshold)s
    31              ||| % thanos.receive,
    32              'for': '5m',
    33              labels: {
    34                severity: 'critical',
    35              },
    36            },
    37            {
    38              alert: 'ThanosReceiveHttpRequestLatencyHigh',
    39              annotations: {
    40                description: 'Thanos Receive {{$labels.job}}%s has a 99th percentile latency of {{ $value }} seconds for requests.' % location,
    41                summary: 'Thanos Receive has high HTTP requests latency.',
    42              },
    43              expr: |||
    44                (
    45                  histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="receive"}[5m]))) > %(p99LatencyThreshold)s
    46                and
    47                  sum by (%(dimensions)s) (rate(http_request_duration_seconds_count{%(selector)s, handler="receive"}[5m])) > 0
    48                )
    49              ||| % thanos.receive,
    50              'for': '10m',
    51              labels: {
    52                severity: 'critical',
    53              },
    54            },
    55            {
    56              alert: 'ThanosReceiveHighReplicationFailures',
    57              annotations: {
    58                description: 'Thanos Receive {{$labels.job}}%s is failing to replicate {{$value | humanize}}%% of requests.' % location,
    59                summary: 'Thanos Receive is having high number of replication failures.',
    60              },
    61              expr: |||
    62                thanos_receive_replication_factor > 1
    63                  and
    64                (
    65                  (
    66                    sum by (%(dimensions)s) (rate(thanos_receive_replications_total{result="error", %(selector)s}[5m]))
    67                  /
    68                    sum by (%(dimensions)s) (rate(thanos_receive_replications_total{%(selector)s}[5m]))
    69                  )
    70                  >
    71                  (
    72                    max by (%(dimensions)s) (floor((thanos_receive_replication_factor{%(selector)s}+1) / 2))
    73                  /
    74                    max by (%(dimensions)s) (thanos_receive_hashring_nodes{%(selector)s})
    75                  )
    76                ) * 100
    77              ||| % thanos.receive,
    78              'for': '5m',
    79              labels: {
    80                severity: 'warning',
    81              },
    82            },
    83            {
    84              alert: 'ThanosReceiveHighForwardRequestFailures',
    85              annotations: {
    86                description: 'Thanos Receive {{$labels.job}}%s is failing to forward {{$value | humanize}}%% of requests.' % location,
    87                summary: 'Thanos Receive is failing to forward requests.',
    88              },
    89              expr: |||
    90                (
    91                  sum by (%(dimensions)s) (rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m]))
    92                /
    93                  sum by (%(dimensions)s) (rate(thanos_receive_forward_requests_total{%(selector)s}[5m]))
    94                ) * 100 > %(forwardErrorThreshold)s
    95              ||| % thanos.receive,
    96              'for': '5m',
    97              labels: {
    98                severity: 'info',
    99              },
   100            },
   101            {
   102              alert: 'ThanosReceiveHighHashringFileRefreshFailures',
   103              annotations: {
   104                description: 'Thanos Receive {{$labels.job}}%s is failing to refresh hashring file, {{$value | humanize}} of attempts failed.' % location,
   105                summary: 'Thanos Receive is failing to refresh hasring file.',
   106              },
   107              expr: |||
   108                (
   109                  sum by (%(dimensions)s) (rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m]))
   110                /
   111                  sum by (%(dimensions)s) (rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m]))
   112                > %(refreshErrorThreshold)s
   113                )
   114              ||| % thanos.receive,
   115              'for': '15m',
   116              labels: {
   117                severity: 'warning',
   118              },
   119            },
   120            {
   121              alert: 'ThanosReceiveConfigReloadFailure',
   122              annotations: {
   123                description: 'Thanos Receive {{$labels.job}}%s has not been able to reload hashring configurations.' % location,
   124                summary: 'Thanos Receive has not been able to reload configuration.',
   125              },
   126              expr: 'avg by (%(dimensions)s) (thanos_receive_config_last_reload_successful{%(selector)s}) != 1' % thanos.receive,
   127              'for': '5m',
   128              labels: {
   129                severity: 'warning',
   130              },
   131            },
   132            {
   133              alert: 'ThanosReceiveNoUpload',
   134              annotations: {
   135                description: 'Thanos Receive {{$labels.instance}}%s has not uploaded latest data to object storage.' % location,
   136                summary: 'Thanos Receive has not uploaded latest data to object storage.',
   137              },
   138              expr: |||
   139                (up{%(selector)s} - 1)
   140                + on (%(dimensions)s, instance) # filters to only alert on current instance last 3h
   141                (sum by (%(dimensions)s, instance) (increase(thanos_shipper_uploads_total{%(selector)s}[3h])) == 0)
   142              ||| % thanos.receive,
   143              'for': '3h',
   144              labels: {
   145                severity: 'critical',
   146              },
   147            },
   148            {
   149              alert: 'ThanosReceiveLimitsConfigReloadFailure',
   150              annotations: {
   151                description: 'Thanos Receive {{$labels.job}}%s has not been able to reload the limits configuration.' % location,
   152                summary: 'Thanos Receive has not been able to reload the limits configuration.',
   153              },
   154              expr: 'sum by(%(dimensions)s) (increase(thanos_receive_limits_config_reload_err_total{%(selector)s}[5m])) > 0' % thanos.receive,
   155              'for': '5m',
   156              labels: {
   157                severity: 'warning',
   158              },
   159            },
   160            {
   161              alert: 'ThanosReceiveLimitsHighMetaMonitoringQueriesFailureRate',
   162              annotations: {
   163                description: 'Thanos Receive {{$labels.job}}%s is failing for {{$value | humanize}}%% of meta monitoring queries.' % location,
   164                summary: 'Thanos Receive has not been able to update the number of head series.',
   165              },
   166              // Values are updated every 15s, 20 times over 5 minutes.
   167              expr: '(sum by(%(dimensions)s) (increase(thanos_receive_metamonitoring_failed_queries_total{%(selector)s}[5m])) / 20) * 100 > %(metaMonitoringErrorThreshold)s' % thanos.receive,
   168              'for': '5m',
   169              labels: {
   170                severity: 'warning',
   171              },
   172            },
   173            {
   174              alert: 'ThanosReceiveTenantLimitedByHeadSeries',
   175              annotations: {
   176                description: 'Thanos Receive tenant {{$labels.tenant}}%s is limited by head series.' % location,
   177                summary: 'A Thanos Receive tenant is limited by head series.',
   178              },
   179              expr: 'sum by(%(dimensions)s, tenant) (increase(thanos_receive_head_series_limited_requests_total{%(selector)s}[5m])) > 0' % thanos.receive,
   180              'for': '5m',
   181              labels: {
   182                severity: 'warning',
   183              },
   184            },
   185          ],
   186        },
   187      ],
   188    },
   189  }