github.com/thanos-io/thanos@v0.32.5/mixin/alerts/rule.libsonnet (about)

     1  {
     2    local thanos = self,
     3    rule+:: {
     4      selector: error 'must provide selector for Thanos Rule alerts',
     5      grpcErrorThreshold: 5,
     6      rulerDnsErrorThreshold: 1,
     7      alertManagerDnsErrorThreshold: 1,
     8      evalErrorThreshold: 5,
     9      dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job', 'instance']),
    10    },
    11    prometheusAlerts+:: {
    12      groups+: if thanos.rule == null then [] else [
    13        local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in %s' % std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else '';
    14        {
    15          name: 'thanos-rule',
    16          rules: [
    17            {
    18              alert: 'ThanosRuleQueueIsDroppingAlerts',
    19              annotations: {
    20                description: 'Thanos Rule {{$labels.instance}}%s is failing to queue alerts.' % location,
    21                summary: 'Thanos Rule is failing to queue alerts.',
    22              },
    23              expr: |||
    24                sum by (%(dimensions)s) (rate(thanos_alert_queue_alerts_dropped_total{%(selector)s}[5m])) > 0
    25              ||| % thanos.rule,
    26              'for': '5m',
    27              labels: {
    28                severity: 'critical',
    29              },
    30            },
    31            {
    32              alert: 'ThanosRuleSenderIsFailingAlerts',
    33              annotations: {
    34                description: 'Thanos Rule {{$labels.instance}}%s is failing to send alerts to alertmanager.' % location,
    35                summary: 'Thanos Rule is failing to send alerts to alertmanager.',
    36              },
    37              expr: |||
    38                sum by (%(dimensions)s) (rate(thanos_alert_sender_alerts_dropped_total{%(selector)s}[5m])) > 0
    39              ||| % thanos.rule,
    40              'for': '5m',
    41              labels: {
    42                severity: 'critical',
    43              },
    44            },
    45            {
    46              alert: 'ThanosRuleHighRuleEvaluationFailures',
    47              annotations: {
    48                description: 'Thanos Rule {{$labels.instance}}%s is failing to evaluate rules.' % location,
    49                summary: 'Thanos Rule is failing to evaluate rules.',
    50              },
    51              expr: |||
    52                (
    53                  sum by (%(dimensions)s) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[5m]))
    54                /
    55                  sum by (%(dimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m]))
    56                * 100 > %(evalErrorThreshold)s
    57                )
    58              ||| % thanos.rule,
    59  
    60              'for': '5m',
    61              labels: {
    62                severity: 'critical',
    63              },
    64            },
    65            {
    66              alert: 'ThanosRuleHighRuleEvaluationWarnings',
    67              annotations: {
    68                description: 'Thanos Rule {{$labels.instance}}%s has high number of evaluation warnings.' % location,
    69                summary: 'Thanos Rule has high number of evaluation warnings.',
    70              },
    71              expr: |||
    72                sum by (%(dimensions)s) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0
    73              ||| % thanos.rule,
    74  
    75              'for': '15m',
    76              labels: {
    77                severity: 'info',
    78              },
    79            },
    80            {
    81              alert: 'ThanosRuleRuleEvaluationLatencyHigh',
    82              annotations: {
    83                description: 'Thanos Rule {{$labels.instance}}%s has higher evaluation latency than interval for {{$labels.rule_group}}.' % location,
    84                summary: 'Thanos Rule has high rule evaluation latency.',
    85              },
    86              expr: |||
    87                (
    88                  sum by (%(dimensions)s, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s})
    89                >
    90                  sum by (%(dimensions)s, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s})
    91                )
    92              ||| % thanos.rule,
    93              'for': '5m',
    94              labels: {
    95                severity: 'warning',
    96              },
    97            },
    98            {
    99              alert: 'ThanosRuleGrpcErrorRate',
   100              annotations: {
   101                description: 'Thanos Rule {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of requests.' % location,
   102                summary: 'Thanos Rule is failing to handle grpc requests.',
   103              },
   104              expr: |||
   105                (
   106                  sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m]))
   107                /
   108                  sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m]))
   109                * 100 > %(grpcErrorThreshold)s
   110                )
   111              ||| % thanos.rule,
   112              'for': '5m',
   113              labels: {
   114                severity: 'warning',
   115              },
   116            },
   117            {
   118              alert: 'ThanosRuleConfigReloadFailure',
   119              annotations: {
   120                description: 'Thanos Rule {{$labels.job}}%s has not been able to reload its configuration.' % location,
   121                summary: 'Thanos Rule has not been able to reload configuration.',
   122              },
   123              expr: 'avg by (%(dimensions)s) (thanos_rule_config_last_reload_successful{%(selector)s}) != 1' % thanos.rule,
   124              'for': '5m',
   125              labels: {
   126                severity: 'info',
   127              },
   128            },
   129            {
   130              alert: 'ThanosRuleQueryHighDNSFailures',
   131              annotations: {
   132                description: 'Thanos Rule {{$labels.job}}%s has {{$value | humanize}}%% of failing DNS queries for query endpoints.' % location,
   133                summary: 'Thanos Rule is having high number of DNS failures.',
   134              },
   135              expr: |||
   136                (
   137                  sum by (%(dimensions)s) (rate(thanos_rule_query_apis_dns_failures_total{%(selector)s}[5m]))
   138                /
   139                  sum by (%(dimensions)s) (rate(thanos_rule_query_apis_dns_lookups_total{%(selector)s}[5m]))
   140                * 100 > %(rulerDnsErrorThreshold)s
   141                )
   142              ||| % thanos.rule,
   143              'for': '15m',
   144              labels: {
   145                severity: 'warning',
   146              },
   147            },
   148            {
   149              alert: 'ThanosRuleAlertmanagerHighDNSFailures',
   150              annotations: {
   151                description: 'Thanos Rule {{$labels.instance}}%s has {{$value | humanize}}%% of failing DNS queries for Alertmanager endpoints.' % location,
   152                summary: 'Thanos Rule is having high number of DNS failures.',
   153              },
   154              expr: |||
   155                (
   156                  sum by (%(dimensions)s) (rate(thanos_rule_alertmanagers_dns_failures_total{%(selector)s}[5m]))
   157                /
   158                  sum by (%(dimensions)s) (rate(thanos_rule_alertmanagers_dns_lookups_total{%(selector)s}[5m]))
   159                * 100 > %(alertManagerDnsErrorThreshold)s
   160                )
   161              ||| % thanos.rule,
   162              'for': '15m',
   163              labels: {
   164                severity: 'warning',
   165              },
   166            },
   167            {
   168              // NOTE: This alert will give false positive if no rules are configured.
   169              alert: 'ThanosRuleNoEvaluationFor10Intervals',
   170              annotations: {
   171                description: 'Thanos Rule {{$labels.job}}%s has rule groups that did not evaluate for at least 10x of their expected interval.' % location,
   172                summary: 'Thanos Rule has rule groups that did not evaluate for 10 intervals.',
   173              },
   174              expr: |||
   175                time() -  max by (%(dimensions)s, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{%(selector)s})
   176                >
   177                10 * max by (%(dimensions)s, group) (prometheus_rule_group_interval_seconds{%(selector)s})
   178              ||| % thanos.rule,
   179              'for': '5m',
   180              labels: {
   181                // TODO(bwplotka): Move to critical once we gain more confidence in this, it's not trivial as it looks.
   182                severity: 'info',
   183              },
   184            },
   185            {
   186              alert: 'ThanosNoRuleEvaluations',
   187              annotations: {
   188                description: 'Thanos Rule {{$labels.instance}}%s did not perform any rule evaluations in the past 10 minutes.' % location,
   189                summary: 'Thanos Rule did not perform any rule evaluations.',
   190              },
   191              expr: |||
   192                sum by (%(dimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m])) <= 0
   193                  and
   194                sum by (%(dimensions)s) (thanos_rule_loaded_rules{%(selector)s}) > 0
   195              ||| % thanos.rule,
   196              'for': '5m',
   197              labels: {
   198                severity: 'critical',
   199              },
   200            },
   201          ],
   202        },
   203      ],
   204    },
   205  }