github.com/thanos-io/thanos@v0.32.5/mixin/alerts/rule.libsonnet (about) 1 { 2 local thanos = self, 3 rule+:: { 4 selector: error 'must provide selector for Thanos Rule alerts', 5 grpcErrorThreshold: 5, 6 rulerDnsErrorThreshold: 1, 7 alertManagerDnsErrorThreshold: 1, 8 evalErrorThreshold: 5, 9 dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job', 'instance']), 10 }, 11 prometheusAlerts+:: { 12 groups+: if thanos.rule == null then [] else [ 13 local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in %s' % std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ''; 14 { 15 name: 'thanos-rule', 16 rules: [ 17 { 18 alert: 'ThanosRuleQueueIsDroppingAlerts', 19 annotations: { 20 description: 'Thanos Rule {{$labels.instance}}%s is failing to queue alerts.' % location, 21 summary: 'Thanos Rule is failing to queue alerts.', 22 }, 23 expr: ||| 24 sum by (%(dimensions)s) (rate(thanos_alert_queue_alerts_dropped_total{%(selector)s}[5m])) > 0 25 ||| % thanos.rule, 26 'for': '5m', 27 labels: { 28 severity: 'critical', 29 }, 30 }, 31 { 32 alert: 'ThanosRuleSenderIsFailingAlerts', 33 annotations: { 34 description: 'Thanos Rule {{$labels.instance}}%s is failing to send alerts to alertmanager.' % location, 35 summary: 'Thanos Rule is failing to send alerts to alertmanager.', 36 }, 37 expr: ||| 38 sum by (%(dimensions)s) (rate(thanos_alert_sender_alerts_dropped_total{%(selector)s}[5m])) > 0 39 ||| % thanos.rule, 40 'for': '5m', 41 labels: { 42 severity: 'critical', 43 }, 44 }, 45 { 46 alert: 'ThanosRuleHighRuleEvaluationFailures', 47 annotations: { 48 description: 'Thanos Rule {{$labels.instance}}%s is failing to evaluate rules.' % location, 49 summary: 'Thanos Rule is failing to evaluate rules.', 50 }, 51 expr: ||| 52 ( 53 sum by (%(dimensions)s) (rate(prometheus_rule_evaluation_failures_total{%(selector)s}[5m])) 54 / 55 sum by (%(dimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m])) 56 * 100 > %(evalErrorThreshold)s 57 ) 58 ||| % thanos.rule, 59 60 'for': '5m', 61 labels: { 62 severity: 'critical', 63 }, 64 }, 65 { 66 alert: 'ThanosRuleHighRuleEvaluationWarnings', 67 annotations: { 68 description: 'Thanos Rule {{$labels.instance}}%s has high number of evaluation warnings.' % location, 69 summary: 'Thanos Rule has high number of evaluation warnings.', 70 }, 71 expr: ||| 72 sum by (%(dimensions)s) (rate(thanos_rule_evaluation_with_warnings_total{%(selector)s}[5m])) > 0 73 ||| % thanos.rule, 74 75 'for': '15m', 76 labels: { 77 severity: 'info', 78 }, 79 }, 80 { 81 alert: 'ThanosRuleRuleEvaluationLatencyHigh', 82 annotations: { 83 description: 'Thanos Rule {{$labels.instance}}%s has higher evaluation latency than interval for {{$labels.rule_group}}.' % location, 84 summary: 'Thanos Rule has high rule evaluation latency.', 85 }, 86 expr: ||| 87 ( 88 sum by (%(dimensions)s, rule_group) (prometheus_rule_group_last_duration_seconds{%(selector)s}) 89 > 90 sum by (%(dimensions)s, rule_group) (prometheus_rule_group_interval_seconds{%(selector)s}) 91 ) 92 ||| % thanos.rule, 93 'for': '5m', 94 labels: { 95 severity: 'warning', 96 }, 97 }, 98 { 99 alert: 'ThanosRuleGrpcErrorRate', 100 annotations: { 101 description: 'Thanos Rule {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of requests.' % location, 102 summary: 'Thanos Rule is failing to handle grpc requests.', 103 }, 104 expr: ||| 105 ( 106 sum by (%(dimensions)s) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", %(selector)s}[5m])) 107 / 108 sum by (%(dimensions)s) (rate(grpc_server_started_total{%(selector)s}[5m])) 109 * 100 > %(grpcErrorThreshold)s 110 ) 111 ||| % thanos.rule, 112 'for': '5m', 113 labels: { 114 severity: 'warning', 115 }, 116 }, 117 { 118 alert: 'ThanosRuleConfigReloadFailure', 119 annotations: { 120 description: 'Thanos Rule {{$labels.job}}%s has not been able to reload its configuration.' % location, 121 summary: 'Thanos Rule has not been able to reload configuration.', 122 }, 123 expr: 'avg by (%(dimensions)s) (thanos_rule_config_last_reload_successful{%(selector)s}) != 1' % thanos.rule, 124 'for': '5m', 125 labels: { 126 severity: 'info', 127 }, 128 }, 129 { 130 alert: 'ThanosRuleQueryHighDNSFailures', 131 annotations: { 132 description: 'Thanos Rule {{$labels.job}}%s has {{$value | humanize}}%% of failing DNS queries for query endpoints.' % location, 133 summary: 'Thanos Rule is having high number of DNS failures.', 134 }, 135 expr: ||| 136 ( 137 sum by (%(dimensions)s) (rate(thanos_rule_query_apis_dns_failures_total{%(selector)s}[5m])) 138 / 139 sum by (%(dimensions)s) (rate(thanos_rule_query_apis_dns_lookups_total{%(selector)s}[5m])) 140 * 100 > %(rulerDnsErrorThreshold)s 141 ) 142 ||| % thanos.rule, 143 'for': '15m', 144 labels: { 145 severity: 'warning', 146 }, 147 }, 148 { 149 alert: 'ThanosRuleAlertmanagerHighDNSFailures', 150 annotations: { 151 description: 'Thanos Rule {{$labels.instance}}%s has {{$value | humanize}}%% of failing DNS queries for Alertmanager endpoints.' % location, 152 summary: 'Thanos Rule is having high number of DNS failures.', 153 }, 154 expr: ||| 155 ( 156 sum by (%(dimensions)s) (rate(thanos_rule_alertmanagers_dns_failures_total{%(selector)s}[5m])) 157 / 158 sum by (%(dimensions)s) (rate(thanos_rule_alertmanagers_dns_lookups_total{%(selector)s}[5m])) 159 * 100 > %(alertManagerDnsErrorThreshold)s 160 ) 161 ||| % thanos.rule, 162 'for': '15m', 163 labels: { 164 severity: 'warning', 165 }, 166 }, 167 { 168 // NOTE: This alert will give false positive if no rules are configured. 169 alert: 'ThanosRuleNoEvaluationFor10Intervals', 170 annotations: { 171 description: 'Thanos Rule {{$labels.job}}%s has rule groups that did not evaluate for at least 10x of their expected interval.' % location, 172 summary: 'Thanos Rule has rule groups that did not evaluate for 10 intervals.', 173 }, 174 expr: ||| 175 time() - max by (%(dimensions)s, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{%(selector)s}) 176 > 177 10 * max by (%(dimensions)s, group) (prometheus_rule_group_interval_seconds{%(selector)s}) 178 ||| % thanos.rule, 179 'for': '5m', 180 labels: { 181 // TODO(bwplotka): Move to critical once we gain more confidence in this, it's not trivial as it looks. 182 severity: 'info', 183 }, 184 }, 185 { 186 alert: 'ThanosNoRuleEvaluations', 187 annotations: { 188 description: 'Thanos Rule {{$labels.instance}}%s did not perform any rule evaluations in the past 10 minutes.' % location, 189 summary: 'Thanos Rule did not perform any rule evaluations.', 190 }, 191 expr: ||| 192 sum by (%(dimensions)s) (rate(prometheus_rule_evaluations_total{%(selector)s}[5m])) <= 0 193 and 194 sum by (%(dimensions)s) (thanos_rule_loaded_rules{%(selector)s}) > 0 195 ||| % thanos.rule, 196 'for': '5m', 197 labels: { 198 severity: 'critical', 199 }, 200 }, 201 ], 202 }, 203 ], 204 }, 205 }