github.com/thanos-io/thanos@v0.32.5/mixin/alerts/receive.libsonnet (about) 1 { 2 local thanos = self, 3 receive+:: { 4 selector: error 'must provide selector for Thanos Receive alerts', 5 httpErrorThreshold: 5, 6 ingestionThreshold: 50, 7 forwardErrorThreshold: 20, 8 metaMonitoringErrorThreshold: 20, 9 refreshErrorThreshold: 0, 10 p99LatencyThreshold: 10, 11 dimensions: std.join(', ', std.objectFields(thanos.targetGroups) + ['job']), 12 }, 13 prometheusAlerts+:: { 14 groups+: if thanos.receive == null then [] else [ 15 local location = if std.length(std.objectFields(thanos.targetGroups)) > 0 then ' in %s' % std.join('/', ['{{$labels.%s}}' % level for level in std.objectFields(thanos.targetGroups)]) else ''; 16 { 17 name: 'thanos-receive', 18 rules: [ 19 { 20 alert: 'ThanosReceiveHttpRequestErrorRateHigh', 21 annotations: { 22 description: 'Thanos Receive {{$labels.job}}%s is failing to handle {{$value | humanize}}%% of requests.' % location, 23 summary: 'Thanos Receive is failing to handle requests.', 24 }, 25 expr: ||| 26 ( 27 sum by (%(dimensions)s) (rate(http_requests_total{code=~"5..", %(selector)s, handler="receive"}[5m])) 28 / 29 sum by (%(dimensions)s) (rate(http_requests_total{%(selector)s, handler="receive"}[5m])) 30 ) * 100 > %(httpErrorThreshold)s 31 ||| % thanos.receive, 32 'for': '5m', 33 labels: { 34 severity: 'critical', 35 }, 36 }, 37 { 38 alert: 'ThanosReceiveHttpRequestLatencyHigh', 39 annotations: { 40 description: 'Thanos Receive {{$labels.job}}%s has a 99th percentile latency of {{ $value }} seconds for requests.' % location, 41 summary: 'Thanos Receive has high HTTP requests latency.', 42 }, 43 expr: ||| 44 ( 45 histogram_quantile(0.99, sum by (%(dimensions)s, le) (rate(http_request_duration_seconds_bucket{%(selector)s, handler="receive"}[5m]))) > %(p99LatencyThreshold)s 46 and 47 sum by (%(dimensions)s) (rate(http_request_duration_seconds_count{%(selector)s, handler="receive"}[5m])) > 0 48 ) 49 ||| % thanos.receive, 50 'for': '10m', 51 labels: { 52 severity: 'critical', 53 }, 54 }, 55 { 56 alert: 'ThanosReceiveHighReplicationFailures', 57 annotations: { 58 description: 'Thanos Receive {{$labels.job}}%s is failing to replicate {{$value | humanize}}%% of requests.' % location, 59 summary: 'Thanos Receive is having high number of replication failures.', 60 }, 61 expr: ||| 62 thanos_receive_replication_factor > 1 63 and 64 ( 65 ( 66 sum by (%(dimensions)s) (rate(thanos_receive_replications_total{result="error", %(selector)s}[5m])) 67 / 68 sum by (%(dimensions)s) (rate(thanos_receive_replications_total{%(selector)s}[5m])) 69 ) 70 > 71 ( 72 max by (%(dimensions)s) (floor((thanos_receive_replication_factor{%(selector)s}+1) / 2)) 73 / 74 max by (%(dimensions)s) (thanos_receive_hashring_nodes{%(selector)s}) 75 ) 76 ) * 100 77 ||| % thanos.receive, 78 'for': '5m', 79 labels: { 80 severity: 'warning', 81 }, 82 }, 83 { 84 alert: 'ThanosReceiveHighForwardRequestFailures', 85 annotations: { 86 description: 'Thanos Receive {{$labels.job}}%s is failing to forward {{$value | humanize}}%% of requests.' % location, 87 summary: 'Thanos Receive is failing to forward requests.', 88 }, 89 expr: ||| 90 ( 91 sum by (%(dimensions)s) (rate(thanos_receive_forward_requests_total{result="error", %(selector)s}[5m])) 92 / 93 sum by (%(dimensions)s) (rate(thanos_receive_forward_requests_total{%(selector)s}[5m])) 94 ) * 100 > %(forwardErrorThreshold)s 95 ||| % thanos.receive, 96 'for': '5m', 97 labels: { 98 severity: 'info', 99 }, 100 }, 101 { 102 alert: 'ThanosReceiveHighHashringFileRefreshFailures', 103 annotations: { 104 description: 'Thanos Receive {{$labels.job}}%s is failing to refresh hashring file, {{$value | humanize}} of attempts failed.' % location, 105 summary: 'Thanos Receive is failing to refresh hasring file.', 106 }, 107 expr: ||| 108 ( 109 sum by (%(dimensions)s) (rate(thanos_receive_hashrings_file_errors_total{%(selector)s}[5m])) 110 / 111 sum by (%(dimensions)s) (rate(thanos_receive_hashrings_file_refreshes_total{%(selector)s}[5m])) 112 > %(refreshErrorThreshold)s 113 ) 114 ||| % thanos.receive, 115 'for': '15m', 116 labels: { 117 severity: 'warning', 118 }, 119 }, 120 { 121 alert: 'ThanosReceiveConfigReloadFailure', 122 annotations: { 123 description: 'Thanos Receive {{$labels.job}}%s has not been able to reload hashring configurations.' % location, 124 summary: 'Thanos Receive has not been able to reload configuration.', 125 }, 126 expr: 'avg by (%(dimensions)s) (thanos_receive_config_last_reload_successful{%(selector)s}) != 1' % thanos.receive, 127 'for': '5m', 128 labels: { 129 severity: 'warning', 130 }, 131 }, 132 { 133 alert: 'ThanosReceiveNoUpload', 134 annotations: { 135 description: 'Thanos Receive {{$labels.instance}}%s has not uploaded latest data to object storage.' % location, 136 summary: 'Thanos Receive has not uploaded latest data to object storage.', 137 }, 138 expr: ||| 139 (up{%(selector)s} - 1) 140 + on (%(dimensions)s, instance) # filters to only alert on current instance last 3h 141 (sum by (%(dimensions)s, instance) (increase(thanos_shipper_uploads_total{%(selector)s}[3h])) == 0) 142 ||| % thanos.receive, 143 'for': '3h', 144 labels: { 145 severity: 'critical', 146 }, 147 }, 148 { 149 alert: 'ThanosReceiveLimitsConfigReloadFailure', 150 annotations: { 151 description: 'Thanos Receive {{$labels.job}}%s has not been able to reload the limits configuration.' % location, 152 summary: 'Thanos Receive has not been able to reload the limits configuration.', 153 }, 154 expr: 'sum by(%(dimensions)s) (increase(thanos_receive_limits_config_reload_err_total{%(selector)s}[5m])) > 0' % thanos.receive, 155 'for': '5m', 156 labels: { 157 severity: 'warning', 158 }, 159 }, 160 { 161 alert: 'ThanosReceiveLimitsHighMetaMonitoringQueriesFailureRate', 162 annotations: { 163 description: 'Thanos Receive {{$labels.job}}%s is failing for {{$value | humanize}}%% of meta monitoring queries.' % location, 164 summary: 'Thanos Receive has not been able to update the number of head series.', 165 }, 166 // Values are updated every 15s, 20 times over 5 minutes. 167 expr: '(sum by(%(dimensions)s) (increase(thanos_receive_metamonitoring_failed_queries_total{%(selector)s}[5m])) / 20) * 100 > %(metaMonitoringErrorThreshold)s' % thanos.receive, 168 'for': '5m', 169 labels: { 170 severity: 'warning', 171 }, 172 }, 173 { 174 alert: 'ThanosReceiveTenantLimitedByHeadSeries', 175 annotations: { 176 description: 'Thanos Receive tenant {{$labels.tenant}}%s is limited by head series.' % location, 177 summary: 'A Thanos Receive tenant is limited by head series.', 178 }, 179 expr: 'sum by(%(dimensions)s, tenant) (increase(thanos_receive_head_series_limited_requests_total{%(selector)s}[5m])) > 0' % thanos.receive, 180 'for': '5m', 181 labels: { 182 severity: 'warning', 183 }, 184 }, 185 ], 186 }, 187 ], 188 }, 189 }