github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/thanos/templates/alert-rule/receive.yml (about) 1 {{- /* 2 Generated from https://github.com/thanos-io/thanos/blob/main/examples/alerts/alerts.md 3 */ -}} 4 {{- if and .Values.metrics.enabled (or .Values.metrics.prometheusRule.default.create .Values.metrics.prometheusRule.default.receive ) .Values.receive.enabled ( .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" ) }} 5 apiVersion: monitoring.coreos.com/v1 6 kind: PrometheusRule 7 metadata: 8 name: {{ template "common.names.fullname" . }}-receive 9 namespace: {{ default .Release.Namespace .Values.metrics.prometheusRule.namespace | quote }} 10 labels: {{- include "common.labels.standard" . | nindent 4 }} 11 {{- if .Values.metrics.prometheusRule.additionalLabels }} 12 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }} 13 {{- end }} 14 {{- if .Values.commonLabels }} 15 {{- include "common.tplvalues.render" ( dict "value" .Values.commonLabels "context" $ ) | nindent 4 }} 16 {{- end }} 17 {{- if .Values.commonAnnotations }} 18 annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} 19 {{- end }} 20 spec: 21 groups: 22 - name: thanos-receive 23 rules: 24 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHttpRequestErrorRateHigh | default false) }} 25 - alert: ThanosReceiveHttpRequestErrorRateHigh 26 annotations: 27 {{- if .Values.commonAnnotations }} 28 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 29 {{- end }} 30 description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} is failing to handle {{`{{`}} $value | humanize {{`}}`}}% of requests. 31 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequesterrorratehigh 32 summary: Thanos Receive is failing to handle requests. 33 expr: | 34 ( 35 sum by (job, verrazzano_cluster) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m])) 36 / 37 sum by (job, verrazzano_cluster) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m])) 38 ) * 100 > 5 39 for: 5m 40 labels: 41 severity: critical 42 {{- if .Values.metrics.prometheusRule.additionalLabels }} 43 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 44 {{- end }} 45 {{- end }} 46 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHttpRequestLatencyHigh | default false) }} 47 - alert: ThanosReceiveHttpRequestLatencyHigh 48 annotations: 49 {{- if .Values.commonAnnotations }} 50 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 51 {{- end }} 52 description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for requests. 53 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehttprequestlatencyhigh 54 summary: Thanos Receive has high HTTP requests latency. 55 expr: | 56 ( 57 histogram_quantile(0.99, sum by (job, le, verrazzano_cluster) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 58 and 59 sum by (job, verrazzano_cluster) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0 60 ) 61 for: 10m 62 labels: 63 severity: critical 64 {{- if .Values.metrics.prometheusRule.additionalLabels }} 65 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 66 {{- end }} 67 {{- end }} 68 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHighReplicationFailures | default false) }} 69 - alert: ThanosReceiveHighReplicationFailures 70 annotations: 71 {{- if .Values.commonAnnotations }} 72 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 73 {{- end }} 74 description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} is failing to replicate {{`{{`}} $value | humanize {{`}}`}}% of requests. 75 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighreplicationfailures 76 summary: Thanos Receive is having high number of replication failures. 77 expr: | 78 thanos_receive_replication_factor > 1 79 and 80 ( 81 ( 82 sum by (job, verrazzano_cluster) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) 83 / 84 sum by (job, verrazzano_cluster) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m])) 85 ) 86 > 87 ( 88 max by (job, verrazzano_cluster) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1) / 2)) 89 / 90 max by (job, verrazzano_cluster) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}) 91 ) 92 ) * 100 93 for: 5m 94 labels: 95 severity: warning 96 {{- if .Values.metrics.prometheusRule.additionalLabels }} 97 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 98 {{- end }} 99 {{- end }} 100 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHighForwardRequestFailures | default false) }} 101 - alert: ThanosReceiveHighForwardRequestFailures 102 annotations: 103 {{- if .Values.commonAnnotations }} 104 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 105 {{- end }} 106 description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} is failing to forward {{`{{`}} $value | humanize {{`}}`}}% of requests. 107 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighforwardrequestfailures 108 summary: Thanos Receive is failing to forward requests. 109 expr: | 110 ( 111 sum by (job, verrazzano_cluster) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m])) 112 / 113 sum by (job, verrazzano_cluster) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) 114 ) * 100 > 20 115 for: 5m 116 labels: 117 severity: info 118 {{- if .Values.metrics.prometheusRule.additionalLabels }} 119 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 120 {{- end }} 121 {{- end }} 122 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveHighHashringFileRefreshFailures | default false) }} 123 - alert: ThanosReceiveHighHashringFileRefreshFailures 124 annotations: 125 {{- if .Values.commonAnnotations }} 126 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 127 {{- end }} 128 description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} is failing to refresh hashring file, {{`{{`}} $value | humanize {{`}}`}} of attempts failed. 129 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivehighhashringfilerefreshfailures 130 summary: Thanos Receive is failing to refresh hasring file. 131 expr: | 132 ( 133 sum by (job, verrazzano_cluster) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) 134 / 135 sum by (job, verrazzano_cluster) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) 136 > 0 137 ) 138 for: 15m 139 labels: 140 severity: warning 141 {{- if .Values.metrics.prometheusRule.additionalLabels }} 142 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 143 {{- end }} 144 {{- end }} 145 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveConfigReloadFailure | default false) }} 146 - alert: ThanosReceiveConfigReloadFailure 147 annotations: 148 {{- if .Values.commonAnnotations }} 149 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 150 {{- end }} 151 description: Thanos Receive {{`{{`}} $labels.job {{`}}`}} has not been able to reload hashring configurations. 152 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceiveconfigreloadfailure 153 summary: Thanos Receive has not been able to reload configuration. 154 expr: avg by (job, verrazzano_cluster) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1 155 for: 5m 156 labels: 157 severity: warning 158 {{- if .Values.metrics.prometheusRule.additionalLabels }} 159 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 160 {{- end }} 161 {{- end }} 162 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveNoUpload | default false) }} 163 - alert: ThanosReceiveNoUpload 164 annotations: 165 {{- if .Values.commonAnnotations }} 166 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 167 {{- end }} 168 description: Thanos Receive {{`{{`}} $labels.instance {{`}}`}} has not uploaded latest data to object storage. 169 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivenoupload 170 summary: Thanos Receive has not uploaded latest data to object storage. 171 expr: | 172 (up{job=~".*thanos-receive.*"} - 1) 173 + on (job, instance, verrazzano_cluster) # filters to only alert on current instance last 3h 174 (sum by (job, instance, verrazzano_cluster) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0) 175 for: 3h 176 labels: 177 severity: critical 178 {{- if .Values.metrics.prometheusRule.additionalLabels }} 179 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 180 {{- end }} 181 {{- end }} 182 {{- if not (.Values.metrics.prometheusRule.default.disabled.ThanosReceiveTrafficBelowThreshold | default false) }} 183 - alert: ThanosReceiveTrafficBelowThreshold 184 annotations: 185 {{- if .Values.commonAnnotations }} 186 {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 8 }} 187 {{- end }} 188 description: At Thanos Receive {{`{{`}} $labels.job {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} , the average 1-hr avg. metrics ingestion rate is {{`{{`}} $value | humanize {{`}}`}}% of 12-hr avg. ingestion rate. 189 runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanosreceivetrafficbelowthreshold 190 summary: Thanos Receive is experiencing low avg. 1-hr ingestion rate relative to avg. 12-hr ingestion rate. 191 expr: | 192 ( 193 avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[1h:5m]) 194 / 195 avg_over_time(rate(http_requests_total{job=~".*thanos-receive.*", code=~"2..", handler="receive"}[5m])[12h:5m]) 196 ) * 100 < 50 197 for: 1h 198 labels: 199 severity: warning 200 {{- if .Values.metrics.prometheusRule.additionalLabels }} 201 {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 8 }} 202 {{- end }} 203 {{- end }} 204 {{- end }}