github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/prometheus-community/kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml (about) 1 {{- /* 2 Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml 3 Do not change in-place! In order to change this file first read following link: 4 https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack 5 */ -}} 6 {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} 7 {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }} 8 {{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }} 9 {{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} 10 apiVersion: monitoring.coreos.com/v1 11 kind: PrometheusRule 12 metadata: 13 name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }} 14 namespace: {{ template "kube-prometheus-stack.namespace" . }} 15 labels: 16 app: {{ template "kube-prometheus-stack.name" . }} 17 {{ include "kube-prometheus-stack.labels" . | indent 4 }} 18 {{- if .Values.defaultRules.labels }} 19 {{ toYaml .Values.defaultRules.labels | indent 4 }} 20 {{- end }} 21 {{- if .Values.defaultRules.annotations }} 22 annotations: 23 {{ toYaml .Values.defaultRules.annotations | indent 4 }} 24 {{- end }} 25 spec: 26 groups: 27 - name: alertmanager.rules 28 rules: 29 {{- if not (.Values.defaultRules.disabled.AlertmanagerFailedReload | default false) }} 30 - alert: AlertmanagerFailedReload 31 annotations: 32 {{- if .Values.defaultRules.additionalRuleAnnotations }} 33 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 34 {{- end }} 35 description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. 36 runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerfailedreload 37 summary: Reloading an Alertmanager configuration has failed. 38 expr: |- 39 # Without max_over_time, failed scrapes could create false negatives, see 40 # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. 41 max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0 42 for: 10m 43 labels: 44 severity: critical 45 {{- if .Values.defaultRules.additionalRuleLabels }} 46 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 47 {{- end }} 48 {{- end }} 49 {{- if not (.Values.defaultRules.disabled.AlertmanagerMembersInconsistent | default false) }} 50 - alert: AlertmanagerMembersInconsistent 51 annotations: 52 {{- if .Values.defaultRules.additionalRuleAnnotations }} 53 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 54 {{- end }} 55 description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster. 56 runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagermembersinconsistent 57 summary: A member of an Alertmanager cluster has not found all other cluster members. 58 expr: |- 59 # Without max_over_time, failed scrapes could create false negatives, see 60 # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. 61 max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) 62 < on (namespace,service, verrazzano_cluster) group_left 63 count by (namespace,service, verrazzano_cluster) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) 64 for: 15m 65 labels: 66 severity: critical 67 {{- if .Values.defaultRules.additionalRuleLabels }} 68 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 69 {{- end }} 70 {{- end }} 71 {{- if not (.Values.defaultRules.disabled.AlertmanagerFailedToSendAlerts | default false) }} 72 - alert: AlertmanagerFailedToSendAlerts 73 annotations: 74 {{- if .Values.defaultRules.additionalRuleAnnotations }} 75 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 76 {{- end }} 77 description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}. 78 runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerfailedtosendalerts 79 summary: An Alertmanager instance failed to send notifications. 80 expr: |- 81 ( 82 rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) 83 / 84 rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) 85 ) 86 > 0.01 87 for: 5m 88 labels: 89 severity: warning 90 {{- if .Values.defaultRules.additionalRuleLabels }} 91 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 92 {{- end }} 93 {{- end }} 94 {{- if not (.Values.defaultRules.disabled.AlertmanagerClusterFailedToSendAlerts | default false) }} 95 - alert: AlertmanagerClusterFailedToSendAlerts 96 annotations: 97 {{- if .Values.defaultRules.additionalRuleAnnotations }} 98 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 99 {{- end }} 100 description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. 101 runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts 102 summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. 103 expr: |- 104 min by (namespace,service, integration, verrazzano_cluster) ( 105 rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) 106 / 107 rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) 108 ) 109 > 0.01 110 for: 5m 111 labels: 112 severity: critical 113 {{- if .Values.defaultRules.additionalRuleLabels }} 114 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 115 {{- end }} 116 {{- end }} 117 {{- if not (.Values.defaultRules.disabled.AlertmanagerClusterFailedToSendAlerts | default false) }} 118 - alert: AlertmanagerClusterFailedToSendAlerts 119 annotations: 120 {{- if .Values.defaultRules.additionalRuleAnnotations }} 121 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 122 {{- end }} 123 description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. 124 runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterfailedtosendalerts 125 summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. 126 expr: |- 127 min by (namespace,service, integration, verrazzano_cluster) ( 128 rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) 129 / 130 rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) 131 ) 132 > 0.01 133 for: 5m 134 labels: 135 severity: warning 136 {{- if .Values.defaultRules.additionalRuleLabels }} 137 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 138 {{- end }} 139 {{- end }} 140 {{- if not (.Values.defaultRules.disabled.AlertmanagerConfigInconsistent | default false) }} 141 - alert: AlertmanagerConfigInconsistent 142 annotations: 143 {{- if .Values.defaultRules.additionalRuleAnnotations }} 144 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 145 {{- end }} 146 description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations. 147 runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerconfiginconsistent 148 summary: Alertmanager instances within the same cluster have different configurations. 149 expr: |- 150 count by (namespace,service, verrazzano_cluster) ( 151 count_values by (namespace,service, verrazzano_cluster) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) 152 ) 153 != 1 154 for: 20m 155 labels: 156 severity: critical 157 {{- if .Values.defaultRules.additionalRuleLabels }} 158 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 159 {{- end }} 160 {{- end }} 161 {{- if not (.Values.defaultRules.disabled.AlertmanagerClusterDown | default false) }} 162 - alert: AlertmanagerClusterDown 163 annotations: 164 {{- if .Values.defaultRules.additionalRuleAnnotations }} 165 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 166 {{- end }} 167 description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.' 168 runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclusterdown 169 summary: Half or more of the Alertmanager instances within the same cluster are down. 170 expr: |- 171 ( 172 count by (namespace,service, verrazzano_cluster) ( 173 avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5 174 ) 175 / 176 count by (namespace,service, verrazzano_cluster) ( 177 up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} 178 ) 179 ) 180 >= 0.5 181 for: 5m 182 labels: 183 severity: critical 184 {{- if .Values.defaultRules.additionalRuleLabels }} 185 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 186 {{- end }} 187 {{- end }} 188 {{- if not (.Values.defaultRules.disabled.AlertmanagerClusterCrashlooping | default false) }} 189 - alert: AlertmanagerClusterCrashlooping 190 annotations: 191 {{- if .Values.defaultRules.additionalRuleAnnotations }} 192 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 193 {{- end }} 194 description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.' 195 runbook_url: {{ .Values.defaultRules.runbookUrl }}/alertmanager/alertmanagerclustercrashlooping 196 summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. 197 expr: |- 198 ( 199 count by (namespace,service, verrazzano_cluster) ( 200 changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4 201 ) 202 / 203 count by (namespace,service, verrazzano_cluster) ( 204 up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} 205 ) 206 ) 207 >= 0.5 208 for: 5m 209 labels: 210 severity: critical 211 {{- if .Values.defaultRules.additionalRuleLabels }} 212 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 213 {{- end }} 214 {{- end }} 215 {{- end }}