github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/prometheus-community/kube-prometheus-stack/templates/prometheus/rules-1.14/prometheus.yaml (about)

     1  {{- /*
     2  Generated from 'prometheus' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml
     3  Do not change in-place! In order to change this file first read following link:
     4  https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
     5  */ -}}
     6  {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
     7  {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
     8  {{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }}
     9  {{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
    10  apiVersion: monitoring.coreos.com/v1
    11  kind: PrometheusRule
    12  metadata:
    13    name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" | trunc 63 | trimSuffix "-" }}
    14    namespace: {{ template "kube-prometheus-stack.namespace" . }}
    15    labels:
    16      app: {{ template "kube-prometheus-stack.name" . }}
    17  {{ include "kube-prometheus-stack.labels" . | indent 4 }}
    18  {{- if .Values.defaultRules.labels }}
    19  {{ toYaml .Values.defaultRules.labels | indent 4 }}
    20  {{- end }}
    21  {{- if .Values.defaultRules.annotations }}
    22    annotations:
    23  {{ toYaml .Values.defaultRules.annotations | indent 4 }}
    24  {{- end }}
    25  spec:
    26    groups:
    27    - name: prometheus
    28      rules:
    29  {{- if not (.Values.defaultRules.disabled.PrometheusBadConfig | default false) }}
    30      - alert: PrometheusBadConfig
    31        annotations:
    32  {{- if .Values.defaultRules.additionalRuleAnnotations }}
    33  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
    34  {{- end }}
    35          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to reload its configuration.
    36          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusbadconfig
    37          summary: Failed Prometheus configuration reload.
    38        expr: |-
    39          # Without max_over_time, failed scrapes could create false negatives, see
    40          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
    41          max_over_time(prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) == 0
    42        for: 10m
    43        labels:
    44          severity: critical
    45  {{- if .Values.defaultRules.additionalRuleLabels }}
    46  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
    47  {{- end }}
    48  {{- end }}
    49  {{- if not (.Values.defaultRules.disabled.PrometheusNotificationQueueRunningFull | default false) }}
    50      - alert: PrometheusNotificationQueueRunningFull
    51        annotations:
    52  {{- if .Values.defaultRules.additionalRuleAnnotations }}
    53  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
    54  {{- end }}
    55          description: Alert notification queue of Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is running full.
    56          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusnotificationqueuerunningfull
    57          summary: Prometheus alert notification queue predicted to run full in less than 30m.
    58        expr: |-
    59          # Without min_over_time, failed scrapes could create false negatives, see
    60          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
    61          (
    62            predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30)
    63          >
    64            min_over_time(prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
    65          )
    66        for: 15m
    67        labels:
    68          severity: warning
    69  {{- if .Values.defaultRules.additionalRuleLabels }}
    70  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
    71  {{- end }}
    72  {{- end }}
    73  {{- if not (.Values.defaultRules.disabled.PrometheusErrorSendingAlertsToSomeAlertmanagers | default false) }}
    74      - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
    75        annotations:
    76  {{- if .Values.defaultRules.additionalRuleAnnotations }}
    77  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
    78  {{- end }}
    79          description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.alertmanager{{`}}`}}.'
    80          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuserrorsendingalertstosomealertmanagers
    81          summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
    82        expr: |-
    83          (
    84            rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
    85          /
    86            rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
    87          )
    88          * 100
    89          > 1
    90        for: 15m
    91        labels:
    92          severity: warning
    93  {{- if .Values.defaultRules.additionalRuleLabels }}
    94  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
    95  {{- end }}
    96  {{- end }}
    97  {{- if not (.Values.defaultRules.disabled.PrometheusNotConnectedToAlertmanagers | default false) }}
    98      - alert: PrometheusNotConnectedToAlertmanagers
    99        annotations:
   100  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   101  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   102  {{- end }}
   103          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not connected to any Alertmanagers.
   104          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusnotconnectedtoalertmanagers
   105          summary: Prometheus is not connected to any Alertmanagers.
   106        expr: |-
   107          # Without max_over_time, failed scrapes could create false negatives, see
   108          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
   109          max_over_time(prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) < 1
   110        for: 10m
   111        labels:
   112          severity: warning
   113  {{- if .Values.defaultRules.additionalRuleLabels }}
   114  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   115  {{- end }}
   116  {{- end }}
   117  {{- if not (.Values.defaultRules.disabled.PrometheusTSDBReloadsFailing | default false) }}
   118      - alert: PrometheusTSDBReloadsFailing
   119        annotations:
   120  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   121  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   122  {{- end }}
   123          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} reload failures over the last 3h.
   124          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbreloadsfailing
   125          summary: Prometheus has issues reloading blocks from disk.
   126        expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
   127        for: 4h
   128        labels:
   129          severity: warning
   130  {{- if .Values.defaultRules.additionalRuleLabels }}
   131  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   132  {{- end }}
   133  {{- end }}
   134  {{- if not (.Values.defaultRules.disabled.PrometheusTSDBCompactionsFailing | default false) }}
   135      - alert: PrometheusTSDBCompactionsFailing
   136        annotations:
   137  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   138  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   139  {{- end }}
   140          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has detected {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last 3h.
   141          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustsdbcompactionsfailing
   142          summary: Prometheus has issues compacting blocks.
   143        expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[3h]) > 0
   144        for: 4h
   145        labels:
   146          severity: warning
   147  {{- if .Values.defaultRules.additionalRuleLabels }}
   148  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   149  {{- end }}
   150  {{- end }}
   151  {{- if not (.Values.defaultRules.disabled.PrometheusNotIngestingSamples | default false) }}
   152      - alert: PrometheusNotIngestingSamples
   153        annotations:
   154  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   155  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   156  {{- end }}
   157          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is not ingesting samples.
   158          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusnotingestingsamples
   159          summary: Prometheus is not ingesting samples.
   160        expr: |-
   161          (
   162            rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
   163          and
   164            (
   165              sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
   166            or
   167              sum without(rule_group) (prometheus_rule_group_rules{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}) > 0
   168            )
   169          )
   170        for: 10m
   171        labels:
   172          severity: warning
   173  {{- if .Values.defaultRules.additionalRuleLabels }}
   174  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   175  {{- end }}
   176  {{- end }}
   177  {{- if not (.Values.defaultRules.disabled.PrometheusDuplicateTimestamps | default false) }}
   178      - alert: PrometheusDuplicateTimestamps
   179        annotations:
   180  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   181  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   182  {{- end }}
   183          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value  {{`}}`}} samples/s with different values but duplicated timestamp.
   184          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusduplicatetimestamps
   185          summary: Prometheus is dropping samples with duplicate timestamps.
   186        expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
   187        for: 10m
   188        labels:
   189          severity: warning
   190  {{- if .Values.defaultRules.additionalRuleLabels }}
   191  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   192  {{- end }}
   193  {{- end }}
   194  {{- if not (.Values.defaultRules.disabled.PrometheusOutOfOrderTimestamps | default false) }}
   195      - alert: PrometheusOutOfOrderTimestamps
   196        annotations:
   197  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   198  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   199  {{- end }}
   200          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} is dropping {{`{{`}} printf "%.4g" $value  {{`}}`}} samples/s with timestamps arriving out of order.
   201          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusoutofordertimestamps
   202          summary: Prometheus drops samples with out-of-order timestamps.
   203        expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
   204        for: 10m
   205        labels:
   206          severity: warning
   207  {{- if .Values.defaultRules.additionalRuleLabels }}
   208  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   209  {{- end }}
   210  {{- end }}
   211  {{- if not (.Values.defaultRules.disabled.PrometheusRemoteStorageFailures | default false) }}
   212      - alert: PrometheusRemoteStorageFailures
   213        annotations:
   214  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   215  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   216  {{- end }}
   217          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} failed to send {{`{{`}} printf "%.1f" $value {{`}}`}}% of the samples to {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}
   218          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusremotestoragefailures
   219          summary: Prometheus fails to send samples to remote storage.
   220        expr: |-
   221          (
   222            (rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
   223          /
   224            (
   225              (rate(prometheus_remote_storage_failed_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
   226            +
   227              (rate(prometheus_remote_storage_succeeded_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]))
   228            )
   229          )
   230          * 100
   231          > 1
   232        for: 15m
   233        labels:
   234          severity: critical
   235  {{- if .Values.defaultRules.additionalRuleLabels }}
   236  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   237  {{- end }}
   238  {{- end }}
   239  {{- if not (.Values.defaultRules.disabled.PrometheusRemoteWriteBehind | default false) }}
   240      - alert: PrometheusRemoteWriteBehind
   241        annotations:
   242  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   243  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   244  {{- end }}
   245          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write is {{`{{`}} printf "%.1f" $value {{`}}`}}s behind for {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}.
   246          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusremotewritebehind
   247          summary: Prometheus remote write is behind.
   248        expr: |-
   249          # Without max_over_time, failed scrapes could create false negatives, see
   250          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
   251          (
   252            max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
   253          - ignoring(remote_name, url) group_right
   254            max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
   255          )
   256          > 120
   257        for: 15m
   258        labels:
   259          severity: critical
   260  {{- if .Values.defaultRules.additionalRuleLabels }}
   261  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   262  {{- end }}
   263  {{- end }}
   264  {{- if not (.Values.defaultRules.disabled.PrometheusRemoteWriteDesiredShards | default false) }}
   265      - alert: PrometheusRemoteWriteDesiredShards
   266        annotations:
   267  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   268  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   269  {{- end }}
   270          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} remote write desired shards calculation wants to run {{`{{`}} $value {{`}}`}} shards for queue {{`{{`}} $labels.remote_name{{`}}`}}:{{`{{`}} $labels.url {{`}}`}}, which is more than the max of {{`{{`}} printf `prometheus_remote_storage_shards_max{instance="%s",job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}` $labels.instance | query | first | value {{`}}`}}.
   271          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusremotewritedesiredshards
   272          summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
   273        expr: |-
   274          # Without max_over_time, failed scrapes could create false negatives, see
   275          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
   276          (
   277            max_over_time(prometheus_remote_storage_shards_desired{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
   278          >
   279            max_over_time(prometheus_remote_storage_shards_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m])
   280          )
   281        for: 15m
   282        labels:
   283          severity: warning
   284  {{- if .Values.defaultRules.additionalRuleLabels }}
   285  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   286  {{- end }}
   287  {{- end }}
   288  {{- if not (.Values.defaultRules.disabled.PrometheusRuleFailures | default false) }}
   289      - alert: PrometheusRuleFailures
   290        annotations:
   291  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   292  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   293  {{- end }}
   294          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed to evaluate {{`{{`}} printf "%.0f" $value {{`}}`}} rules in the last 5m.
   295          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusrulefailures
   296          summary: Prometheus is failing rule evaluations.
   297        expr: increase(prometheus_rule_evaluation_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
   298        for: 15m
   299        labels:
   300          severity: critical
   301  {{- if .Values.defaultRules.additionalRuleLabels }}
   302  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   303  {{- end }}
   304  {{- end }}
   305  {{- if not (.Values.defaultRules.disabled.PrometheusMissingRuleEvaluations | default false) }}
   306      - alert: PrometheusMissingRuleEvaluations
   307        annotations:
   308  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   309  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   310  {{- end }}
   311          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has missed {{`{{`}} printf "%.0f" $value {{`}}`}} rule group evaluations in the last 5m.
   312          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusmissingruleevaluations
   313          summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
   314        expr: increase(prometheus_rule_group_iterations_missed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
   315        for: 15m
   316        labels:
   317          severity: warning
   318  {{- if .Values.defaultRules.additionalRuleLabels }}
   319  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   320  {{- end }}
   321  {{- end }}
   322  {{- if not (.Values.defaultRules.disabled.PrometheusTargetLimitHit | default false) }}
   323      - alert: PrometheusTargetLimitHit
   324        annotations:
   325  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   326  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   327  {{- end }}
   328          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because the number of targets exceeded the configured target_limit.
   329          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetlimithit
   330          summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
   331        expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
   332        for: 15m
   333        labels:
   334          severity: warning
   335  {{- if .Values.defaultRules.additionalRuleLabels }}
   336  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   337  {{- end }}
   338  {{- end }}
   339  {{- if not (.Values.defaultRules.disabled.PrometheusLabelLimitHit | default false) }}
   340      - alert: PrometheusLabelLimitHit
   341        annotations:
   342  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   343  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   344  {{- end }}
   345          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has dropped {{`{{`}} printf "%.0f" $value {{`}}`}} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
   346          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuslabellimithit
   347          summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
   348        expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
   349        for: 15m
   350        labels:
   351          severity: warning
   352  {{- if .Values.defaultRules.additionalRuleLabels }}
   353  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   354  {{- end }}
   355  {{- end }}
   356  {{- if not (.Values.defaultRules.disabled.PrometheusScrapeBodySizeLimitHit | default false) }}
   357      - alert: PrometheusScrapeBodySizeLimitHit
   358        annotations:
   359  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   360  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   361  {{- end }}
   362          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed {{`{{`}} printf "%.0f" $value {{`}}`}} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
   363          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapebodysizelimithit
   364          summary: Prometheus has dropped some targets that exceeded body size limit.
   365        expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
   366        for: 15m
   367        labels:
   368          severity: warning
   369  {{- if .Values.defaultRules.additionalRuleLabels }}
   370  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   371  {{- end }}
   372  {{- end }}
   373  {{- if not (.Values.defaultRules.disabled.PrometheusScrapeSampleLimitHit | default false) }}
   374      - alert: PrometheusScrapeSampleLimitHit
   375        annotations:
   376  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   377  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   378  {{- end }}
   379          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has failed {{`{{`}} printf "%.0f" $value {{`}}`}} scrapes in the last 5m because some targets exceeded the configured sample_limit.
   380          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheusscrapesamplelimithit
   381          summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
   382        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
   383        for: 15m
   384        labels:
   385          severity: warning
   386  {{- if .Values.defaultRules.additionalRuleLabels }}
   387  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   388  {{- end }}
   389  {{- end }}
   390  {{- if not (.Values.defaultRules.disabled.PrometheusTargetSyncFailure | default false) }}
   391      - alert: PrometheusTargetSyncFailure
   392        annotations:
   393  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   394  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   395  {{- end }}
   396          description: '{{`{{`}} printf "%.0f" $value {{`}}`}} targets in Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} have failed to sync because invalid configuration was supplied.'
   397          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheustargetsyncfailure
   398          summary: Prometheus has failed to sync targets.
   399        expr: increase(prometheus_target_sync_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[30m]) > 0
   400        for: 5m
   401        labels:
   402          severity: critical
   403  {{- if .Values.defaultRules.additionalRuleLabels }}
   404  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   405  {{- end }}
   406  {{- end }}
   407  {{- if not (.Values.defaultRules.disabled.PrometheusHighQueryLoad | default false) }}
   408      - alert: PrometheusHighQueryLoad
   409        annotations:
   410  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   411  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   412  {{- end }}
   413          description: Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
   414          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheushighqueryload
   415          summary: Prometheus is reaching its maximum capacity serving concurrent requests.
   416        expr: avg_over_time(prometheus_engine_queries{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.8
   417        for: 15m
   418        labels:
   419          severity: warning
   420  {{- if .Values.defaultRules.additionalRuleLabels }}
   421  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   422  {{- end }}
   423  {{- end }}
   424  {{- if not (.Values.defaultRules.disabled.PrometheusErrorSendingAlertsToAnyAlertmanager | default false) }}
   425      - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
   426        annotations:
   427  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   428  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   429  {{- end }}
   430          description: '{{`{{`}} printf "%.1f" $value {{`}}`}}% minimum errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} to any Alertmanager.'
   431          runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus/prometheuserrorsendingalertstoanyalertmanager
   432          summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
   433        expr: |-
   434          min without (alertmanager) (
   435            rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
   436          /
   437            rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}",alertmanager!~``}[5m])
   438          )
   439          * 100
   440          > 3
   441        for: 15m
   442        labels:
   443          severity: critical
   444  {{- if .Values.defaultRules.additionalRuleLabels }}
   445  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   446  {{- end }}
   447  {{- end }}
   448  {{- end }}