github.com/m3db/m3@v1.5.0/scripts/vagrant/provision/manifests/kube-prometheus/prometheus-rules.yaml (about)

     1  apiVersion: monitoring.coreos.com/v1
     2  kind: PrometheusRule
     3  metadata:
     4    labels:
     5      prometheus: k8s
     6      role: alert-rules
     7    name: prometheus-k8s-rules
     8    namespace: monitoring
     9  spec:
    10    groups:
    11    - name: k8s.rules
    12      rules:
    13      - expr: |
    14          sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
    15        record: namespace:container_cpu_usage_seconds_total:sum_rate
    16      - expr: |
    17          sum by (namespace, pod_name, container_name) (
    18            rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
    19          )
    20        record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
    21      - expr: |
    22          sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
    23        record: namespace:container_memory_usage_bytes:sum
    24      - expr: |
    25          sum by (namespace, label_name) (
    26             sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name)
    27           * on (namespace, pod_name) group_left(label_name)
    28             label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
    29          )
    30        record: namespace_name:container_cpu_usage_seconds_total:sum_rate
    31      - expr: |
    32          sum by (namespace, label_name) (
    33            sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace)
    34          * on (namespace, pod_name) group_left(label_name)
    35            label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
    36          )
    37        record: namespace_name:container_memory_usage_bytes:sum
    38      - expr: |
    39          sum by (namespace, label_name) (
    40            sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
    41          * on (namespace, pod) group_left(label_name)
    42            label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
    43          )
    44        record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
    45      - expr: |
    46          sum by (namespace, label_name) (
    47            sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
    48          * on (namespace, pod) group_left(label_name)
    49            label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
    50          )
    51        record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
    52      - expr: |
    53          sum(
    54            label_replace(
    55              label_replace(
    56                kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
    57                "replicaset", "$1", "owner_name", "(.*)"
    58              ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
    59              "workload", "$1", "owner_name", "(.*)"
    60            )
    61          ) by (namespace, workload, pod)
    62        labels:
    63          workload_type: deployment
    64        record: mixin_pod_workload
    65      - expr: |
    66          sum(
    67            label_replace(
    68              kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
    69              "workload", "$1", "owner_name", "(.*)"
    70            )
    71          ) by (namespace, workload, pod)
    72        labels:
    73          workload_type: daemonset
    74        record: mixin_pod_workload
    75      - expr: |
    76          sum(
    77            label_replace(
    78              kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
    79              "workload", "$1", "owner_name", "(.*)"
    80            )
    81          ) by (namespace, workload, pod)
    82        labels:
    83          workload_type: statefulset
    84        record: mixin_pod_workload
    85    - name: kube-scheduler.rules
    86      rules:
    87      - expr: |
    88          histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
    89        labels:
    90          quantile: "0.99"
    91        record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
    92      - expr: |
    93          histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
    94        labels:
    95          quantile: "0.99"
    96        record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
    97      - expr: |
    98          histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
    99        labels:
   100          quantile: "0.99"
   101        record: cluster_quantile:scheduler_binding_latency:histogram_quantile
   102      - expr: |
   103          histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
   104        labels:
   105          quantile: "0.9"
   106        record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
   107      - expr: |
   108          histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
   109        labels:
   110          quantile: "0.9"
   111        record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
   112      - expr: |
   113          histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
   114        labels:
   115          quantile: "0.9"
   116        record: cluster_quantile:scheduler_binding_latency:histogram_quantile
   117      - expr: |
   118          histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
   119        labels:
   120          quantile: "0.5"
   121        record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
   122      - expr: |
   123          histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
   124        labels:
   125          quantile: "0.5"
   126        record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
   127      - expr: |
   128          histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
   129        labels:
   130          quantile: "0.5"
   131        record: cluster_quantile:scheduler_binding_latency:histogram_quantile
   132    - name: kube-apiserver.rules
   133      rules:
   134      - expr: |
   135          histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
   136        labels:
   137          quantile: "0.99"
   138        record: cluster_quantile:apiserver_request_latencies:histogram_quantile
   139      - expr: |
   140          histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
   141        labels:
   142          quantile: "0.9"
   143        record: cluster_quantile:apiserver_request_latencies:histogram_quantile
   144      - expr: |
   145          histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
   146        labels:
   147          quantile: "0.5"
   148        record: cluster_quantile:apiserver_request_latencies:histogram_quantile
   149    - name: node.rules
   150      rules:
   151      - expr: sum(min(kube_pod_info) by (node))
   152        record: ':kube_pod_info_node_count:'
   153      - expr: |
   154          max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
   155        record: 'node_namespace_pod:kube_pod_info:'
   156      - expr: |
   157          count by (node) (sum by (node, cpu) (
   158            node_cpu_seconds_total{job="node-exporter"}
   159          * on (namespace, pod) group_left(node)
   160            node_namespace_pod:kube_pod_info:
   161          ))
   162        record: node:node_num_cpu:sum
   163      - expr: |
   164          1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
   165        record: :node_cpu_utilisation:avg1m
   166      - expr: |
   167          1 - avg by (node) (
   168            rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
   169          * on (namespace, pod) group_left(node)
   170            node_namespace_pod:kube_pod_info:)
   171        record: node:node_cpu_utilisation:avg1m
   172      - expr: |
   173          node:node_cpu_utilisation:avg1m
   174            *
   175          node:node_num_cpu:sum
   176            /
   177          scalar(sum(node:node_num_cpu:sum))
   178        record: node:cluster_cpu_utilisation:ratio
   179      - expr: |
   180          sum(node_load1{job="node-exporter"})
   181          /
   182          sum(node:node_num_cpu:sum)
   183        record: ':node_cpu_saturation_load1:'
   184      - expr: |
   185          sum by (node) (
   186            node_load1{job="node-exporter"}
   187          * on (namespace, pod) group_left(node)
   188            node_namespace_pod:kube_pod_info:
   189          )
   190          /
   191          node:node_num_cpu:sum
   192        record: 'node:node_cpu_saturation_load1:'
   193      - expr: |
   194          1 -
   195          sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
   196          /
   197          sum(node_memory_MemTotal_bytes{job="node-exporter"})
   198        record: ':node_memory_utilisation:'
   199      - expr: |
   200          sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
   201        record: :node_memory_MemFreeCachedBuffers_bytes:sum
   202      - expr: |
   203          sum(node_memory_MemTotal_bytes{job="node-exporter"})
   204        record: :node_memory_MemTotal_bytes:sum
   205      - expr: |
   206          sum by (node) (
   207            (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
   208            * on (namespace, pod) group_left(node)
   209              node_namespace_pod:kube_pod_info:
   210          )
   211        record: node:node_memory_bytes_available:sum
   212      - expr: |
   213          sum by (node) (
   214            node_memory_MemTotal_bytes{job="node-exporter"}
   215            * on (namespace, pod) group_left(node)
   216              node_namespace_pod:kube_pod_info:
   217          )
   218        record: node:node_memory_bytes_total:sum
   219      - expr: |
   220          (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
   221          /
   222          node:node_memory_bytes_total:sum
   223        record: node:node_memory_utilisation:ratio
   224      - expr: |
   225          (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
   226          /
   227          scalar(sum(node:node_memory_bytes_total:sum))
   228        record: node:cluster_memory_utilisation:ratio
   229      - expr: |
   230          1e3 * sum(
   231            (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
   232           + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
   233          )
   234        record: :node_memory_swap_io_bytes:sum_rate
   235      - expr: |
   236          1 -
   237          sum by (node) (
   238            (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
   239          * on (namespace, pod) group_left(node)
   240            node_namespace_pod:kube_pod_info:
   241          )
   242          /
   243          sum by (node) (
   244            node_memory_MemTotal_bytes{job="node-exporter"}
   245          * on (namespace, pod) group_left(node)
   246            node_namespace_pod:kube_pod_info:
   247          )
   248        record: 'node:node_memory_utilisation:'
   249      - expr: |
   250          1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
   251        record: 'node:node_memory_utilisation_2:'
   252      - expr: |
   253          1e3 * sum by (node) (
   254            (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
   255           + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
   256           * on (namespace, pod) group_left(node)
   257             node_namespace_pod:kube_pod_info:
   258          )
   259        record: node:node_memory_swap_io_bytes:sum_rate
   260      - expr: |
   261          avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
   262        record: :node_disk_utilisation:avg_irate
   263      - expr: |
   264          avg by (node) (
   265            irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
   266          * on (namespace, pod) group_left(node)
   267            node_namespace_pod:kube_pod_info:
   268          )
   269        record: node:node_disk_utilisation:avg_irate
   270      - expr: |
   271          avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
   272        record: :node_disk_saturation:avg_irate
   273      - expr: |
   274          avg by (node) (
   275            irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
   276          * on (namespace, pod) group_left(node)
   277            node_namespace_pod:kube_pod_info:
   278          )
   279        record: node:node_disk_saturation:avg_irate
   280      - expr: |
   281          max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
   282          - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
   283          / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
   284        record: 'node:node_filesystem_usage:'
   285      - expr: |
   286          max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
   287        record: 'node:node_filesystem_avail:'
   288      - expr: |
   289          sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) +
   290          sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
   291        record: :node_net_utilisation:sum_irate
   292      - expr: |
   293          sum by (node) (
   294            (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) +
   295            irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
   296          * on (namespace, pod) group_left(node)
   297            node_namespace_pod:kube_pod_info:
   298          )
   299        record: node:node_net_utilisation:sum_irate
   300      - expr: |
   301          sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) +
   302          sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
   303        record: :node_net_saturation:sum_irate
   304      - expr: |
   305          sum by (node) (
   306            (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) +
   307            irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
   308          * on (namespace, pod) group_left(node)
   309            node_namespace_pod:kube_pod_info:
   310          )
   311        record: node:node_net_saturation:sum_irate
   312      - expr: |
   313          max(
   314            max(
   315              kube_pod_info{job="kube-state-metrics", host_ip!=""}
   316            ) by (node, host_ip)
   317            * on (host_ip) group_right (node)
   318            label_replace(
   319              (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
   320            )
   321          ) by (node)
   322        record: 'node:node_inodes_total:'
   323      - expr: |
   324          max(
   325            max(
   326              kube_pod_info{job="kube-state-metrics", host_ip!=""}
   327            ) by (node, host_ip)
   328            * on (host_ip) group_right (node)
   329            label_replace(
   330              (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
   331            )
   332          ) by (node)
   333        record: 'node:node_inodes_free:'
   334    - name: kube-prometheus-node-recording.rules
   335      rules:
   336      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
   337          (instance)
   338        record: instance:node_cpu:rate:sum
   339      - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
   340          BY (instance)
   341        record: instance:node_filesystem_usage:sum
   342      - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
   343        record: instance:node_network_receive_bytes:rate:sum
   344      - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
   345        record: instance:node_network_transmit_bytes:rate:sum
   346      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
   347          (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
   348          BY (instance, cpu)) BY (instance)
   349        record: instance:node_cpu:ratio
   350      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
   351        record: cluster:node_cpu:sum_rate5m
   352      - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
   353          BY (instance, cpu))
   354        record: cluster:node_cpu:ratio
   355    - name: kubernetes-absent
   356      rules:
   357      - alert: AlertmanagerDown
   358        annotations:
   359          message: Alertmanager has disappeared from Prometheus target discovery.
   360          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
   361        expr: |
   362          absent(up{job="alertmanager-main",namespace="monitoring"} == 1)
   363        for: 15m
   364        labels:
   365          severity: critical
   366      - alert: CoreDNSDown
   367        annotations:
   368          message: CoreDNS has disappeared from Prometheus target discovery.
   369          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown
   370        expr: |
   371          absent(up{job="kube-dns"} == 1)
   372        for: 15m
   373        labels:
   374          severity: critical
   375      - alert: KubeAPIDown
   376        annotations:
   377          message: KubeAPI has disappeared from Prometheus target discovery.
   378          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
   379        expr: |
   380          absent(up{job="apiserver"} == 1)
   381        for: 15m
   382        labels:
   383          severity: critical
   384      - alert: KubeControllerManagerDown
   385        annotations:
   386          message: KubeControllerManager has disappeared from Prometheus target discovery.
   387          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
   388        expr: |
   389          absent(up{job="kube-controller-manager"} == 1)
   390        for: 15m
   391        labels:
   392          severity: critical
   393      - alert: KubeSchedulerDown
   394        annotations:
   395          message: KubeScheduler has disappeared from Prometheus target discovery.
   396          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
   397        expr: |
   398          absent(up{job="kube-scheduler"} == 1)
   399        for: 15m
   400        labels:
   401          severity: critical
   402      - alert: KubeStateMetricsDown
   403        annotations:
   404          message: KubeStateMetrics has disappeared from Prometheus target discovery.
   405          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
   406        expr: |
   407          absent(up{job="kube-state-metrics"} == 1)
   408        for: 15m
   409        labels:
   410          severity: critical
   411      - alert: KubeletDown
   412        annotations:
   413          message: Kubelet has disappeared from Prometheus target discovery.
   414          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
   415        expr: |
   416          absent(up{job="kubelet"} == 1)
   417        for: 15m
   418        labels:
   419          severity: critical
   420      - alert: NodeExporterDown
   421        annotations:
   422          message: NodeExporter has disappeared from Prometheus target discovery.
   423          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
   424        expr: |
   425          absent(up{job="node-exporter"} == 1)
   426        for: 15m
   427        labels:
   428          severity: critical
   429      - alert: PrometheusDown
   430        annotations:
   431          message: Prometheus has disappeared from Prometheus target discovery.
   432          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
   433        expr: |
   434          absent(up{job="prometheus-k8s",namespace="monitoring"} == 1)
   435        for: 15m
   436        labels:
   437          severity: critical
   438      - alert: PrometheusOperatorDown
   439        annotations:
   440          message: PrometheusOperator has disappeared from Prometheus target discovery.
   441          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
   442        expr: |
   443          absent(up{job="prometheus-operator",namespace="monitoring"} == 1)
   444        for: 15m
   445        labels:
   446          severity: critical
   447    - name: kubernetes-apps
   448      rules:
   449      - alert: KubePodCrashLooping
   450        annotations:
   451          message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
   452            }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
   453          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
   454        expr: |
   455          rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
   456        for: 1h
   457        labels:
   458          severity: critical
   459      - alert: KubePodNotReady
   460        annotations:
   461          message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
   462            state for longer than an hour.
   463          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
   464        expr: |
   465          sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
   466        for: 1h
   467        labels:
   468          severity: critical
   469      - alert: KubeDeploymentGenerationMismatch
   470        annotations:
   471          message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
   472            }} does not match, this indicates that the Deployment has failed but has
   473            not been rolled back.
   474          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
   475        expr: |
   476          kube_deployment_status_observed_generation{job="kube-state-metrics"}
   477            !=
   478          kube_deployment_metadata_generation{job="kube-state-metrics"}
   479        for: 15m
   480        labels:
   481          severity: critical
   482      - alert: KubeDeploymentReplicasMismatch
   483        annotations:
   484          message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
   485            matched the expected number of replicas for longer than an hour.
   486          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
   487        expr: |
   488          kube_deployment_spec_replicas{job="kube-state-metrics"}
   489            !=
   490          kube_deployment_status_replicas_available{job="kube-state-metrics"}
   491        for: 1h
   492        labels:
   493          severity: critical
   494      - alert: KubeStatefulSetReplicasMismatch
   495        annotations:
   496          message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
   497            not matched the expected number of replicas for longer than 15 minutes.
   498          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
   499        expr: |
   500          kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
   501            !=
   502          kube_statefulset_status_replicas{job="kube-state-metrics"}
   503        for: 15m
   504        labels:
   505          severity: critical
   506      - alert: KubeStatefulSetGenerationMismatch
   507        annotations:
   508          message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
   509            }} does not match, this indicates that the StatefulSet has failed but has
   510            not been rolled back.
   511          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
   512        expr: |
   513          kube_statefulset_status_observed_generation{job="kube-state-metrics"}
   514            !=
   515          kube_statefulset_metadata_generation{job="kube-state-metrics"}
   516        for: 15m
   517        labels:
   518          severity: critical
   519      - alert: KubeStatefulSetUpdateNotRolledOut
   520        annotations:
   521          message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
   522            has not been rolled out.
   523          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
   524        expr: |
   525          max without (revision) (
   526            kube_statefulset_status_current_revision{job="kube-state-metrics"}
   527              unless
   528            kube_statefulset_status_update_revision{job="kube-state-metrics"}
   529          )
   530            *
   531          (
   532            kube_statefulset_replicas{job="kube-state-metrics"}
   533              !=
   534            kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
   535          )
   536        for: 15m
   537        labels:
   538          severity: critical
   539      - alert: KubeDaemonSetRolloutStuck
   540        annotations:
   541          message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
   542            }}/{{ $labels.daemonset }} are scheduled and ready.
   543          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
   544        expr: |
   545          kube_daemonset_status_number_ready{job="kube-state-metrics"}
   546            /
   547          kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
   548        for: 15m
   549        labels:
   550          severity: critical
   551      - alert: KubeDaemonSetNotScheduled
   552        annotations:
   553          message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
   554            }} are not scheduled.'
   555          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
   556        expr: |
   557          kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
   558            -
   559          kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
   560        for: 10m
   561        labels:
   562          severity: warning
   563      - alert: KubeDaemonSetMisScheduled
   564        annotations:
   565          message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
   566            }} are running where they are not supposed to run.'
   567          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
   568        expr: |
   569          kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
   570        for: 10m
   571        labels:
   572          severity: warning
   573      - alert: KubeCronJobRunning
   574        annotations:
   575          message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
   576            than 1h to complete.
   577          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
   578        expr: |
   579          time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
   580        for: 1h
   581        labels:
   582          severity: warning
   583      - alert: KubeJobCompletion
   584        annotations:
   585          message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
   586            than one hour to complete.
   587          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
   588        expr: |
   589          kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"}  > 0
   590        for: 1h
   591        labels:
   592          severity: warning
   593      - alert: KubeJobFailed
   594        annotations:
   595          message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
   596          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
   597        expr: |
   598          kube_job_status_failed{job="kube-state-metrics"}  > 0
   599        for: 1h
   600        labels:
   601          severity: warning
   602    - name: kubernetes-resources
   603      rules:
   604      - alert: KubeCPUOvercommit
   605        annotations:
   606          message: Cluster has overcommitted CPU resource requests for Pods and cannot
   607            tolerate node failure.
   608          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
   609        expr: |
   610          sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
   611            /
   612          sum(node:node_num_cpu:sum)
   613            >
   614          (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
   615        for: 5m
   616        labels:
   617          severity: warning
   618      - alert: KubeMemOvercommit
   619        annotations:
   620          message: Cluster has overcommitted memory resource requests for Pods and cannot
   621            tolerate node failure.
   622          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
   623        expr: |
   624          sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
   625            /
   626          sum(node_memory_MemTotal_bytes)
   627            >
   628          (count(node:node_num_cpu:sum)-1)
   629            /
   630          count(node:node_num_cpu:sum)
   631        for: 5m
   632        labels:
   633          severity: warning
   634      - alert: KubeCPUOvercommit
   635        annotations:
   636          message: Cluster has overcommitted CPU resource requests for Namespaces.
   637          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
   638        expr: |
   639          sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
   640            /
   641          sum(node:node_num_cpu:sum)
   642            > 1.5
   643        for: 5m
   644        labels:
   645          severity: warning
   646      - alert: KubeMemOvercommit
   647        annotations:
   648          message: Cluster has overcommitted memory resource requests for Namespaces.
   649          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
   650        expr: |
   651          sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
   652            /
   653          sum(node_memory_MemTotal_bytes{job="node-exporter"})
   654            > 1.5
   655        for: 5m
   656        labels:
   657          severity: warning
   658      - alert: KubeQuotaExceeded
   659        annotations:
   660          message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
   661            }}% of its {{ $labels.resource }} quota.
   662          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
   663        expr: |
   664          100 * kube_resourcequota{job="kube-state-metrics", type="used"}
   665            / ignoring(instance, job, type)
   666          (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
   667            > 90
   668        for: 15m
   669        labels:
   670          severity: warning
   671      - alert: CPUThrottlingHigh
   672        annotations:
   673          message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace
   674            }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name
   675            }}.'
   676          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
   677        expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\",
   678          }[5m])) by (container_name, pod_name, namespace)\n  /\nsum(increase(container_cpu_cfs_periods_total{}[5m]))
   679          by (container_name, pod_name, namespace)\n  > 25 \n"
   680        for: 15m
   681        labels:
   682          severity: warning
   683    - name: kubernetes-storage
   684      rules:
   685      - alert: KubePersistentVolumeUsageCritical
   686        annotations:
   687          message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
   688            }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
   689            }}% free.
   690          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
   691        expr: |
   692          100 * kubelet_volume_stats_available_bytes{job="kubelet"}
   693            /
   694          kubelet_volume_stats_capacity_bytes{job="kubelet"}
   695            < 3
   696        for: 1m
   697        labels:
   698          severity: critical
   699      - alert: KubePersistentVolumeFullInFourDays
   700        annotations:
   701          message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
   702            }} in Namespace {{ $labels.namespace }} is expected to fill up within four
   703            days. Currently {{ printf "%0.2f" $value }}% is available.
   704          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
   705        expr: |
   706          100 * (
   707            kubelet_volume_stats_available_bytes{job="kubelet"}
   708              /
   709            kubelet_volume_stats_capacity_bytes{job="kubelet"}
   710          ) < 15
   711          and
   712          predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
   713        for: 5m
   714        labels:
   715          severity: critical
   716      - alert: KubePersistentVolumeErrors
   717        annotations:
   718          message: The persistent volume {{ $labels.persistentvolume }} has status {{
   719            $labels.phase }}.
   720          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
   721        expr: |
   722          kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
   723        for: 5m
   724        labels:
   725          severity: critical
   726    - name: kubernetes-system
   727      rules:
   728      - alert: KubeNodeNotReady
   729        annotations:
   730          message: '{{ $labels.node }} has been unready for more than an hour.'
   731          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
   732        expr: |
   733          kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
   734        for: 1h
   735        labels:
   736          severity: warning
   737      - alert: KubeVersionMismatch
   738        annotations:
   739          message: There are {{ $value }} different semantic versions of Kubernetes
   740            components running.
   741          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
   742        expr: |
   743          count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
   744        for: 1h
   745        labels:
   746          severity: warning
   747      - alert: KubeClientErrors
   748        annotations:
   749          message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
   750            }}' is experiencing {{ printf "%0.0f" $value }}% errors.'
   751          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
   752        expr: |
   753          (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
   754            /
   755          sum(rate(rest_client_requests_total[5m])) by (instance, job))
   756          * 100 > 1
   757        for: 15m
   758        labels:
   759          severity: warning
   760      - alert: KubeClientErrors
   761        annotations:
   762          message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
   763            }}' is experiencing {{ printf "%0.0f" $value }} errors / second.
   764          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
   765        expr: |
   766          sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
   767        for: 15m
   768        labels:
   769          severity: warning
   770      - alert: KubeletTooManyPods
   771        annotations:
   772          message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
   773            to the limit of 110.
   774          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
   775        expr: |
   776          kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
   777        for: 15m
   778        labels:
   779          severity: warning
   780      - alert: KubeAPILatencyHigh
   781        annotations:
   782          message: The API server has a 99th percentile latency of {{ $value }} seconds
   783            for {{ $labels.verb }} {{ $labels.resource }}.
   784          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
   785        expr: |
   786          cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
   787        for: 10m
   788        labels:
   789          severity: warning
   790      - alert: KubeAPILatencyHigh
   791        annotations:
   792          message: The API server has a 99th percentile latency of {{ $value }} seconds
   793            for {{ $labels.verb }} {{ $labels.resource }}.
   794          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
   795        expr: |
   796          cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
   797        for: 10m
   798        labels:
   799          severity: critical
   800      - alert: KubeAPIErrorsHigh
   801        annotations:
   802          message: API server is returning errors for {{ $value }}% of requests.
   803          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
   804        expr: |
   805          sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
   806            /
   807          sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
   808        for: 10m
   809        labels:
   810          severity: critical
   811      - alert: KubeAPIErrorsHigh
   812        annotations:
   813          message: API server is returning errors for {{ $value }}% of requests.
   814          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
   815        expr: |
   816          sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
   817            /
   818          sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
   819        for: 10m
   820        labels:
   821          severity: warning
   822      - alert: KubeAPIErrorsHigh
   823        annotations:
   824          message: API server is returning errors for {{ $value }}% of requests for
   825            {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
   826          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
   827        expr: |
   828          sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
   829            /
   830          sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
   831        for: 10m
   832        labels:
   833          severity: critical
   834      - alert: KubeAPIErrorsHigh
   835        annotations:
   836          message: API server is returning errors for {{ $value }}% of requests for
   837            {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
   838          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
   839        expr: |
   840          sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
   841            /
   842          sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
   843        for: 10m
   844        labels:
   845          severity: warning
   846      - alert: KubeClientCertificateExpiration
   847        annotations:
   848          message: A client certificate used to authenticate to the apiserver is expiring
   849            in less than 7.0 days.
   850          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
   851        expr: |
   852          apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
   853        labels:
   854          severity: warning
   855      - alert: KubeClientCertificateExpiration
   856        annotations:
   857          message: A client certificate used to authenticate to the apiserver is expiring
   858            in less than 24.0 hours.
   859          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
   860        expr: |
   861          apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
   862        labels:
   863          severity: critical
   864    - name: alertmanager.rules
   865      rules:
   866      - alert: AlertmanagerConfigInconsistent
   867        annotations:
   868          message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
   869            are out of sync.
   870        expr: |
   871          count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
   872        for: 5m
   873        labels:
   874          severity: critical
   875      - alert: AlertmanagerFailedReload
   876        annotations:
   877          message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
   878            }}/{{ $labels.pod}}.
   879        expr: |
   880          alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
   881        for: 10m
   882        labels:
   883          severity: warning
   884      - alert: AlertmanagerMembersInconsistent
   885        annotations:
   886          message: Alertmanager has not found all other members of the cluster.
   887        expr: |
   888          alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}
   889            != on (service) GROUP_LEFT()
   890          count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"})
   891        for: 5m
   892        labels:
   893          severity: critical
   894    - name: general.rules
   895      rules:
   896      - alert: TargetDown
   897        annotations:
   898          message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
   899        expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
   900        for: 10m
   901        labels:
   902          severity: warning
   903      - alert: Watchdog
   904        annotations:
   905          message: |
   906            This is an alert meant to ensure that the entire alerting pipeline is functional.
   907            This alert is always firing, therefore it should always be firing in Alertmanager
   908            and always fire against a receiver. There are integrations with various notification
   909            mechanisms that send a notification when this alert is not firing. For example the
   910            "DeadMansSnitch" integration in PagerDuty.
   911        expr: vector(1)
   912        labels:
   913          severity: none
   914    - name: kube-prometheus-node-alerting.rules
   915      rules:
   916      - alert: NodeDiskRunningFull
   917        annotations:
   918          message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
   919            }}/{{ $labels.pod }} will be full within the next 24 hours.
   920        expr: |
   921          (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
   922        for: 30m
   923        labels:
   924          severity: warning
   925      - alert: NodeDiskRunningFull
   926        annotations:
   927          message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
   928            }}/{{ $labels.pod }} will be full within the next 2 hours.
   929        expr: |
   930          (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
   931        for: 10m
   932        labels:
   933          severity: critical
   934    - name: node-time
   935      rules:
   936      - alert: ClockSkewDetected
   937        annotations:
   938          message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod
   939            }}. Ensure NTP is configured correctly on this host.
   940        expr: |
   941          abs(node_timex_offset_seconds{job="node-exporter"}) > 0.03
   942        for: 2m
   943        labels:
   944          severity: warning
   945    - name: node-network
   946      rules:
   947      - alert: NetworkReceiveErrors
   948        annotations:
   949          message: Network interface "{{ $labels.device }}" showing receive errors on
   950            node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
   951        expr: |
   952          rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
   953        for: 2m
   954        labels:
   955          severity: warning
   956      - alert: NetworkTransmitErrors
   957        annotations:
   958          message: Network interface "{{ $labels.device }}" showing transmit errors
   959            on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
   960        expr: |
   961          rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
   962        for: 2m
   963        labels:
   964          severity: warning
   965      - alert: NodeNetworkInterfaceFlapping
   966        annotations:
   967          message: Network interface "{{ $labels.device }}" changing it's up status
   968            often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
   969        expr: |
   970          changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
   971        for: 2m
   972        labels:
   973          severity: warning
   974    - name: prometheus.rules
   975      rules:
   976      - alert: PrometheusConfigReloadFailed
   977        annotations:
   978          description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
   979          summary: Reloading Prometheus' configuration failed
   980        expr: |
   981          prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0
   982        for: 10m
   983        labels:
   984          severity: warning
   985      - alert: PrometheusNotificationQueueRunningFull
   986        annotations:
   987          description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
   988            $labels.pod}}
   989          summary: Prometheus' alert notification queue is running full
   990        expr: |
   991          predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}
   992        for: 10m
   993        labels:
   994          severity: warning
   995      - alert: PrometheusErrorSendingAlerts
   996        annotations:
   997          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
   998            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
   999          summary: Errors while sending alert from Prometheus
  1000        expr: |
  1001          rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01
  1002        for: 10m
  1003        labels:
  1004          severity: warning
  1005      - alert: PrometheusErrorSendingAlerts
  1006        annotations:
  1007          description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
  1008            $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
  1009          summary: Errors while sending alerts from Prometheus
  1010        expr: |
  1011          rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03
  1012        for: 10m
  1013        labels:
  1014          severity: critical
  1015      - alert: PrometheusNotConnectedToAlertmanagers
  1016        annotations:
  1017          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
  1018            to any Alertmanagers
  1019          summary: Prometheus is not connected to any Alertmanagers
  1020        expr: |
  1021          prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1
  1022        for: 10m
  1023        labels:
  1024          severity: warning
  1025      - alert: PrometheusTSDBReloadsFailing
  1026        annotations:
  1027          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
  1028            reload failures over the last four hours.'
  1029          summary: Prometheus has issues reloading data blocks from disk
  1030        expr: |
  1031          increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
  1032        for: 12h
  1033        labels:
  1034          severity: warning
  1035      - alert: PrometheusTSDBCompactionsFailing
  1036        annotations:
  1037          description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
  1038            compaction failures over the last four hours.'
  1039          summary: Prometheus has issues compacting sample blocks
  1040        expr: |
  1041          increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
  1042        for: 12h
  1043        labels:
  1044          severity: warning
  1045      - alert: PrometheusTSDBWALCorruptions
  1046        annotations:
  1047          description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
  1048            log (WAL).'
  1049          summary: Prometheus write-ahead log is corrupted
  1050        expr: |
  1051          prometheus_tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
  1052        for: 4h
  1053        labels:
  1054          severity: warning
  1055      - alert: PrometheusNotIngestingSamples
  1056        annotations:
  1057          description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting
  1058            samples.
  1059          summary: Prometheus isn't ingesting samples
  1060        expr: |
  1061          rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
  1062        for: 10m
  1063        labels:
  1064          severity: warning
  1065      - alert: PrometheusTargetScrapesDuplicate
  1066        annotations:
  1067          description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected
  1068            due to duplicate timestamps but different values'
  1069          summary: Prometheus has many samples rejected
  1070        expr: |
  1071          increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
  1072        for: 10m
  1073        labels:
  1074          severity: warning
  1075    - name: prometheus-operator
  1076      rules:
  1077      - alert: PrometheusOperatorReconcileErrors
  1078        annotations:
  1079          message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
  1080            }} Namespace.
  1081        expr: |
  1082          rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
  1083        for: 10m
  1084        labels:
  1085          severity: warning
  1086      - alert: PrometheusOperatorNodeLookupErrors
  1087        annotations:
  1088          message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
  1089        expr: |
  1090          rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
  1091        for: 10m
  1092        labels:
  1093          severity: warning