github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/scripts/vagrant/provision/manifests/kube-prometheus/prometheus-rules.yaml (about) 1 apiVersion: monitoring.coreos.com/v1 2 kind: PrometheusRule 3 metadata: 4 labels: 5 prometheus: k8s 6 role: alert-rules 7 name: prometheus-k8s-rules 8 namespace: monitoring 9 spec: 10 groups: 11 - name: k8s.rules 12 rules: 13 - expr: | 14 sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace) 15 record: namespace:container_cpu_usage_seconds_total:sum_rate 16 - expr: | 17 sum by (namespace, pod_name, container_name) ( 18 rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m]) 19 ) 20 record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate 21 - expr: | 22 sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace) 23 record: namespace:container_memory_usage_bytes:sum 24 - expr: | 25 sum by (namespace, label_name) ( 26 sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name) 27 * on (namespace, pod_name) group_left(label_name) 28 label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") 29 ) 30 record: namespace_name:container_cpu_usage_seconds_total:sum_rate 31 - expr: | 32 sum by (namespace, label_name) ( 33 sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace) 34 * on (namespace, pod_name) group_left(label_name) 35 label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") 36 ) 37 record: namespace_name:container_memory_usage_bytes:sum 38 - expr: | 39 sum by (namespace, label_name) ( 40 sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) 41 * on (namespace, pod) group_left(label_name) 42 label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") 43 ) 44 record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum 45 - expr: | 46 sum by (namespace, label_name) ( 47 sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) 48 * on (namespace, pod) group_left(label_name) 49 label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") 50 ) 51 record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum 52 - expr: | 53 sum( 54 label_replace( 55 label_replace( 56 kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, 57 "replicaset", "$1", "owner_name", "(.*)" 58 ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, 59 "workload", "$1", "owner_name", "(.*)" 60 ) 61 ) by (namespace, workload, pod) 62 labels: 63 workload_type: deployment 64 record: mixin_pod_workload 65 - expr: | 66 sum( 67 label_replace( 68 kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, 69 "workload", "$1", "owner_name", "(.*)" 70 ) 71 ) by (namespace, workload, pod) 72 labels: 73 workload_type: daemonset 74 record: mixin_pod_workload 75 - expr: | 76 sum( 77 label_replace( 78 kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, 79 "workload", "$1", "owner_name", "(.*)" 80 ) 81 ) by (namespace, workload, pod) 82 labels: 83 workload_type: statefulset 84 record: mixin_pod_workload 85 - name: kube-scheduler.rules 86 rules: 87 - expr: | 88 histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 89 labels: 90 quantile: "0.99" 91 record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile 92 - expr: | 93 histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 94 labels: 95 quantile: "0.99" 96 record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile 97 - expr: | 98 histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 99 labels: 100 quantile: "0.99" 101 record: cluster_quantile:scheduler_binding_latency:histogram_quantile 102 - expr: | 103 histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 104 labels: 105 quantile: "0.9" 106 record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile 107 - expr: | 108 histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 109 labels: 110 quantile: "0.9" 111 record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile 112 - expr: | 113 histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 114 labels: 115 quantile: "0.9" 116 record: cluster_quantile:scheduler_binding_latency:histogram_quantile 117 - expr: | 118 histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 119 labels: 120 quantile: "0.5" 121 record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile 122 - expr: | 123 histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 124 labels: 125 quantile: "0.5" 126 record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile 127 - expr: | 128 histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 129 labels: 130 quantile: "0.5" 131 record: cluster_quantile:scheduler_binding_latency:histogram_quantile 132 - name: kube-apiserver.rules 133 rules: 134 - expr: | 135 histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 136 labels: 137 quantile: "0.99" 138 record: cluster_quantile:apiserver_request_latencies:histogram_quantile 139 - expr: | 140 histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 141 labels: 142 quantile: "0.9" 143 record: cluster_quantile:apiserver_request_latencies:histogram_quantile 144 - expr: | 145 histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 146 labels: 147 quantile: "0.5" 148 record: cluster_quantile:apiserver_request_latencies:histogram_quantile 149 - name: node.rules 150 rules: 151 - expr: sum(min(kube_pod_info) by (node)) 152 record: ':kube_pod_info_node_count:' 153 - expr: | 154 max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) 155 record: 'node_namespace_pod:kube_pod_info:' 156 - expr: | 157 count by (node) (sum by (node, cpu) ( 158 node_cpu_seconds_total{job="node-exporter"} 159 * on (namespace, pod) group_left(node) 160 node_namespace_pod:kube_pod_info: 161 )) 162 record: node:node_num_cpu:sum 163 - expr: | 164 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) 165 record: :node_cpu_utilisation:avg1m 166 - expr: | 167 1 - avg by (node) ( 168 rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) 169 * on (namespace, pod) group_left(node) 170 node_namespace_pod:kube_pod_info:) 171 record: node:node_cpu_utilisation:avg1m 172 - expr: | 173 node:node_cpu_utilisation:avg1m 174 * 175 node:node_num_cpu:sum 176 / 177 scalar(sum(node:node_num_cpu:sum)) 178 record: node:cluster_cpu_utilisation:ratio 179 - expr: | 180 sum(node_load1{job="node-exporter"}) 181 / 182 sum(node:node_num_cpu:sum) 183 record: ':node_cpu_saturation_load1:' 184 - expr: | 185 sum by (node) ( 186 node_load1{job="node-exporter"} 187 * on (namespace, pod) group_left(node) 188 node_namespace_pod:kube_pod_info: 189 ) 190 / 191 node:node_num_cpu:sum 192 record: 'node:node_cpu_saturation_load1:' 193 - expr: | 194 1 - 195 sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) 196 / 197 sum(node_memory_MemTotal_bytes{job="node-exporter"}) 198 record: ':node_memory_utilisation:' 199 - expr: | 200 sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) 201 record: :node_memory_MemFreeCachedBuffers_bytes:sum 202 - expr: | 203 sum(node_memory_MemTotal_bytes{job="node-exporter"}) 204 record: :node_memory_MemTotal_bytes:sum 205 - expr: | 206 sum by (node) ( 207 (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) 208 * on (namespace, pod) group_left(node) 209 node_namespace_pod:kube_pod_info: 210 ) 211 record: node:node_memory_bytes_available:sum 212 - expr: | 213 sum by (node) ( 214 node_memory_MemTotal_bytes{job="node-exporter"} 215 * on (namespace, pod) group_left(node) 216 node_namespace_pod:kube_pod_info: 217 ) 218 record: node:node_memory_bytes_total:sum 219 - expr: | 220 (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) 221 / 222 node:node_memory_bytes_total:sum 223 record: node:node_memory_utilisation:ratio 224 - expr: | 225 (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) 226 / 227 scalar(sum(node:node_memory_bytes_total:sum)) 228 record: node:cluster_memory_utilisation:ratio 229 - expr: | 230 1e3 * sum( 231 (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) 232 + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) 233 ) 234 record: :node_memory_swap_io_bytes:sum_rate 235 - expr: | 236 1 - 237 sum by (node) ( 238 (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) 239 * on (namespace, pod) group_left(node) 240 node_namespace_pod:kube_pod_info: 241 ) 242 / 243 sum by (node) ( 244 node_memory_MemTotal_bytes{job="node-exporter"} 245 * on (namespace, pod) group_left(node) 246 node_namespace_pod:kube_pod_info: 247 ) 248 record: 'node:node_memory_utilisation:' 249 - expr: | 250 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) 251 record: 'node:node_memory_utilisation_2:' 252 - expr: | 253 1e3 * sum by (node) ( 254 (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) 255 + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) 256 * on (namespace, pod) group_left(node) 257 node_namespace_pod:kube_pod_info: 258 ) 259 record: node:node_memory_swap_io_bytes:sum_rate 260 - expr: | 261 avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) 262 record: :node_disk_utilisation:avg_irate 263 - expr: | 264 avg by (node) ( 265 irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) 266 * on (namespace, pod) group_left(node) 267 node_namespace_pod:kube_pod_info: 268 ) 269 record: node:node_disk_utilisation:avg_irate 270 - expr: | 271 avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) 272 record: :node_disk_saturation:avg_irate 273 - expr: | 274 avg by (node) ( 275 irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) 276 * on (namespace, pod) group_left(node) 277 node_namespace_pod:kube_pod_info: 278 ) 279 record: node:node_disk_saturation:avg_irate 280 - expr: | 281 max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} 282 - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) 283 / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) 284 record: 'node:node_filesystem_usage:' 285 - expr: | 286 max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) 287 record: 'node:node_filesystem_avail:' 288 - expr: | 289 sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + 290 sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) 291 record: :node_net_utilisation:sum_irate 292 - expr: | 293 sum by (node) ( 294 (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + 295 irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) 296 * on (namespace, pod) group_left(node) 297 node_namespace_pod:kube_pod_info: 298 ) 299 record: node:node_net_utilisation:sum_irate 300 - expr: | 301 sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + 302 sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) 303 record: :node_net_saturation:sum_irate 304 - expr: | 305 sum by (node) ( 306 (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + 307 irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) 308 * on (namespace, pod) group_left(node) 309 node_namespace_pod:kube_pod_info: 310 ) 311 record: node:node_net_saturation:sum_irate 312 - expr: | 313 max( 314 max( 315 kube_pod_info{job="kube-state-metrics", host_ip!=""} 316 ) by (node, host_ip) 317 * on (host_ip) group_right (node) 318 label_replace( 319 (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" 320 ) 321 ) by (node) 322 record: 'node:node_inodes_total:' 323 - expr: | 324 max( 325 max( 326 kube_pod_info{job="kube-state-metrics", host_ip!=""} 327 ) by (node, host_ip) 328 * on (host_ip) group_right (node) 329 label_replace( 330 (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" 331 ) 332 ) by (node) 333 record: 'node:node_inodes_free:' 334 - name: kube-prometheus-node-recording.rules 335 rules: 336 - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY 337 (instance) 338 record: instance:node_cpu:rate:sum 339 - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) 340 BY (instance) 341 record: instance:node_filesystem_usage:sum 342 - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) 343 record: instance:node_network_receive_bytes:rate:sum 344 - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) 345 record: instance:node_network_transmit_bytes:rate:sum 346 - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT 347 (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) 348 BY (instance, cpu)) BY (instance) 349 record: instance:node_cpu:ratio 350 - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) 351 record: cluster:node_cpu:sum_rate5m 352 - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) 353 BY (instance, cpu)) 354 record: cluster:node_cpu:ratio 355 - name: kubernetes-absent 356 rules: 357 - alert: AlertmanagerDown 358 annotations: 359 message: Alertmanager has disappeared from Prometheus target discovery. 360 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown 361 expr: | 362 absent(up{job="alertmanager-main",namespace="monitoring"} == 1) 363 for: 15m 364 labels: 365 severity: critical 366 - alert: CoreDNSDown 367 annotations: 368 message: CoreDNS has disappeared from Prometheus target discovery. 369 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown 370 expr: | 371 absent(up{job="kube-dns"} == 1) 372 for: 15m 373 labels: 374 severity: critical 375 - alert: KubeAPIDown 376 annotations: 377 message: KubeAPI has disappeared from Prometheus target discovery. 378 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown 379 expr: | 380 absent(up{job="apiserver"} == 1) 381 for: 15m 382 labels: 383 severity: critical 384 - alert: KubeControllerManagerDown 385 annotations: 386 message: KubeControllerManager has disappeared from Prometheus target discovery. 387 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown 388 expr: | 389 absent(up{job="kube-controller-manager"} == 1) 390 for: 15m 391 labels: 392 severity: critical 393 - alert: KubeSchedulerDown 394 annotations: 395 message: KubeScheduler has disappeared from Prometheus target discovery. 396 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown 397 expr: | 398 absent(up{job="kube-scheduler"} == 1) 399 for: 15m 400 labels: 401 severity: critical 402 - alert: KubeStateMetricsDown 403 annotations: 404 message: KubeStateMetrics has disappeared from Prometheus target discovery. 405 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown 406 expr: | 407 absent(up{job="kube-state-metrics"} == 1) 408 for: 15m 409 labels: 410 severity: critical 411 - alert: KubeletDown 412 annotations: 413 message: Kubelet has disappeared from Prometheus target discovery. 414 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown 415 expr: | 416 absent(up{job="kubelet"} == 1) 417 for: 15m 418 labels: 419 severity: critical 420 - alert: NodeExporterDown 421 annotations: 422 message: NodeExporter has disappeared from Prometheus target discovery. 423 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown 424 expr: | 425 absent(up{job="node-exporter"} == 1) 426 for: 15m 427 labels: 428 severity: critical 429 - alert: PrometheusDown 430 annotations: 431 message: Prometheus has disappeared from Prometheus target discovery. 432 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown 433 expr: | 434 absent(up{job="prometheus-k8s",namespace="monitoring"} == 1) 435 for: 15m 436 labels: 437 severity: critical 438 - alert: PrometheusOperatorDown 439 annotations: 440 message: PrometheusOperator has disappeared from Prometheus target discovery. 441 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown 442 expr: | 443 absent(up{job="prometheus-operator",namespace="monitoring"} == 1) 444 for: 15m 445 labels: 446 severity: critical 447 - name: kubernetes-apps 448 rules: 449 - alert: KubePodCrashLooping 450 annotations: 451 message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container 452 }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. 453 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping 454 expr: | 455 rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 456 for: 1h 457 labels: 458 severity: critical 459 - alert: KubePodNotReady 460 annotations: 461 message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready 462 state for longer than an hour. 463 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready 464 expr: | 465 sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 466 for: 1h 467 labels: 468 severity: critical 469 - alert: KubeDeploymentGenerationMismatch 470 annotations: 471 message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment 472 }} does not match, this indicates that the Deployment has failed but has 473 not been rolled back. 474 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch 475 expr: | 476 kube_deployment_status_observed_generation{job="kube-state-metrics"} 477 != 478 kube_deployment_metadata_generation{job="kube-state-metrics"} 479 for: 15m 480 labels: 481 severity: critical 482 - alert: KubeDeploymentReplicasMismatch 483 annotations: 484 message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not 485 matched the expected number of replicas for longer than an hour. 486 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch 487 expr: | 488 kube_deployment_spec_replicas{job="kube-state-metrics"} 489 != 490 kube_deployment_status_replicas_available{job="kube-state-metrics"} 491 for: 1h 492 labels: 493 severity: critical 494 - alert: KubeStatefulSetReplicasMismatch 495 annotations: 496 message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has 497 not matched the expected number of replicas for longer than 15 minutes. 498 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch 499 expr: | 500 kube_statefulset_status_replicas_ready{job="kube-state-metrics"} 501 != 502 kube_statefulset_status_replicas{job="kube-state-metrics"} 503 for: 15m 504 labels: 505 severity: critical 506 - alert: KubeStatefulSetGenerationMismatch 507 annotations: 508 message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset 509 }} does not match, this indicates that the StatefulSet has failed but has 510 not been rolled back. 511 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch 512 expr: | 513 kube_statefulset_status_observed_generation{job="kube-state-metrics"} 514 != 515 kube_statefulset_metadata_generation{job="kube-state-metrics"} 516 for: 15m 517 labels: 518 severity: critical 519 - alert: KubeStatefulSetUpdateNotRolledOut 520 annotations: 521 message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update 522 has not been rolled out. 523 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout 524 expr: | 525 max without (revision) ( 526 kube_statefulset_status_current_revision{job="kube-state-metrics"} 527 unless 528 kube_statefulset_status_update_revision{job="kube-state-metrics"} 529 ) 530 * 531 ( 532 kube_statefulset_replicas{job="kube-state-metrics"} 533 != 534 kube_statefulset_status_replicas_updated{job="kube-state-metrics"} 535 ) 536 for: 15m 537 labels: 538 severity: critical 539 - alert: KubeDaemonSetRolloutStuck 540 annotations: 541 message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace 542 }}/{{ $labels.daemonset }} are scheduled and ready. 543 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck 544 expr: | 545 kube_daemonset_status_number_ready{job="kube-state-metrics"} 546 / 547 kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 548 for: 15m 549 labels: 550 severity: critical 551 - alert: KubeDaemonSetNotScheduled 552 annotations: 553 message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset 554 }} are not scheduled.' 555 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled 556 expr: | 557 kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} 558 - 559 kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 560 for: 10m 561 labels: 562 severity: warning 563 - alert: KubeDaemonSetMisScheduled 564 annotations: 565 message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset 566 }} are running where they are not supposed to run.' 567 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled 568 expr: | 569 kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 570 for: 10m 571 labels: 572 severity: warning 573 - alert: KubeCronJobRunning 574 annotations: 575 message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more 576 than 1h to complete. 577 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning 578 expr: | 579 time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 580 for: 1h 581 labels: 582 severity: warning 583 - alert: KubeJobCompletion 584 annotations: 585 message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more 586 than one hour to complete. 587 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion 588 expr: | 589 kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 590 for: 1h 591 labels: 592 severity: warning 593 - alert: KubeJobFailed 594 annotations: 595 message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. 596 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed 597 expr: | 598 kube_job_status_failed{job="kube-state-metrics"} > 0 599 for: 1h 600 labels: 601 severity: warning 602 - name: kubernetes-resources 603 rules: 604 - alert: KubeCPUOvercommit 605 annotations: 606 message: Cluster has overcommitted CPU resource requests for Pods and cannot 607 tolerate node failure. 608 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit 609 expr: | 610 sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) 611 / 612 sum(node:node_num_cpu:sum) 613 > 614 (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) 615 for: 5m 616 labels: 617 severity: warning 618 - alert: KubeMemOvercommit 619 annotations: 620 message: Cluster has overcommitted memory resource requests for Pods and cannot 621 tolerate node failure. 622 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit 623 expr: | 624 sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) 625 / 626 sum(node_memory_MemTotal_bytes) 627 > 628 (count(node:node_num_cpu:sum)-1) 629 / 630 count(node:node_num_cpu:sum) 631 for: 5m 632 labels: 633 severity: warning 634 - alert: KubeCPUOvercommit 635 annotations: 636 message: Cluster has overcommitted CPU resource requests for Namespaces. 637 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit 638 expr: | 639 sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) 640 / 641 sum(node:node_num_cpu:sum) 642 > 1.5 643 for: 5m 644 labels: 645 severity: warning 646 - alert: KubeMemOvercommit 647 annotations: 648 message: Cluster has overcommitted memory resource requests for Namespaces. 649 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit 650 expr: | 651 sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) 652 / 653 sum(node_memory_MemTotal_bytes{job="node-exporter"}) 654 > 1.5 655 for: 5m 656 labels: 657 severity: warning 658 - alert: KubeQuotaExceeded 659 annotations: 660 message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value 661 }}% of its {{ $labels.resource }} quota. 662 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded 663 expr: | 664 100 * kube_resourcequota{job="kube-state-metrics", type="used"} 665 / ignoring(instance, job, type) 666 (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) 667 > 90 668 for: 15m 669 labels: 670 severity: warning 671 - alert: CPUThrottlingHigh 672 annotations: 673 message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace 674 }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name 675 }}.' 676 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh 677 expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\", 678 }[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) 679 by (container_name, pod_name, namespace)\n > 25 \n" 680 for: 15m 681 labels: 682 severity: warning 683 - name: kubernetes-storage 684 rules: 685 - alert: KubePersistentVolumeUsageCritical 686 annotations: 687 message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim 688 }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value 689 }}% free. 690 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical 691 expr: | 692 100 * kubelet_volume_stats_available_bytes{job="kubelet"} 693 / 694 kubelet_volume_stats_capacity_bytes{job="kubelet"} 695 < 3 696 for: 1m 697 labels: 698 severity: critical 699 - alert: KubePersistentVolumeFullInFourDays 700 annotations: 701 message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim 702 }} in Namespace {{ $labels.namespace }} is expected to fill up within four 703 days. Currently {{ printf "%0.2f" $value }}% is available. 704 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays 705 expr: | 706 100 * ( 707 kubelet_volume_stats_available_bytes{job="kubelet"} 708 / 709 kubelet_volume_stats_capacity_bytes{job="kubelet"} 710 ) < 15 711 and 712 predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 713 for: 5m 714 labels: 715 severity: critical 716 - alert: KubePersistentVolumeErrors 717 annotations: 718 message: The persistent volume {{ $labels.persistentvolume }} has status {{ 719 $labels.phase }}. 720 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors 721 expr: | 722 kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 723 for: 5m 724 labels: 725 severity: critical 726 - name: kubernetes-system 727 rules: 728 - alert: KubeNodeNotReady 729 annotations: 730 message: '{{ $labels.node }} has been unready for more than an hour.' 731 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready 732 expr: | 733 kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 734 for: 1h 735 labels: 736 severity: warning 737 - alert: KubeVersionMismatch 738 annotations: 739 message: There are {{ $value }} different semantic versions of Kubernetes 740 components running. 741 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch 742 expr: | 743 count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 744 for: 1h 745 labels: 746 severity: warning 747 - alert: KubeClientErrors 748 annotations: 749 message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance 750 }}' is experiencing {{ printf "%0.0f" $value }}% errors.' 751 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors 752 expr: | 753 (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) 754 / 755 sum(rate(rest_client_requests_total[5m])) by (instance, job)) 756 * 100 > 1 757 for: 15m 758 labels: 759 severity: warning 760 - alert: KubeClientErrors 761 annotations: 762 message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance 763 }}' is experiencing {{ printf "%0.0f" $value }} errors / second. 764 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors 765 expr: | 766 sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 767 for: 15m 768 labels: 769 severity: warning 770 - alert: KubeletTooManyPods 771 annotations: 772 message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close 773 to the limit of 110. 774 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods 775 expr: | 776 kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 777 for: 15m 778 labels: 779 severity: warning 780 - alert: KubeAPILatencyHigh 781 annotations: 782 message: The API server has a 99th percentile latency of {{ $value }} seconds 783 for {{ $labels.verb }} {{ $labels.resource }}. 784 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh 785 expr: | 786 cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 787 for: 10m 788 labels: 789 severity: warning 790 - alert: KubeAPILatencyHigh 791 annotations: 792 message: The API server has a 99th percentile latency of {{ $value }} seconds 793 for {{ $labels.verb }} {{ $labels.resource }}. 794 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh 795 expr: | 796 cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 797 for: 10m 798 labels: 799 severity: critical 800 - alert: KubeAPIErrorsHigh 801 annotations: 802 message: API server is returning errors for {{ $value }}% of requests. 803 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh 804 expr: | 805 sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) 806 / 807 sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3 808 for: 10m 809 labels: 810 severity: critical 811 - alert: KubeAPIErrorsHigh 812 annotations: 813 message: API server is returning errors for {{ $value }}% of requests. 814 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh 815 expr: | 816 sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) 817 / 818 sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1 819 for: 10m 820 labels: 821 severity: warning 822 - alert: KubeAPIErrorsHigh 823 annotations: 824 message: API server is returning errors for {{ $value }}% of requests for 825 {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. 826 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh 827 expr: | 828 sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) 829 / 830 sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10 831 for: 10m 832 labels: 833 severity: critical 834 - alert: KubeAPIErrorsHigh 835 annotations: 836 message: API server is returning errors for {{ $value }}% of requests for 837 {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. 838 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh 839 expr: | 840 sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) 841 / 842 sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5 843 for: 10m 844 labels: 845 severity: warning 846 - alert: KubeClientCertificateExpiration 847 annotations: 848 message: A client certificate used to authenticate to the apiserver is expiring 849 in less than 7.0 days. 850 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration 851 expr: | 852 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 853 labels: 854 severity: warning 855 - alert: KubeClientCertificateExpiration 856 annotations: 857 message: A client certificate used to authenticate to the apiserver is expiring 858 in less than 24.0 hours. 859 runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration 860 expr: | 861 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 862 labels: 863 severity: critical 864 - name: alertmanager.rules 865 rules: 866 - alert: AlertmanagerConfigInconsistent 867 annotations: 868 message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` 869 are out of sync. 870 expr: | 871 count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 872 for: 5m 873 labels: 874 severity: critical 875 - alert: AlertmanagerFailedReload 876 annotations: 877 message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace 878 }}/{{ $labels.pod}}. 879 expr: | 880 alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0 881 for: 10m 882 labels: 883 severity: warning 884 - alert: AlertmanagerMembersInconsistent 885 annotations: 886 message: Alertmanager has not found all other members of the cluster. 887 expr: | 888 alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"} 889 != on (service) GROUP_LEFT() 890 count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}) 891 for: 5m 892 labels: 893 severity: critical 894 - name: general.rules 895 rules: 896 - alert: TargetDown 897 annotations: 898 message: '{{ $value }}% of the {{ $labels.job }} targets are down.' 899 expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 900 for: 10m 901 labels: 902 severity: warning 903 - alert: Watchdog 904 annotations: 905 message: | 906 This is an alert meant to ensure that the entire alerting pipeline is functional. 907 This alert is always firing, therefore it should always be firing in Alertmanager 908 and always fire against a receiver. There are integrations with various notification 909 mechanisms that send a notification when this alert is not firing. For example the 910 "DeadMansSnitch" integration in PagerDuty. 911 expr: vector(1) 912 labels: 913 severity: none 914 - name: kube-prometheus-node-alerting.rules 915 rules: 916 - alert: NodeDiskRunningFull 917 annotations: 918 message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace 919 }}/{{ $labels.pod }} will be full within the next 24 hours. 920 expr: | 921 (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) 922 for: 30m 923 labels: 924 severity: warning 925 - alert: NodeDiskRunningFull 926 annotations: 927 message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace 928 }}/{{ $labels.pod }} will be full within the next 2 hours. 929 expr: | 930 (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) 931 for: 10m 932 labels: 933 severity: critical 934 - name: node-time 935 rules: 936 - alert: ClockSkewDetected 937 annotations: 938 message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod 939 }}. Ensure NTP is configured correctly on this host. 940 expr: | 941 abs(node_timex_offset_seconds{job="node-exporter"}) > 0.03 942 for: 2m 943 labels: 944 severity: warning 945 - name: node-network 946 rules: 947 - alert: NetworkReceiveErrors 948 annotations: 949 message: Network interface "{{ $labels.device }}" showing receive errors on 950 node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" 951 expr: | 952 rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0 953 for: 2m 954 labels: 955 severity: warning 956 - alert: NetworkTransmitErrors 957 annotations: 958 message: Network interface "{{ $labels.device }}" showing transmit errors 959 on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" 960 expr: | 961 rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0 962 for: 2m 963 labels: 964 severity: warning 965 - alert: NodeNetworkInterfaceFlapping 966 annotations: 967 message: Network interface "{{ $labels.device }}" changing it's up status 968 often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" 969 expr: | 970 changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 971 for: 2m 972 labels: 973 severity: warning 974 - name: prometheus.rules 975 rules: 976 - alert: PrometheusConfigReloadFailed 977 annotations: 978 description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} 979 summary: Reloading Prometheus' configuration failed 980 expr: | 981 prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0 982 for: 10m 983 labels: 984 severity: warning 985 - alert: PrometheusNotificationQueueRunningFull 986 annotations: 987 description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ 988 $labels.pod}} 989 summary: Prometheus' alert notification queue is running full 990 expr: | 991 predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"} 992 for: 10m 993 labels: 994 severity: warning 995 - alert: PrometheusErrorSendingAlerts 996 annotations: 997 description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ 998 $labels.pod}} to Alertmanager {{$labels.Alertmanager}} 999 summary: Errors while sending alert from Prometheus 1000 expr: | 1001 rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01 1002 for: 10m 1003 labels: 1004 severity: warning 1005 - alert: PrometheusErrorSendingAlerts 1006 annotations: 1007 description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ 1008 $labels.pod}} to Alertmanager {{$labels.Alertmanager}} 1009 summary: Errors while sending alerts from Prometheus 1010 expr: | 1011 rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03 1012 for: 10m 1013 labels: 1014 severity: critical 1015 - alert: PrometheusNotConnectedToAlertmanagers 1016 annotations: 1017 description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected 1018 to any Alertmanagers 1019 summary: Prometheus is not connected to any Alertmanagers 1020 expr: | 1021 prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1 1022 for: 10m 1023 labels: 1024 severity: warning 1025 - alert: PrometheusTSDBReloadsFailing 1026 annotations: 1027 description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} 1028 reload failures over the last four hours.' 1029 summary: Prometheus has issues reloading data blocks from disk 1030 expr: | 1031 increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 1032 for: 12h 1033 labels: 1034 severity: warning 1035 - alert: PrometheusTSDBCompactionsFailing 1036 annotations: 1037 description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} 1038 compaction failures over the last four hours.' 1039 summary: Prometheus has issues compacting sample blocks 1040 expr: | 1041 increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 1042 for: 12h 1043 labels: 1044 severity: warning 1045 - alert: PrometheusTSDBWALCorruptions 1046 annotations: 1047 description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead 1048 log (WAL).' 1049 summary: Prometheus write-ahead log is corrupted 1050 expr: | 1051 prometheus_tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 1052 for: 4h 1053 labels: 1054 severity: warning 1055 - alert: PrometheusNotIngestingSamples 1056 annotations: 1057 description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting 1058 samples. 1059 summary: Prometheus isn't ingesting samples 1060 expr: | 1061 rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 1062 for: 10m 1063 labels: 1064 severity: warning 1065 - alert: PrometheusTargetScrapesDuplicate 1066 annotations: 1067 description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected 1068 due to duplicate timestamps but different values' 1069 summary: Prometheus has many samples rejected 1070 expr: | 1071 increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 1072 for: 10m 1073 labels: 1074 severity: warning 1075 - name: prometheus-operator 1076 rules: 1077 - alert: PrometheusOperatorReconcileErrors 1078 annotations: 1079 message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace 1080 }} Namespace. 1081 expr: | 1082 rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 1083 for: 10m 1084 labels: 1085 severity: warning 1086 - alert: PrometheusOperatorNodeLookupErrors 1087 annotations: 1088 message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. 1089 expr: | 1090 rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 1091 for: 10m 1092 labels: 1093 severity: warning