github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/prometheus-community/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml (about)

     1  {{- /*
     2  Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
     3  Do not change in-place! In order to change this file first read following link:
     4  https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
     5  */ -}}
     6  {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
     7  {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
     8  apiVersion: monitoring.coreos.com/v1
     9  kind: PrometheusRule
    10  metadata:
    11    name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-kubelet" | trunc 63 | trimSuffix "-" }}
    12    namespace: {{ template "kube-prometheus-stack.namespace" . }}
    13    labels:
    14      app: {{ template "kube-prometheus-stack.name" . }}
    15  {{ include "kube-prometheus-stack.labels" . | indent 4 }}
    16  {{- if .Values.defaultRules.labels }}
    17  {{ toYaml .Values.defaultRules.labels | indent 4 }}
    18  {{- end }}
    19  {{- if .Values.defaultRules.annotations }}
    20    annotations:
    21  {{ toYaml .Values.defaultRules.annotations | indent 4 }}
    22  {{- end }}
    23  spec:
    24    groups:
    25    - name: kubernetes-system-kubelet
    26      rules:
    27  {{- if not (.Values.defaultRules.disabled.KubeNodeNotReady | default false) }}
    28      - alert: KubeNodeNotReady
    29        annotations:
    30  {{- if .Values.defaultRules.additionalRuleAnnotations }}
    31  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
    32  {{- end }}
    33          description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
    34          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready
    35          summary: Node is not ready.
    36        expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
    37        for: 15m
    38        labels:
    39          severity: warning
    40  {{- if .Values.defaultRules.additionalRuleLabels }}
    41  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
    42  {{- end }}
    43  {{- end }}
    44  {{- if not (.Values.defaultRules.disabled.KubeNodeUnreachable | default false) }}
    45      - alert: KubeNodeUnreachable
    46        annotations:
    47  {{- if .Values.defaultRules.additionalRuleAnnotations }}
    48  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
    49  {{- end }}
    50          description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
    51          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable
    52          summary: Node is unreachable.
    53        expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
    54        for: 15m
    55        labels:
    56          severity: warning
    57  {{- if .Values.defaultRules.additionalRuleLabels }}
    58  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
    59  {{- end }}
    60  {{- end }}
    61  {{- if not (.Values.defaultRules.disabled.KubeletTooManyPods | default false) }}
    62      - alert: KubeletTooManyPods
    63        annotations:
    64  {{- if .Values.defaultRules.additionalRuleAnnotations }}
    65  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
    66  {{- end }}
    67          description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.
    68          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods
    69          summary: Kubelet is running at capacity.
    70        expr: |-
    71          count by (cluster, node, verrazzano_cluster) (
    72            (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,cluster, verrazzano_cluster) group_left(node) topk by (instance,pod,namespace,cluster, verrazzano_cluster) (1, kube_pod_info{job="kube-state-metrics"})
    73          )
    74          /
    75          max by (cluster, node, verrazzano_cluster) (
    76            kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
    77          ) > 0.95
    78        for: 15m
    79        labels:
    80          severity: info
    81  {{- if .Values.defaultRules.additionalRuleLabels }}
    82  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
    83  {{- end }}
    84  {{- end }}
    85  {{- if not (.Values.defaultRules.disabled.KubeNodeReadinessFlapping | default false) }}
    86      - alert: KubeNodeReadinessFlapping
    87        annotations:
    88  {{- if .Values.defaultRules.additionalRuleAnnotations }}
    89  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
    90  {{- end }}
    91          description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.
    92          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping
    93          summary: Node readiness status is flapping.
    94        expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node, verrazzano_cluster) > 2
    95        for: 15m
    96        labels:
    97          severity: warning
    98  {{- if .Values.defaultRules.additionalRuleLabels }}
    99  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   100  {{- end }}
   101  {{- end }}
   102  {{- if not (.Values.defaultRules.disabled.KubeletPlegDurationHigh | default false) }}
   103      - alert: KubeletPlegDurationHigh
   104        annotations:
   105  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   106  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   107  {{- end }}
   108          description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
   109          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh
   110          summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
   111        expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
   112        for: 5m
   113        labels:
   114          severity: warning
   115  {{- if .Values.defaultRules.additionalRuleLabels }}
   116  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   117  {{- end }}
   118  {{- end }}
   119  {{- if not (.Values.defaultRules.disabled.KubeletPodStartUpLatencyHigh | default false) }}
   120      - alert: KubeletPodStartUpLatencyHigh
   121        annotations:
   122  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   123  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   124  {{- end }}
   125          description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
   126          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh
   127          summary: Kubelet Pod startup latency is too high.
   128        expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le, verrazzano_cluster)) * on (cluster, instance, verrazzano_cluster) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
   129        for: 15m
   130        labels:
   131          severity: warning
   132  {{- if .Values.defaultRules.additionalRuleLabels }}
   133  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   134  {{- end }}
   135  {{- end }}
   136  {{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }}
   137      - alert: KubeletClientCertificateExpiration
   138        annotations:
   139  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   140  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   141  {{- end }}
   142          description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
   143          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
   144          summary: Kubelet client certificate is about to expire.
   145        expr: kubelet_certificate_manager_client_ttl_seconds < 604800
   146        labels:
   147          severity: warning
   148  {{- if .Values.defaultRules.additionalRuleLabels }}
   149  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   150  {{- end }}
   151  {{- end }}
   152  {{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }}
   153      - alert: KubeletClientCertificateExpiration
   154        annotations:
   155  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   156  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   157  {{- end }}
   158          description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
   159          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
   160          summary: Kubelet client certificate is about to expire.
   161        expr: kubelet_certificate_manager_client_ttl_seconds < 86400
   162        labels:
   163          severity: critical
   164  {{- if .Values.defaultRules.additionalRuleLabels }}
   165  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   166  {{- end }}
   167  {{- end }}
   168  {{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }}
   169      - alert: KubeletServerCertificateExpiration
   170        annotations:
   171  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   172  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   173  {{- end }}
   174          description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
   175          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
   176          summary: Kubelet server certificate is about to expire.
   177        expr: kubelet_certificate_manager_server_ttl_seconds < 604800
   178        labels:
   179          severity: warning
   180  {{- if .Values.defaultRules.additionalRuleLabels }}
   181  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   182  {{- end }}
   183  {{- end }}
   184  {{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }}
   185      - alert: KubeletServerCertificateExpiration
   186        annotations:
   187  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   188  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   189  {{- end }}
   190          description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
   191          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
   192          summary: Kubelet server certificate is about to expire.
   193        expr: kubelet_certificate_manager_server_ttl_seconds < 86400
   194        labels:
   195          severity: critical
   196  {{- if .Values.defaultRules.additionalRuleLabels }}
   197  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   198  {{- end }}
   199  {{- end }}
   200  {{- if not (.Values.defaultRules.disabled.KubeletClientCertificateRenewalErrors | default false) }}
   201      - alert: KubeletClientCertificateRenewalErrors
   202        annotations:
   203  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   204  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   205  {{- end }}
   206          description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
   207          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors
   208          summary: Kubelet has failed to renew its client certificate.
   209        expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
   210        for: 15m
   211        labels:
   212          severity: warning
   213  {{- if .Values.defaultRules.additionalRuleLabels }}
   214  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   215  {{- end }}
   216  {{- end }}
   217  {{- if not (.Values.defaultRules.disabled.KubeletServerCertificateRenewalErrors | default false) }}
   218      - alert: KubeletServerCertificateRenewalErrors
   219        annotations:
   220  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   221  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   222  {{- end }}
   223          description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
   224          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors
   225          summary: Kubelet has failed to renew its server certificate.
   226        expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
   227        for: 15m
   228        labels:
   229          severity: warning
   230  {{- if .Values.defaultRules.additionalRuleLabels }}
   231  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   232  {{- end }}
   233  {{- end }}
   234  {{- if .Values.prometheusOperator.kubeletService.enabled }}
   235  {{- if not (.Values.defaultRules.disabled.KubeletDown | default false) }}
   236      - alert: KubeletDown
   237        annotations:
   238  {{- if .Values.defaultRules.additionalRuleAnnotations }}
   239  {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
   240  {{- end }}
   241          description: Kubelet has disappeared from Prometheus target discovery.
   242          runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown
   243          summary: Target disappeared from Prometheus target discovery.
   244        expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
   245        for: 15m
   246        labels:
   247          severity: critical
   248  {{- if .Values.defaultRules.additionalRuleLabels }}
   249  {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
   250  {{- end }}
   251  {{- end }}
   252  {{- end }}
   253  {{- end }}