github.com/verrazzano/verrazzano@v1.7.0/platform-operator/thirdparty/charts/prometheus-community/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml (about) 1 {{- /* 2 Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml 3 Do not change in-place! In order to change this file first read following link: 4 https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack 5 */ -}} 6 {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} 7 {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }} 8 apiVersion: monitoring.coreos.com/v1 9 kind: PrometheusRule 10 metadata: 11 name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-kubelet" | trunc 63 | trimSuffix "-" }} 12 namespace: {{ template "kube-prometheus-stack.namespace" . }} 13 labels: 14 app: {{ template "kube-prometheus-stack.name" . }} 15 {{ include "kube-prometheus-stack.labels" . | indent 4 }} 16 {{- if .Values.defaultRules.labels }} 17 {{ toYaml .Values.defaultRules.labels | indent 4 }} 18 {{- end }} 19 {{- if .Values.defaultRules.annotations }} 20 annotations: 21 {{ toYaml .Values.defaultRules.annotations | indent 4 }} 22 {{- end }} 23 spec: 24 groups: 25 - name: kubernetes-system-kubelet 26 rules: 27 {{- if not (.Values.defaultRules.disabled.KubeNodeNotReady | default false) }} 28 - alert: KubeNodeNotReady 29 annotations: 30 {{- if .Values.defaultRules.additionalRuleAnnotations }} 31 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 32 {{- end }} 33 description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.' 34 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready 35 summary: Node is not ready. 36 expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 37 for: 15m 38 labels: 39 severity: warning 40 {{- if .Values.defaultRules.additionalRuleLabels }} 41 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 42 {{- end }} 43 {{- end }} 44 {{- if not (.Values.defaultRules.disabled.KubeNodeUnreachable | default false) }} 45 - alert: KubeNodeUnreachable 46 annotations: 47 {{- if .Values.defaultRules.additionalRuleAnnotations }} 48 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 49 {{- end }} 50 description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.' 51 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable 52 summary: Node is unreachable. 53 expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 54 for: 15m 55 labels: 56 severity: warning 57 {{- if .Values.defaultRules.additionalRuleLabels }} 58 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 59 {{- end }} 60 {{- end }} 61 {{- if not (.Values.defaultRules.disabled.KubeletTooManyPods | default false) }} 62 - alert: KubeletTooManyPods 63 annotations: 64 {{- if .Values.defaultRules.additionalRuleAnnotations }} 65 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 66 {{- end }} 67 description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity. 68 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods 69 summary: Kubelet is running at capacity. 70 expr: |- 71 count by (cluster, node, verrazzano_cluster) ( 72 (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,cluster, verrazzano_cluster) group_left(node) topk by (instance,pod,namespace,cluster, verrazzano_cluster) (1, kube_pod_info{job="kube-state-metrics"}) 73 ) 74 / 75 max by (cluster, node, verrazzano_cluster) ( 76 kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 77 ) > 0.95 78 for: 15m 79 labels: 80 severity: info 81 {{- if .Values.defaultRules.additionalRuleLabels }} 82 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 83 {{- end }} 84 {{- end }} 85 {{- if not (.Values.defaultRules.disabled.KubeNodeReadinessFlapping | default false) }} 86 - alert: KubeNodeReadinessFlapping 87 annotations: 88 {{- if .Values.defaultRules.additionalRuleAnnotations }} 89 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 90 {{- end }} 91 description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes. 92 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping 93 summary: Node readiness status is flapping. 94 expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node, verrazzano_cluster) > 2 95 for: 15m 96 labels: 97 severity: warning 98 {{- if .Values.defaultRules.additionalRuleLabels }} 99 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 100 {{- end }} 101 {{- end }} 102 {{- if not (.Values.defaultRules.disabled.KubeletPlegDurationHigh | default false) }} 103 - alert: KubeletPlegDurationHigh 104 annotations: 105 {{- if .Values.defaultRules.additionalRuleAnnotations }} 106 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 107 {{- end }} 108 description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. 109 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh 110 summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. 111 expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 112 for: 5m 113 labels: 114 severity: warning 115 {{- if .Values.defaultRules.additionalRuleLabels }} 116 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 117 {{- end }} 118 {{- end }} 119 {{- if not (.Values.defaultRules.disabled.KubeletPodStartUpLatencyHigh | default false) }} 120 - alert: KubeletPodStartUpLatencyHigh 121 annotations: 122 {{- if .Values.defaultRules.additionalRuleAnnotations }} 123 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 124 {{- end }} 125 description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}. 126 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh 127 summary: Kubelet Pod startup latency is too high. 128 expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le, verrazzano_cluster)) * on (cluster, instance, verrazzano_cluster) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 129 for: 15m 130 labels: 131 severity: warning 132 {{- if .Values.defaultRules.additionalRuleLabels }} 133 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 134 {{- end }} 135 {{- end }} 136 {{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }} 137 - alert: KubeletClientCertificateExpiration 138 annotations: 139 {{- if .Values.defaultRules.additionalRuleAnnotations }} 140 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 141 {{- end }} 142 description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. 143 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration 144 summary: Kubelet client certificate is about to expire. 145 expr: kubelet_certificate_manager_client_ttl_seconds < 604800 146 labels: 147 severity: warning 148 {{- if .Values.defaultRules.additionalRuleLabels }} 149 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 150 {{- end }} 151 {{- end }} 152 {{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }} 153 - alert: KubeletClientCertificateExpiration 154 annotations: 155 {{- if .Values.defaultRules.additionalRuleAnnotations }} 156 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 157 {{- end }} 158 description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. 159 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration 160 summary: Kubelet client certificate is about to expire. 161 expr: kubelet_certificate_manager_client_ttl_seconds < 86400 162 labels: 163 severity: critical 164 {{- if .Values.defaultRules.additionalRuleLabels }} 165 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 166 {{- end }} 167 {{- end }} 168 {{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }} 169 - alert: KubeletServerCertificateExpiration 170 annotations: 171 {{- if .Values.defaultRules.additionalRuleAnnotations }} 172 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 173 {{- end }} 174 description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. 175 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration 176 summary: Kubelet server certificate is about to expire. 177 expr: kubelet_certificate_manager_server_ttl_seconds < 604800 178 labels: 179 severity: warning 180 {{- if .Values.defaultRules.additionalRuleLabels }} 181 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 182 {{- end }} 183 {{- end }} 184 {{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }} 185 - alert: KubeletServerCertificateExpiration 186 annotations: 187 {{- if .Values.defaultRules.additionalRuleAnnotations }} 188 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 189 {{- end }} 190 description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}. 191 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration 192 summary: Kubelet server certificate is about to expire. 193 expr: kubelet_certificate_manager_server_ttl_seconds < 86400 194 labels: 195 severity: critical 196 {{- if .Values.defaultRules.additionalRuleLabels }} 197 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 198 {{- end }} 199 {{- end }} 200 {{- if not (.Values.defaultRules.disabled.KubeletClientCertificateRenewalErrors | default false) }} 201 - alert: KubeletClientCertificateRenewalErrors 202 annotations: 203 {{- if .Values.defaultRules.additionalRuleAnnotations }} 204 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 205 {{- end }} 206 description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes). 207 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors 208 summary: Kubelet has failed to renew its client certificate. 209 expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 210 for: 15m 211 labels: 212 severity: warning 213 {{- if .Values.defaultRules.additionalRuleLabels }} 214 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 215 {{- end }} 216 {{- end }} 217 {{- if not (.Values.defaultRules.disabled.KubeletServerCertificateRenewalErrors | default false) }} 218 - alert: KubeletServerCertificateRenewalErrors 219 annotations: 220 {{- if .Values.defaultRules.additionalRuleAnnotations }} 221 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 222 {{- end }} 223 description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes). 224 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors 225 summary: Kubelet has failed to renew its server certificate. 226 expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 227 for: 15m 228 labels: 229 severity: warning 230 {{- if .Values.defaultRules.additionalRuleLabels }} 231 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 232 {{- end }} 233 {{- end }} 234 {{- if .Values.prometheusOperator.kubeletService.enabled }} 235 {{- if not (.Values.defaultRules.disabled.KubeletDown | default false) }} 236 - alert: KubeletDown 237 annotations: 238 {{- if .Values.defaultRules.additionalRuleAnnotations }} 239 {{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }} 240 {{- end }} 241 description: Kubelet has disappeared from Prometheus target discovery. 242 runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown 243 summary: Target disappeared from Prometheus target discovery. 244 expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1) 245 for: 15m 246 labels: 247 severity: critical 248 {{- if .Values.defaultRules.additionalRuleLabels }} 249 {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} 250 {{- end }} 251 {{- end }} 252 {{- end }} 253 {{- end }}