k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/gce/gci/health-monitor.sh (about) 1 #!/usr/bin/env bash 2 3 # Copyright 2016 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 # This script is for master and node instance health monitoring, which is 18 # packed in kube-manifest tarball. It is executed through a systemd service 19 # in cluster/gce/gci/<master/node>.yaml. The env variables come from an env 20 # file provided by the systemd service. 21 22 set -o nounset 23 set -o pipefail 24 25 # We simply kill the process when there is a failure. Another systemd service will 26 # automatically restart the process. 27 function container_runtime_monitoring { 28 local -r max_attempts=5 29 local attempt=1 30 local -r crictl="${KUBE_HOME}/bin/crictl" 31 local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-containerd}" 32 local -r healthcheck_command=("${crictl}" pods) 33 # Container runtime startup takes time. Make initial attempts before starting 34 # killing the container runtime. 35 until timeout 60 "${healthcheck_command[@]}" > /dev/null; do 36 if (( attempt == max_attempts )); then 37 echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." 38 break 39 fi 40 echo "$attempt initial attempt \"${healthcheck_command[*]}\"! Trying again in $attempt seconds..." 41 sleep "$(( 2 ** attempt++ ))" 42 done 43 while true; do 44 if ! timeout 60 "${healthcheck_command[@]}" > /dev/null; then 45 echo "Container runtime ${container_runtime_name} failed!" 46 systemctl kill --kill-who=main "${container_runtime_name}" 47 # Wait for a while, as we don't want to kill it again before it is really up. 48 sleep 120 49 else 50 sleep "${SLEEP_SECONDS}" 51 fi 52 done 53 } 54 55 function kubelet_monitoring { 56 echo "Wait for 2 minutes for kubelet to be functional" 57 # TODO(andyzheng0831): replace it with a more reliable method if possible. 58 sleep 120 59 local -r max_seconds=10 60 local output="" 61 while true; do 62 if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then 63 # Print the response and/or errors. 64 echo "${output}" 65 echo "Kubelet is unhealthy!" 66 systemctl kill kubelet 67 # Wait for a while, as we don't want to kill it again before it is really up. 68 sleep 60 69 else 70 sleep "${SLEEP_SECONDS}" 71 fi 72 done 73 } 74 75 76 ############## Main Function ################ 77 if [[ "$#" -ne 1 ]]; then 78 echo "Usage: health-monitor.sh <container-runtime/kubelet>" 79 exit 1 80 fi 81 82 KUBE_HOME="/home/kubernetes" 83 KUBE_ENV="${KUBE_HOME}/kube-env" 84 if [[ ! -e "${KUBE_ENV}" ]]; then 85 echo "The ${KUBE_ENV} file does not exist!! Terminate health monitoring" 86 exit 1 87 fi 88 89 SLEEP_SECONDS=10 90 component=$1 91 echo "Start kubernetes health monitoring for ${component}" 92 source "${KUBE_ENV}" 93 if [[ "${component}" == "container-runtime" ]]; then 94 container_runtime_monitoring 95 elif [[ "${component}" == "kubelet" ]]; then 96 kubelet_monitoring 97 else 98 echo "Health monitoring for component \"${component}\" is not supported!" 99 fi