k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/gce/gci/health-monitor.sh (about)

     1  #!/usr/bin/env bash
     2  
     3  # Copyright 2016 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  # This script is for master and node instance health monitoring, which is
    18  # packed in kube-manifest tarball. It is executed through a systemd service
    19  # in cluster/gce/gci/<master/node>.yaml. The env variables come from an env
    20  # file provided by the systemd service.
    21  
    22  set -o nounset
    23  set -o pipefail
    24  
    25  # We simply kill the process when there is a failure. Another systemd service will
    26  # automatically restart the process.
    27  function container_runtime_monitoring {
    28    local -r max_attempts=5
    29    local attempt=1
    30    local -r crictl="${KUBE_HOME}/bin/crictl"
    31    local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-containerd}"
    32    local -r healthcheck_command=("${crictl}" pods)
    33    # Container runtime startup takes time. Make initial attempts before starting
    34    # killing the container runtime.
    35    until timeout 60 "${healthcheck_command[@]}" > /dev/null; do
    36      if (( attempt == max_attempts )); then
    37        echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness."
    38        break
    39      fi
    40      echo "$attempt initial attempt \"${healthcheck_command[*]}\"! Trying again in $attempt seconds..."
    41      sleep "$(( 2 ** attempt++ ))"
    42    done
    43    while true; do
    44      if ! timeout 60 "${healthcheck_command[@]}" > /dev/null; then
    45        echo "Container runtime ${container_runtime_name} failed!"
    46        systemctl kill --kill-who=main "${container_runtime_name}"
    47        # Wait for a while, as we don't want to kill it again before it is really up.
    48        sleep 120
    49      else
    50        sleep "${SLEEP_SECONDS}"
    51      fi
    52    done
    53  }
    54  
    55  function kubelet_monitoring {
    56    echo "Wait for 2 minutes for kubelet to be functional"
    57    # TODO(andyzheng0831): replace it with a more reliable method if possible.
    58    sleep 120
    59    local -r max_seconds=10
    60    local output=""
    61    while true; do
    62      if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then
    63        # Print the response and/or errors.
    64        echo "${output}"
    65        echo "Kubelet is unhealthy!"
    66        systemctl kill kubelet
    67        # Wait for a while, as we don't want to kill it again before it is really up.
    68        sleep 60
    69      else
    70        sleep "${SLEEP_SECONDS}"
    71      fi
    72    done
    73  }
    74  
    75  
    76  ############## Main Function ################
    77  if [[ "$#" -ne 1 ]]; then
    78    echo "Usage: health-monitor.sh <container-runtime/kubelet>"
    79    exit 1
    80  fi
    81  
    82  KUBE_HOME="/home/kubernetes"
    83  KUBE_ENV="${KUBE_HOME}/kube-env"
    84  if [[ ! -e "${KUBE_ENV}" ]]; then
    85    echo "The ${KUBE_ENV} file does not exist!! Terminate health monitoring"
    86    exit 1
    87  fi
    88  
    89  SLEEP_SECONDS=10
    90  component=$1
    91  echo "Start kubernetes health monitoring for ${component}"
    92  source "${KUBE_ENV}"
    93  if [[ "${component}" == "container-runtime" ]]; then
    94    container_runtime_monitoring
    95  elif [[ "${component}" == "kubelet" ]]; then
    96    kubelet_monitoring
    97  else
    98    echo "Health monitoring for component \"${component}\" is not supported!"
    99  fi