k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/validate-cluster.sh (about)

     1  #!/usr/bin/env bash
     2  
     3  # Copyright 2014 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  # Validates that the cluster is healthy.
    18  # Error codes are:
    19  # 0 - success
    20  # 1 - fatal (cluster is unlikely to work)
    21  # 2 - non-fatal (encountered some errors, but cluster should be working correctly)
    22  
    23  set -o errexit
    24  set -o nounset
    25  set -o pipefail
    26  
    27  KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
    28  
    29  if [ -f "${KUBE_ROOT}/cluster/env.sh" ]; then
    30    source "${KUBE_ROOT}/cluster/env.sh"
    31  fi
    32  
    33  source "${KUBE_ROOT}/hack/lib/util.sh"
    34  source "${KUBE_ROOT}/cluster/kube-util.sh"
    35  
    36  # Run kubectl and retry upon failure.
    37  function kubectl_retry() {
    38    tries=3
    39    while ! "${KUBE_ROOT}/cluster/kubectl.sh" "$@"; do
    40      tries=$((tries-1))
    41      if [[ ${tries} -le 0 ]]; then
    42        echo "('kubectl $*' failed, giving up)" >&2
    43        return 1
    44      fi
    45      echo "(kubectl failed, will retry ${tries} times)" >&2
    46      sleep 1
    47    done
    48  }
    49  
    50  ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
    51  CLUSTER_READY_ADDITIONAL_TIME_SECONDS="${CLUSTER_READY_ADDITIONAL_TIME_SECONDS:-30}"
    52  
    53  if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then
    54    if [[ "${KUBE_CREATE_NODES}" == "true" ]]; then
    55      EXPECTED_NUM_NODES="$(get-num-nodes)"
    56    else
    57      EXPECTED_NUM_NODES="0"
    58    fi
    59    echo "Validating gce cluster, MULTIZONE=${MULTIZONE:-}"
    60    # In multizone mode we need to add instances for all nodes in the region.
    61    if [[ "${MULTIZONE:-}" == "true" ]]; then
    62      EXPECTED_NUM_NODES=$(gcloud -q compute instances list --project="${PROJECT}" --format="[no-heading]" \
    63        --filter="(name ~ '${NODE_INSTANCE_PREFIX}.*' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}.*') AND zone:($(gcloud -q compute zones list --project="${PROJECT}" --filter=region="${REGION}" --format="csv[no-heading](name)" | tr "\n" "," | sed  "s/,$//"))" | wc -l)
    64      echo "Computing number of nodes, NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX}, REGION=${REGION}, EXPECTED_NUM_NODES=${EXPECTED_NUM_NODES}"
    65    fi
    66  else
    67    EXPECTED_NUM_NODES="${NUM_NODES}"
    68  fi
    69  
    70  if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then
    71    if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then
    72      NUM_MASTERS=$(get-master-replicas-count)
    73    else
    74      NUM_MASTERS=1
    75    fi
    76    EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+NUM_MASTERS))
    77  fi
    78  
    79  REQUIRED_NUM_NODES=$((EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES))
    80  # Make several attempts to deal with slow cluster birth.
    81  return_value=0
    82  attempt=0
    83  # Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
    84  PAUSE_BETWEEN_ITERATIONS_SECONDS=15
    85  MAX_ATTEMPTS=100
    86  ADDITIONAL_ITERATIONS=$(((CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1)/PAUSE_BETWEEN_ITERATIONS_SECONDS))
    87  while true; do
    88    # Pause between iterations of this large outer loop.
    89    if [[ ${attempt} -gt 0 ]]; then
    90      sleep 15
    91    fi
    92    attempt=$((attempt+1))
    93  
    94    # The "kubectl get nodes -o template" exports node information.
    95    #
    96    # Echo the output and gather 2 counts:
    97    #  - Total number of nodes.
    98    #  - Number of "ready" nodes.
    99    #
   100    # Suppress errors from kubectl output because during cluster bootstrapping
   101    # for clusters where the master node is registered, the apiserver will become
   102    # available and then get restarted as the kubelet configures the docker bridge.
   103    #
   104    # We are assigning the result of kubectl_retry get nodes operation to the res
   105    # variable in that way, to prevent stopping the whole script on an error.
   106    #
   107    # Bash command substitution $(kubectl_...) removes all trailing whitespaces
   108    # which are important for line counting.
   109    # Use trick from https://unix.stackexchange.com/a/383411 to avoid
   110    # newline truncation.
   111    node=$(kubectl_retry get nodes --no-headers; ret=$?; echo .; exit "$ret") && res="$?" || res="$?"
   112    node="${node%.}"
   113    if [ "${res}" -ne "0" ]; then
   114      if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
   115        echo -e "${color_red:-} Failed to get nodes.${color_norm:-}"
   116        exit 1
   117      else
   118        continue
   119      fi
   120    fi
   121    found=$(echo -n "${node}" | wc -l)
   122    # Use grep || true so that empty result doesn't return nonzero exit code.
   123    ready=$(echo -n "${node}" | grep -c -v "NotReady" || true)
   124  
   125    if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")); then
   126      break
   127    elif (( "${found}" > "${EXPECTED_NUM_NODES}" )); then
   128      if [[ "${KUBE_USE_EXISTING_MASTER:-}" != "true" ]]; then
   129        echo -e "${color_red}Found ${found} nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
   130      fi
   131      break
   132    elif (( "${ready}" > "${EXPECTED_NUM_NODES}")); then
   133      echo -e "${color_red}Found ${ready} ready nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
   134      break
   135    else
   136      if [[ "${REQUIRED_NUM_NODES}" -le "${ready}" ]]; then
   137        echo -e "${color_green:-}Found ${REQUIRED_NUM_NODES} Nodes, allowing additional ${ADDITIONAL_ITERATIONS} iterations for other Nodes to join.${color_norm}"
   138        last_run="${last_run:-$((attempt + ADDITIONAL_ITERATIONS - 1))}"
   139      fi
   140      if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
   141        echo -e "${color_yellow:-}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
   142        kubectl_retry get nodes
   143        if [[ "${REQUIRED_NUM_NODES}" -gt "${ready}" ]]; then
   144          exit 1
   145        else
   146          return_value=2
   147          break
   148        fi
   149      else
   150        echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
   151      fi
   152    fi
   153  done
   154  echo "Found ${found} node(s)."
   155  kubectl_retry get nodes
   156  
   157  attempt=0
   158  while true; do
   159    # The "kubectl componentstatuses -o template" exports components health information.
   160    #
   161    # Echo the output and gather 2 counts:
   162    #  - Total number of componentstatuses.
   163    #  - Number of "healthy" components.
   164    cs_status=$(kubectl_retry get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}') || true
   165    componentstatuses=$(echo "${cs_status}" | grep -c 'Healthy:') || true
   166    healthy=$(echo "${cs_status}" | grep -c 'Healthy:True') || true
   167  
   168    if ((componentstatuses > healthy)) || ((componentstatuses == 0)); then
   169      if ((attempt < 5)); then
   170        echo -e "${color_yellow}Cluster not working yet.${color_norm}"
   171        attempt=$((attempt+1))
   172        sleep 30
   173      else
   174        echo -e " ${color_yellow}Validate output:${color_norm}"
   175        kubectl_retry get cs
   176        echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}"
   177        exit 1
   178      fi
   179    else
   180      break
   181    fi
   182  done
   183  
   184  echo "Validate output:"
   185  kubectl_retry get cs || true
   186  if [ "${return_value}" == "0" ]; then
   187    echo -e "${color_green}Cluster validation succeeded${color_norm}"
   188  else
   189    echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}"
   190  fi
   191  
   192  exit "${return_value}"