k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/validate-cluster.sh (about) 1 #!/usr/bin/env bash 2 3 # Copyright 2014 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 # Validates that the cluster is healthy. 18 # Error codes are: 19 # 0 - success 20 # 1 - fatal (cluster is unlikely to work) 21 # 2 - non-fatal (encountered some errors, but cluster should be working correctly) 22 23 set -o errexit 24 set -o nounset 25 set -o pipefail 26 27 KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/.. 28 29 if [ -f "${KUBE_ROOT}/cluster/env.sh" ]; then 30 source "${KUBE_ROOT}/cluster/env.sh" 31 fi 32 33 source "${KUBE_ROOT}/hack/lib/util.sh" 34 source "${KUBE_ROOT}/cluster/kube-util.sh" 35 36 # Run kubectl and retry upon failure. 37 function kubectl_retry() { 38 tries=3 39 while ! "${KUBE_ROOT}/cluster/kubectl.sh" "$@"; do 40 tries=$((tries-1)) 41 if [[ ${tries} -le 0 ]]; then 42 echo "('kubectl $*' failed, giving up)" >&2 43 return 1 44 fi 45 echo "(kubectl failed, will retry ${tries} times)" >&2 46 sleep 1 47 done 48 } 49 50 ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}" 51 CLUSTER_READY_ADDITIONAL_TIME_SECONDS="${CLUSTER_READY_ADDITIONAL_TIME_SECONDS:-30}" 52 53 if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then 54 if [[ "${KUBE_CREATE_NODES}" == "true" ]]; then 55 EXPECTED_NUM_NODES="$(get-num-nodes)" 56 else 57 EXPECTED_NUM_NODES="0" 58 fi 59 echo "Validating gce cluster, MULTIZONE=${MULTIZONE:-}" 60 # In multizone mode we need to add instances for all nodes in the region. 61 if [[ "${MULTIZONE:-}" == "true" ]]; then 62 EXPECTED_NUM_NODES=$(gcloud -q compute instances list --project="${PROJECT}" --format="[no-heading]" \ 63 --filter="(name ~ '${NODE_INSTANCE_PREFIX}.*' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}.*') AND zone:($(gcloud -q compute zones list --project="${PROJECT}" --filter=region="${REGION}" --format="csv[no-heading](name)" | tr "\n" "," | sed "s/,$//"))" | wc -l) 64 echo "Computing number of nodes, NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX}, REGION=${REGION}, EXPECTED_NUM_NODES=${EXPECTED_NUM_NODES}" 65 fi 66 else 67 EXPECTED_NUM_NODES="${NUM_NODES}" 68 fi 69 70 if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then 71 if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then 72 NUM_MASTERS=$(get-master-replicas-count) 73 else 74 NUM_MASTERS=1 75 fi 76 EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+NUM_MASTERS)) 77 fi 78 79 REQUIRED_NUM_NODES=$((EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES)) 80 # Make several attempts to deal with slow cluster birth. 81 return_value=0 82 attempt=0 83 # Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters. 84 PAUSE_BETWEEN_ITERATIONS_SECONDS=15 85 MAX_ATTEMPTS=100 86 ADDITIONAL_ITERATIONS=$(((CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1)/PAUSE_BETWEEN_ITERATIONS_SECONDS)) 87 while true; do 88 # Pause between iterations of this large outer loop. 89 if [[ ${attempt} -gt 0 ]]; then 90 sleep 15 91 fi 92 attempt=$((attempt+1)) 93 94 # The "kubectl get nodes -o template" exports node information. 95 # 96 # Echo the output and gather 2 counts: 97 # - Total number of nodes. 98 # - Number of "ready" nodes. 99 # 100 # Suppress errors from kubectl output because during cluster bootstrapping 101 # for clusters where the master node is registered, the apiserver will become 102 # available and then get restarted as the kubelet configures the docker bridge. 103 # 104 # We are assigning the result of kubectl_retry get nodes operation to the res 105 # variable in that way, to prevent stopping the whole script on an error. 106 # 107 # Bash command substitution $(kubectl_...) removes all trailing whitespaces 108 # which are important for line counting. 109 # Use trick from https://unix.stackexchange.com/a/383411 to avoid 110 # newline truncation. 111 node=$(kubectl_retry get nodes --no-headers; ret=$?; echo .; exit "$ret") && res="$?" || res="$?" 112 node="${node%.}" 113 if [ "${res}" -ne "0" ]; then 114 if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then 115 echo -e "${color_red:-} Failed to get nodes.${color_norm:-}" 116 exit 1 117 else 118 continue 119 fi 120 fi 121 found=$(echo -n "${node}" | wc -l) 122 # Use grep || true so that empty result doesn't return nonzero exit code. 123 ready=$(echo -n "${node}" | grep -c -v "NotReady" || true) 124 125 if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")); then 126 break 127 elif (( "${found}" > "${EXPECTED_NUM_NODES}" )); then 128 if [[ "${KUBE_USE_EXISTING_MASTER:-}" != "true" ]]; then 129 echo -e "${color_red}Found ${found} nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}" 130 fi 131 break 132 elif (( "${ready}" > "${EXPECTED_NUM_NODES}")); then 133 echo -e "${color_red}Found ${ready} ready nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}" 134 break 135 else 136 if [[ "${REQUIRED_NUM_NODES}" -le "${ready}" ]]; then 137 echo -e "${color_green:-}Found ${REQUIRED_NUM_NODES} Nodes, allowing additional ${ADDITIONAL_ITERATIONS} iterations for other Nodes to join.${color_norm}" 138 last_run="${last_run:-$((attempt + ADDITIONAL_ITERATIONS - 1))}" 139 fi 140 if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then 141 echo -e "${color_yellow:-}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}" 142 kubectl_retry get nodes 143 if [[ "${REQUIRED_NUM_NODES}" -gt "${ready}" ]]; then 144 exit 1 145 else 146 return_value=2 147 break 148 fi 149 else 150 echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}" 151 fi 152 fi 153 done 154 echo "Found ${found} node(s)." 155 kubectl_retry get nodes 156 157 attempt=0 158 while true; do 159 # The "kubectl componentstatuses -o template" exports components health information. 160 # 161 # Echo the output and gather 2 counts: 162 # - Total number of componentstatuses. 163 # - Number of "healthy" components. 164 cs_status=$(kubectl_retry get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}') || true 165 componentstatuses=$(echo "${cs_status}" | grep -c 'Healthy:') || true 166 healthy=$(echo "${cs_status}" | grep -c 'Healthy:True') || true 167 168 if ((componentstatuses > healthy)) || ((componentstatuses == 0)); then 169 if ((attempt < 5)); then 170 echo -e "${color_yellow}Cluster not working yet.${color_norm}" 171 attempt=$((attempt+1)) 172 sleep 30 173 else 174 echo -e " ${color_yellow}Validate output:${color_norm}" 175 kubectl_retry get cs 176 echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}" 177 exit 1 178 fi 179 else 180 break 181 fi 182 done 183 184 echo "Validate output:" 185 kubectl_retry get cs || true 186 if [ "${return_value}" == "0" ]; then 187 echo -e "${color_green}Cluster validation succeeded${color_norm}" 188 else 189 echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}" 190 fi 191 192 exit "${return_value}"