github.com/kata-containers/tests@v0.0.0-20240307153542-772105b56064/integration/kubernetes/init.sh (about) 1 #!/bin/bash 2 # 3 # Copyright (c) 2018 Intel Corporation 4 # 5 # SPDX-License-Identifier: Apache-2.0 6 # 7 8 set -o errexit 9 set -o nounset 10 set -o pipefail 11 12 13 SCRIPT_PATH=$(dirname "$(readlink -f "$0")") 14 source "${SCRIPT_PATH}/../../.ci/lib.sh" 15 source "${SCRIPT_PATH}/../../lib/common.bash" 16 source "/etc/os-release" || source "/usr/lib/os-release" 17 18 # Whether running on bare-metal mode or not. 19 BAREMETAL="${BAREMETAL:-false}" 20 CI=${CI:-""} 21 CI_JOB=${CI_JOB:-} 22 # Path to the cluster network configuration file. Some architectures will 23 # overwritten that variable. 24 network_plugin_config="${network_plugin_config:-}" 25 RUNTIME=${RUNTIME:-containerd-shim-kata-v2} 26 RUNTIME_PATH=${RUNTIME_PATH:-$(command -v $RUNTIME)} 27 CRI_RUNTIME="${CRI_RUNTIME:-crio}" 28 29 untaint_node() { 30 # Enable the master node to be able to schedule pods. 31 local node_name="$(hostname | awk '{print tolower($0)}')" 32 local get_taints="kubectl get 'node/${node_name}' -o jsonpath='{.spec.taints}'" 33 if eval $get_taints | grep -q 'NoSchedule'; then 34 info "Taint 'NoSchedule' is found. Untaint the node so pods can be scheduled." 35 kubectl taint nodes "${node_name}" \ 36 node-role.kubernetes.io/master:NoSchedule- 37 fi 38 if eval $get_taints | grep -q 'control-plane'; then 39 info "Taint 'control-plane' is found. Untaint the node so pods can be scheduled." 40 kubectl taint nodes "${node_name}" \ 41 node-role.kubernetes.io/control-plane- 42 fi 43 } 44 45 wait_pods_ready() 46 { 47 # Master components provide the cluster’s control plane, including kube-apisever, 48 # etcd, kube-scheduler, kube-controller-manager, etc. 49 # We need to ensure their readiness before we run any container tests. 50 local pods_status="kubectl get pods --all-namespaces" 51 local apiserver_pod="kube-apiserver" 52 local controller_pod="kube-controller-manager" 53 local etcd_pod="etcd" 54 local scheduler_pod="kube-scheduler" 55 local dns_pod="coredns" 56 local system_pod=($apiserver_pod $controller_pod $etcd_pod $scheduler_pod $dns_pod) 57 58 local system_pod_wait_time=120 59 local sleep_time=5 60 local running_pattern="" 61 for pod_entry in "${system_pod[@]}" 62 do 63 running_pattern="${pod_entry}.*1/1.*Running" 64 if ! waitForProcess "$system_pod_wait_time" "$sleep_time" \ 65 "$pods_status | grep "${running_pattern}""; then 66 info "Some expected Pods aren't running after ${system_pod_wait_time} seconds." 1>&2 67 ${pods_status} 1>&2 68 # Print debug information for the problematic pods. 69 for pod in $(kubectl get pods --all-namespaces \ 70 -o jsonpath='{.items[*].metadata.name}'); do 71 if [[ "$pod" =~ ${pod_entry} ]]; then 72 echo "[DEBUG] Pod ${pod}:" 1>&2 73 kubectl describe -n kube-system \ 74 pod $pod 1>&2 || true 75 fi 76 done 77 die "Kubernetes is not fully ready. Bailing out..." 78 fi 79 done 80 } 81 82 build_custom_stress_image() 83 { 84 if [ "${CI_JOB}" != "METRICS" ]; then 85 info "Build custom stress image" 86 image_version=$(get_test_version "docker_images.registry.version") 87 registry_image=$(get_test_version "docker_images.registry.registry_url"):"${image_version}" 88 arch=$("${SCRIPT_PATH}/../../.ci/kata-arch.sh") 89 if [[ "${arch}" == "ppc64le" || "${arch}" == "s390x" ]]; then 90 # that image is not built for these architectures 91 image_version=$(get_test_version "docker_images.registry_ibm.version") 92 registry_image=$(get_test_version "docker_images.registry_ibm.registry_url"):"${image_version}" 93 fi 94 fi 95 96 runtimeclass_files_path="${SCRIPT_PATH}/runtimeclass_workloads" 97 98 if [ "${CI_JOB}" != "METRICS" ]; then 99 pushd "${runtimeclass_files_path}/stress" 100 [ "${container_engine}" == "docker" ] && restart_docker_service 101 sudo -E "${container_engine}" build . -t "${stress_image}" 102 popd 103 104 if [ "${stress_image_pull_policy}" == "Always" ]; then 105 info "Store custom stress image in registry" 106 sudo -E "${container_engine}" run -d -p ${registry_port}:5000 --restart=always --name "${registry_name}" "${registry_image}" 107 # wait for registry container 108 waitForProcess 15 3 "curl http://localhost:${registry_port}" 109 sudo -E "${container_engine}" push "${stress_image}" 110 fi 111 if [ "$(uname -m)" != "s390x" ] && [ "$(uname -m)" != "ppc64le" ] && [ "$(uname -m)" != "aarch64" ] && [ "$ID" != "fedora" ]; then 112 pushd "${GOPATH}/src/github.com/kata-containers/tests/metrics/density/sysbench-dockerfile" 113 registry_port="5000" 114 sysbench_image="localhost:${registry_port}/sysbench-kata:latest" 115 sudo -E "${container_engine}" build . -t "${sysbench_image}" 116 sudo -E "${container_engine}" push "${sysbench_image}" 117 popd 118 fi 119 fi 120 } 121 122 # Delete the CNI configuration files and delete the interface. 123 # That's needed because `kubeadm reset` (ran on clean up) won't clean up the 124 # CNI configuration and we must ensure a fresh environment before starting 125 # Kubernetes. 126 cleanup_cni_configuration() { 127 # Remove existing CNI configurations: 128 local cni_config_dir="/etc/cni" 129 local cni_interface="cni0" 130 sudo rm -rf /var/lib/cni/networks/* 131 sudo rm -rf "${cni_config_dir}"/* 132 if ip a show "$cni_interface"; then 133 sudo ip link set dev "$cni_interface" down 134 sudo ip link del "$cni_interface" 135 fi 136 } 137 138 # Configure the cluster network. 139 # 140 # Parameters: 141 # $1 - path to the network plugin configuration file (Optional). 142 # Defaults to flannel. 143 # 144 configure_network() { 145 local network_plugin_config="${1:-}" 146 local issue="https://github.com/kata-containers/tests/issues/4381" 147 148 if [ -z "${network_plugin_config}" ]; then 149 # default network plugin should be flannel, and its config file is taken from k8s 1.12 documentation 150 local flannel_version="$(get_test_version "externals.flannel.version")" 151 local flannel_url="$(get_test_version "externals.flannel.kube-flannel_url")" 152 info "Use flannel ${flannel_version}" 153 network_plugin_config="$flannel_url" 154 fi 155 info "Use configuration file from ${network_plugin_config}" 156 kubectl apply -f "$network_plugin_config" 157 158 if [ -n "${flannel_version:-}" ]; then 159 # There is an issue hitting some CI jobs due to a bug on CRI-O that 160 # sometimes doesn't realize a new CNI configuration was installed. 161 # Here we try a simple workaround which consist of rebooting the 162 # CRI-O service. 163 if [ "${CRI_RUNTIME:-}" = "crio" ]; then 164 info "Restart the CRI-O service due to $issue" 165 sudo systemctl restart crio 166 fi 167 local list_pods="kubectl get -n kube-flannel --selector app=flannel pods" 168 info "Wait for Flannel pods to show up" 169 waitForProcess "60" "10" \ 170 "[ \$($list_pods 2>/dev/null | wc -l) -gt 0 ]" 171 local flannel_p 172 for flannel_p in $($list_pods \ 173 -o jsonpath='{.items[*].metadata.name}'); do 174 info "Wait for pod $flannel_p be ready" 175 if ! kubectl wait -n kube-flannel --for=condition=Ready \ 176 "pod/$flannel_p"; then 177 info "Flannel pod $flannel_p failed to start" 178 echo "[DEBUG] Pod ${flannel_p}:" 1>&2 179 kubectl describe -n kube-flannel "pod/$flannel_p" 180 fi 181 done 182 fi 183 } 184 185 # Save the current iptables configuration. 186 # 187 # Global variables: 188 # KATA_TESTS_DATADIR - directory where to save the configuration (mandatory). 189 # 190 save_iptables() { 191 [ -n "${KATA_TESTS_DATADIR:-}" ] || \ 192 die "\$KATA_TESTS_DATADIR is empty, unable to save the iptables configuration" 193 194 local iptables_cache="${KATA_TESTS_DATADIR}/iptables_cache" 195 [ -d "${KATA_TESTS_DATADIR}" ] || sudo mkdir -p "${KATA_TESTS_DATADIR}" 196 # cleanup iptables before save 197 iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X 198 iptables-save > "$iptables_cache" 199 } 200 201 # Start Kubernetes 202 # 203 # Parameters: 204 # $1 - Kubernetes version as in versions.yaml (mandatory). 205 # $2 - CRI runtime socket path (mandatory). 206 # $3 - cgroup driver name (mandatory). 207 # Global variables: 208 # SCRIPT_PATH - path to this script. 209 # kubeadm_config_file - the kubeadmin configuration file created in 210 # this function. 211 # KUBECONFIG - exported by this function. 212 # 213 start_kubernetes() { 214 local k8s_version="$1" 215 local cri_socket_path="$2" 216 local cgroup_driver="$3" 217 local kubeadm_config_template="${SCRIPT_PATH}/kubeadm/config.yaml" 218 local kubelet_wait="240" 219 local kubelet_sleep="10" 220 221 info "Init cluster using ${cri_socket_path}" 222 223 # This should be global otherwise the clean up fails. 224 kubeadm_config_file="$(mktemp --tmpdir kubeadm_config.XXXXXX.yaml)" 225 trap 'sudo -E sh -c "rm -r "${kubeadm_config_file}""' EXIT 226 227 sed -e "s|CRI_RUNTIME_SOCKET|${cri_socket_path}|" "${kubeadm_config_template}" > "${kubeadm_config_file}" 228 sed -i "s|KUBERNETES_VERSION|v${k8s_version/-*}|" "${kubeadm_config_file}" 229 sed -i "s|CGROUP_DRIVER|${cgroup_driver}|" "${kubeadm_config_file}" 230 231 if [ "${CI}" == true ] && [[ $(wc -l /proc/swaps | awk '{print $1}') -gt 1 ]]; then 232 grep -q zram /proc/swaps && echo "# zram swap disabled" | sudo tee /etc/systemd/zram-generator.conf 233 sudo swapoff -a || true 234 fi 235 236 #reinstall kubelet to do deep cleanup 237 if [ "${BAREMETAL}" == true -a "$(command -v kubelet)" != "" ]; then 238 info "reinstall kubeadm, kubelet before initialize k8s" 239 bash -f "${SCRIPT_PATH}/../../.ci/install_kubernetes.sh" 240 fi 241 242 sudo -E kubeadm init --config "${kubeadm_config_file}" 243 244 mkdir -p "$HOME/.kube" 245 sudo cp "/etc/kubernetes/admin.conf" "$HOME/.kube/config" 246 sudo chown $(id -u):$(id -g) "$HOME/.kube/config" 247 export KUBECONFIG="$HOME/.kube/config" 248 249 info "Probing kubelet (timeout=${kubelet_wait}s)" 250 waitForProcess "$kubelet_wait" "$kubelet_sleep" \ 251 "kubectl get nodes" 252 } 253 254 # Start the CRI runtime service. 255 # 256 # Arguments: 257 # $1 - the CRI service name (mandatory). 258 # $2 - the expected service socket path (mandatory). 259 # 260 start_cri_runtime_service() { 261 local cri=$1 262 local socket_path=$2 263 # Number of check tentatives. 264 local max_cri_socket_check=5 265 # Sleep time between checks. 266 local wait_time_cri_socket_check=5 267 268 # stop the service first and then restart it 269 if systemctl is-active --quiet ${cri}; then 270 info "Stop ${cri} service" 271 sudo systemctl stop ${cri} 272 fi 273 274 sudo systemctl enable --now ${cri} 275 276 for i in $(seq ${max_cri_socket_check}); do 277 #when the test runs two times in the CI, the second time crio takes some time to be ready 278 sleep "${wait_time_cri_socket_check}" 279 [ -e "${socket_path}" ] && break 280 info "Waiting for cri socket ${socket_path} (try ${i})" 281 done 282 283 sudo systemctl status "${cri}" --no-pager || \ 284 die "Unable to start the ${cri} service" 285 } 286 287 main() { 288 local arch="$("${SCRIPT_PATH}/../../.ci/kata-arch.sh")" 289 local kubernetes_version=$(get_version "externals.kubernetes.version") 290 local cri_runtime_socket="" 291 local cgroup_driver="" 292 293 case "${CRI_RUNTIME}" in 294 containerd) 295 cri_runtime_socket="unix:///run/containerd/containerd.sock" 296 cgroup_driver="cgroupfs" 297 ;; 298 crio) 299 cri_runtime_socket="unix:///var/run/crio/crio.sock" 300 cgroup_driver="systemd" 301 ;; 302 *) 303 die "Runtime ${CRI_RUNTIME} not supported" 304 ;; 305 esac 306 307 #Load arch-specific configure file 308 if [ -f "${SCRIPT_PATH}/../../.ci/${arch}/kubernetes/init.sh" ]; then 309 source "${SCRIPT_PATH}/../../.ci/${arch}/kubernetes/init.sh" 310 fi 311 312 # store iptables if CI running on bare-metal. The configuration should be 313 # restored afterwards the tests finish. 314 if [ "${BAREMETAL}" == true ]; then 315 info "Save the iptables configuration" 316 save_iptables 317 fi 318 319 info "Check there aren't dangling processes from previous tests" 320 check_processes 321 322 # Build and store custom stress image 323 build_custom_stress_image 324 325 info "Clean up any leftover CNI configuration" 326 cleanup_cni_configuration 327 328 if [ "$CRI_RUNTIME" == crio ]; then 329 crio_repository="github.com/cri-o/cri-o" 330 crio_repository_path="$GOPATH/src/${crio_repository}" 331 cni_directory="/etc/cni/net.d" 332 if [ ! -d "${cni_directory}" ]; then 333 sudo mkdir -p "${cni_directory}" 334 fi 335 # The config file for cni in cri-o v1.26 and higher has the extension "conflist" 336 # Using a wildcard here to keep support for both names. 337 # To be removed and replaced with "conflist" when support for older cri-o versions is abandonned. 338 sudo cp "${crio_repository_path}/contrib/cni/10-crio-bridge.conf"* "${cni_directory}" 339 fi 340 341 info "Start ${CRI_RUNTIME} service" 342 start_cri_runtime_service "${CRI_RUNTIME}" "${cri_runtime_socket}" 343 344 info "Start Kubernetes" 345 start_kubernetes "${kubernetes_version}" "${cri_runtime_socket}" "${cgroup_driver}" 346 347 info "Configure the cluster network" 348 configure_network "${network_plugin_config}" 349 350 # we need to ensure a few specific pods ready and running 351 info "Wait for system's pods be ready and running" 352 wait_pods_ready 353 354 info "Create kata RuntimeClass resource" 355 kubectl create -f "${runtimeclass_files_path}/kata-runtimeclass.yaml" 356 357 untaint_node 358 } 359 360 main $@