github.com/kata-containers/tests@v0.0.0-20240307153542-772105b56064/integration/kubernetes/init.sh (about)

     1  #!/bin/bash
     2  #
     3  # Copyright (c) 2018 Intel Corporation
     4  #
     5  # SPDX-License-Identifier: Apache-2.0
     6  #
     7  
     8  set -o errexit
     9  set -o nounset
    10  set -o pipefail
    11  
    12  
    13  SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
    14  source "${SCRIPT_PATH}/../../.ci/lib.sh"
    15  source "${SCRIPT_PATH}/../../lib/common.bash"
    16  source "/etc/os-release" || source "/usr/lib/os-release"
    17  
    18  # Whether running on bare-metal mode or not.
    19  BAREMETAL="${BAREMETAL:-false}"
    20  CI=${CI:-""}
    21  CI_JOB=${CI_JOB:-}
    22  # Path to the cluster network configuration file. Some architectures will
    23  # overwritten that variable.
    24  network_plugin_config="${network_plugin_config:-}"
    25  RUNTIME=${RUNTIME:-containerd-shim-kata-v2}
    26  RUNTIME_PATH=${RUNTIME_PATH:-$(command -v $RUNTIME)}
    27  CRI_RUNTIME="${CRI_RUNTIME:-crio}"
    28  
    29  untaint_node() {
    30  	# Enable the master node to be able to schedule pods.
    31  	local node_name="$(hostname | awk '{print tolower($0)}')"
    32  	local get_taints="kubectl get 'node/${node_name}' -o jsonpath='{.spec.taints}'"
    33  	if eval $get_taints | grep -q 'NoSchedule'; then
    34  		info "Taint 'NoSchedule' is found. Untaint the node so pods can be scheduled."
    35  		kubectl taint nodes "${node_name}" \
    36  			node-role.kubernetes.io/master:NoSchedule-
    37  	fi
    38  	if eval $get_taints | grep -q 'control-plane'; then
    39  		info "Taint 'control-plane' is found. Untaint the node so pods can be scheduled."
    40  		kubectl taint nodes "${node_name}" \
    41  			node-role.kubernetes.io/control-plane-
    42  	fi
    43  }
    44  
    45  wait_pods_ready()
    46  {
    47  	# Master components provide the cluster’s control plane, including kube-apisever,
    48  	# etcd, kube-scheduler, kube-controller-manager, etc.
    49  	# We need to ensure their readiness before we run any container tests.
    50  	local pods_status="kubectl get pods --all-namespaces"
    51  	local apiserver_pod="kube-apiserver"
    52  	local controller_pod="kube-controller-manager"
    53  	local etcd_pod="etcd"
    54  	local scheduler_pod="kube-scheduler"
    55  	local dns_pod="coredns"
    56  	local system_pod=($apiserver_pod $controller_pod $etcd_pod $scheduler_pod $dns_pod)
    57  
    58  	local system_pod_wait_time=120
    59  	local sleep_time=5
    60  	local running_pattern=""
    61  	for pod_entry in "${system_pod[@]}"
    62  	do
    63  		running_pattern="${pod_entry}.*1/1.*Running"
    64  		if ! waitForProcess "$system_pod_wait_time" "$sleep_time" \
    65  			"$pods_status | grep "${running_pattern}""; then
    66  			info "Some expected Pods aren't running after ${system_pod_wait_time} seconds." 1>&2
    67  			${pods_status} 1>&2
    68  			# Print debug information for the problematic pods.
    69  			for pod in $(kubectl get pods --all-namespaces \
    70  				-o jsonpath='{.items[*].metadata.name}'); do
    71  				if [[ "$pod" =~ ${pod_entry} ]]; then
    72  					echo "[DEBUG] Pod ${pod}:" 1>&2
    73  					kubectl describe -n kube-system \
    74  						pod $pod 1>&2 || true
    75  				fi
    76  			done
    77  			die "Kubernetes is not fully ready. Bailing out..."
    78  		fi
    79  	done
    80  }
    81  
    82  build_custom_stress_image()
    83  {
    84  	if [ "${CI_JOB}" != "METRICS" ]; then
    85  		info "Build custom stress image"
    86  		image_version=$(get_test_version "docker_images.registry.version")
    87  		registry_image=$(get_test_version "docker_images.registry.registry_url"):"${image_version}"
    88  		arch=$("${SCRIPT_PATH}/../../.ci/kata-arch.sh")
    89  		if [[ "${arch}" == "ppc64le" || "${arch}" == "s390x" ]]; then
    90  			# that image is not built for these architectures
    91  			image_version=$(get_test_version "docker_images.registry_ibm.version")
    92  			registry_image=$(get_test_version "docker_images.registry_ibm.registry_url"):"${image_version}"
    93  		fi
    94  	fi
    95  
    96  	runtimeclass_files_path="${SCRIPT_PATH}/runtimeclass_workloads"
    97  
    98  	if [ "${CI_JOB}" != "METRICS" ]; then
    99  		pushd "${runtimeclass_files_path}/stress"
   100  		[ "${container_engine}" == "docker" ] && restart_docker_service
   101  		sudo -E "${container_engine}" build . -t "${stress_image}"
   102  		popd
   103  
   104  		if [ "${stress_image_pull_policy}" == "Always" ]; then
   105  			info "Store custom stress image in registry"
   106  			sudo -E "${container_engine}" run -d -p ${registry_port}:5000 --restart=always --name "${registry_name}" "${registry_image}"
   107  			# wait for registry container
   108  			waitForProcess 15 3 "curl http://localhost:${registry_port}"
   109  			sudo -E "${container_engine}" push "${stress_image}"
   110  		fi
   111  		if [ "$(uname -m)" != "s390x" ] && [ "$(uname -m)" != "ppc64le" ] && [ "$(uname -m)" != "aarch64" ] && [ "$ID" != "fedora" ]; then
   112  			pushd "${GOPATH}/src/github.com/kata-containers/tests/metrics/density/sysbench-dockerfile"
   113  			registry_port="5000"
   114  			sysbench_image="localhost:${registry_port}/sysbench-kata:latest"
   115  			sudo -E "${container_engine}" build . -t "${sysbench_image}"
   116  			sudo -E "${container_engine}" push "${sysbench_image}"
   117  			popd
   118  		fi
   119  	fi
   120  }
   121  
   122  # Delete the CNI configuration files and delete the interface.
   123  # That's needed because `kubeadm reset` (ran on clean up) won't clean up the
   124  # CNI configuration and we must ensure a fresh environment before starting
   125  # Kubernetes.
   126  cleanup_cni_configuration() {
   127  	# Remove existing CNI configurations:
   128  	local cni_config_dir="/etc/cni"
   129  	local cni_interface="cni0"
   130  	sudo rm -rf /var/lib/cni/networks/*
   131  	sudo rm -rf "${cni_config_dir}"/*
   132  	if ip a show "$cni_interface"; then
   133  		sudo ip link set dev "$cni_interface" down
   134  		sudo ip link del "$cni_interface"
   135  	fi
   136  }
   137  
   138  # Configure the cluster network.
   139  #
   140  # Parameters:
   141  #	$1 - path to the network plugin configuration file (Optional).
   142  #	     Defaults to flannel.
   143  #
   144  configure_network() {
   145  	local network_plugin_config="${1:-}"
   146  	local issue="https://github.com/kata-containers/tests/issues/4381"
   147  
   148  	if [ -z "${network_plugin_config}" ]; then
   149  		# default network plugin should be flannel, and its config file is taken from k8s 1.12 documentation
   150  		local flannel_version="$(get_test_version "externals.flannel.version")"
   151  		local flannel_url="$(get_test_version "externals.flannel.kube-flannel_url")"
   152  		info "Use flannel ${flannel_version}"
   153  		network_plugin_config="$flannel_url"
   154  	fi
   155  	info "Use configuration file from ${network_plugin_config}"
   156  	kubectl apply -f "$network_plugin_config"
   157  
   158  	if [ -n "${flannel_version:-}" ]; then
   159  		# There is an issue hitting some CI jobs due to a bug on CRI-O that
   160  		# sometimes doesn't realize a new CNI configuration was installed.
   161  		# Here we try a simple workaround which consist of rebooting the
   162  		# CRI-O service.
   163  		if [ "${CRI_RUNTIME:-}" = "crio" ]; then
   164  			info "Restart the CRI-O service due to $issue"
   165  			sudo systemctl restart crio
   166  		fi
   167  		local list_pods="kubectl get -n kube-flannel --selector app=flannel pods"
   168  		info "Wait for Flannel pods to show up"
   169  		waitForProcess "60" "10" \
   170  			"[ \$($list_pods 2>/dev/null | wc -l) -gt 0 ]"
   171  		local flannel_p
   172  		for flannel_p in $($list_pods \
   173  			-o jsonpath='{.items[*].metadata.name}'); do
   174  			info "Wait for pod $flannel_p be ready"
   175  			if ! kubectl wait -n kube-flannel --for=condition=Ready \
   176  				"pod/$flannel_p"; then
   177  				info "Flannel pod $flannel_p failed to start"
   178  				echo "[DEBUG] Pod ${flannel_p}:" 1>&2
   179  				kubectl describe -n kube-flannel "pod/$flannel_p"
   180  			fi
   181  		done
   182  	fi
   183  }
   184  
   185  # Save the current iptables configuration.
   186  #
   187  # Global variables:
   188  # 	KATA_TESTS_DATADIR - directory where to save the configuration (mandatory).
   189  #
   190  save_iptables() {
   191  	[ -n "${KATA_TESTS_DATADIR:-}" ] || \
   192  		die "\$KATA_TESTS_DATADIR is empty, unable to save the iptables configuration"
   193  
   194  	local iptables_cache="${KATA_TESTS_DATADIR}/iptables_cache"
   195  	[ -d "${KATA_TESTS_DATADIR}" ] || sudo mkdir -p "${KATA_TESTS_DATADIR}"
   196  	# cleanup iptables before save
   197  	iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
   198  	iptables-save > "$iptables_cache"
   199  }
   200  
   201  # Start Kubernetes
   202  #
   203  # Parameters:
   204  #	$1 - Kubernetes version as in versions.yaml (mandatory).
   205  #	$2 - CRI runtime socket path (mandatory).
   206  #	$3 - cgroup driver name (mandatory).
   207  # Global variables:
   208  #	SCRIPT_PATH - path to this script.
   209  #	kubeadm_config_file - the kubeadmin configuration file created in
   210  #			      this function.
   211  #	KUBECONFIG - exported by this function.
   212  #
   213  start_kubernetes() {
   214  	local k8s_version="$1"
   215  	local cri_socket_path="$2"
   216  	local cgroup_driver="$3"
   217  	local kubeadm_config_template="${SCRIPT_PATH}/kubeadm/config.yaml"
   218  	local kubelet_wait="240"
   219  	local kubelet_sleep="10"
   220  
   221  	info "Init cluster using ${cri_socket_path}"
   222  
   223  	# This should be global otherwise the clean up fails.
   224  	kubeadm_config_file="$(mktemp --tmpdir kubeadm_config.XXXXXX.yaml)"
   225  	trap 'sudo -E sh -c "rm -r "${kubeadm_config_file}""' EXIT
   226  
   227  	sed -e "s|CRI_RUNTIME_SOCKET|${cri_socket_path}|" "${kubeadm_config_template}" > "${kubeadm_config_file}"
   228  	sed -i "s|KUBERNETES_VERSION|v${k8s_version/-*}|" "${kubeadm_config_file}"
   229  	sed -i "s|CGROUP_DRIVER|${cgroup_driver}|" "${kubeadm_config_file}"
   230  
   231  	if [ "${CI}" == true ] && [[ $(wc -l /proc/swaps | awk '{print $1}') -gt 1 ]]; then
   232  		grep -q zram /proc/swaps && echo "# zram swap disabled"  | sudo tee /etc/systemd/zram-generator.conf
   233  		sudo swapoff -a || true
   234  	fi
   235  
   236  	#reinstall kubelet to do deep cleanup
   237  	if [ "${BAREMETAL}" == true -a "$(command -v kubelet)" != "" ]; then
   238  		info "reinstall kubeadm, kubelet before initialize k8s"
   239  		bash -f "${SCRIPT_PATH}/../../.ci/install_kubernetes.sh"
   240  	fi
   241  
   242  	sudo -E kubeadm init --config "${kubeadm_config_file}"
   243  
   244  	mkdir -p "$HOME/.kube"
   245  	sudo cp "/etc/kubernetes/admin.conf" "$HOME/.kube/config"
   246  	sudo chown $(id -u):$(id -g) "$HOME/.kube/config"
   247  	export KUBECONFIG="$HOME/.kube/config"
   248  
   249  	info "Probing kubelet (timeout=${kubelet_wait}s)"
   250  	waitForProcess "$kubelet_wait" "$kubelet_sleep" \
   251  		"kubectl get nodes"
   252  }
   253  
   254  # Start the CRI runtime service.
   255  #
   256  # Arguments:
   257  #	$1 - the CRI service name (mandatory).
   258  #	$2 - the expected service socket path (mandatory).
   259  #
   260  start_cri_runtime_service() {
   261  	local cri=$1
   262  	local socket_path=$2
   263  	# Number of check tentatives.
   264  	local max_cri_socket_check=5
   265  	# Sleep time between checks.
   266  	local wait_time_cri_socket_check=5
   267  
   268  	# stop the service first and then restart it
   269  	if systemctl is-active --quiet ${cri}; then
   270  		info "Stop ${cri} service"
   271  		sudo systemctl stop ${cri}
   272  	fi
   273  
   274  	sudo systemctl enable --now ${cri}
   275  
   276  	for i in $(seq ${max_cri_socket_check}); do
   277  		#when the test runs two times in the CI, the second time crio takes some time to be ready
   278  		sleep "${wait_time_cri_socket_check}"
   279  		[ -e "${socket_path}" ] && break
   280  		info "Waiting for cri socket ${socket_path} (try ${i})"
   281  	done
   282  
   283  	sudo systemctl status "${cri}" --no-pager || \
   284  		die "Unable to start the ${cri} service"
   285  }
   286  
   287  main() {
   288  	local arch="$("${SCRIPT_PATH}/../../.ci/kata-arch.sh")"
   289  	local kubernetes_version=$(get_version "externals.kubernetes.version")
   290  	local cri_runtime_socket=""
   291  	local cgroup_driver=""
   292  
   293  	case "${CRI_RUNTIME}" in
   294  	containerd)
   295  		cri_runtime_socket="unix:///run/containerd/containerd.sock"
   296  		cgroup_driver="cgroupfs"
   297  		;;
   298  	crio)
   299  		cri_runtime_socket="unix:///var/run/crio/crio.sock"
   300  		cgroup_driver="systemd"
   301  		;;
   302  	*)
   303  		die "Runtime ${CRI_RUNTIME} not supported"
   304  		;;
   305  	esac
   306  
   307          #Load arch-specific configure file
   308  	if [ -f "${SCRIPT_PATH}/../../.ci/${arch}/kubernetes/init.sh" ]; then
   309  		source "${SCRIPT_PATH}/../../.ci/${arch}/kubernetes/init.sh"
   310  	fi
   311  
   312  	# store iptables if CI running on bare-metal. The configuration should be
   313  	# restored afterwards the tests finish.
   314  	if [ "${BAREMETAL}" == true ]; then
   315  		info "Save the iptables configuration"
   316  		save_iptables
   317  	fi
   318  
   319  	info "Check there aren't dangling processes from previous tests"
   320  	check_processes
   321  
   322  	# Build and store custom stress image
   323  	build_custom_stress_image
   324  
   325  	info "Clean up any leftover CNI configuration"
   326  	cleanup_cni_configuration
   327  
   328  	if [ "$CRI_RUNTIME" == crio ]; then
   329  		crio_repository="github.com/cri-o/cri-o"
   330  		crio_repository_path="$GOPATH/src/${crio_repository}"
   331  		cni_directory="/etc/cni/net.d"
   332  		if [ ! -d "${cni_directory}" ]; then
   333  			sudo mkdir -p "${cni_directory}"
   334  		fi
   335  		# The config file for cni in cri-o v1.26 and higher has the extension "conflist"
   336  		# Using a wildcard here to keep support for both names.
   337  		# To be removed and replaced with "conflist" when support for older cri-o versions is abandonned.
   338  		sudo cp "${crio_repository_path}/contrib/cni/10-crio-bridge.conf"* "${cni_directory}"
   339  	fi
   340  
   341  	info "Start ${CRI_RUNTIME} service"
   342  	start_cri_runtime_service "${CRI_RUNTIME}" "${cri_runtime_socket}"
   343  
   344  	info "Start Kubernetes"
   345  	start_kubernetes "${kubernetes_version}" "${cri_runtime_socket}" "${cgroup_driver}"
   346  
   347  	info "Configure the cluster network"
   348  	configure_network "${network_plugin_config}"
   349  
   350  	# we need to ensure a few specific pods ready and running
   351  	info "Wait for system's pods be ready and running"
   352  	wait_pods_ready
   353  
   354  	info "Create kata RuntimeClass resource"
   355  	kubectl create -f "${runtimeclass_files_path}/kata-runtimeclass.yaml"
   356  
   357  	untaint_node
   358  }
   359  
   360  main $@