github.com/kata-containers/tests@v0.0.0-20240307153542-772105b56064/functional/tracing/tracing-test.sh

github.com/kata-containers/tests@v0.0.0-20240307153542-772105b56064/functional/tracing/tracing-test.sh (about)

     1  #!/bin/bash
     2  # Copyright (c) 2019-2022 Intel Corporation
     3  #
     4  # SPDX-License-Identifier: Apache-2.0
     5  #
     6  
     7  set -o errexit
     8  set -o nounset
     9  set -o pipefail
    10  set -o errtrace
    11  
    12  script_name=${0##*/}
    13  source "/etc/os-release" || source "/usr/lib/os-release"
    14  
    15  # Set to true if all tests pass
    16  success="false"
    17  
    18  DEBUG=${DEBUG:-}
    19  
    20  # If set to any value, do not shut down the Jaeger service.
    21  DEBUG_KEEP_JAEGER=${DEBUG_KEEP_JAEGER:-}
    22  # If set to any value, do not shut down the trace forwarder.
    23  DEBUG_KEEP_FORWARDER=${DEBUG_KEEP_FORWARDER:-}
    24  
    25  [ -n "$DEBUG" ] && set -o xtrace
    26  
    27  SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
    28  source "${SCRIPT_PATH}/../../lib/common.bash"
    29  
    30  RUNTIME="io.containerd.kata.v2"
    31  CONTAINER_IMAGE="quay.io/prometheus/busybox:latest"
    32  
    33  TRACE_LOG_DIR=${TRACE_LOG_DIR:-${KATA_TESTS_LOGDIR}/traces}
    34  
    35  KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}"
    36  
    37  # files for output
    38  formatted_traces_file="kata-traces-formatted.json"
    39  trace_summary_file="span-summary.txt"
    40  
    41  # tmux(1) session to run the trace forwarder in
    42  KATA_TMUX_FORWARDER_SESSION="kata-trace-forwarder-session"
    43  
    44  katacontainers_repo_dir="$GOPATH/src/github.com/kata-containers/kata-containers"
    45  
    46  forwarder_dir="${katacontainers_repo_dir}/src/tools/trace-forwarder"
    47  forwarder_binary_name="kata-trace-forwarder"
    48  
    49  # path prefix for CLH socket path
    50  socket_path_prefix="/run/vc/vm/"
    51  
    52  container_id="tracing-test"
    53  
    54  jaeger_server=${jaeger_server:-localhost}
    55  jaeger_ui_port=${jaeger_ui_port:-16686}
    56  jaeger_docker_container_name="jaeger"
    57  
    58  # Span data for testing:
    59  #   1. Existence of spans in jaeger output
    60  #   2. That the relative ordering in the data occurs
    61  #      in the jaeger output.
    62  # This is tested to make sure specific spans exist in the output and
    63  # that the order of spans is preserved.
    64  # Ordered in latest in sequence to earliest.
    65  #
    66  # Fields are all span existing span names in relative order from latest
    67  # to earliest call in a sequence of calls. Example (pseudocode):
    68  # func1() {
    69  #	span = trace("func1")
    70  #	func2()
    71  #	end span
    72  # }
    73  # func2() {
    74  #	span = trace("func2")
    75  #	func3()
    76  #	end span
    77  # }
    78  # func3() {
    79  #	span = trace("func3")
    80  #	end span
    81  # }
    82  # The following data should result in a passing test:
    83  #	'func3:func2:func1'
    84  #	'func3:func2'
    85  #	'func3:func1'
    86  #	'func2:func1'
    87  span_ordering_data=(
    88  	'StartVM:createSandboxFromConfig:create:rootSpan'
    89  	'setup_shared_namespaces:StartVM:createSandboxFromConfig:create:rootSpan'
    90  	'start_container:Start:rootSpan'
    91  	'stopSandbox:Stop:Start:rootSpan'
    92  )
    93  
    94  # Cleanup will remove Jaeger container and
    95  # disable tracing.
    96  cleanup()
    97  {
    98  	local fp="die"
    99  	local result="failed"
   100  	local dest="$logdir"
   101  
   102  	if [ "$success" = "true" ]; then
   103  		local fp="info"
   104  		result="passed"
   105  
   106  		[ -z "$DEBUG_KEEP_JAEGER" ] && stop_jaeger 2>/dev/null || true
   107  
   108  		[ -z "$DEBUG_KEEP_FORWARDER" ] && kill_trace_forwarder
   109  
   110  		# The tests worked so remove the logs
   111  		if [ -n "$DEBUG" ]; then
   112  			eval "$fp" "test $result - logs left in '$dest'"
   113  		else
   114  			"${SCRIPT_PATH}/../../.ci/configure_tracing_for_kata.sh" disable
   115  
   116  			[ -d "$logdir" ] && rm -rf "$logdir" || true
   117  		fi
   118  
   119  		return 0
   120  	fi
   121  
   122  	if [ -n "${CI:-}" ]; then
   123  		# Running under the CI, so copy the logs to allow them
   124  		# to be added as test artifacts.
   125  		sudo mkdir -p "$TRACE_LOG_DIR"
   126  		sudo cp -a "$logdir"/* "$TRACE_LOG_DIR"
   127  
   128  		dest="$TRACE_LOG_DIR"
   129  	fi
   130  
   131  	eval "$fp" "test $result - logs left in '$dest'"
   132  }
   133  
   134  # Run an operation to generate Jaeger trace spans
   135  create_traces()
   136  {
   137  	sudo ctr image pull "$CONTAINER_IMAGE"
   138  	sudo ctr run --runtime "$RUNTIME" --rm "$CONTAINER_IMAGE" "$container_id" true
   139  }
   140  
   141  start_jaeger()
   142  {
   143  	local jaeger_docker_image="jaegertracing/all-in-one:latest"
   144  
   145  	docker rm -f "${jaeger_docker_container_name}"
   146  
   147  	# Defaults - see https://www.jaegertracing.io/docs/getting-started/
   148  	docker run -d --runtime runc --name "${jaeger_docker_container_name}" \
   149  		-e COLLECTOR_ZIPKIN_HTTP_PORT=9411 \
   150  		-p 5775:5775/udp \
   151  		-p 6831:6831/udp \
   152  		-p 6832:6832/udp \
   153  		-p 5778:5778 \
   154  		-p "${jaeger_ui_port}:${jaeger_ui_port}" \
   155  		-p 14268:14268 \
   156  		-p 9411:9411 \
   157  		"$jaeger_docker_image"
   158  
   159  	sudo mkdir -m 0750 -p "$TRACE_LOG_DIR"
   160  }
   161  
   162  stop_jaeger()
   163  {
   164  	docker stop "${jaeger_docker_container_name}"
   165  	docker rm -f "${jaeger_docker_container_name}"
   166  }
   167  
   168  get_jaeger_traces()
   169  {
   170  	local service="$1"
   171  	[ -z "$service" ] && die "need jaeger service name"
   172  
   173  	local traces_url="http://${jaeger_server}:${jaeger_ui_port}/api/traces?service=${service}"
   174  	curl -s "${traces_url}" 2>/dev/null
   175  }
   176  
   177  get_trace_summary()
   178  {
   179  	local status="$1"
   180  	[ -z "$status" ] && die "need jaeger status JSON"
   181  
   182  	echo "${status}" | jq -S '.data[].spans[] | [.spanID, .operationName] | @sh'
   183  }
   184  
   185  get_span_count()
   186  {
   187  	local status="$1"
   188  	[ -z "$status" ] && die "need jaeger status JSON"
   189  
   190  	# This could be simplified but creating a variable holding the
   191  	# summary is useful in debug mode as the summary is displayed.
   192  	local trace_summary=$(get_trace_summary "$status" || true)
   193  
   194  	[ -z "$trace_summary" ] && die "failed to get trace summary"
   195  
   196  	local count=$(echo "${trace_summary}" | wc -l)
   197  
   198  	[ -z "$count" ] && count=0
   199  
   200  	echo "$count"
   201  }
   202  
   203  # Returns status from Jaeger web UI
   204  get_jaeger_status()
   205  {
   206  	local service="$1"
   207  	local logdir="$2"
   208  
   209  	[ -z "$service" ] && die "need jaeger service name"
   210  	[ -z "$logdir" ] && die "need logdir"
   211  
   212  	local status=""
   213  	local span_count=0
   214  
   215  	# Find spans
   216  	status=$(get_jaeger_traces "$service" || true)
   217  	if [ -n "$status" ]; then
   218  		echo "$status" | tee "$logdir/${service}-status.json"
   219  		span_count=$(get_span_count "$status")
   220  	fi
   221  
   222  	[ -z "$status" ] && die "failed to query Jaeger for status"
   223  	[ "$span_count" -eq 0 ] && die "failed to find any trace spans"
   224  	[ "$span_count" -le 0 ] && die "invalid span count"
   225  
   226  	get_trace_summary "$status" > "$logdir/$trace_summary_file"
   227  }
   228  
   229  # Check Jaeger spans for the specified service.
   230  check_jaeger_output()
   231  {
   232  	local service="$1"
   233  	local min_spans="$2"
   234  	local logdir="$3"
   235  
   236  	[ -z "$service" ] && die "need jaeger service name"
   237  	[ -z "$min_spans" ] && die "need minimum trace span count"
   238  	[ -z "$logdir" ] && die "need logdir"
   239  
   240  	local status
   241  	local errors=0
   242  
   243  	info "Checking Jaeger status"
   244  
   245  	status=$(get_jaeger_status "$service" "$logdir")
   246  
   247  	#------------------------------
   248  	# Basic sanity checks
   249  	[ -z "$status" ] && die "failed to query status via HTTP"
   250  
   251  	local span_lines=$(echo "$status"|jq -S '.data[].spans | length')
   252  	[ -z "$span_lines" ] && die "no span status"
   253  
   254  	# Log the spans to allow for analysis in case the test fails
   255  	echo "$status"|jq -S . > "$logdir/${service}-traces-formatted.json"
   256  
   257  	local span_lines_count=$(echo "$span_lines"|wc -l)
   258  
   259  	# Total up all span counts
   260  	local spans=$(echo "$span_lines"|paste -sd+ -|bc)
   261  	[ -z "$spans" ] && die "no spans"
   262  
   263  	# Ensure total span count is numeric
   264  	echo "$spans"|grep -q "^[0-9][0-9]*$" || die "invalid span count: '$spans'"
   265  
   266  	info "found $spans spans (across $span_lines_count traces)"
   267  
   268  	# Validate
   269  	[ "$spans" -lt "$min_spans" ] && die "expected >= $min_spans spans, got $spans"
   270  
   271  	# Look for common errors in span data
   272  	local error_msg=$(echo "$status"|jq -S . 2>/dev/null|grep "invalid parent span" || true)
   273  
   274  	if [ -n "$error_msg" ]; then
   275  		errors=$((errors+1))
   276  		warn "Found invalid parent span errors: $error_msg"
   277  	else
   278  		errors=$((errors-1))
   279  		[ "$errors" -lt 0 ] && errors=0
   280  	fi
   281  
   282  	# Crude but it works
   283  	error_or_warning_msgs=$(echo "$status" |\
   284  		jq -S . 2>/dev/null |\
   285  		jq '.data[].spans[].warnings' |\
   286  		grep -E -v "\<null\>" |\
   287  		grep -E -v "\[" |\
   288  		grep -E -v "\]" |\
   289  		grep -E -v "clock skew" || true) # ignore clock skew error
   290  
   291  	if [ -n "$error_or_warning_msgs" ]; then
   292  		errors=$((errors+1))
   293  		warn "Found errors/warnings: $error_or_warning_msgs"
   294  	else
   295  		errors=$((errors-1))
   296  		[ "$errors" -lt 0 ] && errors=0
   297  	fi
   298  
   299  	[ "$errors" -eq 0 ] || die "errors detected"
   300  }
   301  
   302  # Check output for spans in span_ordering_data
   303  check_spans()
   304  {
   305  	local logdir="$1"
   306  	[ -z "$logdir" ] && die "need logdir"
   307  
   308  	local errors=0
   309  
   310  	# Check for existence of spans in output so we do not do the more
   311  	# time consuming test of checking span ordering if it will fail
   312  	info "Checking spans: ${span_ordering_data[@]}"
   313  	local missing_spans=()
   314  	for span_ordering in "${span_ordering_data[@]}"; do
   315  		local test_spans=(`echo $span_ordering | tr ':' ' '`)
   316  		for s in "${test_spans[@]}"; do
   317  			grep -q \'$s\' "$logdir/$trace_summary_file" || missing_spans+=( "$s" )
   318  		done
   319  	done
   320  	if [ "${#missing_spans[@]}" -gt 0 ]; then
   321  	       die "Fail: Missing spans: ${missing_spans[@]}"
   322  	fi
   323  
   324  	# Check relative ordering of spans. We are not checking full trace, just
   325  	# that known calls are not out of order based on the test input.
   326  	for span_ordering in "${span_ordering_data[@]}"; do # runs maximum length of span_ordering_data
   327  		local test_spans=(`echo $span_ordering | tr ':' ' '`)
   328  
   329  		# create array for span IDs that match span string
   330  		local span_ids=()
   331  		for span in "${test_spans[@]}"; do
   332  			grep -q \'$span\' "$logdir/$trace_summary_file" || die "Fail: Missing span: $span"
   333  			id=$(cat "$logdir/$formatted_traces_file" | jq ".data[].spans[] | select(.operationName==\"$span\") | .spanID") || die "Fail: error with span $span retrieved from traces"
   334  			id_formatted=$(echo $id | tr -d '\"' | tr '\n' ':') # format to a string for parsing later, not an array
   335  			span_ids+=("$id_formatted")
   336  		done
   337  
   338  		# We now have 2 parallel arrays where test_spans[n] is the string name and
   339  		# span_ids[n] has all possible span IDs for that string separated by a colon
   340  
   341  		# Since functions can be called multiple times, we may have multiple results
   342  		# for span IDs.
   343  		initial_span_ids=(`echo ${span_ids[0]} | tr ':' ' '`)
   344  		for initial in "${initial_span_ids[@]}"; do # test parents for all initial spans
   345  			# construct array of all parents of first span
   346  			local retrieved_spans=()
   347  			local current_span="$initial"
   348  			[ "$current_span" != "" ] || break
   349  
   350  			MAX_DEPTH=20 # to prevent infinite loop due to unforeseen errors
   351  			for i in `seq 1 $MAX_DEPTH`; do
   352  				retrieved_spans+=("$current_span")
   353  				current_span=$(cat "$logdir/$formatted_traces_file" | jq ".data[].spans[] | select(.spanID==\"$current_span\") | .references[].spanID") || die "Fail: error with current_span $current_span retrieved from formatted traces"
   354  				[ "$current_span" != "" ] || break
   355  				current_span=$(echo $current_span | tr -d '"')
   356  				[ $i -lt $MAX_DEPTH ] || die "Fail: max depth reached, error in jq or adjust test depth"
   357  			done
   358  
   359  			# Keep track of this index so we can ensure we are testing the constructed array in order
   360  			# Increment when there is a match between test case and constructed path
   361  			local retrieved_ids_index=0
   362  
   363  			local matches=0
   364  			local index=0
   365  
   366  			# TODO: Optimize
   367  			for ((index=0; index<${#span_ids[@]}; index++)); do
   368  				for ((r_index=$retrieved_ids_index; r_index<${#retrieved_spans[@]}; r_index++)); do
   369  					grep -q "${retrieved_spans[$r_index]}" <<< ${span_ids[$index]} && (( retrieved_ids_index=$r_index+1 )) && (( matches+=1 )) && break
   370  				done
   371  			done
   372  
   373  			local last_initial_span_index=${#initial_span_ids[@]}-1
   374  			if [ $matches -eq ${#span_ids[@]} ]; then
   375  				info "Pass: spans \"${test_spans[@]}\" found in jaeger output"
   376  				break
   377  			elif [ $matches -lt ${#span_ids[@]} ] && [ "$initial" = "${initial_span_ids[$last_initial_span_index]}" ]; then
   378  				die "Fail: spans \"${test_spans[@]}\" NOT in jaeger output"
   379  			fi
   380  			# else repeat test for next initial span ID
   381  		done
   382  	done
   383  
   384  
   385  }
   386  
   387  run_trace_forwarder()
   388  {
   389  	source "$HOME/.cargo/env"
   390  	command -v "$forwarder_binary_name" &>/dev/null || (cd "$forwarder_dir" && cargo install --locked --path .)
   391  
   392  	if [ $KATA_HYPERVISOR = "qemu" ]; then
   393  		tmux new-session -d -s "$KATA_TMUX_FORWARDER_SESSION" "$forwarder_binary_name -l trace"
   394  	elif [ $KATA_HYPERVISOR = "cloud-hypervisor" ]; then
   395  		# CLH uses hybrid VSOCK which uses a local UNIX socket that we need to specify
   396  		socket_path_template=$socket_path_prefix$(sudo kata-runtime env --json | jq '.Hypervisor.SocketPath')
   397  		socket_path=$(echo "$socket_path_template" | sed "s/{ID}/${container_id}/g" | tr -d '"')
   398  		sudo mkdir -p $(dirname "$socket_path")
   399  
   400  		tmux new-session -d -s "$KATA_TMUX_FORWARDER_SESSION" "$forwarder_binary_name -l trace --socket-path $socket_path"
   401  	else
   402  		die "Unsupported hypervisor $KATA_HYPERVISOR"
   403  	fi
   404  
   405  	info "Verifying trace forwarder in tmux session $KATA_TMUX_FORWARDER_SESSION"
   406  
   407  	local cmd="tmux capture-pane -pt $KATA_TMUX_FORWARDER_SESSION | tr -d '\n' | tr -d '\"' | grep -q \"source:kata-trace-forwarder\""
   408  	waitForProcess 10 1 "$cmd"
   409  }
   410  
   411  kill_trace_forwarder()
   412  {
   413  	tmux kill-session -t "$KATA_TMUX_FORWARDER_SESSION"
   414  }
   415  
   416  setup()
   417  {
   418  	# containerd must be running in order to use ctr to generate traces
   419  	restart_containerd_service
   420  
   421  	local cmds=()
   422  	# For container manager (containerd)
   423  	cmds+=('ctr')
   424  	# For jaeger
   425  	cmds+=('docker')
   426  	# For launching processes
   427  	cmds+=('tmux')
   428  
   429  	local cmd
   430  	for cmd in "${cmds[@]}"
   431          do
   432                  local result
   433                  result=$(command -v "$cmd" || true)
   434                  [ -n "$result" ] || die "need $cmd"
   435          done
   436  
   437  	run_trace_forwarder
   438  
   439  	start_jaeger
   440  
   441  	"${SCRIPT_PATH}/../../.ci/configure_tracing_for_kata.sh" enable
   442  }
   443  
   444  run_test()
   445  {
   446  	local service="$1"
   447  	local min_spans="$2"
   448  	local logdir="$3"
   449  
   450  	[ -z "$service" ] && die "need service name"
   451  	[ -z "$min_spans" ] && die "need minimum span count"
   452  	[ -z "$logdir" ] && die "need logdir"
   453  
   454  	info "Running test for service '$service'"
   455  
   456  	logdir="$logdir/$service"
   457  	mkdir -p "$logdir"
   458  
   459  	check_jaeger_output "$service" "$min_spans" "$logdir"
   460  	check_spans "$logdir"
   461  
   462  	info "test passed"
   463  }
   464  
   465  run_tests()
   466  {
   467  	# List of services to check
   468  	#
   469  	# Format: "name:min-spans"
   470  	#
   471  	# Where:
   472  	#
   473  	# - 'name' is the Jaeger service name.
   474  	# - 'min-spans' is an integer representing the minimum number of
   475  	#   trace spans this service should generate.
   476  	#
   477  	# Notes:
   478  	#
   479  	# - Uses an array to ensure predictable ordering.
   480  	# - All services listed are expected to generate traces
   481  	#   when create_traces() is called a single time.
   482  	local -a services
   483  
   484  	services+=("kata:125")
   485  
   486  	create_traces
   487  
   488  	logdir=$(mktemp -d)
   489  
   490  	for service in "${services[@]}"
   491  	do
   492  		local name=$(echo "${service}"|cut -d: -f1)
   493  		local min_spans=$(echo "${service}"|cut -d: -f2)
   494  
   495  		run_test "${name}" "${min_spans}" "${logdir}"
   496  	done
   497  
   498  	info "all tests passed"
   499  	success="true"
   500  }
   501  
   502  usage()
   503  {
   504  	cat <<EOF
   505  
   506  Usage: $script_name [<command>]
   507  
   508  Commands:
   509  
   510    clean  - Perform cleanup phase only.
   511    help   - Show usage.
   512    run    - Only run tests (no setup or cleanup).
   513    setup  - Perform setup phase only.
   514  
   515  Environment variables:
   516  
   517    CI    - if set, save logs of all tests to ${TRACE_LOG_DIR}.
   518    DEBUG - if set, enable tracing and do not cleanup after tests.
   519    DEBUG_KEEP_JAEGER - if set, do not shut down the Jaeger service.
   520    DEBUG_KEEP_FORWARDER - if set, do not shut down the trace forwarder.
   521  
   522  Notes:
   523    - Runs all test phases if no arguments are specified.
   524  
   525  EOF
   526  }
   527  
   528  main()
   529  {
   530  	local cmd="${1:-}"
   531  
   532  	source /etc/os-release || source /usr/lib/os-release
   533  
   534          # Only run on Ubuntu LTS for now
   535          [ "${ID:-}" = ubuntu ] && grep -q LTS <<< "${VERSION:-}" || {
   536                  info "Exiting as not running on Ubuntu LTS"
   537                  exit 0
   538          }
   539  
   540  	# Only run on x86_64 for now
   541          [ "$(uname -m)" != "x86_64" ] && {
   542                  info "Exiting, only run on x86_64"
   543                  exit 0
   544          }
   545  
   546          [ "${CI_JOB:-}" = "METRICS" ] && {
   547                  info "Exiting as not running on metrics CI"
   548                  exit 0
   549          }
   550  
   551  	[ "${CI_JOB:-}" = "CRI_CONTAINERD_K8S_DEVMAPPER" ] && {
   552  		info "Exiting to limit number of jobs that run tests related to tracing."
   553  		exit 0
   554  	}
   555  
   556  	case "$cmd" in
   557  		clean) success="true"; cleanup; exit 0;;
   558  		help|-h|-help|--help) usage; exit 0;;
   559  		run) run_tests; exit 0;;
   560  		setup) setup; exit 0;;
   561  	esac
   562  
   563  	trap cleanup EXIT
   564  
   565  	setup
   566  
   567  	run_tests
   568  }
   569  
   570  main "$@"