k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/log-dump/log-dump.sh (about) 1 #!/usr/bin/env bash 2 3 # Copyright 2017 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 # Call this to dump all master and node logs into the folder specified in $1 18 # (defaults to _artifacts). Only works if the provider supports SSH. 19 20 # TODO(shyamjvs): This script should be moved to test/e2e which is where it ideally belongs. 21 set -o errexit 22 set -o nounset 23 set -o pipefail 24 25 readonly report_dir="${1:-_artifacts}" 26 readonly gcs_artifacts_dir="${2:-}" 27 readonly logexporter_namespace="${3:-logexporter}" 28 29 # In order to more trivially extend log-dump for custom deployments, 30 # check for a function named log_dump_custom_get_instances. If it's 31 # defined, we assume the function can me called with one argument, the 32 # role, which is either "master" or "node". 33 echo 'Checking for custom logdump instances, if any' 34 if [[ $(type -t log_dump_custom_get_instances) == "function" ]]; then 35 readonly use_custom_instance_list=yes 36 else 37 readonly use_custom_instance_list= 38 fi 39 40 readonly master_ssh_supported_providers="gce aws" 41 readonly node_ssh_supported_providers="gce gke aws" 42 readonly gcloud_supported_providers="gce gke" 43 44 readonly master_logfiles="kube-apiserver.log kube-apiserver-audit.log kube-scheduler.log kube-controller-manager.log cloud-controller-manager.log etcd.log etcd-events.log glbc.log cluster-autoscaler.log kube-addon-manager.log konnectivity-server.log fluentd.log kubelet.cov" 45 readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log node-problem-detector.log kubelet.cov kube-network-policies.log" 46 readonly node_systemd_services="node-problem-detector" 47 readonly hollow_node_logfiles="kubelet-hollow-node-*.log kubeproxy-hollow-node-*.log npd-hollow-node-*.log" 48 readonly aws_logfiles="cloud-init-output.log" 49 readonly gce_logfiles="startupscript.log" 50 readonly kern_logfile="kern.log" 51 readonly initd_logfiles="docker/log" 52 readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log" 53 readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}" 54 readonly extra_log_files="${LOG_DUMP_EXTRA_FILES:-}" 55 readonly extra_systemd_services="${LOG_DUMP_SAVE_SERVICES:-}" 56 readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}" 57 # Log files found in WINDOWS_LOGS_DIR on Windows nodes: 58 readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log docker_images.log csi-proxy.log" 59 # Log files found in other directories on Windows nodes: 60 readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp" 61 62 # Limit the number of concurrent node connections so that we don't run out of 63 # file descriptors for large clusters. 64 readonly max_dump_processes=25 65 66 # Indicator variable whether we experienced a significant failure during 67 # logexporter creation or execution. 68 logexporter_failed=0 69 70 # Percentage of nodes that must be logexported successfully (otherwise the 71 # process will exit with a non-zero exit code). 72 readonly log_dump_expected_success_percentage="${LOG_DUMP_EXPECTED_SUCCESS_PERCENTAGE:-0}" 73 74 function print-deprecation-note() { 75 local -r dashline=$(printf -- '-%.0s' {1..100}) 76 echo "${dashline}" 77 echo "k/k version of the log-dump.sh script is deprecated!" 78 echo "Please migrate your test job to use test-infra's repo version of log-dump.sh!" 79 echo "Migration steps can be found in the readme file." 80 echo "${dashline}" 81 } 82 83 # TODO: Get rid of all the sourcing of bash dependencies eventually. 84 function setup() { 85 KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/../.. 86 if [[ -z "${use_custom_instance_list}" ]]; then 87 : "${KUBE_CONFIG_FILE:=config-test.sh}" 88 echo 'Sourcing kube-util.sh' 89 source "${KUBE_ROOT}/cluster/kube-util.sh" 90 echo 'Detecting project' 91 detect-project 2>&1 92 elif [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then 93 echo "Using 'use_custom_instance_list' with gke, skipping check for LOG_DUMP_SSH_KEY and LOG_DUMP_SSH_USER" 94 # Source the below script for the ssh-to-node utility function. 95 # Hack to save and restore the value of the ZONE env as the script overwrites it. 96 local gke_zone="${ZONE:-}" 97 source "${KUBE_ROOT}/cluster/gce/util.sh" 98 ZONE="${gke_zone}" 99 elif [[ -z "${LOG_DUMP_SSH_KEY:-}" ]]; then 100 echo 'LOG_DUMP_SSH_KEY not set, but required when using log_dump_custom_get_instances' 101 exit 1 102 elif [[ -z "${LOG_DUMP_SSH_USER:-}" ]]; then 103 echo 'LOG_DUMP_SSH_USER not set, but required when using log_dump_custom_get_instances' 104 exit 1 105 fi 106 source "${KUBE_ROOT}/hack/lib/util.sh" 107 } 108 109 function log-dump-ssh() { 110 if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 111 ssh-to-node "$@" 112 return 113 fi 114 115 local host="$1" 116 local cmd="$2" 117 118 ssh -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${host}" "${cmd}" 119 } 120 121 # Copy all files /var/log/{$3}.log on node $1 into local dir $2. 122 # $3 should be a string array of file names. 123 # This function shouldn't ever trigger errexit, but doesn't block stderr. 124 function copy-logs-from-node() { 125 local -r node="${1}" 126 local -r dir="${2}" 127 shift 128 shift 129 local files=("$@") 130 # Append "*" 131 # The * at the end is needed to also copy rotated logs (which happens 132 # in large clusters and long runs). 133 files=( "${files[@]/%/*}" ) 134 # Prepend "/var/log/" 135 files=( "${files[@]/#/\/var\/log\/}" ) 136 # Comma delimit (even the singleton, or scp does the wrong thing), surround by braces. 137 local -r scp_files="{$(printf "%s," "${files[@]}")}" 138 139 if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 140 # get-serial-port-output lets you ask for ports 1-4, but currently (11/21/2016) only port 1 contains useful information 141 gcloud compute instances get-serial-port-output --project "${PROJECT}" --zone "${ZONE}" --port 1 "${node}" > "${dir}/serial-1.log" || true 142 source_file_args=() 143 for single_file in "${files[@]}"; do 144 source_file_args+=( "${node}:${single_file}" ) 145 done 146 gcloud compute scp --recurse --project "${PROJECT}" --zone "${ZONE}" "${source_file_args[@]}" "${dir}" > /dev/null || true 147 elif [[ "${KUBERNETES_PROVIDER}" == "aws" ]]; then 148 local ip 149 ip=$(get_ssh_hostname "${node}") 150 scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" "${SSH_USER}@${ip}:${scp_files}" "${dir}" > /dev/null || true 151 elif [[ -n "${use_custom_instance_list}" ]]; then 152 scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${node}:${scp_files}" "${dir}" > /dev/null || true 153 else 154 echo "Unknown cloud-provider '${KUBERNETES_PROVIDER}' and use_custom_instance_list is unset too - skipping logdump for '${node}'" 155 fi 156 } 157 158 # Save logs for node $1 into directory $2. Pass in any non-common files in $3. 159 # Pass in any non-common systemd services in $4. 160 # $3 and $4 should be a space-separated list of files. 161 # Set $5 to true to indicate it is on master. Default to false. 162 # This function shouldn't ever trigger errexit 163 function save-logs() { 164 local -r node_name="${1}" 165 local -r dir="${2}" 166 local files=() 167 IFS=' ' read -r -a files <<< "$3" 168 local opt_systemd_services="${4:-""}" 169 local on_master="${5:-"false"}" 170 171 local extra=() 172 IFS=' ' read -r -a extra <<< "$extra_log_files" 173 files+=("${extra[@]}") 174 if [[ -n "${use_custom_instance_list}" ]]; then 175 if [[ -n "${LOG_DUMP_SAVE_LOGS:-}" ]]; then 176 local dump=() 177 IFS=' ' read -r -a dump <<< "${LOG_DUMP_SAVE_LOGS:-}" 178 files+=("${dump[@]}") 179 fi 180 else 181 local providerlogs=() 182 case "${KUBERNETES_PROVIDER}" in 183 gce|gke) 184 IFS=' ' read -r -a providerlogs <<< "${gce_logfiles}" 185 ;; 186 aws) 187 IFS=' ' read -r -a providerlogs <<< "${aws_logfiles}" 188 ;; 189 esac 190 files+=("${providerlogs[@]}") 191 fi 192 local services 193 read -r -a services <<< "${systemd_services} ${opt_systemd_services} ${extra_systemd_services}" 194 195 if log-dump-ssh "${node_name}" "command -v journalctl" &> /dev/null; then 196 if [[ "${on_master}" == "true" ]]; then 197 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-installation.service" > "${dir}/kube-master-installation.log" || true 198 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-configuration.service" > "${dir}/kube-master-configuration.log" || true 199 else 200 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-installation.service" > "${dir}/kube-node-installation.log" || true 201 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-configuration.service" > "${dir}/kube-node-configuration.log" || true 202 fi 203 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -k" > "${dir}/kern.log" || true 204 205 for svc in "${services[@]}"; do 206 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u ${svc}.service" > "${dir}/${svc}.log" || true 207 done 208 209 if [[ "$dump_systemd_journal" == "true" ]]; then 210 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise" > "${dir}/systemd.log" || true 211 fi 212 else 213 local tmpfiles=() 214 for f in "${kern_logfile}" "${initd_logfiles}" "${supervisord_logfiles}"; do 215 IFS=' ' read -r -a tmpfiles <<< "$f" 216 files+=("${tmpfiles[@]}") 217 done 218 fi 219 220 # log where we pull the images from 221 log-dump-ssh "${node_name}" "sudo ctr -n k8s.io images ls" > "${dir}/images-containerd.log" || true 222 log-dump-ssh "${node_name}" "sudo docker images --all" > "${dir}/images-docker.log" || true 223 224 # Try dumping coverage profiles, if it looks like coverage is enabled in the first place. 225 if log-dump-ssh "${node_name}" "stat /var/log/kubelet.cov" &> /dev/null; then 226 if log-dump-ssh "${node_name}" "command -v docker" &> /dev/null; then 227 if [[ "${on_master}" == "true" ]]; then 228 run-in-docker-container "${node_name}" "kube-apiserver" "cat /tmp/k8s-kube-apiserver.cov" > "${dir}/kube-apiserver.cov" || true 229 run-in-docker-container "${node_name}" "kube-scheduler" "cat /tmp/k8s-kube-scheduler.cov" > "${dir}/kube-scheduler.cov" || true 230 run-in-docker-container "${node_name}" "kube-controller-manager" "cat /tmp/k8s-kube-controller-manager.cov" > "${dir}/kube-controller-manager.cov" || true 231 else 232 run-in-docker-container "${node_name}" "kube-proxy" "cat /tmp/k8s-kube-proxy.cov" > "${dir}/kube-proxy.cov" || true 233 fi 234 else 235 echo 'Coverage profiles seem to exist, but cannot be retrieved from inside containers.' 236 fi 237 fi 238 239 echo 'Changing logfiles to be world-readable for download' 240 log-dump-ssh "${node_name}" "sudo chmod -R a+r /var/log" || true 241 242 echo "Copying '${files[*]}' from ${node_name}" 243 copy-logs-from-node "${node_name}" "${dir}" "${files[@]}" 244 } 245 246 # Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log 247 # on node $1. 248 function export-windows-docker-event-log() { 249 local -r node="${1}" 250 251 local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(Get-EventLog -LogName Application -Source Docker | Format-Table -Property TimeGenerated, EntryType, Message -Wrap); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker.log'\"" 252 253 # Retry up to 3 times to allow ssh keys to be properly propagated and 254 # stored. 255 for retry in {1..3}; do 256 if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \ 257 --command "$powershell_cmd"; then 258 break 259 else 260 sleep 10 261 fi 262 done 263 } 264 265 # Saves prepulled Windows Docker images list to ${WINDOWS_LOGS_DIR}\docker_images.log 266 # on node $1. 267 function export-windows-docker-images-list() { 268 local -r node="${1}" 269 270 local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(docker image list); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker_images.log'\"" 271 272 # Retry up to 3 times to allow ssh keys to be properly propagated and 273 # stored. 274 for retry in {1..3}; do 275 if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \ 276 --command "$powershell_cmd"; then 277 break 278 else 279 sleep 10 280 fi 281 done 282 } 283 284 # Saves log files from diagnostics tool.(https://github.com/GoogleCloudPlatform/compute-image-tools/tree/master/cli_tools/diagnostics) 285 function save-windows-logs-via-diagnostics-tool() { 286 local node="${1}" 287 local dest_dir="${2}" 288 289 gcloud compute instances add-metadata "${node}" --metadata enable-diagnostics=true --project="${PROJECT}" --zone="${ZONE}" 290 local logs_archive_in_gcs 291 logs_archive_in_gcs=$(gcloud alpha compute diagnose export-logs "${node}" "--zone=${ZONE}" "--project=${PROJECT}" | tail -n 1) 292 local temp_local_path="${node}.zip" 293 for retry in {1..20}; do 294 if gsutil mv "${logs_archive_in_gcs}" "${temp_local_path}" > /dev/null 2>&1; then 295 echo "Downloaded diagnostics log from ${logs_archive_in_gcs}" 296 break 297 else 298 sleep 10 299 fi 300 done 301 302 if [[ -f "${temp_local_path}" ]]; then 303 unzip "${temp_local_path}" -d "${dest_dir}" > /dev/null 304 rm -f "${temp_local_path}" 305 fi 306 } 307 308 # Saves log files from SSH 309 function save-windows-logs-via-ssh() { 310 local node="${1}" 311 local dest_dir="${2}" 312 313 export-windows-docker-event-log "${node}" 314 export-windows-docker-images-list "${node}" 315 316 local remote_files=() 317 for file in "${windows_node_logfiles[@]}"; do 318 remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" ) 319 done 320 remote_files+=( "${windows_node_otherfiles[@]}" ) 321 322 # TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the 323 # same time. 324 for remote_file in "${remote_files[@]}"; do 325 # Retry up to 3 times to allow ssh keys to be properly propagated and 326 # stored. 327 for retry in {1..3}; do 328 if gcloud compute scp --recurse --project "${PROJECT}" \ 329 --zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \ 330 > /dev/null; then 331 break 332 else 333 sleep 10 334 fi 335 done 336 done 337 } 338 339 # Save log files and serial console output from Windows node $1 into local 340 # directory $2. 341 # This function shouldn't ever trigger errexit. 342 function save-logs-windows() { 343 local -r node="${1}" 344 local -r dest_dir="${2}" 345 346 if [[ ! "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 347 echo "Not saving logs for ${node}, Windows log dumping requires gcloud support" 348 return 349 fi 350 351 if [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then 352 save-windows-logs-via-diagnostics-tool "${node}" "${dest_dir}" 353 else 354 save-windows-logs-via-ssh "${node}" "${dest_dir}" 355 fi 356 357 # Serial port 1 contains the Windows console output. 358 gcloud compute instances get-serial-port-output --project "${PROJECT}" \ 359 --zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true 360 } 361 362 # Execute a command in container $2 on node $1. 363 # Uses docker because the container may not ordinarily permit direct execution. 364 function run-in-docker-container() { 365 local node_name="$1" 366 local container="$2" 367 shift 2 368 log-dump-ssh "${node_name}" "docker exec \"\$(docker ps -f label=io.kubernetes.container.name=${container} --format \"{{.ID}}\")\" $*" 369 } 370 371 function dump_masters() { 372 local master_names=() 373 if [[ -n "${use_custom_instance_list}" ]]; then 374 while IFS='' read -r line; do master_names+=("$line"); done < <(log_dump_custom_get_instances master) 375 elif [[ ! "${master_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 376 echo "Master SSH not supported for ${KUBERNETES_PROVIDER}" 377 return 378 elif [[ -n "${KUBEMARK_MASTER_NAME:-}" ]]; then 379 master_names=( "${KUBEMARK_MASTER_NAME}" ) 380 else 381 if ! (detect-master); then 382 echo 'Master not detected. Is the cluster up?' 383 return 384 fi 385 master_names=( "${MASTER_NAME}" ) 386 fi 387 388 if [[ "${#master_names[@]}" == 0 ]]; then 389 echo 'No masters found?' 390 return 391 fi 392 393 proc=${max_dump_processes} 394 for master_name in "${master_names[@]}"; do 395 master_dir="${report_dir}/${master_name}" 396 mkdir -p "${master_dir}" 397 save-logs "${master_name}" "${master_dir}" "${master_logfiles}" "" "true" & 398 399 # We don't want to run more than ${max_dump_processes} at a time, so 400 # wait once we hit that many nodes. This isn't ideal, since one might 401 # take much longer than the others, but it should help. 402 proc=$((proc - 1)) 403 if [[ proc -eq 0 ]]; then 404 proc=${max_dump_processes} 405 wait 406 fi 407 done 408 # Wait for any remaining processes. 409 if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then 410 wait 411 fi 412 } 413 414 # Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be 415 # specified via $1 or $use_custom_instance_list. If not specified then the nodes 416 # to dump logs for will be detected using detect-node-names(); if Windows nodes 417 # are present then they will be detected and their logs will be dumped too. 418 function dump_nodes() { 419 local node_names=() 420 local windows_node_names=() 421 if [[ -n "${1:-}" ]]; then 422 echo 'Dumping logs for nodes provided as args to dump_nodes() function' 423 node_names=( "$@" ) 424 elif [[ -n "${use_custom_instance_list}" ]]; then 425 echo 'Dumping logs for nodes provided by log_dump_custom_get_instances() function' 426 while IFS='' read -r line; do node_names+=("$line"); done < <(log_dump_custom_get_instances node) 427 elif [[ ! "${node_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 428 echo "Node SSH not supported for ${KUBERNETES_PROVIDER}" 429 return 430 else 431 echo 'Detecting nodes in the cluster' 432 detect-node-names &> /dev/null 433 if [[ -n "${NODE_NAMES:-}" ]]; then 434 node_names=( "${NODE_NAMES[@]}" ) 435 fi 436 if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then 437 windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" ) 438 fi 439 fi 440 441 if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then 442 echo 'No nodes found!' 443 return 444 fi 445 446 node_logfiles_all="${node_logfiles}" 447 if [[ "${ENABLE_HOLLOW_NODE_LOGS:-}" == "true" ]]; then 448 node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}" 449 fi 450 451 linux_nodes_selected_for_logs=() 452 if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then 453 # We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs. 454 for index in $(shuf -i 0-$(( ${#node_names[*]} - 1 )) -n "${LOGDUMP_ONLY_N_RANDOM_NODES}") 455 do 456 linux_nodes_selected_for_logs+=("${node_names[$index]}") 457 done 458 else 459 linux_nodes_selected_for_logs=( "${node_names[@]}" ) 460 fi 461 all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" ) 462 all_selected_nodes+=( "${windows_node_names[@]}" ) 463 464 proc=${max_dump_processes} 465 start="$(date +%s)" 466 # log_dump_ssh_timeout is the maximal number of seconds the log dumping over 467 # SSH operation can take. Please note that the logic enforcing the timeout 468 # is only a best effort. The actual time of the operation may be longer 469 # due to waiting for all the child processes below. 470 log_dump_ssh_timeout_seconds="${LOG_DUMP_SSH_TIMEOUT_SECONDS:-}" 471 for i in "${!all_selected_nodes[@]}"; do 472 node_name="${all_selected_nodes[$i]}" 473 node_dir="${report_dir}/${node_name}" 474 mkdir -p "${node_dir}" 475 if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then 476 # Save logs in the background. This speeds up things when there are 477 # many nodes. 478 save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" & 479 else 480 save-logs-windows "${node_name}" "${node_dir}" & 481 fi 482 483 # We don't want to run more than ${max_dump_processes} at a time, so 484 # wait once we hit that many nodes. This isn't ideal, since one might 485 # take much longer than the others, but it should help. 486 proc=$((proc - 1)) 487 if [[ proc -eq 0 ]]; then 488 proc=${max_dump_processes} 489 wait 490 now="$(date +%s)" 491 if [[ -n "${log_dump_ssh_timeout_seconds}" && $((now - start)) -gt ${log_dump_ssh_timeout_seconds} ]]; then 492 echo "WARNING: Hit timeout after ${log_dump_ssh_timeout_seconds} seconds, finishing log dumping over SSH shortly" 493 break 494 fi 495 fi 496 done 497 # Wait for any remaining processes. 498 if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then 499 wait 500 fi 501 } 502 503 # Collect names of nodes which didn't run logexporter successfully. 504 # This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter 505 # does not run on Windows nodes. 506 # 507 # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes. 508 # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes). 509 # Assumes: 510 # NODE_NAMES 511 # Sets: 512 # NON_LOGEXPORTED_NODES 513 function find_non_logexported_nodes() { 514 local file="${gcs_artifacts_dir}/logexported-nodes-registry" 515 echo "Listing marker files ($file) for successful nodes..." 516 succeeded_nodes=$(gsutil ls "${file}") || return 1 517 echo 'Successfully listed marker files for successful nodes' 518 NON_LOGEXPORTED_NODES=() 519 for node in "${NODE_NAMES[@]}"; do 520 if [[ ! "${succeeded_nodes}" =~ ${node} ]]; then 521 NON_LOGEXPORTED_NODES+=("${node}") 522 fi 523 done 524 } 525 526 # This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter 527 # does not run on Windows nodes. 528 function dump_nodes_with_logexporter() { 529 if [[ -n "${use_custom_instance_list}" ]]; then 530 echo 'Dumping logs for nodes provided by log_dump_custom_get_instances() function' 531 NODE_NAMES=() 532 while IFS='' read -r line; do NODE_NAMES+=("$line"); done < <(log_dump_custom_get_instances node) 533 else 534 echo 'Detecting nodes in the cluster' 535 detect-node-names &> /dev/null 536 fi 537 538 if [[ -z "${NODE_NAMES:-}" ]]; then 539 echo 'No nodes found!' 540 return 541 fi 542 543 # Obtain parameters required by logexporter. 544 local -r service_account_credentials="$(base64 "${GOOGLE_APPLICATION_CREDENTIALS}" | tr -d '\n')" 545 local -r cloud_provider="${KUBERNETES_PROVIDER}" 546 local -r enable_hollow_node_logs="${ENABLE_HOLLOW_NODE_LOGS:-false}" 547 local -r logexport_sleep_seconds="$(( 90 + NUM_NODES / 3 ))" 548 if [[ -z "${ZONE_NODE_SELECTOR_DISABLED:-}" ]]; then 549 local -r node_selector="${ZONE_NODE_SELECTOR_LABEL:-topology.kubernetes.io/zone}: ${ZONE}" 550 fi 551 552 # Fill in the parameters in the logexporter daemonset template. 553 local -r tmp="${KUBE_TEMP}/logexporter" 554 local -r manifest_yaml="${tmp}/logexporter-daemonset.yaml" 555 mkdir -p "${tmp}" 556 cp "${KUBE_ROOT}/cluster/log-dump/logexporter-daemonset.yaml" "${manifest_yaml}" 557 558 sed -i'' -e "s@{{.NodeSelector}}@${node_selector:-}@g" "${manifest_yaml}" 559 sed -i'' -e "s@{{.LogexporterNamespace}}@${logexporter_namespace}@g" "${manifest_yaml}" 560 sed -i'' -e "s@{{.ServiceAccountCredentials}}@${service_account_credentials}@g" "${manifest_yaml}" 561 sed -i'' -e "s@{{.CloudProvider}}@${cloud_provider}@g" "${manifest_yaml}" 562 sed -i'' -e "s@{{.GCSPath}}@${gcs_artifacts_dir}@g" "${manifest_yaml}" 563 sed -i'' -e "s@{{.EnableHollowNodeLogs}}@${enable_hollow_node_logs}@g" "${manifest_yaml}" 564 sed -i'' -e "s@{{.DumpSystemdJournal}}@${dump_systemd_journal}@g" "${manifest_yaml}" 565 sed -i'' -e "s@{{.ExtraLogFiles}}@${extra_log_files}@g" "${manifest_yaml}" 566 sed -i'' -e "s@{{.ExtraSystemdServices}}@${extra_systemd_services}@g" "${manifest_yaml}" 567 568 # Create the logexporter namespace, service-account secret and the logexporter daemonset within that namespace. 569 KUBECTL="${KUBE_ROOT}/cluster/kubectl.sh" 570 if ! "${KUBECTL}" create -f "${manifest_yaml}"; then 571 echo 'Failed to create logexporter daemonset.. falling back to logdump through SSH' 572 "${KUBECTL}" delete namespace "${logexporter_namespace}" || true 573 dump_nodes "${NODE_NAMES[@]}" 574 logexporter_failed=1 575 return 576 fi 577 578 # Periodically fetch list of already logexported nodes to verify 579 # if we aren't already done. 580 start="$(date +%s)" 581 while true; do 582 now="$(date +%s)" 583 if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then 584 echo 'Waiting for all nodes to be logexported timed out.' 585 break 586 fi 587 if find_non_logexported_nodes; then 588 if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then 589 break 590 fi 591 fi 592 sleep 15 593 done 594 595 # Store logs from logexporter pods to allow debugging log exporting process 596 # itself. 597 proc=${max_dump_processes} 598 "${KUBECTL}" get pods -n "${logexporter_namespace}" -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | (while read -r pod node; do 599 echo "Fetching logs from ${pod} running on ${node}" 600 mkdir -p "${report_dir}/${node}" 601 "${KUBECTL}" logs -n "${logexporter_namespace}" "${pod}" > "${report_dir}/${node}/${pod}.log" & 602 603 # We don't want to run more than ${max_dump_processes} at a time, so 604 # wait once we hit that many nodes. This isn't ideal, since one might 605 # take much longer than the others, but it should help. 606 proc=$((proc - 1)) 607 if [[ proc -eq 0 ]]; then 608 proc=${max_dump_processes} 609 wait 610 fi 611 # Wait for any remaining processes. 612 done; wait) 613 614 # List registry of marker files (of nodes whose logexporter succeeded) from GCS. 615 for retry in {1..10}; do 616 if find_non_logexported_nodes; then 617 break 618 else 619 echo "Attempt ${retry} failed to list marker files for successful nodes" 620 if [[ "${retry}" == 10 ]]; then 621 echo 'Final attempt to list marker files failed.. falling back to logdump through SSH' 622 "${KUBECTL}" delete namespace "${logexporter_namespace}" || true 623 dump_nodes "${NODE_NAMES[@]}" 624 logexporter_failed=1 625 return 626 fi 627 sleep 2 628 fi 629 done 630 631 failed_nodes=() 632 # The following if is needed, because defaulting for empty arrays 633 # seems to treat them as non-empty with single empty string. 634 if [[ -n "${NON_LOGEXPORTED_NODES:-}" ]]; then 635 for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do 636 echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH." 637 failed_nodes+=("${node}") 638 done 639 fi 640 641 # If less than a certain ratio of the nodes got logexported, report an error. 642 if [[ $(((${#NODE_NAMES[@]} - ${#failed_nodes[@]}) * 100)) -lt $((${#NODE_NAMES[@]} * log_dump_expected_success_percentage )) ]]; then 643 logexporter_failed=1 644 fi 645 646 # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH. 647 "${KUBECTL}" get pods --namespace "${logexporter_namespace}" || true 648 "${KUBECTL}" delete namespace "${logexporter_namespace}" || true 649 if [[ "${#failed_nodes[@]}" != 0 ]]; then 650 echo -e "Dumping logs through SSH for the following nodes:\n${failed_nodes[*]}" 651 dump_nodes "${failed_nodes[@]}" 652 fi 653 } 654 655 function detect_node_failures() { 656 if ! [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 657 return 658 fi 659 660 detect-node-names 661 if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then 662 local all_instance_groups=("${INSTANCE_GROUPS[@]}" "${WINDOWS_INSTANCE_GROUPS[@]}") 663 else 664 local all_instance_groups=("${INSTANCE_GROUPS[@]}") 665 fi 666 667 if [ -z "${all_instance_groups:-}" ]; then 668 return 669 fi 670 for group in "${all_instance_groups[@]}"; do 671 local creation_timestamp 672 creation_timestamp=$(gcloud compute instance-groups managed describe \ 673 "${group}" \ 674 --project "${PROJECT}" \ 675 --zone "${ZONE}" \ 676 --format='value(creationTimestamp)') 677 echo "Failures for ${group} (if any):" 678 gcloud logging read --order=asc \ 679 --format='table(timestamp,jsonPayload.resource.name,jsonPayload.event_subtype)' \ 680 --project "${PROJECT}" \ 681 "resource.type=\"gce_instance\" 682 logName=\"projects/${PROJECT}/logs/compute.googleapis.com%2Factivity_log\" 683 (jsonPayload.event_subtype=\"compute.instances.hostError\" OR jsonPayload.event_subtype=\"compute.instances.automaticRestart\") 684 jsonPayload.resource.name:\"${group}\" 685 timestamp >= \"${creation_timestamp}\"" 686 done 687 } 688 689 function main() { 690 print-deprecation-note 691 setup 692 kube::util::ensure-temp-dir 693 # Copy master logs to artifacts dir locally (through SSH). 694 echo "Dumping logs from master locally to '${report_dir}'" 695 dump_masters 696 if [[ "${DUMP_ONLY_MASTER_LOGS:-}" == "true" ]]; then 697 echo 'Skipping dumping of node logs' 698 return 699 fi 700 701 # Copy logs from nodes to GCS directly or to artifacts dir locally (through SSH). 702 if [[ -n "${gcs_artifacts_dir}" ]]; then 703 echo "Dumping logs from nodes to GCS directly at '${gcs_artifacts_dir}' using logexporter" 704 dump_nodes_with_logexporter 705 else 706 echo "Dumping logs from nodes locally to '${report_dir}'" 707 dump_nodes 708 fi 709 710 detect_node_failures 711 if [[ ${logexporter_failed} -ne 0 && ${log_dump_expected_success_percentage} -gt 0 ]]; then 712 return 1 713 fi 714 } 715 716 main