k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/logexporter/cluster/log-dump.sh (about) 1 #!/usr/bin/env bash 2 3 # Copyright 2017 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 # Call this to dump all master and node logs into the folder specified in $1 18 # (defaults to _artifacts). Only works if the provider supports SSH. 19 20 set -o errexit 21 set -o nounset 22 set -o pipefail 23 24 readonly local_report_dir="${1:-_artifacts}" 25 report_dir="" 26 readonly gcs_artifacts_dir="${2:-}" 27 readonly logexporter_namespace="${3:-logexporter}" 28 29 # In order to more trivially extend log-dump for custom deployments, 30 # check for a function named log_dump_custom_get_instances. If it's 31 # defined, we assume the function can me called with one argument, the 32 # role, which is either "master" or "node". 33 echo 'Checking for custom logdump instances, if any' 34 if [[ $(type -t log_dump_custom_get_instances) == "function" ]]; then 35 readonly use_custom_instance_list=yes 36 else 37 readonly use_custom_instance_list= 38 fi 39 40 readonly master_ssh_supported_providers="gce aws" 41 readonly node_ssh_supported_providers="gce gke aws" 42 readonly gcloud_supported_providers="gce gke" 43 44 readonly master_logfiles="kube-apiserver.log kube-apiserver-audit.log kube-scheduler.log cloud-controller-manager.log kube-controller-manager.log etcd.log etcd-events.log glbc.log cluster-autoscaler.log kube-addon-manager.log konnectivity-server.log fluentd.log kubelet.cov" 45 readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log node-problem-detector.log kubelet.cov" 46 readonly node_systemd_services="node-problem-detector" 47 readonly hollow_node_logfiles="kubelet-hollow-node-*.log kubeproxy-hollow-node-*.log npd-hollow-node-*.log" 48 readonly aws_logfiles="cloud-init-output.log" 49 readonly gce_logfiles="startupscript.log" 50 readonly kern_logfile="kern.log" 51 readonly initd_logfiles="docker/log" 52 readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log" 53 readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}" 54 readonly extra_log_files="${LOG_DUMP_EXTRA_FILES:-}" 55 readonly extra_systemd_services="${LOG_DUMP_SAVE_SERVICES:-}" 56 readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}" 57 58 # Root directory for Kubernetes files on Windows nodes. 59 WINDOWS_K8S_DIR="C:\\etc\\kubernetes" 60 # Directory where Kubernetes log files will be stored on Windows nodes. 61 export WINDOWS_LOGS_DIR="${WINDOWS_K8S_DIR}\\logs" 62 # Log files found in WINDOWS_LOGS_DIR on Windows nodes: 63 readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log docker_images.log csi-proxy.log" 64 # Log files found in other directories on Windows nodes: 65 readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp" 66 67 # Limit the number of concurrent node connections so that we don't run out of 68 # file descriptors for large clusters. 69 readonly max_dump_processes=25 70 71 # Indicator variable whether we experienced a significant failure during 72 # logexporter creation or execution. 73 logexporter_failed=0 74 75 # Percentage of nodes that must be logexported successfully (otherwise the 76 # process will exit with a non-zero exit code). 77 readonly log_dump_expected_success_percentage="${LOG_DUMP_EXPECTED_SUCCESS_PERCENTAGE:-0}" 78 79 # Example: kube::util::trap_add 'echo "in trap DEBUG"' DEBUG 80 # See: http://stackoverflow.com/questions/3338030/multiple-bash-traps-for-the-same-signal 81 kube::util::trap_add() { 82 local trap_add_cmd 83 trap_add_cmd=$1 84 shift 85 86 for trap_add_name in "$@"; do 87 local existing_cmd 88 local new_cmd 89 90 # Grab the currently defined trap commands for this trap 91 existing_cmd=$(trap -p "${trap_add_name}" | awk -F"'" '{print $2}') 92 93 if [[ -z "${existing_cmd}" ]]; then 94 new_cmd="${trap_add_cmd}" 95 else 96 new_cmd="${trap_add_cmd};${existing_cmd}" 97 fi 98 99 # Assign the test. Disable the shellcheck warning telling that trap 100 # commands should be single quoted to avoid evaluating them at this 101 # point instead evaluating them at run time. The logic of adding new 102 # commands to a single trap requires them to be evaluated right away. 103 # shellcheck disable=SC2064 104 trap "${new_cmd}" "${trap_add_name}" 105 done 106 } 107 108 # Opposite of kube::util::ensure-temp-dir() 109 kube::util::cleanup-temp-dir() { 110 rm -rf "${KUBE_TEMP}" 111 } 112 113 # Create a temp dir that'll be deleted at the end of this bash session. 114 # 115 # Vars set: 116 # KUBE_TEMP 117 kube::util::ensure-temp-dir() { 118 if [[ -z ${KUBE_TEMP-} ]]; then 119 KUBE_TEMP=$(mktemp -d 2>/dev/null || mktemp -d -t kubernetes.XXXXXX) 120 kube::util::trap_add kube::util::cleanup-temp-dir EXIT 121 fi 122 } 123 124 # Use the gcloud defaults to find the project. If it is already set in the 125 # environment then go with that. 126 # 127 # Vars set: 128 # PROJECT 129 # NETWORK_PROJECT 130 # PROJECT_REPORTED 131 function detect-project() { 132 if [[ -z "${PROJECT-}" ]]; then 133 PROJECT=$(gcloud config list project --format 'value(core.project)') 134 fi 135 136 NETWORK_PROJECT=${NETWORK_PROJECT:-${PROJECT}} 137 138 if [[ -z "${PROJECT-}" ]]; then 139 echo "Could not detect Google Cloud Platform project. Set the default project using " >&2 140 echo "'gcloud config set project <PROJECT>'" >&2 141 exit 1 142 fi 143 if [[ -z "${PROJECT_REPORTED-}" ]]; then 144 echo "Project: ${PROJECT}" >&2 145 echo "Network Project: ${NETWORK_PROJECT}" >&2 146 echo "Zone: ${ZONE}" >&2 147 PROJECT_REPORTED=true 148 fi 149 } 150 151 # Detect Linux and Windows nodes in the cluster. 152 # 153 # If a custom get-instances function has been set, this function will use it 154 # to set the NODE_NAMES array. 155 # 156 # Otherwise this function will attempt to detect the nodes based on the GCP 157 # instance group information. If Windows nodes are present they will be detected 158 # separately. The following arrays will be set: 159 # NODE_NAMES 160 # INSTANCE_GROUPS 161 # WINDOWS_NODE_NAMES 162 # WINDOWS_INSTANCE_GROUPS 163 function detect-node-names() { 164 NODE_NAMES=() 165 INSTANCE_GROUPS=() 166 WINDOWS_INSTANCE_GROUPS=() 167 WINDOWS_NODE_NAMES=() 168 169 if [[ -n "${use_custom_instance_list}" ]]; then 170 echo 'Detecting node names using log_dump_custom_get_instances() function' 171 while IFS='' read -r line; do NODE_NAMES+=("$line"); done < <(log_dump_custom_get_instances node) 172 echo "NODE_NAMES=${NODE_NAMES[*]:-}" >&2 173 return 174 fi 175 176 if ! [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 177 echo "gcloud not supported for ${KUBERNETES_PROVIDER}, can't detect node names" 178 return 179 fi 180 181 # These prefixes must not be prefixes of each other, so that they can be used to 182 # detect mutually exclusive sets of nodes. 183 local -r NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX:-"${INSTANCE_PREFIX}-minion"} 184 local -r WINDOWS_NODE_INSTANCE_PREFIX=${WINDOWS_NODE_INSTANCE_PREFIX:-"${INSTANCE_PREFIX}-windows-node"} 185 detect-project 186 echo 'Detecting nodes in the cluster' 187 INSTANCE_GROUPS+=($(gcloud compute instance-groups managed list \ 188 --project "${PROJECT}" \ 189 --filter "name ~ '${NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \ 190 --format='value(name)' || true)) 191 WINDOWS_INSTANCE_GROUPS+=($(gcloud compute instance-groups managed list \ 192 --project "${PROJECT}" \ 193 --filter "name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \ 194 --format='value(name)' || true)) 195 196 if [[ -n "${INSTANCE_GROUPS[@]:-}" ]]; then 197 for group in "${INSTANCE_GROUPS[@]}"; do 198 NODE_NAMES+=($(gcloud compute instance-groups managed list-instances \ 199 "${group}" --zone "${ZONE}" --project "${PROJECT}" \ 200 --format='value(name)')) 201 done 202 fi 203 # Add heapster node name to the list too (if it exists). 204 if [[ -n "${HEAPSTER_MACHINE_TYPE:-}" ]]; then 205 NODE_NAMES+=("${NODE_INSTANCE_PREFIX}-heapster") 206 fi 207 if [[ -n "${WINDOWS_INSTANCE_GROUPS[@]:-}" ]]; then 208 for group in "${WINDOWS_INSTANCE_GROUPS[@]}"; do 209 WINDOWS_NODE_NAMES+=($(gcloud compute instance-groups managed \ 210 list-instances "${group}" --zone "${ZONE}" --project "${PROJECT}" \ 211 --format='value(name)')) 212 done 213 fi 214 215 echo "INSTANCE_GROUPS=${INSTANCE_GROUPS[*]:-}" >&2 216 echo "NODE_NAMES=${NODE_NAMES[*]:-}" >&2 217 echo "WINDOWS_INSTANCE_GROUPS=${WINDOWS_INSTANCE_GROUPS[*]:-}" >&2 218 echo "WINDOWS_NODE_NAMES=${WINDOWS_NODE_NAMES[*]:-}" >&2 219 } 220 221 # Detect the IP for the master 222 # 223 # Assumed vars: 224 # MASTER_NAME 225 # ZONE 226 # REGION 227 # Vars set: 228 # KUBE_MASTER 229 # KUBE_MASTER_IP 230 function detect-master() { 231 detect-project 232 KUBE_MASTER=${MASTER_NAME} 233 echo "Trying to find master named '${MASTER_NAME}'" >&2 234 if [[ -z "${KUBE_MASTER_IP-}" ]]; then 235 local master_address_name="${MASTER_NAME}-ip" 236 echo "Looking for address '${master_address_name}'" >&2 237 if ! KUBE_MASTER_IP=$(gcloud compute addresses describe "${master_address_name}" \ 238 --project "${PROJECT}" --region "${REGION}" -q --format='value(address)') || \ 239 [[ -z "${KUBE_MASTER_IP-}" ]]; then 240 echo "Could not detect Kubernetes master node. Make sure you've launched a cluster with 'kube-up.sh'" >&2 241 exit 1 242 fi 243 fi 244 if [[ -z "${KUBE_MASTER_INTERNAL_IP-}" ]] && [[ ${GCE_PRIVATE_CLUSTER:-} == "true" ]]; then 245 local master_address_name="${MASTER_NAME}-internal-ip" 246 echo "Looking for address '${master_address_name}'" >&2 247 if ! KUBE_MASTER_INTERNAL_IP=$(gcloud compute addresses describe "${master_address_name}" \ 248 --project "${PROJECT}" --region "${REGION}" -q --format='value(address)') || \ 249 [[ -z "${KUBE_MASTER_INTERNAL_IP-}" ]]; then 250 echo "Could not detect Kubernetes master node. Make sure you've launched a cluster with 'kube-up.sh'" >&2 251 exit 1 252 fi 253 fi 254 echo "Using master: $KUBE_MASTER (external IP: $KUBE_MASTER_IP; internal IP: ${KUBE_MASTER_INTERNAL_IP:-(not set)})" >&2 255 } 256 257 # SSH to a node by name ($1) and run a command ($2). 258 function setup() { 259 if [[ -z "${use_custom_instance_list}" ]]; then 260 echo "Using gce provider, skipping check for LOG_DUMP_SSH_KEY and LOG_DUMP_SSH_USER" 261 ZONE="${KUBE_GCE_ZONE:-us-central1-b}" 262 REGION="${ZONE%-*}" 263 INSTANCE_PREFIX="${KUBE_GCE_INSTANCE_PREFIX:-kubernetes}" 264 CLUSTER_NAME="${CLUSTER_NAME:-${INSTANCE_PREFIX}}" 265 MASTER_NAME="${INSTANCE_PREFIX}-master" 266 GCE_PRIVATE_CLUSTER="${KUBE_GCE_PRIVATE_CLUSTER:-false}" 267 detect-project 2>&1 268 elif [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then 269 NUM_NODES=${NUM_NODES:-3} 270 echo "Using 'use_custom_instance_list' with gke, skipping check for LOG_DUMP_SSH_KEY and LOG_DUMP_SSH_USER" 271 elif [[ -z "${LOG_DUMP_SSH_KEY:-}" ]]; then 272 echo 'LOG_DUMP_SSH_KEY not set, but required when using log_dump_custom_get_instances' 273 exit 1 274 elif [[ -z "${LOG_DUMP_SSH_USER:-}" ]]; then 275 echo 'LOG_DUMP_SSH_USER not set, but required when using log_dump_custom_get_instances' 276 exit 1 277 fi 278 } 279 280 function log-dump-ssh() { 281 local host="$1" 282 local cmd="$2" 283 284 if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 285 for (( i=0; i<5; i++)); do 286 if gcloud compute ssh --ssh-flag="-o LogLevel=quiet" --ssh-flag="-o ConnectTimeout=30" --project "${PROJECT}" --zone="${ZONE}" "${host}" --command "echo test > /dev/null"; then 287 break 288 fi 289 sleep 5 290 done 291 # Then actually try the command. 292 gcloud compute ssh --ssh-flag="-o LogLevel=quiet" --ssh-flag="-o ConnectTimeout=30" --project "${PROJECT}" --zone="${ZONE}" "${host}" --command "${cmd}" 293 return 294 fi 295 296 ssh -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${host}" "${cmd}" 297 } 298 299 # Copy all files /var/log/{$3}.log on node $1 into local dir $2. 300 # $3 should be a string array of file names. 301 # This function shouldn't ever trigger errexit, but doesn't block stderr. 302 function copy-logs-from-node() { 303 local -r node="${1}" 304 local -r dir="${2}" 305 shift 306 shift 307 local files=("$@") 308 # Append "*" 309 # The * at the end is needed to also copy rotated logs (which happens 310 # in large clusters and long runs). 311 files=( "${files[@]/%/*}" ) 312 # Prepend "/var/log/" 313 files=( "${files[@]/#/\/var\/log\/}" ) 314 # Comma delimit (even the singleton, or scp does the wrong thing), surround by braces. 315 local -r scp_files="{$(printf "%s," "${files[@]}")}" 316 317 if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 318 # get-serial-port-output lets you ask for ports 1-4, but currently (11/21/2016) only port 1 contains useful information 319 gcloud compute instances get-serial-port-output --project "${PROJECT}" --zone "${ZONE}" --port 1 "${node}" > "${dir}/serial-1.log" || true 320 # FIXME(dims): bug in gcloud prevents multiple source files specified using curly braces, so we just loop through for now 321 for single_file in "${files[@]}"; do 322 # gcloud scp doesn't work very well when trying to fetch constantly changing files such as logs, as it blocks forever sometimes. 323 # We set ConnectTimeout to 5s to avoid blocking for (default tested on 2023-11-17) 2m. 324 gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" --command "tar -zcvf - ${single_file}" -- -o ConnectTimeout=5 | tar -zxf - --strip-components=2 -C "${dir}" || true 325 done 326 elif [[ "${KUBERNETES_PROVIDER}" == "aws" ]]; then 327 local ip 328 ip=$(get_ssh_hostname "${node}") 329 scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" "${SSH_USER}@${ip}:${scp_files}" "${dir}" > /dev/null || true 330 elif [[ -n "${use_custom_instance_list}" ]]; then 331 scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${node}:${scp_files}" "${dir}" > /dev/null || true 332 else 333 echo "Unknown cloud-provider '${KUBERNETES_PROVIDER}' and use_custom_instance_list is unset too - skipping logdump for '${node}'" 334 fi 335 } 336 337 # Save logs for node $1 into directory $2. Pass in any non-common files in $3. 338 # Pass in any non-common systemd services in $4. 339 # $3 and $4 should be a space-separated list of files. 340 # Set $5 to true to indicate it is on master. Default to false. 341 # This function shouldn't ever trigger errexit 342 function save-logs() { 343 local -r node_name="${1}" 344 local -r dir="${2}" 345 local files=() 346 IFS=' ' read -r -a files <<< "$3" 347 local opt_systemd_services="${4:-""}" 348 local on_master="${5:-"false"}" 349 350 local extra=() 351 IFS=' ' read -r -a extra <<< "$extra_log_files" 352 files+=("${extra[@]}") 353 if [[ -n "${use_custom_instance_list}" ]]; then 354 if [[ -n "${LOG_DUMP_SAVE_LOGS:-}" ]]; then 355 local dump=() 356 IFS=' ' read -r -a dump <<< "${LOG_DUMP_SAVE_LOGS:-}" 357 files+=("${dump[@]}") 358 fi 359 else 360 local providerlogs=() 361 case "${KUBERNETES_PROVIDER}" in 362 gce|gke) 363 IFS=' ' read -r -a providerlogs <<< "${gce_logfiles}" 364 ;; 365 aws) 366 IFS=' ' read -r -a providerlogs <<< "${aws_logfiles}" 367 ;; 368 esac 369 files+=("${providerlogs[@]}") 370 fi 371 local services 372 read -r -a services <<< "${systemd_services} ${opt_systemd_services} ${extra_systemd_services}" 373 374 if log-dump-ssh "${node_name}" "command -v journalctl" &> /dev/null; then 375 if [[ "${on_master}" == "true" ]]; then 376 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-installation.service" > "${dir}/kube-master-installation.log" || true 377 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-configuration.service" > "${dir}/kube-master-configuration.log" || true 378 else 379 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-installation.service" > "${dir}/kube-node-installation.log" || true 380 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-configuration.service" > "${dir}/kube-node-configuration.log" || true 381 fi 382 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -k" > "${dir}/kern.log" || true 383 384 for svc in "${services[@]}"; do 385 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u ${svc}.service" > "${dir}/${svc}.log" || true 386 done 387 388 if [[ "$dump_systemd_journal" == "true" ]]; then 389 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise" > "${dir}/systemd.log" || true 390 fi 391 else 392 local tmpfiles=() 393 for f in "${kern_logfile}" "${initd_logfiles}" "${supervisord_logfiles}"; do 394 IFS=' ' read -r -a tmpfiles <<< "$f" 395 files+=("${tmpfiles[@]}") 396 done 397 fi 398 399 # Try dumping coverage profiles, if it looks like coverage is enabled in the first place. 400 if log-dump-ssh "${node_name}" "stat /var/log/kubelet.cov" &> /dev/null; then 401 if log-dump-ssh "${node_name}" "command -v docker" &> /dev/null; then 402 if [[ "${on_master}" == "true" ]]; then 403 run-in-docker-container "${node_name}" "kube-apiserver" "cat /tmp/k8s-kube-apiserver.cov" > "${dir}/kube-apiserver.cov" || true 404 run-in-docker-container "${node_name}" "kube-scheduler" "cat /tmp/k8s-kube-scheduler.cov" > "${dir}/kube-scheduler.cov" || true 405 run-in-docker-container "${node_name}" "kube-controller-manager" "cat /tmp/k8s-kube-controller-manager.cov" > "${dir}/kube-controller-manager.cov" || true 406 else 407 run-in-docker-container "${node_name}" "kube-proxy" "cat /tmp/k8s-kube-proxy.cov" > "${dir}/kube-proxy.cov" || true 408 fi 409 else 410 echo 'Coverage profiles seem to exist, but cannot be retrieved from inside containers.' 411 fi 412 fi 413 414 echo 'Changing logfiles to be world-readable for download' 415 log-dump-ssh "${node_name}" "sudo chmod -R a+r /var/log" || true 416 417 echo "Copying '${files[*]}' from ${node_name}" 418 copy-logs-from-node "${node_name}" "${dir}" "${files[@]}" 419 } 420 421 # Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log 422 # on node $1. 423 function export-windows-docker-event-log() { 424 local -r node="${1}" 425 426 local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(Get-EventLog -LogName Application -Source Docker | Format-Table -Property TimeGenerated, EntryType, Message -Wrap); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker.log'\"" 427 428 # Retry up to 3 times to allow ssh keys to be properly propagated and 429 # stored. 430 for retry in {1..3}; do 431 if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \ 432 --command "$powershell_cmd"; then 433 break 434 else 435 sleep 10 436 fi 437 done 438 } 439 440 # Saves prepulled Windows Docker images list to ${WINDOWS_LOGS_DIR}\docker_images.log 441 # on node $1. 442 function export-windows-docker-images-list() { 443 local -r node="${1}" 444 445 local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(docker image list); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker_images.log'\"" 446 447 # Retry up to 3 times to allow ssh keys to be properly propagated and 448 # stored. 449 for retry in {1..3}; do 450 if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \ 451 --command "$powershell_cmd"; then 452 break 453 else 454 sleep 10 455 fi 456 done 457 } 458 459 # Saves log files from diagnostics tool.(https://github.com/GoogleCloudPlatform/compute-image-tools/tree/master/cli_tools/diagnostics) 460 function save-windows-logs-via-diagnostics-tool() { 461 local node="${1}" 462 local dest_dir="${2}" 463 464 gcloud compute instances add-metadata "${node}" --metadata enable-diagnostics=true --project="${PROJECT}" --zone="${ZONE}" 465 local logs_archive_in_gcs 466 logs_archive_in_gcs=$(gcloud alpha compute diagnose export-logs "${node}" "--zone=${ZONE}" "--project=${PROJECT}" | tail -n 1) 467 local temp_local_path="${node}.zip" 468 for retry in {1..20}; do 469 if gsutil mv "${logs_archive_in_gcs}" "${temp_local_path}" > /dev/null 2>&1; then 470 echo "Downloaded diagnostics log from ${logs_archive_in_gcs}" 471 break 472 else 473 sleep 10 474 fi 475 done 476 477 if [[ -f "${temp_local_path}" ]]; then 478 unzip "${temp_local_path}" -d "${dest_dir}" > /dev/null 479 rm -f "${temp_local_path}" 480 fi 481 } 482 483 # Saves log files from SSH 484 function save-windows-logs-via-ssh() { 485 local node="${1}" 486 local dest_dir="${2}" 487 488 export-windows-docker-event-log "${node}" 489 export-windows-docker-images-list "${node}" 490 491 local remote_files=() 492 for file in "${windows_node_logfiles[@]}"; do 493 remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" ) 494 done 495 remote_files+=( "${windows_node_otherfiles[@]}" ) 496 497 # TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the 498 # same time. 499 for remote_file in "${remote_files[@]}"; do 500 # Retry up to 3 times to allow ssh keys to be properly propagated and 501 # stored. 502 for retry in {1..3}; do 503 if gcloud compute scp --recurse --project "${PROJECT}" \ 504 --zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \ 505 > /dev/null; then 506 break 507 else 508 sleep 10 509 fi 510 done 511 done 512 } 513 514 # Save log files and serial console output from Windows node $1 into local 515 # directory $2. 516 # This function shouldn't ever trigger errexit. 517 function save-logs-windows() { 518 local -r node="${1}" 519 local -r dest_dir="${2}" 520 521 if [[ ! "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 522 echo "Not saving logs for ${node}, Windows log dumping requires gcloud support" 523 return 524 fi 525 526 if [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then 527 save-windows-logs-via-diagnostics-tool "${node}" "${dest_dir}" 528 else 529 save-windows-logs-via-ssh "${node}" "${dest_dir}" 530 fi 531 532 # Serial port 1 contains the Windows console output. 533 gcloud compute instances get-serial-port-output --project "${PROJECT}" \ 534 --zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true 535 } 536 537 # Execute a command in container $2 on node $1. 538 # Uses docker because the container may not ordinarily permit direct execution. 539 function run-in-docker-container() { 540 local node_name="$1" 541 local container="$2" 542 shift 2 543 log-dump-ssh "${node_name}" "docker exec \"\$(docker ps -f label=io.kubernetes.container.name=${container} --format \"{{.ID}}\")\" $*" 544 } 545 546 function dump_masters() { 547 local master_names=() 548 if [[ -n "${use_custom_instance_list}" ]]; then 549 while IFS='' read -r line; do master_names+=("$line"); done < <(log_dump_custom_get_instances master) 550 elif [[ ! "${master_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 551 echo "Master SSH not supported for ${KUBERNETES_PROVIDER}" 552 return 553 elif [[ -n "${KUBEMARK_MASTER_NAME:-}" ]]; then 554 master_names=( "${KUBEMARK_MASTER_NAME}" ) 555 else 556 if ! (detect-master); then 557 echo 'Master not detected. Is the cluster up?' 558 return 559 fi 560 master_names=( "${MASTER_NAME}" ) 561 fi 562 563 if [[ "${#master_names[@]}" == 0 ]]; then 564 echo 'No masters found?' 565 return 566 fi 567 568 proc=${max_dump_processes} 569 for master_name in "${master_names[@]}"; do 570 master_dir="${report_dir}/${master_name}" 571 mkdir -p "${master_dir}" 572 save-logs "${master_name}" "${master_dir}" "${master_logfiles}" "" "true" & 573 574 # We don't want to run more than ${max_dump_processes} at a time, so 575 # wait once we hit that many nodes. This isn't ideal, since one might 576 # take much longer than the others, but it should help. 577 proc=$((proc - 1)) 578 if [[ proc -eq 0 ]]; then 579 proc=${max_dump_processes} 580 wait 581 fi 582 done 583 # Wait for any remaining processes. 584 if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then 585 wait 586 fi 587 } 588 589 # Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be 590 # specified via $1 or $use_custom_instance_list. If not specified then the nodes 591 # to dump logs for will be detected using detect-node-names(); if Windows nodes 592 # are present then they will be detected and their logs will be dumped too. 593 function dump_nodes() { 594 local node_names=() 595 local windows_node_names=() 596 if [[ -n "${1:-}" ]]; then 597 echo 'Dumping logs for nodes provided as args to dump_nodes() function' 598 node_names=( "$@" ) 599 else 600 echo 'Detecting nodes in the cluster' 601 detect-node-names &> /dev/null 602 if [[ -n "${NODE_NAMES:-}" ]]; then 603 node_names=( "${NODE_NAMES[@]}" ) 604 fi 605 if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then 606 windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" ) 607 fi 608 fi 609 610 if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then 611 echo 'No nodes found!' 612 return 613 fi 614 615 node_logfiles_all="${node_logfiles}" 616 if [[ "${ENABLE_HOLLOW_NODE_LOGS:-}" == "true" ]]; then 617 node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}" 618 fi 619 620 linux_nodes_selected_for_logs=() 621 if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then 622 # We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs. 623 for index in $(shuf -i 0-$(( ${#node_names[*]} - 1 )) -n "${LOGDUMP_ONLY_N_RANDOM_NODES}") 624 do 625 linux_nodes_selected_for_logs+=("${node_names[$index]}") 626 done 627 else 628 linux_nodes_selected_for_logs=( "${node_names[@]}" ) 629 fi 630 all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" ) 631 all_selected_nodes+=( "${windows_node_names[@]}" ) 632 633 proc=${max_dump_processes} 634 start="$(date +%s)" 635 # log_dump_ssh_timeout is the maximal number of seconds the log dumping over 636 # SSH operation can take. Please note that the logic enforcing the timeout 637 # is only a best effort. The actual time of the operation may be longer 638 # due to waiting for all the child processes below. 639 log_dump_ssh_timeout_seconds="${LOG_DUMP_SSH_TIMEOUT_SECONDS:-}" 640 for i in "${!all_selected_nodes[@]}"; do 641 node_name="${all_selected_nodes[$i]}" 642 node_dir="${report_dir}/${node_name}" 643 mkdir -p "${node_dir}" 644 if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then 645 # Save logs in the background. This speeds up things when there are 646 # many nodes. 647 save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" & 648 else 649 save-logs-windows "${node_name}" "${node_dir}" & 650 fi 651 652 # We don't want to run more than ${max_dump_processes} at a time, so 653 # wait once we hit that many nodes. This isn't ideal, since one might 654 # take much longer than the others, but it should help. 655 proc=$((proc - 1)) 656 if [[ proc -eq 0 ]]; then 657 proc=${max_dump_processes} 658 wait 659 now="$(date +%s)" 660 if [[ -n "${log_dump_ssh_timeout_seconds}" && $((now - start)) -gt ${log_dump_ssh_timeout_seconds} ]]; then 661 echo "WARNING: Hit timeout after ${log_dump_ssh_timeout_seconds} seconds, finishing log dumping over SSH shortly" 662 break 663 fi 664 fi 665 done 666 # Wait for any remaining processes. 667 if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then 668 wait 669 fi 670 } 671 672 # Collect names of nodes which didn't run logexporter successfully. 673 # This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter 674 # does not run on Windows nodes. 675 # 676 # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes. 677 # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes). 678 # Assumes: 679 # NODE_NAMES 680 # Sets: 681 # NON_LOGEXPORTED_NODES 682 function find_non_logexported_nodes() { 683 local file="${gcs_artifacts_dir}/logexported-nodes-registry" 684 echo "Listing marker files ($file) for successful nodes..." 685 succeeded_nodes=$(gsutil ls "${file}") || return 1 686 echo 'Successfully listed marker files for successful nodes' 687 NON_LOGEXPORTED_NODES=() 688 for node in "${NODE_NAMES[@]}"; do 689 if [[ ! "${succeeded_nodes}" =~ ${node} ]]; then 690 NON_LOGEXPORTED_NODES+=("${node}") 691 fi 692 done 693 } 694 695 # This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter 696 # does not run on Windows nodes. 697 function dump_nodes_with_logexporter() { 698 detect-node-names &> /dev/null 699 700 if [[ -z "${NODE_NAMES:-}" ]]; then 701 echo 'No nodes found!' 702 return 703 fi 704 705 # Obtain parameters required by logexporter. 706 if [[ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]]; then 707 local -r service_account_credentials="$(base64 "${GOOGLE_APPLICATION_CREDENTIALS}" | tr -d '\n')" 708 fi 709 local -r cloud_provider="${KUBERNETES_PROVIDER}" 710 local -r enable_hollow_node_logs="${ENABLE_HOLLOW_NODE_LOGS:-false}" 711 local -r logexport_sleep_seconds="$(( 90 + NUM_NODES / 3 ))" 712 if [[ -z "${ZONE_NODE_SELECTOR_DISABLED:-}" ]]; then 713 local -r node_selector="${ZONE_NODE_SELECTOR_LABEL:-topology.kubernetes.io/zone}: ${ZONE}" 714 fi 715 local -r use_application_default_credentials="${LOGEXPORTER_USE_APPLICATION_DEFAULT_CREDENTIALS:-false}" 716 717 # Fill in the parameters in the logexporter daemonset template. 718 local -r tmp="${KUBE_TEMP}/logexporter" 719 local -r manifest_yaml="${tmp}/logexporter-daemonset.yaml" 720 mkdir -p "${tmp}" 721 local -r cwd=$(dirname "${BASH_SOURCE[0]}") 722 cp "${cwd}/logexporter-daemonset.yaml" "${manifest_yaml}" 723 724 sed -i'' -e "s@{{.NodeSelector}}@${node_selector:-}@g" "${manifest_yaml}" 725 sed -i'' -e "s@{{.LogexporterNamespace}}@${logexporter_namespace}@g" "${manifest_yaml}" 726 sed -i'' -e "s@{{.ServiceAccountCredentials}}@${service_account_credentials:-}@g" "${manifest_yaml}" 727 sed -i'' -e "s@{{.CloudProvider}}@${cloud_provider}@g" "${manifest_yaml}" 728 sed -i'' -e "s@{{.GCSPath}}@${gcs_artifacts_dir}@g" "${manifest_yaml}" 729 sed -i'' -e "s@{{.EnableHollowNodeLogs}}@${enable_hollow_node_logs}@g" "${manifest_yaml}" 730 sed -i'' -e "s@{{.DumpSystemdJournal}}@${dump_systemd_journal}@g" "${manifest_yaml}" 731 sed -i'' -e "s@{{.ExtraLogFiles}}@${extra_log_files}@g" "${manifest_yaml}" 732 sed -i'' -e "s@{{.ExtraSystemdServices}}@${extra_systemd_services}@g" "${manifest_yaml}" 733 sed -i'' -e "s@{{.UseApplicationDefaultCredentials}}@${use_application_default_credentials}@g" "${manifest_yaml}" 734 735 # Create the logexporter namespace, service-account secret and the logexporter daemonset within that namespace. 736 if ! kubectl create -f "${manifest_yaml}"; then 737 echo 'Failed to create logexporter daemonset.. falling back to logdump through SSH' 738 kubectl delete namespace "${logexporter_namespace}" || true 739 dump_nodes "${NODE_NAMES[@]}" 740 logexporter_failed=1 741 return 742 fi 743 744 # Periodically fetch list of already logexported nodes to verify 745 # if we aren't already done. 746 start="$(date +%s)" 747 while true; do 748 now="$(date +%s)" 749 if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then 750 echo 'Waiting for all nodes to be logexported timed out.' 751 break 752 fi 753 if find_non_logexported_nodes; then 754 if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then 755 break 756 fi 757 fi 758 sleep 15 759 done 760 761 # Store logs from logexporter pods to allow debugging log exporting process 762 # itself. 763 proc=${max_dump_processes} 764 kubectl get pods -n "${logexporter_namespace}" -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | (while read -r pod node; do 765 echo "Fetching logs from ${pod} running on ${node}" 766 mkdir -p "${report_dir}/${node}" 767 kubectl logs -n "${logexporter_namespace}" "${pod}" > "${report_dir}/${node}/${pod}.log" & 768 769 # We don't want to run more than ${max_dump_processes} at a time, so 770 # wait once we hit that many nodes. This isn't ideal, since one might 771 # take much longer than the others, but it should help. 772 proc=$((proc - 1)) 773 if [[ proc -eq 0 ]]; then 774 proc=${max_dump_processes} 775 wait 776 fi 777 # Wait for any remaining processes. 778 done; wait) 779 780 # List registry of marker files (of nodes whose logexporter succeeded) from GCS. 781 for retry in {1..10}; do 782 if find_non_logexported_nodes; then 783 break 784 else 785 echo "Attempt ${retry} failed to list marker files for successful nodes" 786 if [[ "${retry}" == 10 ]]; then 787 echo 'Final attempt to list marker files failed.. falling back to logdump through SSH' 788 # Timeout prevents the test waiting too long to delete resources and 789 # never uploading logs, as happened in https://github.com/kubernetes/kubernetes/issues/111111 790 kubectl delete namespace "${logexporter_namespace}" --timeout 15m || true 791 dump_nodes "${NODE_NAMES[@]}" 792 logexporter_failed=1 793 return 794 fi 795 sleep 2 796 fi 797 done 798 799 failed_nodes=() 800 # The following if is needed, because defaulting for empty arrays 801 # seems to treat them as non-empty with single empty string. 802 if [[ -n "${NON_LOGEXPORTED_NODES:-}" ]]; then 803 for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do 804 echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH." 805 failed_nodes+=("${node}") 806 done 807 fi 808 809 # If less than a certain ratio of the nodes got logexported, report an error. 810 if [[ $(((${#NODE_NAMES[@]} - ${#failed_nodes[@]}) * 100)) -lt $((${#NODE_NAMES[@]} * log_dump_expected_success_percentage )) ]]; then 811 logexporter_failed=1 812 fi 813 814 # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH. 815 kubectl get pods --namespace "${logexporter_namespace}" || true 816 # Timeout prevents the test waiting too long to delete resources and 817 # never uploading logs, as happened in https://github.com/kubernetes/kubernetes/issues/111111 818 kubectl delete namespace "${logexporter_namespace}" --timeout 15m || true 819 if [[ "${#failed_nodes[@]}" != 0 ]]; then 820 echo -e "Dumping logs through SSH for the following nodes:\n${failed_nodes[*]}" 821 dump_nodes "${failed_nodes[@]}" 822 fi 823 } 824 825 # Writes node information that's available through the gcloud and kubectl API 826 # surfaces to a nodes/ subdirectory of $report_dir. 827 function dump_node_info() { 828 nodes_dir="${report_dir}/nodes" 829 mkdir -p "${nodes_dir}" 830 831 detect-node-names 832 if [[ -n "${NODE_NAMES:-}" ]]; then 833 printf "%s\n" "${NODE_NAMES[@]}" > "${nodes_dir}/node_names.txt" 834 fi 835 if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then 836 printf "%s\n" "${WINDOWS_NODE_NAMES[@]}" > "${nodes_dir}/windows_node_names.txt" 837 fi 838 839 # If we are not able to reach the server, just bail out as the other 840 # kubectl calls below will fail anyway (we don't want to error out collecting logs) 841 kubectl version || return 0 842 843 kubectl get nodes -o yaml > "${nodes_dir}/kubectl_get_nodes.yaml" 844 845 api_node_names=() 846 api_node_names+=($( kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\tReady="}{@.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | awk '/Ready=True/ {print $1}')) 847 if [[ "${#api_node_names[@]}" -le 5 ]]; then 848 for node_name in "${api_node_names[@]}"; do 849 mkdir -p "${nodes_dir}/${node_name}" 850 kubectl get --raw "/api/v1/nodes/${node_name}/proxy/metrics" > "${nodes_dir}/${node_name}/kubelet_metrics.txt" 851 done 852 fi 853 } 854 855 function detect_node_failures() { 856 if ! [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then 857 return 858 fi 859 860 detect-node-names 861 if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then 862 local all_instance_groups=("${INSTANCE_GROUPS[@]}" "${WINDOWS_INSTANCE_GROUPS[@]}") 863 else 864 local all_instance_groups=("${INSTANCE_GROUPS[@]}") 865 fi 866 867 if [ -z "${all_instance_groups:-}" ]; then 868 return 869 fi 870 for group in "${all_instance_groups[@]}"; do 871 local creation_timestamp 872 creation_timestamp=$(gcloud compute instance-groups managed describe \ 873 "${group}" \ 874 --project "${PROJECT}" \ 875 --zone "${ZONE}" \ 876 --format='value(creationTimestamp)') 877 echo "Failures for ${group} (if any):" 878 gcloud logging read --order=asc \ 879 --format='table(timestamp,jsonPayload.resource.name,jsonPayload.event_subtype)' \ 880 --project "${PROJECT}" \ 881 "resource.type=\"gce_instance\" 882 logName=\"projects/${PROJECT}/logs/compute.googleapis.com%2Factivity_log\" 883 (jsonPayload.event_subtype=\"compute.instances.hostError\" OR jsonPayload.event_subtype=\"compute.instances.automaticRestart\") 884 jsonPayload.resource.name:\"${group}\" 885 timestamp >= \"${creation_timestamp}\"" 886 done 887 } 888 889 function dump_logs() { 890 # Copy master logs to artifacts dir locally (through SSH). 891 echo "Dumping logs from master locally to '${report_dir}'" 892 dump_masters 893 if [[ "${DUMP_ONLY_MASTER_LOGS:-}" == "true" ]]; then 894 echo 'Skipping dumping of node logs' 895 return 896 fi 897 898 # Copy logs from nodes to GCS directly or to artifacts dir locally (through SSH). 899 if [[ -n "${gcs_artifacts_dir}" ]]; then 900 echo "Dumping logs from nodes to GCS directly at '${gcs_artifacts_dir}' using logexporter" 901 dump_nodes_with_logexporter 902 else 903 echo "Dumping logs from nodes locally to '${report_dir}'" 904 dump_nodes 905 fi 906 } 907 908 # Without ${DUMP_TO_GCS_ONLY} == true: 909 # * only logs exported by logexporter will be uploaded to 910 # ${gcs_artifacts_dir} 911 # * other logs (master logs, nodes where logexporter failed) will be 912 # fetched locally to ${report_dir}. 913 # If $DUMP_TO_GCS_ONLY == 'true', all logs will be uploaded directly to 914 # ${gcs_artifacts_dir}. 915 function main() { 916 setup 917 kube::util::ensure-temp-dir 918 if [[ "${DUMP_TO_GCS_ONLY:-}" == "true" ]] && [[ -n "${gcs_artifacts_dir}" ]]; then 919 report_dir="${KUBE_TEMP}/logs" 920 mkdir -p "${report_dir}" 921 echo "${gcs_artifacts_dir}" > "${local_report_dir}/master-and-node-logs.link.txt" 922 echo "Dumping logs temporarily to '${report_dir}'. Will upload to '${gcs_artifacts_dir}' later." 923 else 924 report_dir="${local_report_dir}" 925 fi 926 927 dump_logs 928 dump_node_info 929 930 if [[ "${DUMP_TO_GCS_ONLY:-}" == "true" ]] && [[ -n "${gcs_artifacts_dir}" ]]; then 931 if [[ "$(ls -A ${report_dir})" ]]; then 932 echo "Uploading '${report_dir}' to '${gcs_artifacts_dir}'" 933 934 if gsutil ls "${gcs_artifacts_dir}" > /dev/null; then 935 # If "${gcs_artifacts_dir}" exists, the simple call: 936 # `gsutil cp -r /tmp/dir/logs ${gcs_artifacts_dir}` will 937 # create subdirectory 'logs' in ${gcs_artifacts_dir} 938 # 939 # If "${gcs_artifacts_dir}" exists, we want to merge its content 940 # with local logs. To do that we do the following trick: 941 # * Let's say that ${gcs_artifacts_dir} == 'gs://a/b/c'. 942 # * We rename 'logs' to 'c' 943 # * Call `gsutil cp -r /tmp/dir/c gs://a/b/` 944 # 945 # Similar pattern is used in bootstrap.py#L409-L416. 946 # It is a known issue that gsutil cp behavior is that complex. 947 # For more information on this, see: 948 # https://cloud.google.com/storage/docs/gsutil/commands/cp#how-names-are-constructed 949 remote_dir=$(dirname ${gcs_artifacts_dir}) 950 remote_basename=$(basename ${gcs_artifacts_dir}) 951 mv ${report_dir} "${KUBE_TEMP}/${remote_basename}" 952 gsutil -m cp -r -c -z log,txt,xml "${KUBE_TEMP}/${remote_basename}" "${remote_dir}" 953 rm -rf "${KUBE_TEMP}/${remote_basename}" 954 else # ${gcs_artifacts_dir} doesn't exist. 955 gsutil -m cp -r -c -z log,txt,xml "${report_dir}" "${gcs_artifacts_dir}" 956 rm -rf "${report_dir}" 957 fi 958 else 959 echo "Skipping upload of '${report_dir}' as it's empty." 960 fi 961 fi 962 963 detect_node_failures 964 if [[ ${logexporter_failed} -ne 0 && ${log_dump_expected_success_percentage} -gt 0 ]]; then 965 return 1 966 fi 967 } 968 969 main