k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/log-dump/log-dump.sh

k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/log-dump/log-dump.sh (about)

     1  #!/usr/bin/env bash
     2  
     3  # Copyright 2017 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  # Call this to dump all master and node logs into the folder specified in $1
    18  # (defaults to _artifacts). Only works if the provider supports SSH.
    19  
    20  # TODO(shyamjvs): This script should be moved to test/e2e which is where it ideally belongs.
    21  set -o errexit
    22  set -o nounset
    23  set -o pipefail
    24  
    25  readonly report_dir="${1:-_artifacts}"
    26  readonly gcs_artifacts_dir="${2:-}"
    27  readonly logexporter_namespace="${3:-logexporter}"
    28  
    29  # In order to more trivially extend log-dump for custom deployments,
    30  # check for a function named log_dump_custom_get_instances. If it's
    31  # defined, we assume the function can me called with one argument, the
    32  # role, which is either "master" or "node".
    33  echo 'Checking for custom logdump instances, if any'
    34  if [[ $(type -t log_dump_custom_get_instances) == "function" ]]; then
    35    readonly use_custom_instance_list=yes
    36  else
    37    readonly use_custom_instance_list=
    38  fi
    39  
    40  readonly master_ssh_supported_providers="gce aws"
    41  readonly node_ssh_supported_providers="gce gke aws"
    42  readonly gcloud_supported_providers="gce gke"
    43  
    44  readonly master_logfiles="kube-apiserver.log kube-apiserver-audit.log kube-scheduler.log kube-controller-manager.log cloud-controller-manager.log etcd.log etcd-events.log glbc.log cluster-autoscaler.log kube-addon-manager.log konnectivity-server.log fluentd.log kubelet.cov"
    45  readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log node-problem-detector.log kubelet.cov kube-network-policies.log"
    46  readonly node_systemd_services="node-problem-detector"
    47  readonly hollow_node_logfiles="kubelet-hollow-node-*.log kubeproxy-hollow-node-*.log npd-hollow-node-*.log"
    48  readonly aws_logfiles="cloud-init-output.log"
    49  readonly gce_logfiles="startupscript.log"
    50  readonly kern_logfile="kern.log"
    51  readonly initd_logfiles="docker/log"
    52  readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log"
    53  readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}"
    54  readonly extra_log_files="${LOG_DUMP_EXTRA_FILES:-}"
    55  readonly extra_systemd_services="${LOG_DUMP_SAVE_SERVICES:-}"
    56  readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}"
    57  # Log files found in WINDOWS_LOGS_DIR on Windows nodes:
    58  readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log docker_images.log csi-proxy.log"
    59  # Log files found in other directories on Windows nodes:
    60  readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp"
    61  
    62  # Limit the number of concurrent node connections so that we don't run out of
    63  # file descriptors for large clusters.
    64  readonly max_dump_processes=25
    65  
    66  # Indicator variable whether we experienced a significant failure during
    67  # logexporter creation or execution.
    68  logexporter_failed=0
    69  
    70  # Percentage of nodes that must be logexported successfully (otherwise the
    71  # process will exit with a non-zero exit code).
    72  readonly log_dump_expected_success_percentage="${LOG_DUMP_EXPECTED_SUCCESS_PERCENTAGE:-0}"
    73  
    74  function print-deprecation-note() {
    75    local -r dashline=$(printf -- '-%.0s' {1..100})
    76    echo "${dashline}"
    77    echo "k/k version of the log-dump.sh script is deprecated!"
    78    echo "Please migrate your test job to use test-infra's repo version of log-dump.sh!"
    79    echo "Migration steps can be found in the readme file."
    80    echo "${dashline}"
    81  }
    82  
    83  # TODO: Get rid of all the sourcing of bash dependencies eventually.
    84  function setup() {
    85    KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/../..
    86    if [[ -z "${use_custom_instance_list}" ]]; then
    87      : "${KUBE_CONFIG_FILE:=config-test.sh}"
    88      echo 'Sourcing kube-util.sh'
    89      source "${KUBE_ROOT}/cluster/kube-util.sh"
    90      echo 'Detecting project'
    91      detect-project 2>&1
    92    elif [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
    93      echo "Using 'use_custom_instance_list' with gke, skipping check for LOG_DUMP_SSH_KEY and LOG_DUMP_SSH_USER"
    94      # Source the below script for the ssh-to-node utility function.
    95      # Hack to save and restore the value of the ZONE env as the script overwrites it.
    96      local gke_zone="${ZONE:-}"
    97      source "${KUBE_ROOT}/cluster/gce/util.sh"
    98      ZONE="${gke_zone}"
    99    elif [[ -z "${LOG_DUMP_SSH_KEY:-}" ]]; then
   100      echo 'LOG_DUMP_SSH_KEY not set, but required when using log_dump_custom_get_instances'
   101      exit 1
   102    elif [[ -z "${LOG_DUMP_SSH_USER:-}" ]]; then
   103      echo 'LOG_DUMP_SSH_USER not set, but required when using log_dump_custom_get_instances'
   104      exit 1
   105    fi
   106    source "${KUBE_ROOT}/hack/lib/util.sh"
   107  }
   108  
   109  function log-dump-ssh() {
   110    if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   111      ssh-to-node "$@"
   112      return
   113    fi
   114  
   115    local host="$1"
   116    local cmd="$2"
   117  
   118    ssh -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${host}" "${cmd}"
   119  }
   120  
   121  # Copy all files /var/log/{$3}.log on node $1 into local dir $2.
   122  # $3 should be a string array of file names.
   123  # This function shouldn't ever trigger errexit, but doesn't block stderr.
   124  function copy-logs-from-node() {
   125      local -r node="${1}"
   126      local -r dir="${2}"
   127      shift
   128      shift
   129      local files=("$@")
   130      # Append "*"
   131      # The * at the end is needed to also copy rotated logs (which happens
   132      # in large clusters and long runs).
   133      files=( "${files[@]/%/*}" )
   134      # Prepend "/var/log/"
   135      files=( "${files[@]/#/\/var\/log\/}" )
   136      # Comma delimit (even the singleton, or scp does the wrong thing), surround by braces.
   137      local -r scp_files="{$(printf "%s," "${files[@]}")}"
   138  
   139      if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   140        # get-serial-port-output lets you ask for ports 1-4, but currently (11/21/2016) only port 1 contains useful information
   141        gcloud compute instances get-serial-port-output --project "${PROJECT}" --zone "${ZONE}" --port 1 "${node}" > "${dir}/serial-1.log" || true
   142        source_file_args=()
   143        for single_file in "${files[@]}"; do
   144          source_file_args+=( "${node}:${single_file}" )
   145        done
   146        gcloud compute scp --recurse --project "${PROJECT}" --zone "${ZONE}" "${source_file_args[@]}" "${dir}" > /dev/null || true
   147      elif  [[ "${KUBERNETES_PROVIDER}" == "aws" ]]; then
   148        local ip
   149        ip=$(get_ssh_hostname "${node}")
   150        scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" "${SSH_USER}@${ip}:${scp_files}" "${dir}" > /dev/null || true
   151      elif  [[ -n "${use_custom_instance_list}" ]]; then
   152        scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${node}:${scp_files}" "${dir}" > /dev/null || true
   153      else
   154        echo "Unknown cloud-provider '${KUBERNETES_PROVIDER}' and use_custom_instance_list is unset too - skipping logdump for '${node}'"
   155      fi
   156  }
   157  
   158  # Save logs for node $1 into directory $2. Pass in any non-common files in $3.
   159  # Pass in any non-common systemd services in $4.
   160  # $3 and $4 should be a space-separated list of files.
   161  # Set $5 to true to indicate it is on master. Default to false.
   162  # This function shouldn't ever trigger errexit
   163  function save-logs() {
   164      local -r node_name="${1}"
   165      local -r dir="${2}"
   166      local files=()
   167      IFS=' ' read -r -a files <<< "$3"
   168      local opt_systemd_services="${4:-""}"
   169      local on_master="${5:-"false"}"
   170  
   171      local extra=()
   172      IFS=' ' read -r -a extra <<< "$extra_log_files"
   173      files+=("${extra[@]}")
   174      if [[ -n "${use_custom_instance_list}" ]]; then
   175        if [[ -n "${LOG_DUMP_SAVE_LOGS:-}" ]]; then
   176  	local dump=()
   177          IFS=' ' read -r -a dump <<< "${LOG_DUMP_SAVE_LOGS:-}"
   178          files+=("${dump[@]}")
   179        fi
   180      else
   181        local providerlogs=()
   182        case "${KUBERNETES_PROVIDER}" in
   183          gce|gke)
   184            IFS=' ' read -r -a providerlogs <<< "${gce_logfiles}"
   185            ;;
   186          aws)
   187            IFS=' ' read -r -a providerlogs <<< "${aws_logfiles}"
   188            ;;
   189        esac
   190        files+=("${providerlogs[@]}")
   191      fi
   192      local services
   193      read -r -a services <<< "${systemd_services} ${opt_systemd_services} ${extra_systemd_services}"
   194  
   195      if log-dump-ssh "${node_name}" "command -v journalctl" &> /dev/null; then
   196          if [[ "${on_master}" == "true" ]]; then
   197            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-installation.service" > "${dir}/kube-master-installation.log" || true
   198            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-configuration.service" > "${dir}/kube-master-configuration.log" || true
   199          else
   200            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-installation.service" > "${dir}/kube-node-installation.log" || true
   201            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-configuration.service" > "${dir}/kube-node-configuration.log" || true
   202          fi
   203          log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -k" > "${dir}/kern.log" || true
   204  
   205          for svc in "${services[@]}"; do
   206              log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u ${svc}.service" > "${dir}/${svc}.log" || true
   207          done
   208  
   209          if [[ "$dump_systemd_journal" == "true" ]]; then
   210            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise" > "${dir}/systemd.log" || true
   211          fi
   212      else
   213          local tmpfiles=()
   214          for f in "${kern_logfile}" "${initd_logfiles}" "${supervisord_logfiles}"; do
   215  	    IFS=' ' read -r -a tmpfiles <<< "$f"
   216  	    files+=("${tmpfiles[@]}")
   217          done
   218      fi
   219  
   220      # log where we pull the images from
   221      log-dump-ssh "${node_name}" "sudo ctr -n k8s.io images ls" > "${dir}/images-containerd.log" || true
   222      log-dump-ssh "${node_name}" "sudo docker images --all" > "${dir}/images-docker.log" || true
   223  
   224      # Try dumping coverage profiles, if it looks like coverage is enabled in the first place.
   225      if log-dump-ssh "${node_name}" "stat /var/log/kubelet.cov" &> /dev/null; then
   226        if log-dump-ssh "${node_name}" "command -v docker" &> /dev/null; then
   227          if [[ "${on_master}" == "true" ]]; then
   228            run-in-docker-container "${node_name}" "kube-apiserver" "cat /tmp/k8s-kube-apiserver.cov" > "${dir}/kube-apiserver.cov" || true
   229            run-in-docker-container "${node_name}" "kube-scheduler" "cat /tmp/k8s-kube-scheduler.cov" > "${dir}/kube-scheduler.cov" || true
   230            run-in-docker-container "${node_name}" "kube-controller-manager" "cat /tmp/k8s-kube-controller-manager.cov" > "${dir}/kube-controller-manager.cov" || true
   231          else
   232            run-in-docker-container "${node_name}" "kube-proxy" "cat /tmp/k8s-kube-proxy.cov" > "${dir}/kube-proxy.cov" || true
   233          fi
   234        else
   235          echo 'Coverage profiles seem to exist, but cannot be retrieved from inside containers.'
   236        fi
   237      fi
   238  
   239      echo 'Changing logfiles to be world-readable for download'
   240      log-dump-ssh "${node_name}" "sudo chmod -R a+r /var/log" || true
   241  
   242      echo "Copying '${files[*]}' from ${node_name}"
   243      copy-logs-from-node "${node_name}" "${dir}" "${files[@]}"
   244  }
   245  
   246  # Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log
   247  # on node $1.
   248  function export-windows-docker-event-log() {
   249      local -r node="${1}"
   250  
   251      local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(Get-EventLog -LogName Application -Source Docker | Format-Table -Property TimeGenerated, EntryType, Message -Wrap); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker.log'\""
   252  
   253      # Retry up to 3 times to allow ssh keys to be properly propagated and
   254      # stored.
   255      for retry in {1..3}; do
   256        if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
   257          --command "$powershell_cmd"; then
   258          break
   259        else
   260          sleep 10
   261        fi
   262      done
   263  }
   264  
   265  # Saves prepulled Windows Docker images list to ${WINDOWS_LOGS_DIR}\docker_images.log
   266  # on node $1.
   267  function export-windows-docker-images-list() {
   268      local -r node="${1}"
   269  
   270      local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(docker image list); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker_images.log'\""
   271  
   272      # Retry up to 3 times to allow ssh keys to be properly propagated and
   273      # stored.
   274      for retry in {1..3}; do
   275        if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
   276          --command "$powershell_cmd"; then
   277          break
   278        else
   279          sleep 10
   280        fi
   281      done
   282  }
   283  
   284  # Saves log files from diagnostics tool.(https://github.com/GoogleCloudPlatform/compute-image-tools/tree/master/cli_tools/diagnostics)
   285  function save-windows-logs-via-diagnostics-tool() {
   286      local node="${1}"
   287      local dest_dir="${2}"
   288  
   289      gcloud compute instances add-metadata "${node}" --metadata enable-diagnostics=true --project="${PROJECT}" --zone="${ZONE}"
   290      local logs_archive_in_gcs
   291      logs_archive_in_gcs=$(gcloud alpha compute diagnose export-logs "${node}" "--zone=${ZONE}" "--project=${PROJECT}" | tail -n 1)
   292      local temp_local_path="${node}.zip"
   293      for retry in {1..20}; do
   294        if gsutil mv "${logs_archive_in_gcs}" "${temp_local_path}"  > /dev/null 2>&1; then
   295          echo "Downloaded diagnostics log from ${logs_archive_in_gcs}"
   296          break
   297        else
   298          sleep 10
   299        fi
   300      done
   301  
   302      if [[ -f "${temp_local_path}" ]]; then
   303        unzip "${temp_local_path}" -d "${dest_dir}" > /dev/null
   304        rm -f "${temp_local_path}"
   305      fi
   306  }
   307  
   308  # Saves log files from SSH
   309  function save-windows-logs-via-ssh() {
   310      local node="${1}"
   311      local dest_dir="${2}"
   312  
   313      export-windows-docker-event-log "${node}"
   314      export-windows-docker-images-list "${node}"
   315  
   316      local remote_files=()
   317      for file in "${windows_node_logfiles[@]}"; do
   318        remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" )
   319      done
   320      remote_files+=( "${windows_node_otherfiles[@]}" )
   321  
   322      # TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the
   323      # same time.
   324      for remote_file in "${remote_files[@]}"; do
   325        # Retry up to 3 times to allow ssh keys to be properly propagated and
   326        # stored.
   327        for retry in {1..3}; do
   328          if gcloud compute scp --recurse --project "${PROJECT}" \
   329            --zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \
   330            > /dev/null; then
   331            break
   332          else
   333            sleep 10
   334          fi
   335        done
   336      done
   337  }
   338  
   339  # Save log files and serial console output from Windows node $1 into local
   340  # directory $2.
   341  # This function shouldn't ever trigger errexit.
   342  function save-logs-windows() {
   343      local -r node="${1}"
   344      local -r dest_dir="${2}"
   345  
   346      if [[ ! "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   347        echo "Not saving logs for ${node}, Windows log dumping requires gcloud support"
   348        return
   349      fi
   350  
   351      if [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
   352        save-windows-logs-via-diagnostics-tool "${node}" "${dest_dir}"
   353      else
   354        save-windows-logs-via-ssh "${node}" "${dest_dir}"
   355      fi
   356  
   357      # Serial port 1 contains the Windows console output.
   358      gcloud compute instances get-serial-port-output --project "${PROJECT}" \
   359        --zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true
   360  }
   361  
   362  # Execute a command in container $2 on node $1.
   363  # Uses docker because the container may not ordinarily permit direct execution.
   364  function run-in-docker-container() {
   365    local node_name="$1"
   366    local container="$2"
   367    shift 2
   368    log-dump-ssh "${node_name}" "docker exec \"\$(docker ps -f label=io.kubernetes.container.name=${container} --format \"{{.ID}}\")\" $*"
   369  }
   370  
   371  function dump_masters() {
   372    local master_names=()
   373    if [[ -n "${use_custom_instance_list}" ]]; then
   374      while IFS='' read -r line; do master_names+=("$line"); done < <(log_dump_custom_get_instances master)
   375    elif [[ ! "${master_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   376      echo "Master SSH not supported for ${KUBERNETES_PROVIDER}"
   377      return
   378    elif [[ -n "${KUBEMARK_MASTER_NAME:-}" ]]; then
   379      master_names=( "${KUBEMARK_MASTER_NAME}" )
   380    else
   381      if ! (detect-master); then
   382        echo 'Master not detected. Is the cluster up?'
   383        return
   384      fi
   385      master_names=( "${MASTER_NAME}" )
   386    fi
   387  
   388    if [[ "${#master_names[@]}" == 0 ]]; then
   389      echo 'No masters found?'
   390      return
   391    fi
   392  
   393    proc=${max_dump_processes}
   394    for master_name in "${master_names[@]}"; do
   395      master_dir="${report_dir}/${master_name}"
   396      mkdir -p "${master_dir}"
   397      save-logs "${master_name}" "${master_dir}" "${master_logfiles}" "" "true" &
   398  
   399      # We don't want to run more than ${max_dump_processes} at a time, so
   400      # wait once we hit that many nodes. This isn't ideal, since one might
   401      # take much longer than the others, but it should help.
   402      proc=$((proc - 1))
   403      if [[ proc -eq 0 ]]; then
   404        proc=${max_dump_processes}
   405        wait
   406      fi
   407    done
   408    # Wait for any remaining processes.
   409    if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then
   410      wait
   411    fi
   412  }
   413  
   414  # Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be
   415  # specified via $1 or $use_custom_instance_list. If not specified then the nodes
   416  # to dump logs for will be detected using detect-node-names(); if Windows nodes
   417  # are present then they will be detected and their logs will be dumped too.
   418  function dump_nodes() {
   419    local node_names=()
   420    local windows_node_names=()
   421    if [[ -n "${1:-}" ]]; then
   422      echo 'Dumping logs for nodes provided as args to dump_nodes() function'
   423      node_names=( "$@" )
   424    elif [[ -n "${use_custom_instance_list}" ]]; then
   425      echo 'Dumping logs for nodes provided by log_dump_custom_get_instances() function'
   426      while IFS='' read -r line; do node_names+=("$line"); done < <(log_dump_custom_get_instances node)
   427    elif [[ ! "${node_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   428      echo "Node SSH not supported for ${KUBERNETES_PROVIDER}"
   429      return
   430    else
   431      echo 'Detecting nodes in the cluster'
   432      detect-node-names &> /dev/null
   433      if [[ -n "${NODE_NAMES:-}" ]]; then
   434        node_names=( "${NODE_NAMES[@]}" )
   435      fi
   436      if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then
   437        windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" )
   438      fi
   439    fi
   440  
   441    if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then
   442      echo 'No nodes found!'
   443      return
   444    fi
   445  
   446    node_logfiles_all="${node_logfiles}"
   447    if [[ "${ENABLE_HOLLOW_NODE_LOGS:-}" == "true" ]]; then
   448      node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}"
   449    fi
   450  
   451    linux_nodes_selected_for_logs=()
   452    if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then
   453      # We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs.
   454      for index in $(shuf -i 0-$(( ${#node_names[*]} - 1 )) -n "${LOGDUMP_ONLY_N_RANDOM_NODES}")
   455      do
   456        linux_nodes_selected_for_logs+=("${node_names[$index]}")
   457      done
   458    else
   459      linux_nodes_selected_for_logs=( "${node_names[@]}" )
   460    fi
   461    all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" )
   462    all_selected_nodes+=( "${windows_node_names[@]}" )
   463  
   464    proc=${max_dump_processes}
   465    start="$(date +%s)"
   466    # log_dump_ssh_timeout is the maximal number of seconds the log dumping over
   467    # SSH operation can take. Please note that the logic enforcing the timeout
   468    # is only a best effort. The actual time of the operation may be longer
   469    # due to waiting for all the child processes below.
   470    log_dump_ssh_timeout_seconds="${LOG_DUMP_SSH_TIMEOUT_SECONDS:-}"
   471    for i in "${!all_selected_nodes[@]}"; do
   472      node_name="${all_selected_nodes[$i]}"
   473      node_dir="${report_dir}/${node_name}"
   474      mkdir -p "${node_dir}"
   475      if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then
   476        # Save logs in the background. This speeds up things when there are
   477        # many nodes.
   478        save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" &
   479      else
   480        save-logs-windows "${node_name}" "${node_dir}" &
   481      fi
   482  
   483      # We don't want to run more than ${max_dump_processes} at a time, so
   484      # wait once we hit that many nodes. This isn't ideal, since one might
   485      # take much longer than the others, but it should help.
   486      proc=$((proc - 1))
   487      if [[ proc -eq 0 ]]; then
   488        proc=${max_dump_processes}
   489        wait
   490        now="$(date +%s)"
   491        if [[ -n "${log_dump_ssh_timeout_seconds}" && $((now - start)) -gt ${log_dump_ssh_timeout_seconds} ]]; then
   492          echo "WARNING: Hit timeout after ${log_dump_ssh_timeout_seconds} seconds, finishing log dumping over SSH shortly"
   493          break
   494        fi
   495      fi
   496    done
   497    # Wait for any remaining processes.
   498    if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then
   499      wait
   500    fi
   501  }
   502  
   503  # Collect names of nodes which didn't run logexporter successfully.
   504  # This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
   505  # does not run on Windows nodes.
   506  #
   507  # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
   508  # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
   509  # Assumes:
   510  #   NODE_NAMES
   511  # Sets:
   512  #   NON_LOGEXPORTED_NODES
   513  function find_non_logexported_nodes() {
   514    local file="${gcs_artifacts_dir}/logexported-nodes-registry"
   515    echo "Listing marker files ($file) for successful nodes..."
   516    succeeded_nodes=$(gsutil ls "${file}") || return 1
   517    echo 'Successfully listed marker files for successful nodes'
   518    NON_LOGEXPORTED_NODES=()
   519    for node in "${NODE_NAMES[@]}"; do
   520      if [[ ! "${succeeded_nodes}" =~ ${node} ]]; then
   521        NON_LOGEXPORTED_NODES+=("${node}")
   522      fi
   523    done
   524  }
   525  
   526  # This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
   527  # does not run on Windows nodes.
   528  function dump_nodes_with_logexporter() {
   529    if [[ -n "${use_custom_instance_list}" ]]; then
   530      echo 'Dumping logs for nodes provided by log_dump_custom_get_instances() function'
   531      NODE_NAMES=()
   532      while IFS='' read -r line; do NODE_NAMES+=("$line"); done < <(log_dump_custom_get_instances node)
   533    else
   534      echo 'Detecting nodes in the cluster'
   535      detect-node-names &> /dev/null
   536    fi
   537  
   538    if [[ -z "${NODE_NAMES:-}" ]]; then
   539      echo 'No nodes found!'
   540      return
   541    fi
   542  
   543    # Obtain parameters required by logexporter.
   544    local -r service_account_credentials="$(base64 "${GOOGLE_APPLICATION_CREDENTIALS}" | tr -d '\n')"
   545    local -r cloud_provider="${KUBERNETES_PROVIDER}"
   546    local -r enable_hollow_node_logs="${ENABLE_HOLLOW_NODE_LOGS:-false}"
   547    local -r logexport_sleep_seconds="$(( 90 + NUM_NODES / 3 ))"
   548    if [[ -z "${ZONE_NODE_SELECTOR_DISABLED:-}" ]]; then
   549      local -r node_selector="${ZONE_NODE_SELECTOR_LABEL:-topology.kubernetes.io/zone}: ${ZONE}"
   550    fi
   551  
   552    # Fill in the parameters in the logexporter daemonset template.
   553    local -r tmp="${KUBE_TEMP}/logexporter"
   554    local -r manifest_yaml="${tmp}/logexporter-daemonset.yaml"
   555    mkdir -p "${tmp}"
   556    cp "${KUBE_ROOT}/cluster/log-dump/logexporter-daemonset.yaml" "${manifest_yaml}"
   557  
   558    sed -i'' -e "s@{{.NodeSelector}}@${node_selector:-}@g" "${manifest_yaml}"
   559    sed -i'' -e "s@{{.LogexporterNamespace}}@${logexporter_namespace}@g" "${manifest_yaml}"
   560    sed -i'' -e "s@{{.ServiceAccountCredentials}}@${service_account_credentials}@g" "${manifest_yaml}"
   561    sed -i'' -e "s@{{.CloudProvider}}@${cloud_provider}@g" "${manifest_yaml}"
   562    sed -i'' -e "s@{{.GCSPath}}@${gcs_artifacts_dir}@g" "${manifest_yaml}"
   563    sed -i'' -e "s@{{.EnableHollowNodeLogs}}@${enable_hollow_node_logs}@g" "${manifest_yaml}"
   564    sed -i'' -e "s@{{.DumpSystemdJournal}}@${dump_systemd_journal}@g" "${manifest_yaml}"
   565    sed -i'' -e "s@{{.ExtraLogFiles}}@${extra_log_files}@g" "${manifest_yaml}"
   566    sed -i'' -e "s@{{.ExtraSystemdServices}}@${extra_systemd_services}@g" "${manifest_yaml}"
   567  
   568    # Create the logexporter namespace, service-account secret and the logexporter daemonset within that namespace.
   569    KUBECTL="${KUBE_ROOT}/cluster/kubectl.sh"
   570    if ! "${KUBECTL}" create -f "${manifest_yaml}"; then
   571      echo 'Failed to create logexporter daemonset.. falling back to logdump through SSH'
   572      "${KUBECTL}" delete namespace "${logexporter_namespace}" || true
   573      dump_nodes "${NODE_NAMES[@]}"
   574      logexporter_failed=1
   575      return
   576    fi
   577  
   578    # Periodically fetch list of already logexported nodes to verify
   579    # if we aren't already done.
   580    start="$(date +%s)"
   581    while true; do
   582      now="$(date +%s)"
   583      if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then
   584        echo 'Waiting for all nodes to be logexported timed out.'
   585        break
   586      fi
   587      if find_non_logexported_nodes; then
   588        if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then
   589          break
   590        fi
   591      fi
   592      sleep 15
   593    done
   594  
   595    # Store logs from logexporter pods to allow debugging log exporting process
   596    # itself.
   597    proc=${max_dump_processes}
   598    "${KUBECTL}" get pods -n "${logexporter_namespace}" -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | (while read -r pod node; do
   599      echo "Fetching logs from ${pod} running on ${node}"
   600      mkdir -p "${report_dir}/${node}"
   601      "${KUBECTL}" logs -n "${logexporter_namespace}" "${pod}" > "${report_dir}/${node}/${pod}.log" &
   602  
   603      # We don't want to run more than ${max_dump_processes} at a time, so
   604      # wait once we hit that many nodes. This isn't ideal, since one might
   605      # take much longer than the others, but it should help.
   606      proc=$((proc - 1))
   607      if [[ proc -eq 0 ]]; then
   608        proc=${max_dump_processes}
   609        wait
   610      fi
   611    # Wait for any remaining processes.
   612    done; wait)
   613  
   614    # List registry of marker files (of nodes whose logexporter succeeded) from GCS.
   615    for retry in {1..10}; do
   616      if find_non_logexported_nodes; then
   617        break
   618      else
   619        echo "Attempt ${retry} failed to list marker files for successful nodes"
   620        if [[ "${retry}" == 10 ]]; then
   621          echo 'Final attempt to list marker files failed.. falling back to logdump through SSH'
   622          "${KUBECTL}" delete namespace "${logexporter_namespace}" || true
   623          dump_nodes "${NODE_NAMES[@]}"
   624          logexporter_failed=1
   625          return
   626        fi
   627        sleep 2
   628      fi
   629    done
   630  
   631    failed_nodes=()
   632    # The following if is needed, because defaulting for empty arrays
   633    # seems to treat them as non-empty with single empty string.
   634    if [[ -n "${NON_LOGEXPORTED_NODES:-}" ]]; then
   635      for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do
   636        echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH."
   637        failed_nodes+=("${node}")
   638      done
   639    fi
   640  
   641    # If less than a certain ratio of the nodes got logexported, report an error.
   642    if [[ $(((${#NODE_NAMES[@]} - ${#failed_nodes[@]}) * 100)) -lt $((${#NODE_NAMES[@]} * log_dump_expected_success_percentage )) ]]; then
   643      logexporter_failed=1
   644    fi
   645  
   646    # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH.
   647    "${KUBECTL}" get pods --namespace "${logexporter_namespace}" || true
   648    "${KUBECTL}" delete namespace "${logexporter_namespace}" || true
   649    if [[ "${#failed_nodes[@]}" != 0 ]]; then
   650      echo -e "Dumping logs through SSH for the following nodes:\n${failed_nodes[*]}"
   651      dump_nodes "${failed_nodes[@]}"
   652    fi
   653  }
   654  
   655  function detect_node_failures() {
   656    if ! [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   657      return
   658    fi
   659  
   660    detect-node-names
   661    if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then
   662      local all_instance_groups=("${INSTANCE_GROUPS[@]}" "${WINDOWS_INSTANCE_GROUPS[@]}")
   663    else
   664      local all_instance_groups=("${INSTANCE_GROUPS[@]}")
   665    fi
   666  
   667    if [ -z "${all_instance_groups:-}" ]; then
   668      return
   669    fi
   670    for group in "${all_instance_groups[@]}"; do
   671      local creation_timestamp
   672      creation_timestamp=$(gcloud compute instance-groups managed describe \
   673                           "${group}" \
   674                           --project "${PROJECT}" \
   675                           --zone "${ZONE}" \
   676                           --format='value(creationTimestamp)')
   677      echo "Failures for ${group} (if any):"
   678      gcloud logging read --order=asc \
   679            --format='table(timestamp,jsonPayload.resource.name,jsonPayload.event_subtype)' \
   680            --project "${PROJECT}" \
   681            "resource.type=\"gce_instance\"
   682             logName=\"projects/${PROJECT}/logs/compute.googleapis.com%2Factivity_log\"
   683             (jsonPayload.event_subtype=\"compute.instances.hostError\" OR jsonPayload.event_subtype=\"compute.instances.automaticRestart\")
   684             jsonPayload.resource.name:\"${group}\"
   685             timestamp >= \"${creation_timestamp}\""
   686    done
   687  }
   688  
   689  function main() {
   690    print-deprecation-note
   691    setup
   692    kube::util::ensure-temp-dir
   693    # Copy master logs to artifacts dir locally (through SSH).
   694    echo "Dumping logs from master locally to '${report_dir}'"
   695    dump_masters
   696    if [[ "${DUMP_ONLY_MASTER_LOGS:-}" == "true" ]]; then
   697      echo 'Skipping dumping of node logs'
   698      return
   699    fi
   700  
   701    # Copy logs from nodes to GCS directly or to artifacts dir locally (through SSH).
   702    if [[ -n "${gcs_artifacts_dir}" ]]; then
   703      echo "Dumping logs from nodes to GCS directly at '${gcs_artifacts_dir}' using logexporter"
   704      dump_nodes_with_logexporter
   705    else
   706      echo "Dumping logs from nodes locally to '${report_dir}'"
   707      dump_nodes
   708    fi
   709  
   710    detect_node_failures
   711    if [[ ${logexporter_failed} -ne 0 && ${log_dump_expected_success_percentage} -gt 0 ]]; then
   712      return 1
   713    fi
   714  }
   715  
   716  main