k8s.io/test-infra@v0.0.0-20240520184403-27c6b4c223d8/logexporter/cluster/log-dump.sh (about)

     1  #!/usr/bin/env bash
     2  
     3  # Copyright 2017 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  # Call this to dump all master and node logs into the folder specified in $1
    18  # (defaults to _artifacts). Only works if the provider supports SSH.
    19  
    20  set -o errexit
    21  set -o nounset
    22  set -o pipefail
    23  
    24  readonly local_report_dir="${1:-_artifacts}"
    25  report_dir=""
    26  readonly gcs_artifacts_dir="${2:-}"
    27  readonly logexporter_namespace="${3:-logexporter}"
    28  
    29  # In order to more trivially extend log-dump for custom deployments,
    30  # check for a function named log_dump_custom_get_instances. If it's
    31  # defined, we assume the function can me called with one argument, the
    32  # role, which is either "master" or "node".
    33  echo 'Checking for custom logdump instances, if any'
    34  if [[ $(type -t log_dump_custom_get_instances) == "function" ]]; then
    35    readonly use_custom_instance_list=yes
    36  else
    37    readonly use_custom_instance_list=
    38  fi
    39  
    40  readonly master_ssh_supported_providers="gce aws"
    41  readonly node_ssh_supported_providers="gce gke aws"
    42  readonly gcloud_supported_providers="gce gke"
    43  
    44  readonly master_logfiles="kube-apiserver.log kube-apiserver-audit.log kube-scheduler.log cloud-controller-manager.log kube-controller-manager.log etcd.log etcd-events.log glbc.log cluster-autoscaler.log kube-addon-manager.log konnectivity-server.log fluentd.log kubelet.cov"
    45  readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log node-problem-detector.log kubelet.cov"
    46  readonly node_systemd_services="node-problem-detector"
    47  readonly hollow_node_logfiles="kubelet-hollow-node-*.log kubeproxy-hollow-node-*.log npd-hollow-node-*.log"
    48  readonly aws_logfiles="cloud-init-output.log"
    49  readonly gce_logfiles="startupscript.log"
    50  readonly kern_logfile="kern.log"
    51  readonly initd_logfiles="docker/log"
    52  readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log"
    53  readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}"
    54  readonly extra_log_files="${LOG_DUMP_EXTRA_FILES:-}"
    55  readonly extra_systemd_services="${LOG_DUMP_SAVE_SERVICES:-}"
    56  readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}"
    57  
    58  # Root directory for Kubernetes files on Windows nodes.
    59  WINDOWS_K8S_DIR="C:\\etc\\kubernetes"
    60  # Directory where Kubernetes log files will be stored on Windows nodes.
    61  export WINDOWS_LOGS_DIR="${WINDOWS_K8S_DIR}\\logs"
    62  # Log files found in WINDOWS_LOGS_DIR on Windows nodes:
    63  readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log docker_images.log csi-proxy.log"
    64  # Log files found in other directories on Windows nodes:
    65  readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp"
    66  
    67  # Limit the number of concurrent node connections so that we don't run out of
    68  # file descriptors for large clusters.
    69  readonly max_dump_processes=25
    70  
    71  # Indicator variable whether we experienced a significant failure during
    72  # logexporter creation or execution.
    73  logexporter_failed=0
    74  
    75  # Percentage of nodes that must be logexported successfully (otherwise the
    76  # process will exit with a non-zero exit code).
    77  readonly log_dump_expected_success_percentage="${LOG_DUMP_EXPECTED_SUCCESS_PERCENTAGE:-0}"
    78  
    79  # Example:  kube::util::trap_add 'echo "in trap DEBUG"' DEBUG
    80  # See: http://stackoverflow.com/questions/3338030/multiple-bash-traps-for-the-same-signal
    81  kube::util::trap_add() {
    82    local trap_add_cmd
    83    trap_add_cmd=$1
    84    shift
    85  
    86    for trap_add_name in "$@"; do
    87      local existing_cmd
    88      local new_cmd
    89  
    90      # Grab the currently defined trap commands for this trap
    91      existing_cmd=$(trap -p "${trap_add_name}" |  awk -F"'" '{print $2}')
    92  
    93      if [[ -z "${existing_cmd}" ]]; then
    94        new_cmd="${trap_add_cmd}"
    95      else
    96        new_cmd="${trap_add_cmd};${existing_cmd}"
    97      fi
    98  
    99      # Assign the test. Disable the shellcheck warning telling that trap
   100      # commands should be single quoted to avoid evaluating them at this
   101      # point instead evaluating them at run time. The logic of adding new
   102      # commands to a single trap requires them to be evaluated right away.
   103      # shellcheck disable=SC2064
   104      trap "${new_cmd}" "${trap_add_name}"
   105    done
   106  }
   107  
   108  # Opposite of kube::util::ensure-temp-dir()
   109  kube::util::cleanup-temp-dir() {
   110    rm -rf "${KUBE_TEMP}"
   111  }
   112  
   113  # Create a temp dir that'll be deleted at the end of this bash session.
   114  #
   115  # Vars set:
   116  #   KUBE_TEMP
   117  kube::util::ensure-temp-dir() {
   118    if [[ -z ${KUBE_TEMP-} ]]; then
   119      KUBE_TEMP=$(mktemp -d 2>/dev/null || mktemp -d -t kubernetes.XXXXXX)
   120      kube::util::trap_add kube::util::cleanup-temp-dir EXIT
   121    fi
   122  }
   123  
   124  # Use the gcloud defaults to find the project.  If it is already set in the
   125  # environment then go with that.
   126  #
   127  # Vars set:
   128  #   PROJECT
   129  #   NETWORK_PROJECT
   130  #   PROJECT_REPORTED
   131  function detect-project() {
   132    if [[ -z "${PROJECT-}" ]]; then
   133      PROJECT=$(gcloud config list project --format 'value(core.project)')
   134    fi
   135  
   136    NETWORK_PROJECT=${NETWORK_PROJECT:-${PROJECT}}
   137  
   138    if [[ -z "${PROJECT-}" ]]; then
   139      echo "Could not detect Google Cloud Platform project.  Set the default project using " >&2
   140      echo "'gcloud config set project <PROJECT>'" >&2
   141      exit 1
   142    fi
   143    if [[ -z "${PROJECT_REPORTED-}" ]]; then
   144      echo "Project: ${PROJECT}" >&2
   145      echo "Network Project: ${NETWORK_PROJECT}" >&2
   146      echo "Zone: ${ZONE}" >&2
   147      PROJECT_REPORTED=true
   148    fi
   149  }
   150  
   151  # Detect Linux and Windows nodes in the cluster.
   152  #
   153  # If a custom get-instances function has been set, this function will use it
   154  # to set the NODE_NAMES array.
   155  #
   156  # Otherwise this function will attempt to detect the nodes based on the GCP
   157  # instance group information. If Windows nodes are present they will be detected
   158  # separately. The following arrays will be set:
   159  #   NODE_NAMES
   160  #   INSTANCE_GROUPS
   161  #   WINDOWS_NODE_NAMES
   162  #   WINDOWS_INSTANCE_GROUPS
   163  function detect-node-names() {
   164    NODE_NAMES=()
   165    INSTANCE_GROUPS=()
   166    WINDOWS_INSTANCE_GROUPS=()
   167    WINDOWS_NODE_NAMES=()
   168  
   169    if [[ -n "${use_custom_instance_list}" ]]; then
   170      echo 'Detecting node names using log_dump_custom_get_instances() function'
   171      while IFS='' read -r line; do NODE_NAMES+=("$line"); done < <(log_dump_custom_get_instances node)
   172      echo "NODE_NAMES=${NODE_NAMES[*]:-}" >&2
   173      return
   174    fi
   175  
   176    if ! [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   177      echo "gcloud not supported for ${KUBERNETES_PROVIDER}, can't detect node names"
   178      return
   179    fi
   180  
   181    # These prefixes must not be prefixes of each other, so that they can be used to
   182    # detect mutually exclusive sets of nodes.
   183    local -r NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX:-"${INSTANCE_PREFIX}-minion"}
   184    local -r WINDOWS_NODE_INSTANCE_PREFIX=${WINDOWS_NODE_INSTANCE_PREFIX:-"${INSTANCE_PREFIX}-windows-node"}
   185    detect-project
   186    echo 'Detecting nodes in the cluster'
   187    INSTANCE_GROUPS+=($(gcloud compute instance-groups managed list \
   188      --project "${PROJECT}" \
   189      --filter "name ~ '${NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \
   190      --format='value(name)' || true))
   191    WINDOWS_INSTANCE_GROUPS+=($(gcloud compute instance-groups managed list \
   192      --project "${PROJECT}" \
   193      --filter "name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \
   194      --format='value(name)' || true))
   195  
   196    if [[ -n "${INSTANCE_GROUPS[@]:-}" ]]; then
   197      for group in "${INSTANCE_GROUPS[@]}"; do
   198        NODE_NAMES+=($(gcloud compute instance-groups managed list-instances \
   199          "${group}" --zone "${ZONE}" --project "${PROJECT}" \
   200          --format='value(name)'))
   201      done
   202    fi
   203    # Add heapster node name to the list too (if it exists).
   204    if [[ -n "${HEAPSTER_MACHINE_TYPE:-}" ]]; then
   205      NODE_NAMES+=("${NODE_INSTANCE_PREFIX}-heapster")
   206    fi
   207    if [[ -n "${WINDOWS_INSTANCE_GROUPS[@]:-}" ]]; then
   208      for group in "${WINDOWS_INSTANCE_GROUPS[@]}"; do
   209        WINDOWS_NODE_NAMES+=($(gcloud compute instance-groups managed \
   210          list-instances "${group}" --zone "${ZONE}" --project "${PROJECT}" \
   211          --format='value(name)'))
   212      done
   213    fi
   214  
   215    echo "INSTANCE_GROUPS=${INSTANCE_GROUPS[*]:-}" >&2
   216    echo "NODE_NAMES=${NODE_NAMES[*]:-}" >&2
   217    echo "WINDOWS_INSTANCE_GROUPS=${WINDOWS_INSTANCE_GROUPS[*]:-}" >&2
   218    echo "WINDOWS_NODE_NAMES=${WINDOWS_NODE_NAMES[*]:-}" >&2
   219  }
   220  
   221  # Detect the IP for the master
   222  #
   223  # Assumed vars:
   224  #   MASTER_NAME
   225  #   ZONE
   226  #   REGION
   227  # Vars set:
   228  #   KUBE_MASTER
   229  #   KUBE_MASTER_IP
   230  function detect-master() {
   231    detect-project
   232    KUBE_MASTER=${MASTER_NAME}
   233    echo "Trying to find master named '${MASTER_NAME}'" >&2
   234    if [[ -z "${KUBE_MASTER_IP-}" ]]; then
   235      local master_address_name="${MASTER_NAME}-ip"
   236      echo "Looking for address '${master_address_name}'" >&2
   237      if ! KUBE_MASTER_IP=$(gcloud compute addresses describe "${master_address_name}" \
   238        --project "${PROJECT}" --region "${REGION}" -q --format='value(address)') || \
   239        [[ -z "${KUBE_MASTER_IP-}" ]]; then
   240        echo "Could not detect Kubernetes master node.  Make sure you've launched a cluster with 'kube-up.sh'" >&2
   241        exit 1
   242      fi
   243    fi
   244    if [[ -z "${KUBE_MASTER_INTERNAL_IP-}" ]] && [[ ${GCE_PRIVATE_CLUSTER:-} == "true" ]]; then
   245        local master_address_name="${MASTER_NAME}-internal-ip"
   246        echo "Looking for address '${master_address_name}'" >&2
   247        if ! KUBE_MASTER_INTERNAL_IP=$(gcloud compute addresses describe "${master_address_name}" \
   248          --project "${PROJECT}" --region "${REGION}" -q --format='value(address)') || \
   249          [[ -z "${KUBE_MASTER_INTERNAL_IP-}" ]]; then
   250          echo "Could not detect Kubernetes master node.  Make sure you've launched a cluster with 'kube-up.sh'" >&2
   251          exit 1
   252        fi
   253    fi
   254    echo "Using master: $KUBE_MASTER (external IP: $KUBE_MASTER_IP; internal IP: ${KUBE_MASTER_INTERNAL_IP:-(not set)})" >&2
   255  }
   256  
   257  # SSH to a node by name ($1) and run a command ($2).
   258  function setup() {
   259    if [[ -z "${use_custom_instance_list}" ]]; then
   260      echo "Using gce provider, skipping check for LOG_DUMP_SSH_KEY and LOG_DUMP_SSH_USER"
   261      ZONE="${KUBE_GCE_ZONE:-us-central1-b}"
   262      REGION="${ZONE%-*}"
   263      INSTANCE_PREFIX="${KUBE_GCE_INSTANCE_PREFIX:-kubernetes}"
   264      CLUSTER_NAME="${CLUSTER_NAME:-${INSTANCE_PREFIX}}"
   265      MASTER_NAME="${INSTANCE_PREFIX}-master"
   266      GCE_PRIVATE_CLUSTER="${KUBE_GCE_PRIVATE_CLUSTER:-false}"
   267      detect-project 2>&1
   268    elif [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
   269      NUM_NODES=${NUM_NODES:-3}
   270      echo "Using 'use_custom_instance_list' with gke, skipping check for LOG_DUMP_SSH_KEY and LOG_DUMP_SSH_USER"
   271    elif [[ -z "${LOG_DUMP_SSH_KEY:-}" ]]; then
   272      echo 'LOG_DUMP_SSH_KEY not set, but required when using log_dump_custom_get_instances'
   273      exit 1
   274    elif [[ -z "${LOG_DUMP_SSH_USER:-}" ]]; then
   275      echo 'LOG_DUMP_SSH_USER not set, but required when using log_dump_custom_get_instances'
   276      exit 1
   277    fi
   278  }
   279  
   280  function log-dump-ssh() {
   281    local host="$1"
   282    local cmd="$2"
   283  
   284    if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   285      for (( i=0; i<5; i++)); do
   286        if gcloud compute ssh --ssh-flag="-o LogLevel=quiet" --ssh-flag="-o ConnectTimeout=30" --project "${PROJECT}" --zone="${ZONE}" "${host}" --command "echo test > /dev/null"; then
   287          break
   288        fi
   289        sleep 5
   290      done
   291      # Then actually try the command.
   292      gcloud compute ssh --ssh-flag="-o LogLevel=quiet" --ssh-flag="-o ConnectTimeout=30" --project "${PROJECT}" --zone="${ZONE}" "${host}" --command "${cmd}"
   293      return
   294    fi
   295  
   296    ssh -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${host}" "${cmd}"
   297  }
   298  
   299  # Copy all files /var/log/{$3}.log on node $1 into local dir $2.
   300  # $3 should be a string array of file names.
   301  # This function shouldn't ever trigger errexit, but doesn't block stderr.
   302  function copy-logs-from-node() {
   303      local -r node="${1}"
   304      local -r dir="${2}"
   305      shift
   306      shift
   307      local files=("$@")
   308      # Append "*"
   309      # The * at the end is needed to also copy rotated logs (which happens
   310      # in large clusters and long runs).
   311      files=( "${files[@]/%/*}" )
   312      # Prepend "/var/log/"
   313      files=( "${files[@]/#/\/var\/log\/}" )
   314      # Comma delimit (even the singleton, or scp does the wrong thing), surround by braces.
   315      local -r scp_files="{$(printf "%s," "${files[@]}")}"
   316  
   317      if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   318        # get-serial-port-output lets you ask for ports 1-4, but currently (11/21/2016) only port 1 contains useful information
   319        gcloud compute instances get-serial-port-output --project "${PROJECT}" --zone "${ZONE}" --port 1 "${node}" > "${dir}/serial-1.log" || true
   320        # FIXME(dims): bug in gcloud prevents multiple source files specified using curly braces, so we just loop through for now
   321        for single_file in "${files[@]}"; do
   322          # gcloud scp doesn't work very well when trying to fetch constantly changing files such as logs, as it blocks forever sometimes.
   323          # We set ConnectTimeout to 5s to avoid blocking for (default tested on 2023-11-17) 2m.
   324          gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" --command "tar -zcvf - ${single_file}" -- -o ConnectTimeout=5 | tar -zxf - --strip-components=2 -C "${dir}" || true
   325        done
   326      elif  [[ "${KUBERNETES_PROVIDER}" == "aws" ]]; then
   327        local ip
   328        ip=$(get_ssh_hostname "${node}")
   329        scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" "${SSH_USER}@${ip}:${scp_files}" "${dir}" > /dev/null || true
   330      elif  [[ -n "${use_custom_instance_list}" ]]; then
   331        scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${node}:${scp_files}" "${dir}" > /dev/null || true
   332      else
   333        echo "Unknown cloud-provider '${KUBERNETES_PROVIDER}' and use_custom_instance_list is unset too - skipping logdump for '${node}'"
   334      fi
   335  }
   336  
   337  # Save logs for node $1 into directory $2. Pass in any non-common files in $3.
   338  # Pass in any non-common systemd services in $4.
   339  # $3 and $4 should be a space-separated list of files.
   340  # Set $5 to true to indicate it is on master. Default to false.
   341  # This function shouldn't ever trigger errexit
   342  function save-logs() {
   343      local -r node_name="${1}"
   344      local -r dir="${2}"
   345      local files=()
   346      IFS=' ' read -r -a files <<< "$3"
   347      local opt_systemd_services="${4:-""}"
   348      local on_master="${5:-"false"}"
   349  
   350      local extra=()
   351      IFS=' ' read -r -a extra <<< "$extra_log_files"
   352      files+=("${extra[@]}")
   353      if [[ -n "${use_custom_instance_list}" ]]; then
   354        if [[ -n "${LOG_DUMP_SAVE_LOGS:-}" ]]; then
   355          local dump=()
   356          IFS=' ' read -r -a dump <<< "${LOG_DUMP_SAVE_LOGS:-}"
   357          files+=("${dump[@]}")
   358        fi
   359      else
   360        local providerlogs=()
   361        case "${KUBERNETES_PROVIDER}" in
   362          gce|gke)
   363            IFS=' ' read -r -a providerlogs <<< "${gce_logfiles}"
   364            ;;
   365          aws)
   366            IFS=' ' read -r -a providerlogs <<< "${aws_logfiles}"
   367            ;;
   368        esac
   369        files+=("${providerlogs[@]}")
   370      fi
   371      local services
   372      read -r -a services <<< "${systemd_services} ${opt_systemd_services} ${extra_systemd_services}"
   373  
   374      if log-dump-ssh "${node_name}" "command -v journalctl" &> /dev/null; then
   375          if [[ "${on_master}" == "true" ]]; then
   376            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-installation.service" > "${dir}/kube-master-installation.log" || true
   377            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-configuration.service" > "${dir}/kube-master-configuration.log" || true
   378          else
   379            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-installation.service" > "${dir}/kube-node-installation.log" || true
   380            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-configuration.service" > "${dir}/kube-node-configuration.log" || true
   381          fi
   382          log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -k" > "${dir}/kern.log" || true
   383  
   384          for svc in "${services[@]}"; do
   385              log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u ${svc}.service" > "${dir}/${svc}.log" || true
   386          done
   387  
   388          if [[ "$dump_systemd_journal" == "true" ]]; then
   389            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise" > "${dir}/systemd.log" || true
   390          fi
   391      else
   392          local tmpfiles=()
   393          for f in "${kern_logfile}" "${initd_logfiles}" "${supervisord_logfiles}"; do
   394              IFS=' ' read -r -a tmpfiles <<< "$f"
   395              files+=("${tmpfiles[@]}")
   396          done
   397      fi
   398  
   399      # Try dumping coverage profiles, if it looks like coverage is enabled in the first place.
   400      if log-dump-ssh "${node_name}" "stat /var/log/kubelet.cov" &> /dev/null; then
   401        if log-dump-ssh "${node_name}" "command -v docker" &> /dev/null; then
   402          if [[ "${on_master}" == "true" ]]; then
   403            run-in-docker-container "${node_name}" "kube-apiserver" "cat /tmp/k8s-kube-apiserver.cov" > "${dir}/kube-apiserver.cov" || true
   404            run-in-docker-container "${node_name}" "kube-scheduler" "cat /tmp/k8s-kube-scheduler.cov" > "${dir}/kube-scheduler.cov" || true
   405            run-in-docker-container "${node_name}" "kube-controller-manager" "cat /tmp/k8s-kube-controller-manager.cov" > "${dir}/kube-controller-manager.cov" || true
   406          else
   407            run-in-docker-container "${node_name}" "kube-proxy" "cat /tmp/k8s-kube-proxy.cov" > "${dir}/kube-proxy.cov" || true
   408          fi
   409        else
   410          echo 'Coverage profiles seem to exist, but cannot be retrieved from inside containers.'
   411        fi
   412      fi
   413  
   414      echo 'Changing logfiles to be world-readable for download'
   415      log-dump-ssh "${node_name}" "sudo chmod -R a+r /var/log" || true
   416  
   417      echo "Copying '${files[*]}' from ${node_name}"
   418      copy-logs-from-node "${node_name}" "${dir}" "${files[@]}"
   419  }
   420  
   421  # Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log
   422  # on node $1.
   423  function export-windows-docker-event-log() {
   424      local -r node="${1}"
   425  
   426      local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(Get-EventLog -LogName Application -Source Docker | Format-Table -Property TimeGenerated, EntryType, Message -Wrap); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker.log'\""
   427  
   428      # Retry up to 3 times to allow ssh keys to be properly propagated and
   429      # stored.
   430      for retry in {1..3}; do
   431        if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
   432          --command "$powershell_cmd"; then
   433          break
   434        else
   435          sleep 10
   436        fi
   437      done
   438  }
   439  
   440  # Saves prepulled Windows Docker images list to ${WINDOWS_LOGS_DIR}\docker_images.log
   441  # on node $1.
   442  function export-windows-docker-images-list() {
   443      local -r node="${1}"
   444  
   445      local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(docker image list); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker_images.log'\""
   446  
   447      # Retry up to 3 times to allow ssh keys to be properly propagated and
   448      # stored.
   449      for retry in {1..3}; do
   450        if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
   451          --command "$powershell_cmd"; then
   452          break
   453        else
   454          sleep 10
   455        fi
   456      done
   457  }
   458  
   459  # Saves log files from diagnostics tool.(https://github.com/GoogleCloudPlatform/compute-image-tools/tree/master/cli_tools/diagnostics)
   460  function save-windows-logs-via-diagnostics-tool() {
   461      local node="${1}"
   462      local dest_dir="${2}"
   463  
   464      gcloud compute instances add-metadata "${node}" --metadata enable-diagnostics=true --project="${PROJECT}" --zone="${ZONE}"
   465      local logs_archive_in_gcs
   466      logs_archive_in_gcs=$(gcloud alpha compute diagnose export-logs "${node}" "--zone=${ZONE}" "--project=${PROJECT}" | tail -n 1)
   467      local temp_local_path="${node}.zip"
   468      for retry in {1..20}; do
   469        if gsutil mv "${logs_archive_in_gcs}" "${temp_local_path}"  > /dev/null 2>&1; then
   470          echo "Downloaded diagnostics log from ${logs_archive_in_gcs}"
   471          break
   472        else
   473          sleep 10
   474        fi
   475      done
   476  
   477      if [[ -f "${temp_local_path}" ]]; then
   478        unzip "${temp_local_path}" -d "${dest_dir}" > /dev/null
   479        rm -f "${temp_local_path}"
   480      fi
   481  }
   482  
   483  # Saves log files from SSH
   484  function save-windows-logs-via-ssh() {
   485      local node="${1}"
   486      local dest_dir="${2}"
   487  
   488      export-windows-docker-event-log "${node}"
   489      export-windows-docker-images-list "${node}"
   490  
   491      local remote_files=()
   492      for file in "${windows_node_logfiles[@]}"; do
   493        remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" )
   494      done
   495      remote_files+=( "${windows_node_otherfiles[@]}" )
   496  
   497      # TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the
   498      # same time.
   499      for remote_file in "${remote_files[@]}"; do
   500        # Retry up to 3 times to allow ssh keys to be properly propagated and
   501        # stored.
   502        for retry in {1..3}; do
   503          if gcloud compute scp --recurse --project "${PROJECT}" \
   504            --zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \
   505            > /dev/null; then
   506            break
   507          else
   508            sleep 10
   509          fi
   510        done
   511      done
   512  }
   513  
   514  # Save log files and serial console output from Windows node $1 into local
   515  # directory $2.
   516  # This function shouldn't ever trigger errexit.
   517  function save-logs-windows() {
   518      local -r node="${1}"
   519      local -r dest_dir="${2}"
   520  
   521      if [[ ! "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   522        echo "Not saving logs for ${node}, Windows log dumping requires gcloud support"
   523        return
   524      fi
   525  
   526      if [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
   527        save-windows-logs-via-diagnostics-tool "${node}" "${dest_dir}"
   528      else
   529        save-windows-logs-via-ssh "${node}" "${dest_dir}"
   530      fi
   531  
   532      # Serial port 1 contains the Windows console output.
   533      gcloud compute instances get-serial-port-output --project "${PROJECT}" \
   534        --zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true
   535  }
   536  
   537  # Execute a command in container $2 on node $1.
   538  # Uses docker because the container may not ordinarily permit direct execution.
   539  function run-in-docker-container() {
   540    local node_name="$1"
   541    local container="$2"
   542    shift 2
   543    log-dump-ssh "${node_name}" "docker exec \"\$(docker ps -f label=io.kubernetes.container.name=${container} --format \"{{.ID}}\")\" $*"
   544  }
   545  
   546  function dump_masters() {
   547    local master_names=()
   548    if [[ -n "${use_custom_instance_list}" ]]; then
   549      while IFS='' read -r line; do master_names+=("$line"); done < <(log_dump_custom_get_instances master)
   550    elif [[ ! "${master_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   551      echo "Master SSH not supported for ${KUBERNETES_PROVIDER}"
   552      return
   553    elif [[ -n "${KUBEMARK_MASTER_NAME:-}" ]]; then
   554      master_names=( "${KUBEMARK_MASTER_NAME}" )
   555    else
   556      if ! (detect-master); then
   557        echo 'Master not detected. Is the cluster up?'
   558        return
   559      fi
   560      master_names=( "${MASTER_NAME}" )
   561    fi
   562  
   563    if [[ "${#master_names[@]}" == 0 ]]; then
   564      echo 'No masters found?'
   565      return
   566    fi
   567  
   568    proc=${max_dump_processes}
   569    for master_name in "${master_names[@]}"; do
   570      master_dir="${report_dir}/${master_name}"
   571      mkdir -p "${master_dir}"
   572      save-logs "${master_name}" "${master_dir}" "${master_logfiles}" "" "true" &
   573  
   574      # We don't want to run more than ${max_dump_processes} at a time, so
   575      # wait once we hit that many nodes. This isn't ideal, since one might
   576      # take much longer than the others, but it should help.
   577      proc=$((proc - 1))
   578      if [[ proc -eq 0 ]]; then
   579        proc=${max_dump_processes}
   580        wait
   581      fi
   582    done
   583    # Wait for any remaining processes.
   584    if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then
   585      wait
   586    fi
   587  }
   588  
   589  # Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be
   590  # specified via $1 or $use_custom_instance_list. If not specified then the nodes
   591  # to dump logs for will be detected using detect-node-names(); if Windows nodes
   592  # are present then they will be detected and their logs will be dumped too.
   593  function dump_nodes() {
   594    local node_names=()
   595    local windows_node_names=()
   596    if [[ -n "${1:-}" ]]; then
   597      echo 'Dumping logs for nodes provided as args to dump_nodes() function'
   598      node_names=( "$@" )
   599    else
   600      echo 'Detecting nodes in the cluster'
   601      detect-node-names &> /dev/null
   602      if [[ -n "${NODE_NAMES:-}" ]]; then
   603        node_names=( "${NODE_NAMES[@]}" )
   604      fi
   605      if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then
   606        windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" )
   607      fi
   608    fi
   609  
   610    if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then
   611      echo 'No nodes found!'
   612      return
   613    fi
   614  
   615    node_logfiles_all="${node_logfiles}"
   616    if [[ "${ENABLE_HOLLOW_NODE_LOGS:-}" == "true" ]]; then
   617      node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}"
   618    fi
   619  
   620    linux_nodes_selected_for_logs=()
   621    if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then
   622      # We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs.
   623      for index in $(shuf -i 0-$(( ${#node_names[*]} - 1 )) -n "${LOGDUMP_ONLY_N_RANDOM_NODES}")
   624      do
   625        linux_nodes_selected_for_logs+=("${node_names[$index]}")
   626      done
   627    else
   628      linux_nodes_selected_for_logs=( "${node_names[@]}" )
   629    fi
   630    all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" )
   631    all_selected_nodes+=( "${windows_node_names[@]}" )
   632  
   633    proc=${max_dump_processes}
   634    start="$(date +%s)"
   635    # log_dump_ssh_timeout is the maximal number of seconds the log dumping over
   636    # SSH operation can take. Please note that the logic enforcing the timeout
   637    # is only a best effort. The actual time of the operation may be longer
   638    # due to waiting for all the child processes below.
   639    log_dump_ssh_timeout_seconds="${LOG_DUMP_SSH_TIMEOUT_SECONDS:-}"
   640    for i in "${!all_selected_nodes[@]}"; do
   641      node_name="${all_selected_nodes[$i]}"
   642      node_dir="${report_dir}/${node_name}"
   643      mkdir -p "${node_dir}"
   644      if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then
   645        # Save logs in the background. This speeds up things when there are
   646        # many nodes.
   647        save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" &
   648      else
   649        save-logs-windows "${node_name}" "${node_dir}" &
   650      fi
   651  
   652      # We don't want to run more than ${max_dump_processes} at a time, so
   653      # wait once we hit that many nodes. This isn't ideal, since one might
   654      # take much longer than the others, but it should help.
   655      proc=$((proc - 1))
   656      if [[ proc -eq 0 ]]; then
   657        proc=${max_dump_processes}
   658        wait
   659        now="$(date +%s)"
   660        if [[ -n "${log_dump_ssh_timeout_seconds}" && $((now - start)) -gt ${log_dump_ssh_timeout_seconds} ]]; then
   661          echo "WARNING: Hit timeout after ${log_dump_ssh_timeout_seconds} seconds, finishing log dumping over SSH shortly"
   662          break
   663        fi
   664      fi
   665    done
   666    # Wait for any remaining processes.
   667    if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then
   668      wait
   669    fi
   670  }
   671  
   672  # Collect names of nodes which didn't run logexporter successfully.
   673  # This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
   674  # does not run on Windows nodes.
   675  #
   676  # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
   677  # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
   678  # Assumes:
   679  #   NODE_NAMES
   680  # Sets:
   681  #   NON_LOGEXPORTED_NODES
   682  function find_non_logexported_nodes() {
   683    local file="${gcs_artifacts_dir}/logexported-nodes-registry"
   684    echo "Listing marker files ($file) for successful nodes..."
   685    succeeded_nodes=$(gsutil ls "${file}") || return 1
   686    echo 'Successfully listed marker files for successful nodes'
   687    NON_LOGEXPORTED_NODES=()
   688    for node in "${NODE_NAMES[@]}"; do
   689      if [[ ! "${succeeded_nodes}" =~ ${node} ]]; then
   690        NON_LOGEXPORTED_NODES+=("${node}")
   691      fi
   692    done
   693  }
   694  
   695  # This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
   696  # does not run on Windows nodes.
   697  function dump_nodes_with_logexporter() {
   698    detect-node-names &> /dev/null
   699  
   700    if [[ -z "${NODE_NAMES:-}" ]]; then
   701      echo 'No nodes found!'
   702      return
   703    fi
   704  
   705    # Obtain parameters required by logexporter.
   706    if [[ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]]; then
   707      local -r service_account_credentials="$(base64 "${GOOGLE_APPLICATION_CREDENTIALS}" | tr -d '\n')"
   708    fi
   709    local -r cloud_provider="${KUBERNETES_PROVIDER}"
   710    local -r enable_hollow_node_logs="${ENABLE_HOLLOW_NODE_LOGS:-false}"
   711    local -r logexport_sleep_seconds="$(( 90 + NUM_NODES / 3 ))"
   712    if [[ -z "${ZONE_NODE_SELECTOR_DISABLED:-}" ]]; then
   713      local -r node_selector="${ZONE_NODE_SELECTOR_LABEL:-topology.kubernetes.io/zone}: ${ZONE}"
   714    fi
   715    local -r use_application_default_credentials="${LOGEXPORTER_USE_APPLICATION_DEFAULT_CREDENTIALS:-false}"
   716  
   717    # Fill in the parameters in the logexporter daemonset template.
   718    local -r tmp="${KUBE_TEMP}/logexporter"
   719    local -r manifest_yaml="${tmp}/logexporter-daemonset.yaml"
   720    mkdir -p "${tmp}"
   721    local -r cwd=$(dirname "${BASH_SOURCE[0]}")
   722    cp "${cwd}/logexporter-daemonset.yaml" "${manifest_yaml}"
   723  
   724    sed -i'' -e "s@{{.NodeSelector}}@${node_selector:-}@g" "${manifest_yaml}"
   725    sed -i'' -e "s@{{.LogexporterNamespace}}@${logexporter_namespace}@g" "${manifest_yaml}"
   726    sed -i'' -e "s@{{.ServiceAccountCredentials}}@${service_account_credentials:-}@g" "${manifest_yaml}"
   727    sed -i'' -e "s@{{.CloudProvider}}@${cloud_provider}@g" "${manifest_yaml}"
   728    sed -i'' -e "s@{{.GCSPath}}@${gcs_artifacts_dir}@g" "${manifest_yaml}"
   729    sed -i'' -e "s@{{.EnableHollowNodeLogs}}@${enable_hollow_node_logs}@g" "${manifest_yaml}"
   730    sed -i'' -e "s@{{.DumpSystemdJournal}}@${dump_systemd_journal}@g" "${manifest_yaml}"
   731    sed -i'' -e "s@{{.ExtraLogFiles}}@${extra_log_files}@g" "${manifest_yaml}"
   732    sed -i'' -e "s@{{.ExtraSystemdServices}}@${extra_systemd_services}@g" "${manifest_yaml}"
   733    sed -i'' -e "s@{{.UseApplicationDefaultCredentials}}@${use_application_default_credentials}@g" "${manifest_yaml}"
   734  
   735    # Create the logexporter namespace, service-account secret and the logexporter daemonset within that namespace.
   736    if ! kubectl create -f "${manifest_yaml}"; then
   737      echo 'Failed to create logexporter daemonset.. falling back to logdump through SSH'
   738      kubectl delete namespace "${logexporter_namespace}" || true
   739      dump_nodes "${NODE_NAMES[@]}"
   740      logexporter_failed=1
   741      return
   742    fi
   743  
   744    # Periodically fetch list of already logexported nodes to verify
   745    # if we aren't already done.
   746    start="$(date +%s)"
   747    while true; do
   748      now="$(date +%s)"
   749      if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then
   750        echo 'Waiting for all nodes to be logexported timed out.'
   751        break
   752      fi
   753      if find_non_logexported_nodes; then
   754        if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then
   755          break
   756        fi
   757      fi
   758      sleep 15
   759    done
   760  
   761    # Store logs from logexporter pods to allow debugging log exporting process
   762    # itself.
   763    proc=${max_dump_processes}
   764    kubectl get pods -n "${logexporter_namespace}" -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | (while read -r pod node; do
   765      echo "Fetching logs from ${pod} running on ${node}"
   766      mkdir -p "${report_dir}/${node}"
   767      kubectl logs -n "${logexporter_namespace}" "${pod}" > "${report_dir}/${node}/${pod}.log" &
   768  
   769      # We don't want to run more than ${max_dump_processes} at a time, so
   770      # wait once we hit that many nodes. This isn't ideal, since one might
   771      # take much longer than the others, but it should help.
   772      proc=$((proc - 1))
   773      if [[ proc -eq 0 ]]; then
   774        proc=${max_dump_processes}
   775        wait
   776      fi
   777    # Wait for any remaining processes.
   778    done; wait)
   779  
   780    # List registry of marker files (of nodes whose logexporter succeeded) from GCS.
   781    for retry in {1..10}; do
   782      if find_non_logexported_nodes; then
   783        break
   784      else
   785        echo "Attempt ${retry} failed to list marker files for successful nodes"
   786        if [[ "${retry}" == 10 ]]; then
   787          echo 'Final attempt to list marker files failed.. falling back to logdump through SSH'
   788          # Timeout prevents the test waiting too long to delete resources and
   789          # never uploading logs, as happened in https://github.com/kubernetes/kubernetes/issues/111111
   790          kubectl delete namespace "${logexporter_namespace}" --timeout 15m || true
   791          dump_nodes "${NODE_NAMES[@]}"
   792          logexporter_failed=1
   793          return
   794        fi
   795        sleep 2
   796      fi
   797    done
   798  
   799    failed_nodes=()
   800    # The following if is needed, because defaulting for empty arrays
   801    # seems to treat them as non-empty with single empty string.
   802    if [[ -n "${NON_LOGEXPORTED_NODES:-}" ]]; then
   803      for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do
   804        echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH."
   805        failed_nodes+=("${node}")
   806      done
   807    fi
   808  
   809    # If less than a certain ratio of the nodes got logexported, report an error.
   810    if [[ $(((${#NODE_NAMES[@]} - ${#failed_nodes[@]}) * 100)) -lt $((${#NODE_NAMES[@]} * log_dump_expected_success_percentage )) ]]; then
   811      logexporter_failed=1
   812    fi
   813  
   814    # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH.
   815    kubectl get pods --namespace "${logexporter_namespace}" || true
   816    # Timeout prevents the test waiting too long to delete resources and
   817    # never uploading logs, as happened in https://github.com/kubernetes/kubernetes/issues/111111
   818    kubectl delete namespace "${logexporter_namespace}" --timeout 15m || true
   819    if [[ "${#failed_nodes[@]}" != 0 ]]; then
   820      echo -e "Dumping logs through SSH for the following nodes:\n${failed_nodes[*]}"
   821      dump_nodes "${failed_nodes[@]}"
   822    fi
   823  }
   824  
   825  # Writes node information that's available through the gcloud and kubectl API
   826  # surfaces to a nodes/ subdirectory of $report_dir.
   827  function dump_node_info() {
   828    nodes_dir="${report_dir}/nodes"
   829    mkdir -p "${nodes_dir}"
   830  
   831    detect-node-names
   832    if [[ -n "${NODE_NAMES:-}" ]]; then
   833      printf "%s\n" "${NODE_NAMES[@]}" > "${nodes_dir}/node_names.txt"
   834    fi
   835    if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then
   836      printf "%s\n" "${WINDOWS_NODE_NAMES[@]}" > "${nodes_dir}/windows_node_names.txt"
   837    fi
   838  
   839    # If we are not able to reach the server, just bail out as the other
   840    # kubectl calls below will fail anyway (we don't want to error out collecting logs)
   841    kubectl version || return 0
   842  
   843    kubectl get nodes -o yaml > "${nodes_dir}/kubectl_get_nodes.yaml"
   844  
   845    api_node_names=()
   846    api_node_names+=($( kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\tReady="}{@.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | awk '/Ready=True/ {print $1}'))
   847    if [[ "${#api_node_names[@]}" -le 5 ]]; then
   848      for node_name in "${api_node_names[@]}"; do
   849        mkdir -p "${nodes_dir}/${node_name}"
   850        kubectl get --raw "/api/v1/nodes/${node_name}/proxy/metrics" > "${nodes_dir}/${node_name}/kubelet_metrics.txt"
   851      done
   852    fi
   853  }
   854  
   855  function detect_node_failures() {
   856    if ! [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   857      return
   858    fi
   859  
   860    detect-node-names
   861    if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then
   862      local all_instance_groups=("${INSTANCE_GROUPS[@]}" "${WINDOWS_INSTANCE_GROUPS[@]}")
   863    else
   864      local all_instance_groups=("${INSTANCE_GROUPS[@]}")
   865    fi
   866  
   867    if [ -z "${all_instance_groups:-}" ]; then
   868      return
   869    fi
   870    for group in "${all_instance_groups[@]}"; do
   871      local creation_timestamp
   872      creation_timestamp=$(gcloud compute instance-groups managed describe \
   873                           "${group}" \
   874                           --project "${PROJECT}" \
   875                           --zone "${ZONE}" \
   876                           --format='value(creationTimestamp)')
   877      echo "Failures for ${group} (if any):"
   878      gcloud logging read --order=asc \
   879            --format='table(timestamp,jsonPayload.resource.name,jsonPayload.event_subtype)' \
   880            --project "${PROJECT}" \
   881            "resource.type=\"gce_instance\"
   882             logName=\"projects/${PROJECT}/logs/compute.googleapis.com%2Factivity_log\"
   883             (jsonPayload.event_subtype=\"compute.instances.hostError\" OR jsonPayload.event_subtype=\"compute.instances.automaticRestart\")
   884             jsonPayload.resource.name:\"${group}\"
   885             timestamp >= \"${creation_timestamp}\""
   886    done
   887  }
   888  
   889  function dump_logs() {
   890    # Copy master logs to artifacts dir locally (through SSH).
   891    echo "Dumping logs from master locally to '${report_dir}'"
   892    dump_masters
   893    if [[ "${DUMP_ONLY_MASTER_LOGS:-}" == "true" ]]; then
   894      echo 'Skipping dumping of node logs'
   895      return
   896    fi
   897  
   898    # Copy logs from nodes to GCS directly or to artifacts dir locally (through SSH).
   899    if [[ -n "${gcs_artifacts_dir}" ]]; then
   900      echo "Dumping logs from nodes to GCS directly at '${gcs_artifacts_dir}' using logexporter"
   901      dump_nodes_with_logexporter
   902    else
   903      echo "Dumping logs from nodes locally to '${report_dir}'"
   904      dump_nodes
   905    fi
   906  }
   907  
   908  # Without ${DUMP_TO_GCS_ONLY} == true:
   909  # * only logs exported by logexporter will be uploaded to
   910  #   ${gcs_artifacts_dir}
   911  # * other logs (master logs, nodes where logexporter failed) will be
   912  #   fetched locally to ${report_dir}.
   913  # If $DUMP_TO_GCS_ONLY == 'true', all logs will be uploaded directly to
   914  # ${gcs_artifacts_dir}.
   915  function main() {
   916    setup
   917    kube::util::ensure-temp-dir
   918    if [[ "${DUMP_TO_GCS_ONLY:-}" == "true" ]] && [[ -n "${gcs_artifacts_dir}" ]]; then
   919      report_dir="${KUBE_TEMP}/logs"
   920      mkdir -p "${report_dir}"
   921      echo "${gcs_artifacts_dir}" > "${local_report_dir}/master-and-node-logs.link.txt"
   922      echo "Dumping logs temporarily to '${report_dir}'. Will upload to '${gcs_artifacts_dir}' later."
   923    else
   924      report_dir="${local_report_dir}"
   925    fi
   926  
   927    dump_logs
   928    dump_node_info
   929  
   930    if [[ "${DUMP_TO_GCS_ONLY:-}" == "true" ]] && [[ -n "${gcs_artifacts_dir}" ]]; then
   931      if [[ "$(ls -A ${report_dir})" ]]; then
   932        echo "Uploading '${report_dir}' to '${gcs_artifacts_dir}'"
   933  
   934        if gsutil ls "${gcs_artifacts_dir}" > /dev/null; then
   935          # If "${gcs_artifacts_dir}" exists, the simple call:
   936          # `gsutil cp -r /tmp/dir/logs ${gcs_artifacts_dir}` will
   937          #  create subdirectory 'logs' in ${gcs_artifacts_dir}
   938          #
   939          # If "${gcs_artifacts_dir}" exists, we want to merge its content
   940          # with local logs. To do that we do the following trick:
   941          # * Let's say that ${gcs_artifacts_dir} == 'gs://a/b/c'.
   942          # * We rename 'logs' to 'c'
   943          # * Call `gsutil cp -r /tmp/dir/c gs://a/b/`
   944          #
   945          # Similar pattern is used in bootstrap.py#L409-L416.
   946          # It is a known issue that gsutil cp behavior is that complex.
   947          # For more information on this, see:
   948          # https://cloud.google.com/storage/docs/gsutil/commands/cp#how-names-are-constructed
   949          remote_dir=$(dirname ${gcs_artifacts_dir})
   950          remote_basename=$(basename ${gcs_artifacts_dir})
   951          mv ${report_dir} "${KUBE_TEMP}/${remote_basename}"
   952          gsutil -m cp -r -c -z log,txt,xml "${KUBE_TEMP}/${remote_basename}" "${remote_dir}"
   953          rm -rf "${KUBE_TEMP}/${remote_basename}"
   954        else  # ${gcs_artifacts_dir} doesn't exist.
   955          gsutil -m cp -r -c -z log,txt,xml "${report_dir}" "${gcs_artifacts_dir}"
   956          rm -rf "${report_dir}"
   957        fi
   958      else
   959        echo "Skipping upload of '${report_dir}' as it's empty."
   960      fi
   961    fi
   962  
   963    detect_node_failures
   964    if [[ ${logexporter_failed} -ne 0 && ${log_dump_expected_success_percentage} -gt 0 ]]; then
   965      return 1
   966    fi
   967  }
   968  
   969  main