agones.dev/agones@v1.54.0/build/e2e_upgrade_test.sh (about)

     1  #!/usr/bin/env bash
     2  
     3  # Copyright 2025 Google LLC All Rights Reserved.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  set -e
    18  set -o pipefail
    19  
    20  BASE_VERSION=$1
    21  PROJECT_ID=$2
    22  BUCKET_NAME="upgrade-test-container-logs"
    23  
    24  apt-get update && apt-get install -y jq
    25  export SHELL="/bin/bash"
    26  export KUBECONFIG="/root/.kube/config"
    27  mkdir -p /go/src/agones.dev/ /root/.kube/
    28  ln -s /workspace /go/src/agones.dev/agones
    29  cd /go/src/agones.dev/agones/test/upgrade
    30  
    31  # --- Function to print failure logs ---
    32  print_failure_logs() {
    33      local testCluster=$1
    34      local testClusterLocation=$2
    35      echo "ERROR: Upgrade test failed on cluster: $testCluster"
    36      gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID"
    37  
    38      # Get all pods for the job
    39      job_pods_json=$(kubectl get pods -l job-name=upgrade-test-runner -o json)
    40  
    41      # Check if any pods were found
    42      if [[ $(echo "$job_pods_json" | jq '.items | length') -eq 0 ]]; then
    43        echo "No pods found for job upgrade-test-runner. They might have failed to schedule or were deleted."
    44      else
    45        # Get the name of the first (and only) pod
    46        job_pod=$(echo "$job_pods_json" | jq -r '.items[0].metadata.name')
    47        pod_status=$(kubectl get pod "$job_pod" -o jsonpath='{.status.phase}')
    48  
    49        echo "--- Pod $job_pod status $pod_status. Retrieving termination message. ---"
    50        # A non-restarting pod will have its termination message in 'state.terminated'.
    51        termination_message=$(kubectl get pod "$job_pod" -o go-template='{{range .status.containerStatuses}}{{if eq .name "upgrade-test-controller"}}{{.state.terminated.message}}{{end}}{{end}}')
    52  
    53        if [ -n "$termination_message" ]; then
    54          echo "Fatal Error: $termination_message"
    55        else
    56          echo "No termination message found for pod $job_pod. Dumping logs:"
    57          containers=$(kubectl get pod "$job_pod" -o jsonpath='{.spec.containers[*].name}')
    58          for container in $containers; do
    59            if [[ "$container" == "sdk-client-test" || "$container" == "upgrade-test-controller" ]]; then
    60              echo "----- Logs from pod: $job_pod, container: $container -----"
    61              kubectl logs "$job_pod" -c "$container" --tail=50 || echo "Failed to retrieve logs from $job_pod/$container"
    62            fi
    63          done
    64        fi
    65      fi
    66  
    67    echo "Logs from log bucket: https://console.cloud.google.com/logs/query;storageScope=storage,projects%2F${PROJECT_ID}%2Flocations%2Fglobal%2Fbuckets%2F${BUCKET_NAME}%2Fviews%2F_AllLogs?hl=en&inv=1&invt=Ab4o5A&mods=logs_tg_prod&project=${PROJECT_ID}"
    68  }
    69  # ------------------------------------------------------
    70  
    71  pids=()
    72  typeset -A waitPids    # Associative array for mapping `kubectl wait job` pid -> `kubectl wait job` output log name
    73  declare -A clusterRegionMap   # Associative array for mapping cluster name -> cluster location
    74  tmpdir=$(mktemp -d)
    75  trap 'rm -rf -- "$tmpdir"' EXIT SIGTERM
    76  
    77  # Update image tags to include the current build version.
    78  DevVersion="${BASE_VERSION}-dev-$(git rev-parse --short=7 HEAD)"
    79  export DevVersion
    80  sed "s/\${DevVersion}/${DevVersion}/" upgradeTest.yaml > "${tmpdir}"/upgradeTest.yaml
    81  sed "s/\${DevVersion}/${DevVersion}/" versionMap.yaml > "${tmpdir}"/versionMap.yaml
    82  
    83  # Kill all currently running child processes on exit or if a non-zero signal is seen
    84  trap 'echo Cleaning up any remaining running pids: $(jobs -p) ; kill $(jobs -p) 2> /dev/null || :' EXIT SIGTERM
    85  
    86  cloudProducts=("generic" "gke-autopilot")
    87  declare -A versionsAndRegions=( [1.33]=us-central1 [1.32]=us-west1 [1.31]=us-east1 )
    88  
    89  for cloudProduct in "${cloudProducts[@]}"
    90  do
    91      for version in "${!versionsAndRegions[@]}"
    92      do
    93      region=${versionsAndRegions[$version]}
    94      if [ "$cloudProduct" = generic ]; then
    95          testCluster="standard-upgrade-test-cluster-${version//./-}"
    96      else
    97          testCluster="gke-autopilot-upgrade-test-cluster-${version//./-}"
    98      fi
    99      testClusterLocation="${region}"
   100  
   101      # Store mapping for later lookup
   102      clusterRegionMap["$testCluster"]="$testClusterLocation"
   103  
   104      echo "===== Processing cluster: $testCluster in $testClusterLocation ====="
   105  
   106      gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID"
   107  
   108      if [ "$cloudProduct" = gke-autopilot ]; then
   109          # For autopilot clusters use evictable "balloon" pods to keep a buffer in node pool autoscaling.
   110          kubectl apply -f evictablePods.yaml
   111      fi
   112  
   113      # Clean up any existing job / namespace / apiservice from previous run
   114      echo Checking if resources from a previous build of upgrade-test-runner exist and need to be cleaned up on cluster "${testCluster}".
   115      if kubectl get jobs | grep upgrade-test-runner ; then
   116          echo Deleting job from previous run of upgrade-test-runner on cluster "${testCluster}".
   117          kubectl delete job upgrade-test-runner --ignore-not-found=true
   118          kubectl wait --for=delete pod -l job-name=upgrade-test-runner --timeout=5m
   119      fi
   120  
   121      # Check if there are any dangling game servers.
   122      if kubectl get gs | grep ".*"; then
   123          # Remove any finalizers so that dangling game servers can be manually deleted.
   124          kubectl get gs -o=custom-columns=:.metadata.name --no-headers | xargs kubectl patch gs -p '{"metadata":{"finalizers":[]}}' --type=merge
   125          sleep 5
   126          echo Deleting game servers from previous run of upgrade-test-runner on cluster "${testCluster}".
   127          kubectl delete gs -l app=sdk-client-test --ignore-not-found=true
   128      fi
   129  
   130      if kubectl get po -l app=sdk-client-test | grep ".*"; then
   131          echo Deleting pods from previous run of upgrade-test-runner on cluster "${testCluster}".
   132          kubectl delete po -l app=sdk-client-test --ignore-not-found=true
   133          kubectl wait --for=delete pod -l app=sdk-client-test --timeout=5m
   134      fi
   135  
   136      # The v1.allocation.agones.dev apiservice does not get removed automatically and will prevent the namespace from terminating.
   137      if kubectl get apiservice | grep v1.allocation.agones.dev ; then
   138          echo Deleting v1.allocation.agones.dev from previous run of upgrade-test-runner on cluster "${testCluster}".
   139          kubectl delete apiservice v1.allocation.agones.dev --ignore-not-found=true
   140      fi
   141  
   142      if kubectl get namespace | grep agones-system ; then
   143          echo Deleting agones-system namespace from previous run of upgrade-test-runner on cluster "${testCluster}".
   144          kubectl delete namespace agones-system --ignore-not-found=true
   145          kubectl wait --for=delete ns agones-system --timeout=5m
   146      fi
   147  
   148      if kubectl get crds | grep agones ; then
   149          echo Deleting crds from previous run of upgrade-test-runner on cluster "${testCluster}".
   150          kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd --ignore-not-found=true
   151      fi
   152  
   153      echo kubectl apply -f permissions.yaml on cluster "${testCluster}"
   154      kubectl apply -f permissions.yaml
   155      echo kubectl apply -f versionMap.yaml on cluster "${testCluster}"
   156      kubectl apply -f "${tmpdir}"/versionMap.yaml
   157      echo kubectl apply -f gameserverTemplate.yaml on cluster "${testCluster}"
   158      kubectl apply -f gameserverTemplate.yaml
   159  
   160      echo kubectl apply -f upgradeTest.yaml on cluster "${testCluster}"
   161      kubectl apply -f "${tmpdir}"/upgradeTest.yaml
   162  
   163      # We need to wait for job pod to be created and ready before we can wait on the job itself.
   164      kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m
   165  
   166      # Wait for the pod to become ready (or timeout)
   167      if ! kubectl wait --for=condition=ready pod -l job-name=upgrade-test-runner --timeout=5m; then
   168          echo "ERROR: The pod for job 'upgrade-test-runner' did not become ready within the timeout period."
   169          print_failure_logs "$testCluster" "$testClusterLocation"
   170          exit 1
   171      fi
   172  
   173      echo Wait for job upgrade-test-runner to complete or fail on cluster "${testCluster}"
   174      logPath="${tmpdir}/${testCluster}.log"
   175      kubectl wait job/upgrade-test-runner --timeout=30m --for jsonpath='{.status.conditions[*].status}'=True -o jsonpath='{.status.conditions[*]}' | tee "$logPath" &
   176      waitPid=$!
   177      pids+=( "$waitPid" )
   178      waitPids[$waitPid]="$logPath"
   179  
   180      done
   181  done
   182  
   183  for pid in "${pids[@]}"; do
   184      # This block executes when the process exits and pid status==0
   185      if wait "$pid"; then
   186          outputLog="${waitPids[$pid]}"
   187          # wait for output to finish writing to file
   188          until [ -s "$outputLog" ]; do sleep 1; done
   189          output_json=$(<"${outputLog}")
   190  
   191          echo "Reading output from log file: $outputLog:"
   192          echo "$output_json" | jq '.'
   193  
   194          job_condition_type=$(echo "$output_json" | jq -r '.type')
   195          job_condition_message=$(echo "$output_json" | jq -r '.message')
   196  
   197          # "Complete" is successful job run.
   198          # Version 1.31 has "SuccessCriteriaMet" as the first completion status returned, or "FailureTarget" in case of failure.
   199          if [ "$job_condition_type" == "Complete" ] || [ "$job_condition_type" == "SuccessCriteriaMet" ]; then
   200              echo "Job completed successfully on cluster associated with log: $outputLog"
   201              continue
   202          else
   203              echo "Unexpected job status: '$job_condition_type' with message: '$job_condition_message' in log $outputLog"
   204              clusterName="$(basename "$outputLog" .log)"
   205              print_failure_logs "$clusterName" "${clusterRegionMap[$clusterName]}"
   206              exit 1
   207          fi
   208      # This block executes when the process exits and pid status!=0
   209      else
   210          status=$?
   211          outputLog="${waitPids[$pid]}"
   212          clusterName="$(basename "$outputLog" .log)"
   213          echo "One of the upgrade tests pid $pid from cluster log $outputLog exited with a non-zero status ${status}."
   214          print_failure_logs "$clusterName" "${clusterRegionMap[$clusterName]}"
   215          exit $status
   216      fi
   217  done
   218  
   219  echo "End of Upgrade Tests"