agones.dev/agones@v1.53.0/build/e2e_upgrade_test.sh (about) 1 #!/usr/bin/env bash 2 3 # Copyright 2025 Google LLC All Rights Reserved. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 set -e 18 set -o pipefail 19 20 BASE_VERSION=$1 21 PROJECT_ID=$2 22 BUCKET_NAME="upgrade-test-container-logs" 23 24 apt-get update && apt-get install -y jq 25 export SHELL="/bin/bash" 26 export KUBECONFIG="/root/.kube/config" 27 mkdir -p /go/src/agones.dev/ /root/.kube/ 28 ln -s /workspace /go/src/agones.dev/agones 29 cd /go/src/agones.dev/agones/test/upgrade 30 31 # --- Function to print failure logs --- 32 print_failure_logs() { 33 local testCluster=$1 34 local testClusterLocation=$2 35 echo "ERROR: Upgrade test failed on cluster: $testCluster" 36 gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID" 37 38 # Get all pods for the job 39 job_pods_json=$(kubectl get pods -l job-name=upgrade-test-runner -o json) 40 41 # Check if any pods were found 42 if [[ $(echo "$job_pods_json" | jq '.items | length') -eq 0 ]]; then 43 echo "No pods found for job upgrade-test-runner. They might have failed to schedule or were deleted." 44 else 45 # Get the name of the first (and only) pod 46 job_pod=$(echo "$job_pods_json" | jq -r '.items[0].metadata.name') 47 pod_status=$(kubectl get pod "$job_pod" -o jsonpath='{.status.phase}') 48 49 echo "--- Pod $job_pod status $pod_status. Retrieving termination message. ---" 50 # A non-restarting pod will have its termination message in 'state.terminated'. 51 termination_message=$(kubectl get pod "$job_pod" -o go-template='{{range .status.containerStatuses}}{{if eq .name "upgrade-test-controller"}}{{.state.terminated.message}}{{end}}{{end}}') 52 53 if [ -n "$termination_message" ]; then 54 echo "Fatal Error: $termination_message" 55 else 56 echo "No termination message found for pod $job_pod. Dumping logs:" 57 containers=$(kubectl get pod "$job_pod" -o jsonpath='{.spec.containers[*].name}') 58 for container in $containers; do 59 if [[ "$container" == "sdk-client-test" || "$container" == "upgrade-test-controller" ]]; then 60 echo "----- Logs from pod: $job_pod, container: $container -----" 61 kubectl logs "$job_pod" -c "$container" --tail=50 || echo "Failed to retrieve logs from $job_pod/$container" 62 fi 63 done 64 fi 65 fi 66 67 echo "Logs from log bucket: https://console.cloud.google.com/logs/query;storageScope=storage,projects%2F${PROJECT_ID}%2Flocations%2Fglobal%2Fbuckets%2F${BUCKET_NAME}%2Fviews%2F_AllLogs?hl=en&inv=1&invt=Ab4o5A&mods=logs_tg_prod&project=${PROJECT_ID}" 68 } 69 # ------------------------------------------------------ 70 71 pids=() 72 typeset -A waitPids # Associative array for mapping `kubectl wait job` pid -> `kubectl wait job` output log name 73 declare -A clusterRegionMap # Associative array for mapping cluster name -> cluster location 74 tmpdir=$(mktemp -d) 75 trap 'rm -rf -- "$tmpdir"' EXIT SIGTERM 76 77 # Update image tags to include the current build version. 78 DevVersion="${BASE_VERSION}-dev-$(git rev-parse --short=7 HEAD)" 79 export DevVersion 80 sed "s/\${DevVersion}/${DevVersion}/" upgradeTest.yaml > "${tmpdir}"/upgradeTest.yaml 81 sed "s/\${DevVersion}/${DevVersion}/" versionMap.yaml > "${tmpdir}"/versionMap.yaml 82 83 # Kill all currently running child processes on exit or if a non-zero signal is seen 84 trap 'echo Cleaning up any remaining running pids: $(jobs -p) ; kill $(jobs -p) 2> /dev/null || :' EXIT SIGTERM 85 86 cloudProducts=("generic" "gke-autopilot") 87 declare -A versionsAndRegions=( [1.33]=us-central1 [1.32]=us-west1 [1.31]=us-east1 ) 88 89 for cloudProduct in "${cloudProducts[@]}" 90 do 91 for version in "${!versionsAndRegions[@]}" 92 do 93 region=${versionsAndRegions[$version]} 94 if [ "$cloudProduct" = generic ]; then 95 testCluster="standard-upgrade-test-cluster-${version//./-}" 96 else 97 testCluster="gke-autopilot-upgrade-test-cluster-${version//./-}" 98 fi 99 testClusterLocation="${region}" 100 101 # Store mapping for later lookup 102 clusterRegionMap["$testCluster"]="$testClusterLocation" 103 104 echo "===== Processing cluster: $testCluster in $testClusterLocation =====" 105 106 gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID" 107 108 if [ "$cloudProduct" = gke-autopilot ]; then 109 # For autopilot clusters use evictable "balloon" pods to keep a buffer in node pool autoscaling. 110 kubectl apply -f evictablePods.yaml 111 fi 112 113 # Clean up any existing job / namespace / apiservice from previous run 114 echo Checking if resources from a previous build of upgrade-test-runner exist and need to be cleaned up on cluster "${testCluster}". 115 if kubectl get jobs | grep upgrade-test-runner ; then 116 echo Deleting job from previous run of upgrade-test-runner on cluster "${testCluster}". 117 kubectl delete job upgrade-test-runner --ignore-not-found=true 118 kubectl wait --for=delete pod -l job-name=upgrade-test-runner --timeout=5m 119 fi 120 121 # Check if there are any dangling game servers. 122 if kubectl get gs | grep ".*"; then 123 # Remove any finalizers so that dangling game servers can be manually deleted. 124 kubectl get gs -o=custom-columns=:.metadata.name --no-headers | xargs kubectl patch gs -p '{"metadata":{"finalizers":[]}}' --type=merge 125 sleep 5 126 echo Deleting game servers from previous run of upgrade-test-runner on cluster "${testCluster}". 127 kubectl delete gs -l app=sdk-client-test --ignore-not-found=true 128 fi 129 130 if kubectl get po -l app=sdk-client-test | grep ".*"; then 131 echo Deleting pods from previous run of upgrade-test-runner on cluster "${testCluster}". 132 kubectl delete po -l app=sdk-client-test --ignore-not-found=true 133 kubectl wait --for=delete pod -l app=sdk-client-test --timeout=5m 134 fi 135 136 # The v1.allocation.agones.dev apiservice does not get removed automatically and will prevent the namespace from terminating. 137 if kubectl get apiservice | grep v1.allocation.agones.dev ; then 138 echo Deleting v1.allocation.agones.dev from previous run of upgrade-test-runner on cluster "${testCluster}". 139 kubectl delete apiservice v1.allocation.agones.dev --ignore-not-found=true 140 fi 141 142 if kubectl get namespace | grep agones-system ; then 143 echo Deleting agones-system namespace from previous run of upgrade-test-runner on cluster "${testCluster}". 144 kubectl delete namespace agones-system --ignore-not-found=true 145 kubectl wait --for=delete ns agones-system --timeout=5m 146 fi 147 148 if kubectl get crds | grep agones ; then 149 echo Deleting crds from previous run of upgrade-test-runner on cluster "${testCluster}". 150 kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd --ignore-not-found=true 151 fi 152 153 echo kubectl apply -f permissions.yaml on cluster "${testCluster}" 154 kubectl apply -f permissions.yaml 155 echo kubectl apply -f versionMap.yaml on cluster "${testCluster}" 156 kubectl apply -f "${tmpdir}"/versionMap.yaml 157 echo kubectl apply -f gameserverTemplate.yaml on cluster "${testCluster}" 158 kubectl apply -f gameserverTemplate.yaml 159 160 echo kubectl apply -f upgradeTest.yaml on cluster "${testCluster}" 161 kubectl apply -f "${tmpdir}"/upgradeTest.yaml 162 163 # We need to wait for job pod to be created and ready before we can wait on the job itself. 164 kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m 165 166 # Wait for the pod to become ready (or timeout) 167 if ! kubectl wait --for=condition=ready pod -l job-name=upgrade-test-runner --timeout=5m; then 168 echo "ERROR: The pod for job 'upgrade-test-runner' did not become ready within the timeout period." 169 print_failure_logs "$testCluster" "$testClusterLocation" 170 exit 1 171 fi 172 173 echo Wait for job upgrade-test-runner to complete or fail on cluster "${testCluster}" 174 logPath="${tmpdir}/${testCluster}.log" 175 kubectl wait job/upgrade-test-runner --timeout=30m --for jsonpath='{.status.conditions[*].status}'=True -o jsonpath='{.status.conditions[*]}' | tee "$logPath" & 176 waitPid=$! 177 pids+=( "$waitPid" ) 178 waitPids[$waitPid]="$logPath" 179 180 done 181 done 182 183 for pid in "${pids[@]}"; do 184 # This block executes when the process exits and pid status==0 185 if wait "$pid"; then 186 outputLog="${waitPids[$pid]}" 187 # wait for output to finish writing to file 188 until [ -s "$outputLog" ]; do sleep 1; done 189 output_json=$(<"${outputLog}") 190 191 echo "Reading output from log file: $outputLog:" 192 echo "$output_json" | jq '.' 193 194 job_condition_type=$(echo "$output_json" | jq -r '.type') 195 job_condition_message=$(echo "$output_json" | jq -r '.message') 196 197 # "Complete" is successful job run. 198 # Version 1.31 has "SuccessCriteriaMet" as the first completion status returned, or "FailureTarget" in case of failure. 199 if [ "$job_condition_type" == "Complete" ] || [ "$job_condition_type" == "SuccessCriteriaMet" ]; then 200 echo "Job completed successfully on cluster associated with log: $outputLog" 201 continue 202 else 203 echo "Unexpected job status: '$job_condition_type' with message: '$job_condition_message' in log $outputLog" 204 clusterName="$(basename "$outputLog" .log)" 205 print_failure_logs "$clusterName" "${clusterRegionMap[$clusterName]}" 206 exit 1 207 fi 208 # This block executes when the process exits and pid status!=0 209 else 210 status=$? 211 outputLog="${waitPids[$pid]}" 212 clusterName="$(basename "$outputLog" .log)" 213 echo "One of the upgrade tests pid $pid from cluster log $outputLog exited with a non-zero status ${status}." 214 print_failure_logs "$clusterName" "${clusterRegionMap[$clusterName]}" 215 exit $status 216 fi 217 done 218 219 echo "End of Upgrade Tests"