github.com/MetalBlockchain/metalgo@v1.11.9/.github/workflows/run-net-outage-sim.sh (about) 1 #!/usr/bin/env bash 2 3 set -euo pipefail 4 5 SUCCESS=1 6 7 # Polls metalgo until it's healthy. When it is, 8 # sets SUCCESS to 0 and returns. If metalgo 9 # doesn't become healthy within 3 hours, sets 10 # SUCCESS to 1 and returns. 11 wait_until_healthy () { 12 # timeout: if after 3 hours it is not healthy, return 13 stop=$(date -d "+ 3 hour" +%s) 14 # store the response code here 15 response=0 16 # while the endpoint doesn't return 200 17 while [ "$response" -ne 200 ] 18 do 19 echo "Checking if local node is healthy..." 20 # Ignore error in case of ephemeral failure to hit node's API 21 response=$(curl --write-out '%{http_code}' --silent --output /dev/null localhost:9650/ext/health) 22 echo "got status code $response from health endpoint" 23 # check that 3 hours haven't passed 24 now=$(date +%s) 25 if [ "$now" -ge "$stop" ]; 26 then 27 # timeout: exit 28 SUCCESS=1 29 return 30 fi 31 # no timeout yet, wait 30s until retry 32 sleep 30 33 done 34 # response returned 200, therefore exit 35 echo "Node became healthy" 36 SUCCESS=0 37 } 38 39 #remove any existing database files 40 echo "removing existing database files..." 41 rm /opt/mainnet-db-daily* 2>/dev/null || true # Do || true to ignore error if files dont exist yet 42 rm -rf /var/lib/metalgo 2>/dev/null || true # Do || true to ignore error if files dont exist yet 43 echo "done existing database files" 44 45 #download latest mainnet DB backup 46 FILENAME="mainnet-db-daily-" 47 DATE=$(date +'%m-%d-%Y') 48 DB_FILE="$FILENAME$DATE" 49 echo "Copying database file $DB_FILE from S3 to local..." 50 aws s3 cp s3://avalanche-db-daily/ /opt/ --no-progress --recursive --exclude "*" --include "$DB_FILE*" 51 echo "Done downloading database" 52 53 # extract DB 54 echo "Extracting database..." 55 mkdir -p /var/lib/metalgo/db 56 tar -zxf /opt/"$DB_FILE"*-tar.gz -C /var/lib/metalgo/db 57 echo "Done extracting database" 58 59 echo "Creating Docker network..." 60 docker network create controlled-net 61 62 echo "Starting Docker container..." 63 containerID=$(docker run --name="net_outage_simulation" --memory="12g" --memory-reservation="11g" --cpus="6.0" --net=controlled-net -p 9650:9650 -p 9651:9651 -v /var/lib/metalgo/db:/db -d avaplatform/metalgo:latest /metalgo/build/metalgo --db-dir /db --http-host=0.0.0.0) 64 65 echo "Waiting 30 seconds for node to start..." 66 sleep 30 67 echo "Waiting until healthy..." 68 wait_until_healthy 69 if [ $SUCCESS -eq 1 ]; 70 then 71 echo "Timed out waiting for node to become healthy; exiting." 72 exit 1 73 fi 74 75 # To simulate internet outage, we will disable the docker network connection 76 echo "Disconnecting node from internet..." 77 docker network disconnect controlled-net "$containerID" 78 echo "Sleeping 60 minutes..." 79 sleep 3600 80 echo "Reconnecting node to internet..." 81 docker network connect controlled-net "$containerID" 82 echo "Reconnected to internet. Waiting until healthy..." 83 84 # now repeatedly check the node's health until it returns healthy 85 start=$(date +%s) 86 SUCCESS=-1 87 wait_until_healthy 88 if [ $SUCCESS -eq 1 ]; 89 then 90 echo "Timed out waiting for node to become healthy after outage; exiting." 91 exit 1 92 fi 93 94 # The node returned healthy, print how long it took 95 end=$(date +%s) 96 97 DELAY=$((end - start)) 98 echo "Node became healthy again after complete outage after $DELAY seconds." 99 echo "Test completed"