github.com/MetalBlockchain/metalgo@v1.11.9/.github/workflows/run-net-outage-sim.sh

github.com/MetalBlockchain/metalgo@v1.11.9/.github/workflows/run-net-outage-sim.sh (about)

     1  #!/usr/bin/env bash
     2  
     3  set -euo pipefail
     4  
     5  SUCCESS=1
     6  
     7  # Polls metalgo until it's healthy. When it is,
     8  # sets SUCCESS to 0 and returns. If metalgo
     9  # doesn't become healthy within 3 hours, sets
    10  # SUCCESS to 1 and returns.
    11  wait_until_healthy () {
    12    # timeout: if after 3 hours it is not healthy, return
    13    stop=$(date -d "+ 3 hour" +%s)
    14    # store the response code here
    15    response=0
    16    # while the endpoint doesn't return 200
    17    while [ "$response" -ne 200 ]
    18    do
    19      echo "Checking if local node is healthy..."
    20      # Ignore error in case of ephemeral failure to hit node's API
    21      response=$(curl --write-out '%{http_code}' --silent --output /dev/null localhost:9650/ext/health)
    22      echo "got status code $response from health endpoint"
    23      # check that 3 hours haven't passed
    24      now=$(date +%s)
    25      if [ "$now" -ge "$stop" ];
    26      then
    27        # timeout: exit
    28        SUCCESS=1
    29        return
    30      fi
    31      # no timeout yet, wait 30s until retry
    32      sleep 30
    33    done
    34    # response returned 200, therefore exit
    35    echo "Node became healthy"
    36    SUCCESS=0
    37  }
    38  
    39  #remove any existing database files
    40  echo "removing existing database files..."
    41  rm /opt/mainnet-db-daily* 2>/dev/null || true # Do || true to ignore error if files dont exist yet
    42  rm -rf /var/lib/metalgo 2>/dev/null || true # Do || true to ignore error if files dont exist yet
    43  echo "done existing database files"
    44  
    45  #download latest mainnet DB backup
    46  FILENAME="mainnet-db-daily-"
    47  DATE=$(date +'%m-%d-%Y')
    48  DB_FILE="$FILENAME$DATE"
    49  echo "Copying database file $DB_FILE from S3 to local..."
    50  aws s3 cp s3://avalanche-db-daily/ /opt/ --no-progress --recursive --exclude "*" --include "$DB_FILE*" 
    51  echo "Done downloading database"
    52  
    53  # extract DB
    54  echo "Extracting database..."
    55  mkdir -p /var/lib/metalgo/db 
    56  tar -zxf /opt/"$DB_FILE"*-tar.gz -C /var/lib/metalgo/db 
    57  echo "Done extracting database"
    58  
    59  echo "Creating Docker network..."
    60  docker network create controlled-net
    61  
    62  echo "Starting Docker container..."
    63  containerID=$(docker run --name="net_outage_simulation" --memory="12g" --memory-reservation="11g" --cpus="6.0" --net=controlled-net -p 9650:9650 -p 9651:9651 -v /var/lib/metalgo/db:/db -d avaplatform/metalgo:latest /metalgo/build/metalgo --db-dir /db --http-host=0.0.0.0)
    64  
    65  echo "Waiting 30 seconds for node to start..."
    66  sleep 30
    67  echo "Waiting until healthy..."
    68  wait_until_healthy
    69  if [ $SUCCESS -eq 1 ];
    70  then
    71    echo "Timed out waiting for node to become healthy; exiting."
    72    exit 1
    73  fi
    74  
    75  # To simulate internet outage, we will disable the docker network connection
    76  echo "Disconnecting node from internet..."
    77  docker network disconnect controlled-net "$containerID"
    78  echo "Sleeping 60 minutes..."
    79  sleep 3600
    80  echo "Reconnecting node to internet..."
    81  docker network connect controlled-net "$containerID"
    82  echo "Reconnected to internet. Waiting until healthy..."
    83  
    84  # now repeatedly check the node's health until it returns healthy
    85  start=$(date +%s)
    86  SUCCESS=-1
    87  wait_until_healthy
    88  if [ $SUCCESS -eq 1 ];
    89  then
    90    echo "Timed out waiting for node to become healthy after outage; exiting."
    91    exit 1
    92  fi
    93  
    94  # The node returned healthy, print how long it took
    95  end=$(date +%s)
    96  
    97  DELAY=$((end - start))
    98  echo "Node became healthy again after complete outage after $DELAY seconds."
    99  echo "Test completed"