github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/ops/terraform/remote_files/scripts/install-node.sh (about)

     1  #!/bin/bash
     2  # shellcheck disable=SC1091,SC2312
     3  set -euo pipefail
     4  IFS=$'\n\t'
     5  
     6  source /terraform_node/variables
     7  
     8  function install-go() {
     9    echo "Installing Go..."
    10    rm -fr /usr/local/go /usr/local/bin/go
    11    curl --silent --show-error --location --fail https://go.dev/dl/go1.19.6.linux-amd64.tar.gz | sudo tar --extract --gzip --file=- --directory=/usr/local
    12    sudo ln -s /usr/local/go/bin/go /usr/local/bin/go
    13    go version
    14  }
    15  
    16  function install-docker() {
    17    echo "Installing Docker"
    18    sudo apt-get install -y \
    19        ca-certificates \
    20        curl \
    21        gnupg \
    22        lsb-release
    23    sudo mkdir -p /etc/apt/keyrings
    24    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
    25    echo \
    26      "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
    27      $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
    28    sudo apt-get update -y
    29    sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
    30  }
    31  
    32  function install-gpu() {
    33    echo "Installing GPU drivers"
    34    if [[ "${GPU_NODE}" = "true" ]]; then
    35      echo "Installing GPU drivers"
    36      distribution=$(. /etc/os-release;echo "${ID}${VERSION_ID}" | sed -e 's/\.//g') \
    37        && wget https://developer.download.nvidia.com/compute/cuda/repos/"${distribution}"/x86_64/cuda-keyring_1.0-1_all.deb \
    38        && sudo dpkg -i cuda-keyring_1.0-1_all.deb
    39      distribution=$(. /etc/os-release;echo "${ID}${VERSION_ID}") \
    40        && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
    41        && curl -s -L https://nvidia.github.io/libnvidia-container/"${distribution}"/libnvidia-container.list | \
    42              sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
    43              sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
    44  
    45      sudo apt-get update && sudo apt-get install -y \
    46        linux-headers-"$(uname -r)" \
    47        cuda-drivers \
    48        nvidia-docker2
    49      sudo systemctl restart docker
    50      nvidia-smi # No idea why we have to run this once, but we do. Only then does nvidia-container-cli work.
    51    else
    52      echo "Not installing GPU drivers because GPU_NODE=${GPU_NODE}"
    53    fi
    54  }
    55  
    56  # Lay down a very basic web server to report when the node is healthy
    57  function install-healthcheck() {
    58    echo "Installing healthcheck"
    59    sudo apt-get -y install --no-install-recommends wget gnupg ca-certificates
    60    wget -O - https://openresty.org/package/pubkey.gpg | sudo apt-key add -
    61    echo "deb http://openresty.org/package/ubuntu $(lsb_release -sc) main" \
    62        | sudo tee /etc/apt/sources.list.d/openresty.list
    63    sudo apt-get update -y
    64    sudo apt-get -y install --no-install-recommends openresty
    65    sudo cp /terraform_node/nginx.conf /usr/local/openresty/nginx/conf/nginx.conf
    66  }
    67  
    68  function install-ipfs() {
    69    echo "Installing IPFS"
    70    wget "https://dist.ipfs.tech/go-ipfs/${IPFS_VERSION}/go-ipfs_${IPFS_VERSION}_linux-amd64.tar.gz"
    71    tar -xvzf "go-ipfs_${IPFS_VERSION}_linux-amd64.tar.gz"
    72    # TODO should reset PWD to home dir after each function call
    73    cd go-ipfs
    74    sudo bash install.sh
    75    ipfs --version
    76  }
    77  
    78  function install-bacalhau() {
    79    if [[ -n "${BACALHAU_BRANCH}" ]] ; then
    80      install-bacalhau-from-source
    81    elif [[ -n "${BACALHAU_VERSION}" ]] ; then
    82      install-bacalhau-from-release
    83    else
    84      echo "No bacalhau version or branch specified. Not installing bacalhau."
    85      exit 1
    86    fi
    87  }
    88  
    89  function install-bacalhau-from-release() {
    90    echo "Installing Bacalhau from release ${BACALHAU_VERSION}"
    91    sudo apt-get -y install --no-install-recommends jq
    92    wget "https://github.com/filecoin-project/bacalhau/releases/download/${BACALHAU_VERSION}/bacalhau_${BACALHAU_VERSION}_linux_amd64.tar.gz"
    93    tar xfv "bacalhau_${BACALHAU_VERSION}_linux_amd64.tar.gz"
    94    sudo mv ./bacalhau /usr/local/bin/bacalhau
    95  }
    96  
    97  function install-bacalhau-from-source() {
    98    echo "Installing Bacalhau from branch ${BACALHAU_BRANCH}"
    99    sudo apt-get -y install --no-install-recommends jq
   100    git clone --depth 1 --branch ${BACALHAU_BRANCH} https://github.com/filecoin-project/bacalhau.git
   101    cd bacalhau
   102    GO111MODULE=on CGO_ENABLED=0 go build -gcflags '-N -l' -trimpath -o ./bacalhau
   103    sudo mv ./bacalhau /usr/local/bin/bacalhau
   104  }
   105  
   106  function install-otel-collector() {
   107    echo "Installing otel collector"
   108    if [[ -z "${OTEL_COLLECTOR_VERSION}" ]] ; then
   109      echo 'OTEL_COLLECTOR_VERSION is undefined. Skipping otel collector installation.'
   110    else
   111      sudo apt -y update
   112      sudo groupadd --system otel
   113      sudo useradd -s /sbin/nologin --system -g otel otel
   114      sudo mkdir -p /etc/otel
   115      sudo mkdir -p /var/lib/otel
   116      wget "https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v${OTEL_COLLECTOR_VERSION}/otelcol-contrib_${OTEL_COLLECTOR_VERSION}_linux_amd64.tar.gz"
   117      tar xvf "otelcol-contrib_${OTEL_COLLECTOR_VERSION}_linux_amd64.tar.gz"
   118      sudo mv otelcol-contrib /usr/local/bin/otelcol
   119      # config file
   120      sudo tee /terraform_node/otel-collector.yml > /dev/null <<EOF
   121  
   122  extensions:
   123    health_check:
   124    zpages:
   125      endpoint: :55679
   126    basicauth/prometheus:
   127      client_auth:
   128        username: ${GRAFANA_CLOUD_PROMETHEUS_USER}
   129        password: ${GRAFANA_CLOUD_PROMETHEUS_API_KEY}
   130    basicauth/tempo:
   131      client_auth:
   132        username: ${GRAFANA_CLOUD_TEMPO_USER}
   133        password: ${GRAFANA_CLOUD_TEMPO_API_KEY}
   134    basicauth/loki:
   135      client_auth:
   136        username: ${GRAFANA_CLOUD_LOKI_USER}
   137        password: ${GRAFANA_CLOUD_LOKI_API_KEY}
   138  
   139  receivers:
   140    hostmetrics:
   141      scrapers:
   142        cpu:
   143        disk:
   144        load:
   145        filesystem:
   146        memory:
   147        network:
   148        paging:
   149    otlp:
   150      protocols:
   151        http:
   152    prometheus:
   153      config:
   154        scrape_configs:
   155          - job_name: 'otel-collector'
   156            scrape_interval: 5s
   157            static_configs:
   158              - targets: [ '0.0.0.0:8888' ]
   159  
   160  exporters:
   161    prometheusremotewrite:
   162      endpoint: ${GRAFANA_CLOUD_PROMETHEUS_ENDPOINT}
   163      auth:
   164        authenticator: basicauth/prometheus
   165      resource_to_telemetry_conversion:
   166        enabled: true
   167    otlp:
   168      endpoint: ${GRAFANA_CLOUD_TEMPO_ENDPOINT}
   169      auth:
   170        authenticator: basicauth/tempo
   171    loki:
   172      endpoint: https://${GRAFANA_CLOUD_LOKI_ENDPOINT}/loki/api/v1/push
   173      auth:
   174        authenticator: basicauth/loki
   175  
   176  processors:
   177    batch:
   178    memory_limiter:
   179      check_interval: 5s
   180      limit_mib: 4000
   181      spike_limit_mib: 500
   182    resourcedetection/gcp:
   183      detectors: [ env, gcp ]
   184      timeout: 2s
   185      override: false
   186    resource:
   187      attributes:
   188      - key: deployment.environment
   189        value: ${TERRAFORM_WORKSPACE}
   190        action: insert
   191      - key: service.namespace
   192        value: bacalhau
   193        action: insert
   194    attributes/metrics:
   195      actions:
   196      - pattern: net\.sock.+
   197        action: delete
   198  
   199  service:
   200    extensions: [basicauth/tempo, basicauth/prometheus, basicauth/loki, zpages, health_check]
   201    pipelines:
   202  EOF
   203  
   204      if [[ -n "${GRAFANA_CLOUD_PROMETHEUS_ENDPOINT}" ]] && [[ -n "${GRAFANA_CLOUD_PROMETHEUS_USER}" ]] && [[ -n "${GRAFANA_CLOUD_PROMETHEUS_API_KEY}" ]]; then
   205        sudo tee -a /terraform_node/otel-collector.yml > /dev/null <<EOF
   206      traces:
   207        receivers: [otlp]
   208        processors: [memory_limiter, resourcedetection/gcp, resource, batch]
   209        exporters: [otlp]
   210  EOF
   211      fi
   212  
   213      if [[ -n "${GRAFANA_CLOUD_TEMPO_ENDPOINT}" ]] && [[ -n "${GRAFANA_CLOUD_TEMPO_USER}" ]] && [[ -n "${GRAFANA_CLOUD_TEMPO_API_KEY}" ]]; then
   214        sudo tee -a /terraform_node/otel-collector.yml > /dev/null <<EOF
   215      metrics:
   216        receivers: [otlp, prometheus, hostmetrics]
   217        processors: [memory_limiter, resourcedetection/gcp, resource, attributes/metrics, batch]
   218        exporters: [prometheusremotewrite]
   219  EOF
   220      fi
   221  
   222      if [[ -n "${GRAFANA_CLOUD_LOKI_ENDPOINT}" ]] && [[ -n "${GRAFANA_CLOUD_LOKI_USER}" ]] && [[ -n "${GRAFANA_CLOUD_LOKI_API_KEY}" ]]; then
   223        sudo tee -a /terraform_node/otel-collector.yml > /dev/null <<EOF
   224  
   225  # disabled until promtail receiver is merged in collector-contrib
   226  #    logs:
   227  #      receivers: []
   228  #      processors: [memory_limiter, resourcedetection/gcp, resource, batch]
   229  #      exporters: [loki]
   230  EOF
   231      fi
   232      sudo chown -R otel:otel /terraform_node/otel-collector.yml
   233    fi
   234  }
   235  
   236  function install-promtail() {
   237    echo "Installing Promtail/Loki"
   238    if [[ -z "${LOKI_VERSION}" ]] || [[ -z "${GRAFANA_CLOUD_LOKI_API_KEY}" ]] || [[ -z "${GRAFANA_CLOUD_LOKI_USER}" ]] || [[ -z "${GRAFANA_CLOUD_LOKI_ENDPOINT}" ]]; then
   239      echo 'Any of LOKI_VERSION, GRAFANA_CLOUD_LOKI_API_KEY, GRAFANA_CLOUD_LOKI_USER, GRAFANA_CLOUD_LOKI_ENDPOINT env variables is undefined. Skipping Promtail/Loki installation.'
   240    else
   241      cd ~
   242      curl -O -L "https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/promtail-linux-amd64.zip"
   243      gunzip -S ".zip" promtail-linux-amd64.zip
   244      sudo chmod a+x "promtail-linux-amd64"
   245      sudo mv promtail-linux-amd64 /usr/local/bin/
   246      
   247      # config file
   248      HOSTNAME=$(hostname)
   249      
   250      sudo tee /terraform_node/promtail.yml > /dev/null <<EOF
   251  server:
   252    http_listen_port: 0
   253    grpc_listen_port: 0
   254  
   255  positions:
   256    filename: /tmp/positions.yaml
   257  
   258  clients:
   259    - url: https://${GRAFANA_CLOUD_LOKI_USER}:${GRAFANA_CLOUD_LOKI_API_KEY}@${GRAFANA_CLOUD_LOKI_ENDPOINT}/loki/api/v1/push
   260  
   261  scrape_configs:
   262    - job_name: journal
   263      pipeline_stages:
   264        - json:
   265            expressions:
   266             level:
   267             msg:
   268        - drop:
   269            source: "level"
   270            expression:  "(debug|trace)"
   271      journal:
   272        max_age: 12h
   273        labels:
   274          job: systemd-journal
   275          host: ${HOSTNAME}
   276          label_project: bacalhau
   277          environment: ${TERRAFORM_WORKSPACE}
   278      relabel_configs:
   279        - action: keep
   280          source_labels: [__journal__systemd_unit]
   281          regex: '^bacalhau\.service$'
   282        - source_labels: ['__journal__systemd_unit']
   283          target_label: 'systemd_unit'
   284  EOF
   285      sudo mkdir -p /etc/promtail
   286      sudo cp /terraform_node/promtail.yml /etc/promtail/config.yml
   287    fi
   288  }
   289  
   290  function mount-disk() { 
   291    echo "Mounting disk"
   292    # wait for /dev/sdb to exist
   293    while [[ ! -e /dev/sdb ]]; do
   294      sleep 1
   295      echo "waiting for /dev/sdb to exist"
   296    done
   297    # mount /dev/sdb at /data
   298    sudo mkdir -p /data
   299    sudo mount /dev/sdb /data || (sudo mkfs -t ext4 /dev/sdb && sudo mount /dev/sdb /data)
   300  }
   301  
   302  # make sure that "ipfs init" has been run
   303  function init-ipfs() {
   304    echo "Initializing IPFS"
   305    sudo mkdir -p /data/ipfs
   306    export IPFS_PATH=/data/ipfs
   307  
   308    if [[ ! -e /data/ipfs/version ]]; then
   309      ipfs init
   310    fi
   311  }
   312  
   313  # install any secrets provided as terraform vars
   314  function install-secrets() {
   315    echo "Installing secrets"
   316    # set defaults
   317    export GRAFANA_CLOUD_PROMETHEUS_API_KEY=""
   318    export GRAFANA_CLOUD_TEMPO_API_KEY=""
   319    export GRAFANA_CLOUD_LOKI_API_KEY=""
   320    export ESTUARY_API_KEY=""
   321    if [[ -e /data/secrets.sh ]]; then
   322      source /data/secrets.sh
   323    fi
   324  
   325    # load new values if they were provided
   326    if [[ -n "${SECRETS_GRAFANA_CLOUD_PROMETHEUS_API_KEY}" ]]; then
   327      export GRAFANA_CLOUD_PROMETHEUS_API_KEY="${SECRETS_GRAFANA_CLOUD_PROMETHEUS_API_KEY}"
   328    fi
   329    if [[ -n "${SECRETS_GRAFANA_CLOUD_TEMPO_API_KEY}" ]]; then
   330        export GRAFANA_CLOUD_TEMPO_API_KEY="${SECRETS_GRAFANA_CLOUD_TEMPO_API_KEY}"
   331    fi
   332    if [[ -n "${SECRETS_GRAFANA_CLOUD_LOKI_API_KEY}" ]]; then
   333        export GRAFANA_CLOUD_LOKI_API_KEY="${SECRETS_GRAFANA_CLOUD_LOKI_API_KEY}"
   334    fi
   335    if [[ -n "${SECRETS_ESTUARY_API_KEY}" ]]; then
   336      export ESTUARY_API_KEY="${SECRETS_ESTUARY_API_KEY}"
   337    fi
   338  
   339    # write the secrets to persistent disk
   340    sudo tee /data/secrets.sh > /dev/null <<EOG
   341  export GRAFANA_CLOUD_PROMETHEUS_API_KEY="${GRAFANA_CLOUD_PROMETHEUS_API_KEY}"
   342  export GRAFANA_CLOUD_TEMPO_API_KEY="${GRAFANA_CLOUD_TEMPO_API_KEY}"
   343  export GRAFANA_CLOUD_LOKI_API_KEY="${GRAFANA_CLOUD_LOKI_API_KEY}"
   344  export ESTUARY_API_KEY="${ESTUARY_API_KEY}"
   345  EOG
   346  
   347    # clean up variables file from any secret
   348    sed -e '/^export SECRETS_/d' /terraform_node/variables | sudo tee /terraform_node/variables > /dev/null
   349  }
   350  
   351  # if we are node zero, are in unsafe mode and don't have a private key
   352  # then let's copy the unsafe private key so we have a deterministic id
   353  # that other nodes will connect to
   354  function init-bacalhau() {
   355    echo "Initializing Bacalhau"
   356    export BACALHAU_NODE_PRIVATE_KEY_PATH="/data/.bacalhau/private_key.${BACALHAU_PORT}"
   357    sudo mkdir -p /data/.bacalhau
   358    if [[ "${TERRAFORM_NODE_INDEX}" == "0" ]] && [[ -n "${BACALHAU_UNSAFE_CLUSTER}" ]] && [[ ! -f "${BACALHAU_NODE_PRIVATE_KEY_PATH}" ]]; then
   359      echo "WE ARE NOW INSTALLING THE UNSAFE KEY YO"
   360      sudo cp /terraform_node/bacalhau-unsafe-private-key "${BACALHAU_NODE_PRIVATE_KEY_PATH}"
   361      sudo chmod 0600 "${BACALHAU_NODE_PRIVATE_KEY_PATH}"
   362    fi
   363  }
   364  
   365  function start-services() {
   366    sudo systemctl daemon-reload
   367    sudo systemctl enable ipfs
   368    sudo systemctl enable bacalhau
   369    sudo systemctl enable otel
   370    sudo systemctl enable promtail
   371    sudo systemctl start ipfs
   372    sudo systemctl start bacalhau
   373    sudo systemctl start otel
   374    sudo systemctl start promtail
   375    sudo service openresty reload
   376  }
   377  
   378  function install() {
   379    install-go
   380    install-docker
   381    install-gpu
   382    install-healthcheck
   383    install-ipfs
   384    install-bacalhau
   385    mount-disk
   386    init-ipfs
   387    init-bacalhau
   388    install-secrets
   389    install-otel-collector
   390    install-promtail
   391    start-services
   392  }
   393  
   394  install