github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/ops/terraform/remote_files/scripts/install-node.sh (about) 1 #!/bin/bash 2 # shellcheck disable=SC1091,SC2312 3 set -euo pipefail 4 IFS=$'\n\t' 5 6 source /terraform_node/variables 7 8 function install-go() { 9 echo "Installing Go..." 10 rm -fr /usr/local/go /usr/local/bin/go 11 curl --silent --show-error --location --fail https://go.dev/dl/go1.19.6.linux-amd64.tar.gz | sudo tar --extract --gzip --file=- --directory=/usr/local 12 sudo ln -s /usr/local/go/bin/go /usr/local/bin/go 13 go version 14 } 15 16 function install-docker() { 17 echo "Installing Docker" 18 sudo apt-get install -y \ 19 ca-certificates \ 20 curl \ 21 gnupg \ 22 lsb-release 23 sudo mkdir -p /etc/apt/keyrings 24 curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg 25 echo \ 26 "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ 27 $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null 28 sudo apt-get update -y 29 sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin 30 } 31 32 function install-gpu() { 33 echo "Installing GPU drivers" 34 if [[ "${GPU_NODE}" = "true" ]]; then 35 echo "Installing GPU drivers" 36 distribution=$(. /etc/os-release;echo "${ID}${VERSION_ID}" | sed -e 's/\.//g') \ 37 && wget https://developer.download.nvidia.com/compute/cuda/repos/"${distribution}"/x86_64/cuda-keyring_1.0-1_all.deb \ 38 && sudo dpkg -i cuda-keyring_1.0-1_all.deb 39 distribution=$(. /etc/os-release;echo "${ID}${VERSION_ID}") \ 40 && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ 41 && curl -s -L https://nvidia.github.io/libnvidia-container/"${distribution}"/libnvidia-container.list | \ 42 sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ 43 sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list 44 45 sudo apt-get update && sudo apt-get install -y \ 46 linux-headers-"$(uname -r)" \ 47 cuda-drivers \ 48 nvidia-docker2 49 sudo systemctl restart docker 50 nvidia-smi # No idea why we have to run this once, but we do. Only then does nvidia-container-cli work. 51 else 52 echo "Not installing GPU drivers because GPU_NODE=${GPU_NODE}" 53 fi 54 } 55 56 # Lay down a very basic web server to report when the node is healthy 57 function install-healthcheck() { 58 echo "Installing healthcheck" 59 sudo apt-get -y install --no-install-recommends wget gnupg ca-certificates 60 wget -O - https://openresty.org/package/pubkey.gpg | sudo apt-key add - 61 echo "deb http://openresty.org/package/ubuntu $(lsb_release -sc) main" \ 62 | sudo tee /etc/apt/sources.list.d/openresty.list 63 sudo apt-get update -y 64 sudo apt-get -y install --no-install-recommends openresty 65 sudo cp /terraform_node/nginx.conf /usr/local/openresty/nginx/conf/nginx.conf 66 } 67 68 function install-ipfs() { 69 echo "Installing IPFS" 70 wget "https://dist.ipfs.tech/go-ipfs/${IPFS_VERSION}/go-ipfs_${IPFS_VERSION}_linux-amd64.tar.gz" 71 tar -xvzf "go-ipfs_${IPFS_VERSION}_linux-amd64.tar.gz" 72 # TODO should reset PWD to home dir after each function call 73 cd go-ipfs 74 sudo bash install.sh 75 ipfs --version 76 } 77 78 function install-bacalhau() { 79 if [[ -n "${BACALHAU_BRANCH}" ]] ; then 80 install-bacalhau-from-source 81 elif [[ -n "${BACALHAU_VERSION}" ]] ; then 82 install-bacalhau-from-release 83 else 84 echo "No bacalhau version or branch specified. Not installing bacalhau." 85 exit 1 86 fi 87 } 88 89 function install-bacalhau-from-release() { 90 echo "Installing Bacalhau from release ${BACALHAU_VERSION}" 91 sudo apt-get -y install --no-install-recommends jq 92 wget "https://github.com/filecoin-project/bacalhau/releases/download/${BACALHAU_VERSION}/bacalhau_${BACALHAU_VERSION}_linux_amd64.tar.gz" 93 tar xfv "bacalhau_${BACALHAU_VERSION}_linux_amd64.tar.gz" 94 sudo mv ./bacalhau /usr/local/bin/bacalhau 95 } 96 97 function install-bacalhau-from-source() { 98 echo "Installing Bacalhau from branch ${BACALHAU_BRANCH}" 99 sudo apt-get -y install --no-install-recommends jq 100 git clone --depth 1 --branch ${BACALHAU_BRANCH} https://github.com/filecoin-project/bacalhau.git 101 cd bacalhau 102 GO111MODULE=on CGO_ENABLED=0 go build -gcflags '-N -l' -trimpath -o ./bacalhau 103 sudo mv ./bacalhau /usr/local/bin/bacalhau 104 } 105 106 function install-otel-collector() { 107 echo "Installing otel collector" 108 if [[ -z "${OTEL_COLLECTOR_VERSION}" ]] ; then 109 echo 'OTEL_COLLECTOR_VERSION is undefined. Skipping otel collector installation.' 110 else 111 sudo apt -y update 112 sudo groupadd --system otel 113 sudo useradd -s /sbin/nologin --system -g otel otel 114 sudo mkdir -p /etc/otel 115 sudo mkdir -p /var/lib/otel 116 wget "https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v${OTEL_COLLECTOR_VERSION}/otelcol-contrib_${OTEL_COLLECTOR_VERSION}_linux_amd64.tar.gz" 117 tar xvf "otelcol-contrib_${OTEL_COLLECTOR_VERSION}_linux_amd64.tar.gz" 118 sudo mv otelcol-contrib /usr/local/bin/otelcol 119 # config file 120 sudo tee /terraform_node/otel-collector.yml > /dev/null <<EOF 121 122 extensions: 123 health_check: 124 zpages: 125 endpoint: :55679 126 basicauth/prometheus: 127 client_auth: 128 username: ${GRAFANA_CLOUD_PROMETHEUS_USER} 129 password: ${GRAFANA_CLOUD_PROMETHEUS_API_KEY} 130 basicauth/tempo: 131 client_auth: 132 username: ${GRAFANA_CLOUD_TEMPO_USER} 133 password: ${GRAFANA_CLOUD_TEMPO_API_KEY} 134 basicauth/loki: 135 client_auth: 136 username: ${GRAFANA_CLOUD_LOKI_USER} 137 password: ${GRAFANA_CLOUD_LOKI_API_KEY} 138 139 receivers: 140 hostmetrics: 141 scrapers: 142 cpu: 143 disk: 144 load: 145 filesystem: 146 memory: 147 network: 148 paging: 149 otlp: 150 protocols: 151 http: 152 prometheus: 153 config: 154 scrape_configs: 155 - job_name: 'otel-collector' 156 scrape_interval: 5s 157 static_configs: 158 - targets: [ '0.0.0.0:8888' ] 159 160 exporters: 161 prometheusremotewrite: 162 endpoint: ${GRAFANA_CLOUD_PROMETHEUS_ENDPOINT} 163 auth: 164 authenticator: basicauth/prometheus 165 resource_to_telemetry_conversion: 166 enabled: true 167 otlp: 168 endpoint: ${GRAFANA_CLOUD_TEMPO_ENDPOINT} 169 auth: 170 authenticator: basicauth/tempo 171 loki: 172 endpoint: https://${GRAFANA_CLOUD_LOKI_ENDPOINT}/loki/api/v1/push 173 auth: 174 authenticator: basicauth/loki 175 176 processors: 177 batch: 178 memory_limiter: 179 check_interval: 5s 180 limit_mib: 4000 181 spike_limit_mib: 500 182 resourcedetection/gcp: 183 detectors: [ env, gcp ] 184 timeout: 2s 185 override: false 186 resource: 187 attributes: 188 - key: deployment.environment 189 value: ${TERRAFORM_WORKSPACE} 190 action: insert 191 - key: service.namespace 192 value: bacalhau 193 action: insert 194 attributes/metrics: 195 actions: 196 - pattern: net\.sock.+ 197 action: delete 198 199 service: 200 extensions: [basicauth/tempo, basicauth/prometheus, basicauth/loki, zpages, health_check] 201 pipelines: 202 EOF 203 204 if [[ -n "${GRAFANA_CLOUD_PROMETHEUS_ENDPOINT}" ]] && [[ -n "${GRAFANA_CLOUD_PROMETHEUS_USER}" ]] && [[ -n "${GRAFANA_CLOUD_PROMETHEUS_API_KEY}" ]]; then 205 sudo tee -a /terraform_node/otel-collector.yml > /dev/null <<EOF 206 traces: 207 receivers: [otlp] 208 processors: [memory_limiter, resourcedetection/gcp, resource, batch] 209 exporters: [otlp] 210 EOF 211 fi 212 213 if [[ -n "${GRAFANA_CLOUD_TEMPO_ENDPOINT}" ]] && [[ -n "${GRAFANA_CLOUD_TEMPO_USER}" ]] && [[ -n "${GRAFANA_CLOUD_TEMPO_API_KEY}" ]]; then 214 sudo tee -a /terraform_node/otel-collector.yml > /dev/null <<EOF 215 metrics: 216 receivers: [otlp, prometheus, hostmetrics] 217 processors: [memory_limiter, resourcedetection/gcp, resource, attributes/metrics, batch] 218 exporters: [prometheusremotewrite] 219 EOF 220 fi 221 222 if [[ -n "${GRAFANA_CLOUD_LOKI_ENDPOINT}" ]] && [[ -n "${GRAFANA_CLOUD_LOKI_USER}" ]] && [[ -n "${GRAFANA_CLOUD_LOKI_API_KEY}" ]]; then 223 sudo tee -a /terraform_node/otel-collector.yml > /dev/null <<EOF 224 225 # disabled until promtail receiver is merged in collector-contrib 226 # logs: 227 # receivers: [] 228 # processors: [memory_limiter, resourcedetection/gcp, resource, batch] 229 # exporters: [loki] 230 EOF 231 fi 232 sudo chown -R otel:otel /terraform_node/otel-collector.yml 233 fi 234 } 235 236 function install-promtail() { 237 echo "Installing Promtail/Loki" 238 if [[ -z "${LOKI_VERSION}" ]] || [[ -z "${GRAFANA_CLOUD_LOKI_API_KEY}" ]] || [[ -z "${GRAFANA_CLOUD_LOKI_USER}" ]] || [[ -z "${GRAFANA_CLOUD_LOKI_ENDPOINT}" ]]; then 239 echo 'Any of LOKI_VERSION, GRAFANA_CLOUD_LOKI_API_KEY, GRAFANA_CLOUD_LOKI_USER, GRAFANA_CLOUD_LOKI_ENDPOINT env variables is undefined. Skipping Promtail/Loki installation.' 240 else 241 cd ~ 242 curl -O -L "https://github.com/grafana/loki/releases/download/v${LOKI_VERSION}/promtail-linux-amd64.zip" 243 gunzip -S ".zip" promtail-linux-amd64.zip 244 sudo chmod a+x "promtail-linux-amd64" 245 sudo mv promtail-linux-amd64 /usr/local/bin/ 246 247 # config file 248 HOSTNAME=$(hostname) 249 250 sudo tee /terraform_node/promtail.yml > /dev/null <<EOF 251 server: 252 http_listen_port: 0 253 grpc_listen_port: 0 254 255 positions: 256 filename: /tmp/positions.yaml 257 258 clients: 259 - url: https://${GRAFANA_CLOUD_LOKI_USER}:${GRAFANA_CLOUD_LOKI_API_KEY}@${GRAFANA_CLOUD_LOKI_ENDPOINT}/loki/api/v1/push 260 261 scrape_configs: 262 - job_name: journal 263 pipeline_stages: 264 - json: 265 expressions: 266 level: 267 msg: 268 - drop: 269 source: "level" 270 expression: "(debug|trace)" 271 journal: 272 max_age: 12h 273 labels: 274 job: systemd-journal 275 host: ${HOSTNAME} 276 label_project: bacalhau 277 environment: ${TERRAFORM_WORKSPACE} 278 relabel_configs: 279 - action: keep 280 source_labels: [__journal__systemd_unit] 281 regex: '^bacalhau\.service$' 282 - source_labels: ['__journal__systemd_unit'] 283 target_label: 'systemd_unit' 284 EOF 285 sudo mkdir -p /etc/promtail 286 sudo cp /terraform_node/promtail.yml /etc/promtail/config.yml 287 fi 288 } 289 290 function mount-disk() { 291 echo "Mounting disk" 292 # wait for /dev/sdb to exist 293 while [[ ! -e /dev/sdb ]]; do 294 sleep 1 295 echo "waiting for /dev/sdb to exist" 296 done 297 # mount /dev/sdb at /data 298 sudo mkdir -p /data 299 sudo mount /dev/sdb /data || (sudo mkfs -t ext4 /dev/sdb && sudo mount /dev/sdb /data) 300 } 301 302 # make sure that "ipfs init" has been run 303 function init-ipfs() { 304 echo "Initializing IPFS" 305 sudo mkdir -p /data/ipfs 306 export IPFS_PATH=/data/ipfs 307 308 if [[ ! -e /data/ipfs/version ]]; then 309 ipfs init 310 fi 311 } 312 313 # install any secrets provided as terraform vars 314 function install-secrets() { 315 echo "Installing secrets" 316 # set defaults 317 export GRAFANA_CLOUD_PROMETHEUS_API_KEY="" 318 export GRAFANA_CLOUD_TEMPO_API_KEY="" 319 export GRAFANA_CLOUD_LOKI_API_KEY="" 320 export ESTUARY_API_KEY="" 321 if [[ -e /data/secrets.sh ]]; then 322 source /data/secrets.sh 323 fi 324 325 # load new values if they were provided 326 if [[ -n "${SECRETS_GRAFANA_CLOUD_PROMETHEUS_API_KEY}" ]]; then 327 export GRAFANA_CLOUD_PROMETHEUS_API_KEY="${SECRETS_GRAFANA_CLOUD_PROMETHEUS_API_KEY}" 328 fi 329 if [[ -n "${SECRETS_GRAFANA_CLOUD_TEMPO_API_KEY}" ]]; then 330 export GRAFANA_CLOUD_TEMPO_API_KEY="${SECRETS_GRAFANA_CLOUD_TEMPO_API_KEY}" 331 fi 332 if [[ -n "${SECRETS_GRAFANA_CLOUD_LOKI_API_KEY}" ]]; then 333 export GRAFANA_CLOUD_LOKI_API_KEY="${SECRETS_GRAFANA_CLOUD_LOKI_API_KEY}" 334 fi 335 if [[ -n "${SECRETS_ESTUARY_API_KEY}" ]]; then 336 export ESTUARY_API_KEY="${SECRETS_ESTUARY_API_KEY}" 337 fi 338 339 # write the secrets to persistent disk 340 sudo tee /data/secrets.sh > /dev/null <<EOG 341 export GRAFANA_CLOUD_PROMETHEUS_API_KEY="${GRAFANA_CLOUD_PROMETHEUS_API_KEY}" 342 export GRAFANA_CLOUD_TEMPO_API_KEY="${GRAFANA_CLOUD_TEMPO_API_KEY}" 343 export GRAFANA_CLOUD_LOKI_API_KEY="${GRAFANA_CLOUD_LOKI_API_KEY}" 344 export ESTUARY_API_KEY="${ESTUARY_API_KEY}" 345 EOG 346 347 # clean up variables file from any secret 348 sed -e '/^export SECRETS_/d' /terraform_node/variables | sudo tee /terraform_node/variables > /dev/null 349 } 350 351 # if we are node zero, are in unsafe mode and don't have a private key 352 # then let's copy the unsafe private key so we have a deterministic id 353 # that other nodes will connect to 354 function init-bacalhau() { 355 echo "Initializing Bacalhau" 356 export BACALHAU_NODE_PRIVATE_KEY_PATH="/data/.bacalhau/private_key.${BACALHAU_PORT}" 357 sudo mkdir -p /data/.bacalhau 358 if [[ "${TERRAFORM_NODE_INDEX}" == "0" ]] && [[ -n "${BACALHAU_UNSAFE_CLUSTER}" ]] && [[ ! -f "${BACALHAU_NODE_PRIVATE_KEY_PATH}" ]]; then 359 echo "WE ARE NOW INSTALLING THE UNSAFE KEY YO" 360 sudo cp /terraform_node/bacalhau-unsafe-private-key "${BACALHAU_NODE_PRIVATE_KEY_PATH}" 361 sudo chmod 0600 "${BACALHAU_NODE_PRIVATE_KEY_PATH}" 362 fi 363 } 364 365 function start-services() { 366 sudo systemctl daemon-reload 367 sudo systemctl enable ipfs 368 sudo systemctl enable bacalhau 369 sudo systemctl enable otel 370 sudo systemctl enable promtail 371 sudo systemctl start ipfs 372 sudo systemctl start bacalhau 373 sudo systemctl start otel 374 sudo systemctl start promtail 375 sudo service openresty reload 376 } 377 378 function install() { 379 install-go 380 install-docker 381 install-gpu 382 install-healthcheck 383 install-ipfs 384 install-bacalhau 385 mount-disk 386 init-ipfs 387 init-bacalhau 388 install-secrets 389 install-otel-collector 390 install-promtail 391 start-services 392 } 393 394 install