github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/.gitlab-ci.yml (about) 1 image: aistorage/ci:latest 2 3 stages: 4 - build 5 - test-short 6 - test-long 7 - security 8 9 include: 10 - project: pstooling/gitlab-templates 11 ref: main 12 file: /templates/static-analysis/Checkmarx-main-csv.gitlab-ci.yml 13 14 variables: 15 MODE: debug # run aistore and tests with debug asserts enabled 16 SCRIPTS_DIR: "./scripts" 17 NUM_TARGET: 18 value: "5" 19 description: "Number of targets deployed." 20 NUM_PROXY: 21 value: "5" 22 description: "Number of proxies deployed." 23 FS_CNT: 24 value: "6" 25 description: "Number of disks defined for each target." 26 CLOUD_BCKS: 27 value: "aws://ais-blossom gs://ais-ci-kube" # NOTE: additionally used: aws://ais-cicd and aws://ais-ci-python 28 description: "Set of buckets used when running cloud tests." 29 GOOGLE_APPLICATION_CREDENTIALS: 30 value: "$GCP_JSON_FILE" 31 description: "Gitlab file variable containing credentials for GCP" 32 33 34 # Templates 35 36 .gather_logs_template: &gather_logs_def 37 after_script: 38 - make kill # To make sure that nodes flushed the logs. 39 - mkdir $CI_PROJECT_DIR/logs 40 - find /tmp/ais -type f -name "*\.INFO\.*" -exec cp {} $CI_PROJECT_DIR/logs/ \; 41 artifacts: 42 when: on_failure 43 paths: [ logs/ ] 44 expire_in: 1 days 45 46 .default_only_template: &default_only_def 47 only: 48 - main 49 - merge_requests 50 - schedules 51 - webs 52 53 .test_short_template: &test_short_def 54 stage: test-short 55 tags: 56 - ais 57 timeout: 30m 58 <<: *default_only_def 59 except: 60 variables: 61 - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/ 62 - $CI_MERGE_REQUEST_LABELS =~ /.*python-tests-only.*/ 63 <<: *gather_logs_def 64 65 .test_short_skip_scheduled_template: &test_short_skip_scheduled_def 66 stage: test-short 67 tags: 68 - ais 69 timeout: 30m 70 rules: 71 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"' 72 when: manual 73 allow_failure: true 74 <<: *gather_logs_def 75 76 77 .test_short_optional_template: &test_short_optional_def 78 stage: test-short 79 tags: 80 - ais 81 timeout: 30m 82 rules: 83 - if: '$CI_PIPELINE_SOURCE == "schedule" || $CI_PIPELINE_SOURCE == "web"' 84 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"' 85 when: manual 86 allow_failure: true 87 <<: *gather_logs_def 88 89 .test_long_template: &test_long_def 90 stage: test-long 91 tags: 92 - ais 93 # NOTE: when changing, make sure to update $SCRIPTS_DIR/bootstrap.sh and GitLab /settings/ci_cd 94 timeout: 4h 95 rules: 96 - if: '$CI_PIPELINE_SOURCE == "schedule"' 97 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main" || $CI_PIPELINE_SOURCE == "web"' 98 when: manual 99 allow_failure: true 100 <<: *gather_logs_def 101 102 .test_long_skip_scheduled_template: &test_long_skip_scheduled_def 103 stage: test-long 104 tags: 105 - ais 106 # NOTE: when changing, make sure to update $SCRIPTS_DIR/bootstrap.sh and GitLab /settings/ci_cd 107 timeout: 4h 108 rules: 109 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"' 110 when: manual 111 allow_failure: true 112 <<: *gather_logs_def 113 114 # Stages 115 116 build:linux: 117 stage: build 118 tags: 119 - ais 120 timeout: 10m 121 <<: *default_only_def 122 script: 123 - MODE="" make node # Build node without backends in production mode. 124 - MODE="debug" make node # Build node without backends in debug mode. 125 - AIS_BACKEND_PROVIDERS="aws azure gcp" MODE="" make node # Build with all backends (production mode). 126 - AIS_BACKEND_PROVIDERS="aws azure gcp" MODE="debug" make node # Build with all backends (debug mode). 127 - MEM_PROFILE="/tmp/mem" CPU_PROFILE="/tmp/cpu" make node # Build with profile. 128 - TAGS="nethttp" make node # Build with net/http transport support (fasthttp is used by default). 129 - make authn 130 - make cli 131 - make aisloader 132 133 build:k8s: 134 stage: build 135 image: quay.io/buildah/stable 136 variables: 137 # Use vfs with buildah. Docker offers overlayfs as a default, but Buildah 138 # cannot stack overlayfs on top of another overlayfs filesystem. 139 STORAGE_DRIVER: vfs 140 # Write all image metadata in the docker format, not the standard OCI format. 141 BUILDAH_FORMAT: docker 142 FQ_IMAGE_NAME: "$CI_REGISTRY_IMAGE/aisnode:$CI_COMMIT_SHORT_SHA" 143 144 before_script: 145 - buildah login -u $CI_REGISTRY_USER -p $CI_REGISTRY_TOKEN $CI_REGISTRY 146 script: 147 - buildah images 148 - buildah build -t $FQ_IMAGE_NAME -f deploy/dev/k8s/Dockerfile --build-arg MODE="debug" --build-arg providers="gcp" 149 - buildah images 150 - buildah push $FQ_IMAGE_NAME 151 rules: 152 - if: '$CI_COMMIT_BRANCH == "main"' 153 allow_failure: false 154 - if: '$CI_MERGE_REQUEST_LABELS =~ /.*etl.*/' 155 allow_failure: false 156 - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/' 157 allow_failure: false 158 - if: '$CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/' 159 when: never 160 161 lint:linux: 162 stage: build 163 tags: 164 - ais 165 timeout: 10m 166 <<: *default_only_def 167 script: 168 - make lint 169 - make fmt-check 170 - make spell-check 171 172 # Runs cluster with 5 proxies and 5 targets (each with 6 mountpaths). 173 test:short: 174 <<: *test_short_def 175 variables: 176 BUCKET: "ais://test" 177 script: 178 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all 179 - make test-short 180 - FLAGS="--duration=10s" make test-aisloader 181 182 # Runs cluster with 5 proxies and 5 targets (each with 6 mountpaths). 183 test:short:python: 184 <<: *test_short_def 185 variables: 186 AIS_ENDPOINT: "http://localhost:8080" 187 BUCKET: "aws://ais-ci-python" 188 script: 189 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all --aws 190 - cd python 191 - make python_sdk_tests 192 - make python_botocore_tests 193 - make PYAISLOADER_TEST_TYPE=short test-pyaisloader 194 195 except: 196 variables: 197 - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/ 198 199 test:short:pytorch: 200 <<: *test_short_def 201 variables: 202 AIS_ENDPOINT: "http://localhost:8080" 203 BUCKET: "aws://ais-ci-python" 204 script: 205 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all --aws 206 - cd python 207 - make python_pytorch_unit_tests 208 except: 209 variables: 210 - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/ 211 - $CI_MERGE_REQUEST_LABELS !~ /.*pytorch.*/ 212 213 test:short:python-etl: 214 <<: *test_short_def 215 tags: 216 - ais-k8s 217 script: 218 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt 1 --mountpath-cnt $FS_CNT --deployment all 219 - cd python 220 - make python_etl_tests 221 except: 222 variables: 223 - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/ 224 - $CI_MERGE_REQUEST_LABELS !~ /.*etl.*/ 225 retry: 226 max: 2 227 when: 228 - unknown_failure 229 - api_failure 230 - stuck_or_timeout_failure 231 - runner_system_failure 232 - job_execution_timeout 233 234 # Runs cluster with 1 proxy and 1 target (with 6 mountpaths). 235 test:short:minimal: 236 <<: *test_short_def 237 variables: 238 BUCKET: "ais://test" 239 script: 240 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt 1 --proxy-cnt 1 --mountpath-cnt $FS_CNT --deployment all 241 - NUM_PROXY=1 NUM_TARGET=1 make test-short 242 243 test:short:s3-compat: 244 <<: *test_short_def 245 tags: 246 - ais 247 variables: 248 # Run only s3 compat tests we know should pass 249 S3_COMPAT_RUN_ALL: 250 value: "False" 251 # Fail on first failure 252 S3_COMPAT_STRICT: 253 value: "True" 254 255 script: 256 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all 257 - ais config cluster features Provide-S3-API-via-Root 258 - ais config cluster checksum.type=md5 259 - cd python 260 - make python_s3_compat_test 261 except: 262 variables: 263 - $CI_MERGE_REQUEST_LABELS !~ /.*s3-compat.*/ 264 - $CI_PIPELINE_SOURCE == "schedule" 265 266 267 test:short:authn: 268 <<: *test_short_optional_def 269 variables: 270 AIS_AUTHN_ENABLED: "true" 271 AIS_AUTHN_SU_NAME: "admin" 272 AIS_AUTHN_SU_PASS: "admin" 273 AIS_AUTHN_URL: "http://localhost:52001" 274 BUCKET: "ais://test" 275 RE: "TestAuth" 276 script: 277 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT 278 - ais auth login $AIS_AUTHN_SU_NAME -p $AIS_AUTHN_SU_PASS 279 - make test-run 280 - ais auth logout 281 282 test:short:https: 283 <<: *test_short_optional_def 284 variables: 285 AIS_USE_HTTPS: "true" 286 AIS_SKIP_VERIFY_CRT: "true" 287 AIS_SERVER_CRT: "$CI_PROJECT_DIR/localhost.crt" 288 AIS_SERVER_KEY: "$CI_PROJECT_DIR/localhost.key" 289 AIS_ENDPOINT: "https://localhost:8080" 290 BUCKET: "ais://ais-ci" 291 script: 292 - openssl req -x509 -out $AIS_SERVER_CRT -keyout $AIS_SERVER_KEY -newkey rsa:2048 -nodes -sha256 -subj '/CN=localhost' -extensions EXT -config <( printf "[dn]\nCN=localhost\n[req]\ndistinguished_name = dn\n[EXT]\nsubjectAltName=DNS:localhost\nkeyUsage=digitalSignature\nextendedKeyUsage=serverAuth") 293 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all --https 294 - make test-short 295 296 test:short:s3rproxy: 297 <<: *test_short_skip_scheduled_def 298 variables: 299 BUCKET: "ais://ais-ci" 300 RE: "S3" 301 script: 302 - deploy/scripts/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT 303 - ais config cluster features S3-Reverse-Proxy 304 - make test-short 305 306 test:short:aws: 307 <<: *test_short_skip_scheduled_def 308 variables: 309 BUCKET: "aws://ais-cicd" 310 script: 311 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --aws 312 - make test-short 313 314 test:short:gcp: 315 <<: *test_short_skip_scheduled_def 316 variables: 317 BUCKET: "gs://ais-ci" 318 script: 319 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --gcp 320 - make test-short 321 322 test:short:azure: 323 <<: *test_short_skip_scheduled_def 324 variables: 325 BUCKET: "az://ais-ci" 326 script: 327 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --azure 328 - make test-short 329 330 test:long: 331 <<: *test_long_skip_scheduled_def 332 variables: 333 NUM_PROXY: 6 334 BUCKET: "ais://ais-ci" 335 script: 336 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT 337 - make test-long 338 339 test:long:aws: 340 <<: *test_long_def 341 variables: 342 NUM_PROXY: 6 343 BUCKET: "aws://ais-cicd" 344 script: 345 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --aws 346 - make test-long 347 348 test:long:gcp: 349 <<: *test_long_def 350 variables: 351 NUM_PROXY: 6 352 BUCKET: "gs://ais-ci" 353 script: 354 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --gcp 355 - make test-long 356 357 test:long:azure: 358 <<: *test_long_def 359 variables: 360 NUM_PROXY: 6 361 BUCKET: "az://ais-ci" 362 script: 363 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --azure 364 - make test-long 365 366 test:long:aisloader: 367 stage: test-long 368 tags: 369 - ais 370 timeout: 10m 371 variables: 372 AIS_ENDPOINT: "http://localhost:8080" 373 script: 374 - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT 375 - sleep 10 # make sure that cluster properly starts 376 - FLAGS="--duration=5m" make test-aisloader 377 - cd ./python; make PYAISLOADER_TEST_TYPE=long test-pyaisloader 378 rules: 379 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"' 380 when: manual 381 allow_failure: true 382 383 384 # 385 # Kubernetes stages 386 # 387 388 .test_k8s: 389 tags: 390 - ais-k8s 391 variables: 392 BUCKET: "gs://ais-ci-kube" 393 TESTS_DIR: "ais/test" 394 AISNODE_IMAGE: "$CI_REGISTRY_IMAGE/aisnode:$CI_COMMIT_SHORT_SHA" 395 AIS_ENDPOINT: "http://ais-proxy-0.default.svc.cluster.local:8080" 396 FS_CNT: 6 397 RE: "TestETL|TestConfig|TestMountpath" 398 PROVIDERS: gcp 399 before_script: 400 - kubectl delete pods,services -l nvidia.com/ais-etl-name # TODO: this can be removed once the lifecycle of transformers is implemented. 401 - cd deploy/dev/k8s 402 - make deploy-ci 403 - cd ../../.. 404 - make cli 405 after_script: 406 - mkdir $CI_PROJECT_DIR/logs 407 - find /tmp/ais -type f -name "*\.INFO\.*" -exec cp {} $CI_PROJECT_DIR/logs/ \; 408 - cd deploy/dev/k8s 409 - make cleanup-ci 410 artifacts: 411 when: on_failure 412 paths: [ logs/ ] 413 expire_in: 1 days 414 415 .test_k8s_short_template: 416 stage: test-short 417 extends: .test_k8s 418 variables: 419 NUM_PROXY: 1 420 NUM_TARGET: 1 421 422 .test_k8s_long_template: 423 stage: test-long 424 extends: .test_k8s 425 variables: 426 NUM_PROXY: 1 427 NUM_TARGET: 5 428 429 test:short:k8s: 430 extends: .test_k8s_short_template 431 timeout: 1h 432 only: 433 - merge_requests 434 - schedules 435 except: 436 variables: 437 - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/ 438 - $CI_MERGE_REQUEST_LABELS =~ /.*python-tests-only.*/ 439 - $CI_MERGE_REQUEST_LABELS !~ /.*etl.*/ 440 variables: 441 RE: "TestETL|TestConfig|TestMountpath" 442 script: 443 - make test-short 444 445 # e.g. RE: "ETLBucket|ETLConnectionError|ETLInitCode" (or any other regex to select tests) 446 test:short:assorted:k8s: 447 extends: .test_k8s_short_template 448 timeout: 30m 449 rules: 450 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"' 451 when: manual 452 allow_failure: true 453 variables: 454 RE: "ETLAnyToAny|ETLMultiObj" 455 script: 456 - make test-run 457 458 test:long:k8s: 459 extends: .test_k8s_long_template 460 ## NOTE: when changing, make sure to update $SCRIPTS_DIR/bootstrap.sh and GitLab /settings/ci_cd 461 timeout: 4h 462 rules: 463 - if: '$CI_PIPELINE_SOURCE == "schedule"' 464 - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/' 465 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main" || $CI_PIPELINE_SOURCE == "web"' 466 when: manual 467 allow_failure: true 468 script: 469 - make test-run 470 471 test:long:k8s:single-target: 472 extends: .test_k8s_long_template 473 timeout: 4h 474 rules: 475 - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/' 476 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"' 477 when: manual 478 allow_failure: true 479 variables: 480 NUM_TARGET: 1 481 script: 482 - make test-run 483 484 test:long:k8s:aisloader: 485 extends: .test_k8s_long_template 486 timeout: 15m 487 rules: 488 - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/' 489 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"' 490 when: manual 491 allow_failure: true 492 script: 493 - sleep 10 # Give some time for the cluster to stabilize. 494 - make aisloader 495 - BUCKET="ais://test" FLAGS="--duration=2m --etl" make test-aisloader 496 497 test:long:k8s:all: 498 extends: .test_k8s_long_template 499 timeout: 5h 500 rules: 501 - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/' 502 - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"' 503 when: manual 504 allow_failure: true 505 variables: 506 PROVIDERS: "gcp,aws" 507 before_script: 508 - kubectl delete pods,services -l nvidia.com/ais-etl-name # TODO: this can be removed once the lifecycle of transformers is implemented. 509 - cd deploy/dev/k8s 510 - make deploy-ci 511 - cd ../../.. 512 # Make sure that metrics collection is enabled. 513 - git clone https://github.com/prometheus-operator/kube-prometheus.git 514 - kubectl apply -f kube-prometheus/manifests/setup && kubectl apply -f kube-prometheus/manifests && rm -rf kube-prometheus 515 script: 516 - | 517 echo "----- RUNNING K8S TESTS -----" 518 BUCKET="aws://ais-blossom" RE="TestETL|TestConfig|TestMountpath" make test-run 519 exit_code=$? 520 result=$((result + exit_code)) 521 echo "----- K8S TESTS FINISHED WITH: ${exit_code} -----" 522 - | 523 for bucket in ${CLOUD_BCKS}; do 524 echo "----- RUNNING LONG TESTS WITH: ${bucket} -----" 525 BUCKET=${bucket} make test-long && make test-aisloader 526 exit_code=$? 527 result=$((result + exit_code)) 528 echo "----- LONG TESTS FINISHED WITH: ${exit_code} -----" 529 done 530 531 checkmarx-scan-csv: 532 stage: security 533 rules: 534 - if: '$CI_PIPELINE_SOURCE == "schedule" || $CI_PIPELINE_SOURCE == "web"' 535 allow_failure: true