github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/.gitlab-ci.yml (about)

     1  image: aistorage/ci:latest
     2  
     3  stages:
     4    - build
     5    - test-short
     6    - test-long
     7    - security
     8  
     9  include:
    10    - project: pstooling/gitlab-templates
    11      ref: main
    12      file: /templates/static-analysis/Checkmarx-main-csv.gitlab-ci.yml
    13  
    14  variables:
    15    MODE: debug # run aistore and tests with debug asserts enabled
    16    SCRIPTS_DIR: "./scripts"
    17    NUM_TARGET:
    18      value: "5"
    19      description: "Number of targets deployed."
    20    NUM_PROXY:
    21      value: "5"
    22      description: "Number of proxies deployed."
    23    FS_CNT:
    24      value: "6"
    25      description: "Number of disks defined for each target."
    26    CLOUD_BCKS:
    27      value: "aws://ais-blossom gs://ais-ci-kube" # NOTE: additionally used: aws://ais-cicd and aws://ais-ci-python
    28      description: "Set of buckets used when running cloud tests."
    29    GOOGLE_APPLICATION_CREDENTIALS:
    30      value: "$GCP_JSON_FILE"
    31      description: "Gitlab file variable containing credentials for GCP"
    32  
    33  
    34  # Templates
    35  
    36  .gather_logs_template: &gather_logs_def
    37    after_script:
    38      - make kill # To make sure that nodes flushed the logs.
    39      - mkdir $CI_PROJECT_DIR/logs
    40      - find /tmp/ais -type f -name "*\.INFO\.*" -exec cp {} $CI_PROJECT_DIR/logs/ \;
    41    artifacts:
    42      when: on_failure
    43      paths: [ logs/ ]
    44      expire_in: 1 days
    45  
    46  .default_only_template: &default_only_def
    47    only:
    48      - main
    49      - merge_requests
    50      - schedules
    51      - webs
    52  
    53  .test_short_template: &test_short_def
    54    stage: test-short
    55    tags:
    56      - ais
    57    timeout: 30m
    58    <<: *default_only_def
    59    except:
    60      variables:
    61        - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/
    62        - $CI_MERGE_REQUEST_LABELS =~ /.*python-tests-only.*/
    63    <<: *gather_logs_def
    64  
    65  .test_short_skip_scheduled_template: &test_short_skip_scheduled_def
    66    stage: test-short
    67    tags:
    68      - ais
    69    timeout: 30m
    70    rules:
    71      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"'
    72        when: manual
    73        allow_failure: true
    74    <<: *gather_logs_def
    75  
    76  
    77  .test_short_optional_template: &test_short_optional_def
    78    stage: test-short
    79    tags:
    80      - ais
    81    timeout: 30m
    82    rules:
    83      - if: '$CI_PIPELINE_SOURCE == "schedule" || $CI_PIPELINE_SOURCE == "web"'
    84      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"'
    85        when: manual
    86        allow_failure: true
    87    <<: *gather_logs_def
    88  
    89  .test_long_template: &test_long_def
    90    stage: test-long
    91    tags:
    92      - ais
    93    # NOTE: when changing, make sure to update $SCRIPTS_DIR/bootstrap.sh and GitLab /settings/ci_cd
    94    timeout: 4h
    95    rules:
    96      - if: '$CI_PIPELINE_SOURCE == "schedule"'
    97      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main" || $CI_PIPELINE_SOURCE == "web"'
    98        when: manual
    99        allow_failure: true
   100    <<: *gather_logs_def
   101  
   102  .test_long_skip_scheduled_template: &test_long_skip_scheduled_def
   103    stage: test-long
   104    tags:
   105      - ais
   106    # NOTE: when changing, make sure to update $SCRIPTS_DIR/bootstrap.sh and GitLab /settings/ci_cd
   107    timeout: 4h
   108    rules:
   109      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"'
   110        when: manual
   111        allow_failure: true
   112    <<: *gather_logs_def
   113  
   114  # Stages
   115  
   116  build:linux:
   117    stage: build
   118    tags:
   119      - ais
   120    timeout: 10m
   121    <<: *default_only_def
   122    script:
   123      - MODE="" make node # Build node without backends in production mode.
   124      - MODE="debug" make node # Build node without backends in debug mode.
   125      - AIS_BACKEND_PROVIDERS="aws azure gcp" MODE="" make node # Build with all backends (production mode).
   126      - AIS_BACKEND_PROVIDERS="aws azure gcp" MODE="debug" make node # Build with all backends (debug mode).
   127      - MEM_PROFILE="/tmp/mem" CPU_PROFILE="/tmp/cpu" make node # Build with profile.
   128      - TAGS="nethttp" make node # Build with net/http transport support (fasthttp is used by default).
   129      - make authn
   130      - make cli
   131      - make aisloader
   132  
   133  build:k8s:
   134    stage: build
   135    image: quay.io/buildah/stable
   136    variables:
   137      # Use vfs with buildah. Docker offers overlayfs as a default, but Buildah
   138      # cannot stack overlayfs on top of another overlayfs filesystem.
   139      STORAGE_DRIVER: vfs
   140      # Write all image metadata in the docker format, not the standard OCI format.
   141      BUILDAH_FORMAT: docker
   142      FQ_IMAGE_NAME: "$CI_REGISTRY_IMAGE/aisnode:$CI_COMMIT_SHORT_SHA"
   143  
   144    before_script:
   145      - buildah login -u $CI_REGISTRY_USER -p $CI_REGISTRY_TOKEN $CI_REGISTRY
   146    script:
   147      - buildah images 
   148      - buildah build -t $FQ_IMAGE_NAME -f deploy/dev/k8s/Dockerfile --build-arg MODE="debug" --build-arg providers="gcp"
   149      - buildah images
   150      - buildah push $FQ_IMAGE_NAME
   151    rules:
   152      - if: '$CI_COMMIT_BRANCH == "main"'
   153        allow_failure: false
   154      - if: '$CI_MERGE_REQUEST_LABELS =~ /.*etl.*/'
   155        allow_failure: false
   156      - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/'
   157        allow_failure: false
   158      - if: '$CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/'
   159        when: never
   160  
   161  lint:linux:
   162    stage: build
   163    tags:
   164      - ais
   165    timeout: 10m
   166    <<: *default_only_def
   167    script:
   168      - make lint
   169      - make fmt-check
   170      - make spell-check
   171  
   172  # Runs cluster with 5 proxies and 5 targets (each with 6 mountpaths).
   173  test:short:
   174    <<: *test_short_def
   175    variables:
   176      BUCKET: "ais://test"
   177    script:
   178      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all
   179      - make test-short
   180      - FLAGS="--duration=10s" make test-aisloader
   181  
   182  # Runs cluster with 5 proxies and 5 targets (each with 6 mountpaths).
   183  test:short:python:
   184    <<: *test_short_def
   185    variables:
   186      AIS_ENDPOINT: "http://localhost:8080"
   187      BUCKET: "aws://ais-ci-python"
   188    script:
   189      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all --aws
   190      - cd python
   191      - make python_sdk_tests
   192      - make python_botocore_tests
   193      - make PYAISLOADER_TEST_TYPE=short test-pyaisloader
   194  
   195    except:
   196      variables:
   197        - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/
   198  
   199  test:short:pytorch:
   200    <<: *test_short_def
   201    variables:
   202      AIS_ENDPOINT: "http://localhost:8080"
   203      BUCKET: "aws://ais-ci-python"
   204    script:
   205      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all --aws
   206      - cd python
   207      - make python_pytorch_unit_tests
   208    except:
   209      variables:
   210        - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/
   211        - $CI_MERGE_REQUEST_LABELS !~ /.*pytorch.*/
   212  
   213  test:short:python-etl:
   214    <<: *test_short_def
   215    tags:
   216      - ais-k8s
   217    script:
   218      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt 1 --mountpath-cnt $FS_CNT --deployment all
   219      - cd python
   220      - make python_etl_tests
   221    except:
   222      variables:
   223        - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/
   224        - $CI_MERGE_REQUEST_LABELS !~ /.*etl.*/
   225    retry:
   226      max: 2
   227      when:
   228        - unknown_failure
   229        - api_failure
   230        - stuck_or_timeout_failure
   231        - runner_system_failure
   232        - job_execution_timeout
   233  
   234  # Runs cluster with 1 proxy and 1 target (with 6 mountpaths).
   235  test:short:minimal:
   236    <<: *test_short_def
   237    variables:
   238      BUCKET: "ais://test"
   239    script:
   240      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt 1 --proxy-cnt 1 --mountpath-cnt $FS_CNT --deployment all
   241      - NUM_PROXY=1 NUM_TARGET=1 make test-short
   242  
   243  test:short:s3-compat:
   244    <<: *test_short_def
   245    tags:
   246      - ais
   247    variables:
   248      # Run only s3 compat tests we know should pass
   249      S3_COMPAT_RUN_ALL:
   250        value: "False"
   251      # Fail on first failure
   252      S3_COMPAT_STRICT:
   253        value: "True"
   254  
   255    script:
   256      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all
   257      - ais config cluster features Provide-S3-API-via-Root
   258      - ais config cluster checksum.type=md5
   259      - cd python
   260      - make python_s3_compat_test
   261    except:
   262      variables:
   263        - $CI_MERGE_REQUEST_LABELS !~ /.*s3-compat.*/
   264        - $CI_PIPELINE_SOURCE == "schedule"
   265  
   266  
   267  test:short:authn:
   268    <<: *test_short_optional_def
   269    variables:
   270      AIS_AUTHN_ENABLED: "true"
   271      AIS_AUTHN_SU_NAME: "admin"
   272      AIS_AUTHN_SU_PASS: "admin"
   273      AIS_AUTHN_URL: "http://localhost:52001"
   274      BUCKET: "ais://test"
   275      RE: "TestAuth"
   276    script:
   277      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT
   278      - ais auth login $AIS_AUTHN_SU_NAME -p $AIS_AUTHN_SU_PASS
   279      - make test-run
   280      - ais auth logout
   281  
   282  test:short:https:
   283    <<: *test_short_optional_def
   284    variables:
   285      AIS_USE_HTTPS: "true"
   286      AIS_SKIP_VERIFY_CRT: "true"
   287      AIS_SERVER_CRT: "$CI_PROJECT_DIR/localhost.crt"
   288      AIS_SERVER_KEY: "$CI_PROJECT_DIR/localhost.key"
   289      AIS_ENDPOINT: "https://localhost:8080"
   290      BUCKET: "ais://ais-ci"
   291    script:
   292      - openssl req -x509 -out $AIS_SERVER_CRT -keyout $AIS_SERVER_KEY -newkey rsa:2048 -nodes -sha256 -subj '/CN=localhost' -extensions EXT -config <( printf "[dn]\nCN=localhost\n[req]\ndistinguished_name = dn\n[EXT]\nsubjectAltName=DNS:localhost\nkeyUsage=digitalSignature\nextendedKeyUsage=serverAuth")
   293      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --deployment all --https
   294      - make test-short
   295  
   296  test:short:s3rproxy:
   297    <<: *test_short_skip_scheduled_def
   298    variables:
   299      BUCKET: "ais://ais-ci"
   300      RE: "S3"
   301    script:
   302      - deploy/scripts/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT
   303      - ais config cluster features S3-Reverse-Proxy
   304      - make test-short
   305  
   306  test:short:aws:
   307    <<: *test_short_skip_scheduled_def
   308    variables:
   309      BUCKET: "aws://ais-cicd"
   310    script:
   311      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --aws
   312      - make test-short
   313  
   314  test:short:gcp:
   315    <<: *test_short_skip_scheduled_def
   316    variables:
   317      BUCKET: "gs://ais-ci"
   318    script:
   319      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --gcp
   320      - make test-short
   321  
   322  test:short:azure:
   323    <<: *test_short_skip_scheduled_def
   324    variables:
   325      BUCKET: "az://ais-ci"
   326    script:
   327      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --azure
   328      - make test-short
   329  
   330  test:long:
   331    <<: *test_long_skip_scheduled_def
   332    variables:
   333      NUM_PROXY: 6
   334      BUCKET: "ais://ais-ci"
   335    script:
   336      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT
   337      - make test-long
   338  
   339  test:long:aws:
   340    <<: *test_long_def
   341    variables:
   342      NUM_PROXY: 6
   343      BUCKET: "aws://ais-cicd"
   344    script:
   345      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --aws
   346      - make test-long
   347  
   348  test:long:gcp:
   349    <<: *test_long_def
   350    variables:
   351      NUM_PROXY: 6
   352      BUCKET: "gs://ais-ci"
   353    script:
   354      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --gcp
   355      - make test-long
   356  
   357  test:long:azure:
   358    <<: *test_long_def
   359    variables:
   360      NUM_PROXY: 6
   361      BUCKET: "az://ais-ci"
   362    script:
   363      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT --azure
   364      - make test-long
   365  
   366  test:long:aisloader:
   367    stage: test-long
   368    tags:
   369      - ais
   370    timeout: 10m
   371    variables:
   372      AIS_ENDPOINT: "http://localhost:8080"
   373    script:
   374      - ${SCRIPTS_DIR}/clean_deploy.sh --target-cnt $NUM_TARGET --proxy-cnt $NUM_PROXY --mountpath-cnt $FS_CNT
   375      - sleep 10 # make sure that cluster properly starts
   376      - FLAGS="--duration=5m" make test-aisloader
   377      - cd ./python; make PYAISLOADER_TEST_TYPE=long test-pyaisloader
   378    rules:
   379      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"'
   380        when: manual
   381        allow_failure: true
   382  
   383  
   384  #
   385  # Kubernetes stages
   386  #
   387  
   388  .test_k8s:
   389    tags:
   390      - ais-k8s
   391    variables:
   392      BUCKET: "gs://ais-ci-kube"
   393      TESTS_DIR: "ais/test"
   394      AISNODE_IMAGE: "$CI_REGISTRY_IMAGE/aisnode:$CI_COMMIT_SHORT_SHA"
   395      AIS_ENDPOINT: "http://ais-proxy-0.default.svc.cluster.local:8080"
   396      FS_CNT: 6
   397      RE: "TestETL|TestConfig|TestMountpath"
   398      PROVIDERS: gcp
   399    before_script:
   400      - kubectl delete pods,services -l nvidia.com/ais-etl-name # TODO: this can be removed once the lifecycle of transformers is implemented.
   401      - cd deploy/dev/k8s
   402      - make deploy-ci
   403      - cd ../../..
   404      - make cli
   405    after_script:
   406      - mkdir $CI_PROJECT_DIR/logs
   407      - find /tmp/ais -type f -name "*\.INFO\.*" -exec cp {} $CI_PROJECT_DIR/logs/ \;
   408      - cd deploy/dev/k8s
   409      - make cleanup-ci
   410    artifacts:
   411      when: on_failure
   412      paths: [ logs/ ]
   413      expire_in: 1 days
   414  
   415  .test_k8s_short_template:
   416    stage: test-short
   417    extends: .test_k8s
   418    variables:
   419      NUM_PROXY: 1
   420      NUM_TARGET: 1
   421  
   422  .test_k8s_long_template:
   423    stage: test-long
   424    extends: .test_k8s
   425    variables:
   426      NUM_PROXY: 1
   427      NUM_TARGET: 5
   428  
   429  test:short:k8s:
   430    extends: .test_k8s_short_template
   431    timeout: 1h
   432    only:
   433      - merge_requests
   434      - schedules
   435    except:
   436      variables:
   437        - $CI_MERGE_REQUEST_LABELS =~ /.*skip-ci.*/
   438        - $CI_MERGE_REQUEST_LABELS =~ /.*python-tests-only.*/
   439        - $CI_MERGE_REQUEST_LABELS !~ /.*etl.*/
   440    variables:
   441      RE: "TestETL|TestConfig|TestMountpath"
   442    script:
   443      - make test-short
   444  
   445  #  e.g. RE: "ETLBucket|ETLConnectionError|ETLInitCode" (or any other regex to select tests)
   446  test:short:assorted:k8s:
   447    extends: .test_k8s_short_template
   448    timeout: 30m
   449    rules:
   450      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"'
   451        when: manual
   452        allow_failure: true
   453    variables:
   454      RE: "ETLAnyToAny|ETLMultiObj"
   455    script:
   456      - make test-run
   457  
   458  test:long:k8s:
   459    extends: .test_k8s_long_template
   460    ## NOTE: when changing, make sure to update $SCRIPTS_DIR/bootstrap.sh and GitLab /settings/ci_cd
   461    timeout: 4h
   462    rules:
   463      - if: '$CI_PIPELINE_SOURCE == "schedule"'
   464      - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/'
   465      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main" || $CI_PIPELINE_SOURCE == "web"'
   466        when: manual
   467        allow_failure: true
   468    script:
   469      - make test-run
   470  
   471  test:long:k8s:single-target:
   472    extends: .test_k8s_long_template
   473    timeout: 4h
   474    rules:
   475      - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/'
   476      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"'
   477        when: manual
   478        allow_failure: true
   479    variables:
   480      NUM_TARGET: 1
   481    script:
   482      - make test-run
   483  
   484  test:long:k8s:aisloader:
   485    extends: .test_k8s_long_template
   486    timeout: 15m
   487    rules:
   488      - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/'
   489      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"'
   490        when: manual
   491        allow_failure: true
   492    script:
   493      - sleep 10 # Give some time for the cluster to stabilize.
   494      - make aisloader
   495      - BUCKET="ais://test" FLAGS="--duration=2m --etl" make test-aisloader
   496  
   497  test:long:k8s:all:
   498    extends: .test_k8s_long_template
   499    timeout: 5h
   500    rules:
   501      - if: '$CI_MERGE_REQUEST_LABELS =~ /.*k8s-ci.*/'
   502      - if: '$CI_PIPELINE_SOURCE == "merge_request_event" || $CI_COMMIT_BRANCH == "main"'
   503        when: manual
   504        allow_failure: true
   505    variables: 
   506      PROVIDERS: "gcp,aws"
   507    before_script:
   508      - kubectl delete pods,services -l nvidia.com/ais-etl-name # TODO: this can be removed once the lifecycle of transformers is implemented.
   509      - cd deploy/dev/k8s
   510      - make deploy-ci
   511      - cd ../../..
   512      # Make sure that metrics collection is enabled.
   513      - git clone https://github.com/prometheus-operator/kube-prometheus.git
   514      - kubectl apply -f kube-prometheus/manifests/setup && kubectl apply -f kube-prometheus/manifests && rm -rf kube-prometheus
   515    script:
   516      - |
   517        echo "----- RUNNING K8S TESTS -----"
   518        BUCKET="aws://ais-blossom" RE="TestETL|TestConfig|TestMountpath" make test-run
   519        exit_code=$?
   520        result=$((result + exit_code))
   521        echo "----- K8S TESTS FINISHED WITH: ${exit_code} -----"
   522      - |
   523        for bucket in ${CLOUD_BCKS}; do
   524          echo "----- RUNNING LONG TESTS WITH: ${bucket} -----"
   525          BUCKET=${bucket} make test-long && make test-aisloader
   526          exit_code=$?
   527          result=$((result + exit_code))
   528          echo "----- LONG TESTS FINISHED WITH: ${exit_code} -----"
   529        done
   530  
   531  checkmarx-scan-csv:
   532    stage: security
   533    rules:
   534      - if: '$CI_PIPELINE_SOURCE == "schedule" || $CI_PIPELINE_SOURCE == "web"'
   535        allow_failure: true