github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/.github/workflows/dataflow_engine_chaos.yaml (about)

     1  name: Dataflow Engine Chaos
     2  
     3  on:
     4    schedule:
     5      - cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8
     6    workflow_dispatch:
     7      inputs:
     8        pr:
     9          description: 'Which PR do you want to trigger (use PR number, such as 6127)'
    10          required: true
    11          default: ''
    12  
    13  # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency.
    14  concurrency:
    15    group: ${{ github.ref }}-${{ github.workflow }}
    16    cancel-in-progress: true
    17  
    18  # A workflow run is made up of one or more jobs that can run sequentially or in parallel
    19  jobs:
    20    # This workflow contains a single job called "base"
    21    base:
    22      # The type of runner that the job will run on
    23      runs-on: ubuntu-20.04
    24      timeout-minutes: 50
    25      strategy:
    26        fail-fast: false
    27        matrix:
    28          chaos-obj:
    29            [
    30              "pod-failure-dataflow",
    31              "pod-kill-dataflow",
    32              "network-partition-dataflow",
    33              "network-emulation-dataflow",
    34              "time-shift-dataflow",
    35            ]
    36  
    37      # Steps represent a sequence of tasks that will be executed as part of the job
    38      steps:
    39        - uses: actions/checkout@v2
    40  
    41        - name: check out code by workerflow dispatch PR
    42          if: ${{ github.event.inputs.pr != '' }}
    43          uses: actions/checkout@v2
    44          with:
    45            ref: refs/pull/${{ github.event.inputs.pr }}/head
    46  
    47        - uses: actions/setup-go@v3
    48          with:
    49            go-version: '1.21'
    50  
    51        - name: Cache go modules
    52          uses: actions/cache@v2
    53          with:
    54            path: ~/go/pkg/mod
    55            key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }}
    56  
    57        - name: Create k8s Kind Cluster
    58          uses: helm/kind-action@v1.4.0
    59          with:
    60            cluster_name: dataflow-engine-cluster
    61            config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml
    62  
    63        - name: Print cluster information
    64          run: |
    65            kubectl config view
    66            kubectl cluster-info
    67            kubectl get nodes
    68            kubectl get pods -n kube-system
    69            kubectl get sc
    70            kubectl version
    71            helm version
    72  
    73        - name: Build dataflow engine binary
    74          run: |
    75            make tiflow tiflow-chaos-case
    76            cp -r $GITHUB_WORKSPACE/engine/chaos/cases/conf/ $GITHUB_WORKSPACE/bin/engine-conf
    77  
    78        - name: Build Dataflow engine docker image
    79          run: |
    80            docker build -f $GITHUB_WORKSPACE/engine/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin
    81            docker image list
    82  
    83        - name: Load docker image to kind cluster
    84          run: |
    85            kind load docker-image dataflow:chaos --name dataflow-engine-cluster
    86  
    87        # Set up upstream instances
    88        - name: Set up sources
    89          run: |
    90            kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
    91            kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
    92            kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
    93        - name: Wait for sources ready # kubectl wait --all not working
    94          run: |
    95            kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true
    96            kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true
    97            kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true
    98            sleep 10
    99            echo show pvc
   100            kubectl get pvc -l app=sources -o wide
   101            echo show pv
   102            kubectl get pv -o wide
   103            echo show svc
   104            kubectl get svc -l app=sources -o wide
   105            echo show sts
   106            kubectl get sts -l app=sources -o wide
   107            echo show po
   108            kubectl get po -l app=sources -o wide
   109            echo describe po
   110            kubectl describe po -l app=sources
   111            echo describe pvc
   112            kubectl describe pvc -l app=sources
   113            kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s
   114            kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s
   115            kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s
   116   
   117        # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator)
   118        - name: Set up TiDB
   119          run: |
   120            kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
   121            kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
   122            kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
   123        - name: Wait for TiDB ready
   124          run: |
   125            kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true
   126            echo show pvc
   127            kubectl get pvc -l app=tidb -o wide
   128            echo show pv
   129            kubectl get pv -o wide
   130            echo show svc
   131            kubectl get svc -l app=tidb -o wide
   132            echo show sts
   133            kubectl get sts -l app=tidb -o wide
   134            echo show po
   135            kubectl get po -l app=tidb -o wide
   136            echo describe po
   137            kubectl describe po -l app=tidb
   138            echo describe pvc
   139            kubectl describe pvc -l app=tidb
   140            kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s
   141  
   142        # Set up minio and create a bucket for tests
   143        - name: Set up minio
   144          run: |
   145            kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml  
   146            kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml 
   147            kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml 
   148        - name: Wait for minio ready
   149          run: |
   150            kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=10m || true
   151            echo show pvc
   152            kubectl get pvc -l app=minio -o wide
   153            echo show pv
   154            kubectl get pv -o wide
   155            echo show svc
   156            kubectl get svc -l app=minio -o wide
   157            echo show sts
   158            kubectl get sts -l app=minio -o wide
   159            echo show po
   160            kubectl get po -l app=minio -o wide
   161            echo describe po
   162            kubectl describe po -l app=minio
   163            echo describe pvc
   164            kubectl describe pvc -l app=minio
   165            kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=0s
   166        - name: Set up minio-create-bucket job
   167          run: |
   168            kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml 
   169            kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
   170            kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
   171            kubectl wait --for=condition=Complete job/chaos-minio-create-bucket-job --timeout=2m
   172  
   173        # Set up metastore and basic services
   174        - name: Set up metastore and basic services
   175          run: |
   176            helm install -f $GITHUB_WORKSPACE/deployments/engine/helm/tiflow/values.yaml chaos $GITHUB_WORKSPACE/deployments/engine/helm/tiflow
   177            helm list
   178            sleep 5
   179            kubectl get pods
   180            
   181        - name: Wait for metastore ready
   182          run: |
   183            kubectl wait --for=condition=Ready pod/chaos-metastore-mysql-0 --timeout=60s || true
   184            kubectl wait --for=condition=Ready pod/chaos-metastore-etcd-0 --timeout=60s || true
   185  
   186            echo show pvc
   187            kubectl get pvc -l app=chaos-metastore-etcd -o wide
   188            echo show pv
   189            kubectl get pv -o wide
   190            echo show svc
   191            kubectl get svc -l app=chaos-metastore-etcd -o wide
   192            echo show sts
   193            kubectl get sts -l app=chaos-metastore-etcd -o wide
   194            echo show po
   195            kubectl get po -l app=chaos-metastore-etcd -o wide
   196            echo describe po
   197            kubectl describe po -l app=chaos-metastore-etcd
   198            echo describe pvc
   199            kubectl describe pvc -l app=chaos-metastore-etcd
   200  
   201            echo show pvc
   202            kubectl get pvc -l app=chaos-metastore-mysql -o wide
   203            echo show pv
   204            kubectl get pv -o wide
   205            echo show svc
   206            kubectl get svc -l app=chaos-metastore-mysql -o wide
   207            echo show sts
   208            kubectl get sts -l app=chaos-metastore-mysql -o wide
   209            echo show po
   210            kubectl get po -l app=chaos-metastore-mysql -o wide
   211            echo describe po
   212            kubectl describe po -l app=chaos-metastore-framework
   213            echo describe pvc
   214            kubectl describe pvc -l app=chaos-metastore-framework
   215  
   216        - name: Wait for server-master ready
   217          run: |
   218            kubectl wait --for=condition=Ready pod -l app=chaos-server-master --all --timeout=20s|| true
   219            echo "<<<<< show pvc >>>>>"
   220            kubectl get pvc -l app=chaos-server-master -o wide
   221            echo "<<<<< show pv >>>>>"
   222            kubectl get pv -o wide
   223            echo "<<<<< show svc >>>>>"
   224            kubectl get svc -l app=chaos-server-master -o wide
   225            echo "<<<<< show sts >>>>>"
   226            kubectl get sts -l app=chaos-server-master -o wide
   227            echo "<<<<< show po >>>>>"
   228            kubectl get po -l app=chaos-server-master -o wide
   229            echo "<<<<< describe po >>>>>"
   230            kubectl describe po -l app=chaos-server-master
   231            echo "<<<<< describe pvc >>>>>"
   232            kubectl describe pvc -l app=chaos-server-master
   233            echo "<<<<< show current log for chaos-server-master-0 >>>>>"
   234            kubectl logs chaos-server-master-0 || true
   235            echo "<<<<< show previous log for chaos-server-master-0 >>>>>"
   236            kubectl logs chaos-server-master-0 -p || true
   237            echo "<<<<< show current log for chaos-server-master-1 >>>>>"
   238            kubectl logs chaos-server-master-1 || true
   239            echo "<<<<< show previous log for chaos-server-master-1 >>>>>"
   240            kubectl logs chaos-server-master-1 -p || true
   241            echo "<<<<< show current log for chaos-server-master-2 >>>>>"
   242            kubectl logs chaos-server-master-2 || true
   243            echo "<<<<< show previous log for chaos-server-master-2 >>>>>"
   244            kubectl logs chaos-server-master-2 -p || true
   245  
   246            kubectl logs chaos-server-master-0 -c wait-mysql || true
   247  
   248        - name: Wait for executor ready
   249          run: |
   250            kubectl wait --for=condition=Ready pod -l app=chaos-executor --all --timeout=15s|| true
   251            echo "<<<<< show pvc >>>>>"
   252            kubectl get pvc -l app=chaos-executor -o wide
   253            echo "<<<<< show pv >>>>>"
   254            kubectl get pv -o wide
   255            echo "<<<<< show svc >>>>>"
   256            kubectl get svc -l app=chaos-executor -o wide
   257            echo "<<<<< show sts >>>>>"
   258            kubectl get sts -l app=chaos-executor -o wide
   259            echo "<<<<< show po >>>>>"
   260            kubectl get po -l app=chaos-executor -o wide
   261            echo "<<<<< describe po >>>>>"
   262            kubectl describe po -l app=chaos-executor
   263            echo "<<<<< describe pvc >>>>>"
   264            kubectl describe pvc -l app=chaos-executor
   265            echo "<<<<< show current log for chaos-executor-0 >>>>>"
   266            kubectl logs chaos-executor-0 || true
   267            echo "<<<<< show previous log for chaos-executor-0 >>>>>"
   268            kubectl logs chaos-executor-0 -p || true
   269            echo "<<<<< show current log for chaos-executor-1 >>>>>"
   270            kubectl logs chaos-executor-1 || true
   271            echo "<<<<< show previous log for worker-master-1 >>>>>"
   272            kubectl logs chaos-executor-1 -p || true
   273            echo "<<<<< show current log for chaos-executor-2 >>>>>"
   274            kubectl logs chaos-executor-2 || true
   275            echo "<<<<< show previous log for chaos-executor-2 >>>>>"
   276            kubectl logs chaos-executor-2 -p || true
   277  
   278            kubectl logs chaos-executor-0 -c wait-server-master || true
   279  
   280        - name: Set up chaos test cases
   281          run: |
   282            kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
   283            kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
   284            kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
   285            kubectl get pods
   286  
   287        # FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304
   288        - name: Wait DM enter sync stage
   289          run: |
   290            for idx in $(seq 0 300); do
   291              echo "wait dm enter sync stage"
   292              if kubectl logs job.batch/chaos-test-case | grep "full mode of the task has completed" ; then
   293                break
   294              fi
   295              sleep 1
   296            done
   297  
   298        - name: Encode chaos-mesh action
   299          run: |
   300            echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV
   301  
   302        - name: Run chaos mesh action
   303          uses: chaos-mesh/chaos-mesh-action@master
   304          env:
   305            CFG_BASE64: ${{ env.CFG_BASE64 }}
   306  
   307        # check whether complete with 1m * 20 times.
   308        - name: Wait for chaos test case complete
   309          run: |
   310            $GITHUB_WORKSPACE/engine/chaos/scripts/check-case.sh
   311  
   312        - name: Pause all chaos
   313          if: ${{ always() }}
   314          run: |
   315            kubectl delete -f $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml
   316  
   317        - name: Dump goroutines
   318          if: ${{ failure() }}
   319          run: |
   320            # Add a delay if test fails, to check whether the cluster can recover after chaos is removed
   321            sleep 60
   322            kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master"|xargs -I{} kubectl exec -i -c server-master {} -- wget http://127.0.0.1:10240/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true
   323            kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "executor"|xargs -I{} kubectl exec -i -c executor {} -- wget http://127.0.0.1:10241/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true
   324  
   325        - name: Copy logs to hack permission
   326          if: ${{ always() }}
   327          run: |
   328            mkdir ./logs
   329            kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master|executor"|xargs -I{} kubectl cp {}:/log ./logs || true
   330            kind export logs ./logs/kind --name dataflow-engine-cluster
   331            sudo chown -R runner ./logs
   332  
   333        # Upload logs as artifact seems not stable, so we set `continue-on-error: true` here.
   334        - name: Upload logs
   335          continue-on-error: true
   336          uses: actions/upload-artifact@v2
   337          if: ${{ always() }}
   338          with:
   339            name: chaos-base-logs.${{ matrix.chaos-obj }}
   340            path: |
   341              ./logs
   342  
   343        # Send feishu notification if failed.
   344        - name: Feishu notification
   345          continue-on-error: true
   346          uses: foxundermoon/feishu-action@v2
   347          if: ${{ failure() }}
   348          with:
   349            url: ${{ secrets.ENGINE_FEISHU_NOTIFY_URL }}
   350            msg_type: text
   351            content: |
   352              text: |
   353                dataflow engine chaos job failed, see https://github.com/pingcap/tiflow/actions/runs/${{ github.run_id }}