github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/.github/workflows/dataflow_engine_chaos.yaml (about) 1 name: Dataflow Engine Chaos 2 3 on: 4 schedule: 5 - cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8 6 workflow_dispatch: 7 inputs: 8 pr: 9 description: 'Which PR do you want to trigger (use PR number, such as 6127)' 10 required: true 11 default: '' 12 13 # See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. 14 concurrency: 15 group: ${{ github.ref }}-${{ github.workflow }} 16 cancel-in-progress: true 17 18 # A workflow run is made up of one or more jobs that can run sequentially or in parallel 19 jobs: 20 # This workflow contains a single job called "base" 21 base: 22 # The type of runner that the job will run on 23 runs-on: ubuntu-20.04 24 timeout-minutes: 50 25 strategy: 26 fail-fast: false 27 matrix: 28 chaos-obj: 29 [ 30 "pod-failure-dataflow", 31 "pod-kill-dataflow", 32 "network-partition-dataflow", 33 "network-emulation-dataflow", 34 "time-shift-dataflow", 35 ] 36 37 # Steps represent a sequence of tasks that will be executed as part of the job 38 steps: 39 - uses: actions/checkout@v2 40 41 - name: check out code by workerflow dispatch PR 42 if: ${{ github.event.inputs.pr != '' }} 43 uses: actions/checkout@v2 44 with: 45 ref: refs/pull/${{ github.event.inputs.pr }}/head 46 47 - uses: actions/setup-go@v3 48 with: 49 go-version: '1.21' 50 51 - name: Cache go modules 52 uses: actions/cache@v2 53 with: 54 path: ~/go/pkg/mod 55 key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }} 56 57 - name: Create k8s Kind Cluster 58 uses: helm/kind-action@v1.4.0 59 with: 60 cluster_name: dataflow-engine-cluster 61 config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml 62 63 - name: Print cluster information 64 run: | 65 kubectl config view 66 kubectl cluster-info 67 kubectl get nodes 68 kubectl get pods -n kube-system 69 kubectl get sc 70 kubectl version 71 helm version 72 73 - name: Build dataflow engine binary 74 run: | 75 make tiflow tiflow-chaos-case 76 cp -r $GITHUB_WORKSPACE/engine/chaos/cases/conf/ $GITHUB_WORKSPACE/bin/engine-conf 77 78 - name: Build Dataflow engine docker image 79 run: | 80 docker build -f $GITHUB_WORKSPACE/engine/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin 81 docker image list 82 83 - name: Load docker image to kind cluster 84 run: | 85 kind load docker-image dataflow:chaos --name dataflow-engine-cluster 86 87 # Set up upstream instances 88 - name: Set up sources 89 run: | 90 kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml 91 kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml 92 kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml 93 - name: Wait for sources ready # kubectl wait --all not working 94 run: | 95 kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true 96 kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true 97 kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true 98 sleep 10 99 echo show pvc 100 kubectl get pvc -l app=sources -o wide 101 echo show pv 102 kubectl get pv -o wide 103 echo show svc 104 kubectl get svc -l app=sources -o wide 105 echo show sts 106 kubectl get sts -l app=sources -o wide 107 echo show po 108 kubectl get po -l app=sources -o wide 109 echo describe po 110 kubectl describe po -l app=sources 111 echo describe pvc 112 kubectl describe pvc -l app=sources 113 kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s 114 kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s 115 kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s 116 117 # Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator) 118 - name: Set up TiDB 119 run: | 120 kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml 121 kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml 122 kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml 123 - name: Wait for TiDB ready 124 run: | 125 kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true 126 echo show pvc 127 kubectl get pvc -l app=tidb -o wide 128 echo show pv 129 kubectl get pv -o wide 130 echo show svc 131 kubectl get svc -l app=tidb -o wide 132 echo show sts 133 kubectl get sts -l app=tidb -o wide 134 echo show po 135 kubectl get po -l app=tidb -o wide 136 echo describe po 137 kubectl describe po -l app=tidb 138 echo describe pvc 139 kubectl describe pvc -l app=tidb 140 kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s 141 142 # Set up minio and create a bucket for tests 143 - name: Set up minio 144 run: | 145 kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml 146 kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml 147 kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml 148 - name: Wait for minio ready 149 run: | 150 kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=10m || true 151 echo show pvc 152 kubectl get pvc -l app=minio -o wide 153 echo show pv 154 kubectl get pv -o wide 155 echo show svc 156 kubectl get svc -l app=minio -o wide 157 echo show sts 158 kubectl get sts -l app=minio -o wide 159 echo show po 160 kubectl get po -l app=minio -o wide 161 echo describe po 162 kubectl describe po -l app=minio 163 echo describe pvc 164 kubectl describe pvc -l app=minio 165 kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=0s 166 - name: Set up minio-create-bucket job 167 run: | 168 kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml 169 kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml 170 kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml 171 kubectl wait --for=condition=Complete job/chaos-minio-create-bucket-job --timeout=2m 172 173 # Set up metastore and basic services 174 - name: Set up metastore and basic services 175 run: | 176 helm install -f $GITHUB_WORKSPACE/deployments/engine/helm/tiflow/values.yaml chaos $GITHUB_WORKSPACE/deployments/engine/helm/tiflow 177 helm list 178 sleep 5 179 kubectl get pods 180 181 - name: Wait for metastore ready 182 run: | 183 kubectl wait --for=condition=Ready pod/chaos-metastore-mysql-0 --timeout=60s || true 184 kubectl wait --for=condition=Ready pod/chaos-metastore-etcd-0 --timeout=60s || true 185 186 echo show pvc 187 kubectl get pvc -l app=chaos-metastore-etcd -o wide 188 echo show pv 189 kubectl get pv -o wide 190 echo show svc 191 kubectl get svc -l app=chaos-metastore-etcd -o wide 192 echo show sts 193 kubectl get sts -l app=chaos-metastore-etcd -o wide 194 echo show po 195 kubectl get po -l app=chaos-metastore-etcd -o wide 196 echo describe po 197 kubectl describe po -l app=chaos-metastore-etcd 198 echo describe pvc 199 kubectl describe pvc -l app=chaos-metastore-etcd 200 201 echo show pvc 202 kubectl get pvc -l app=chaos-metastore-mysql -o wide 203 echo show pv 204 kubectl get pv -o wide 205 echo show svc 206 kubectl get svc -l app=chaos-metastore-mysql -o wide 207 echo show sts 208 kubectl get sts -l app=chaos-metastore-mysql -o wide 209 echo show po 210 kubectl get po -l app=chaos-metastore-mysql -o wide 211 echo describe po 212 kubectl describe po -l app=chaos-metastore-framework 213 echo describe pvc 214 kubectl describe pvc -l app=chaos-metastore-framework 215 216 - name: Wait for server-master ready 217 run: | 218 kubectl wait --for=condition=Ready pod -l app=chaos-server-master --all --timeout=20s|| true 219 echo "<<<<< show pvc >>>>>" 220 kubectl get pvc -l app=chaos-server-master -o wide 221 echo "<<<<< show pv >>>>>" 222 kubectl get pv -o wide 223 echo "<<<<< show svc >>>>>" 224 kubectl get svc -l app=chaos-server-master -o wide 225 echo "<<<<< show sts >>>>>" 226 kubectl get sts -l app=chaos-server-master -o wide 227 echo "<<<<< show po >>>>>" 228 kubectl get po -l app=chaos-server-master -o wide 229 echo "<<<<< describe po >>>>>" 230 kubectl describe po -l app=chaos-server-master 231 echo "<<<<< describe pvc >>>>>" 232 kubectl describe pvc -l app=chaos-server-master 233 echo "<<<<< show current log for chaos-server-master-0 >>>>>" 234 kubectl logs chaos-server-master-0 || true 235 echo "<<<<< show previous log for chaos-server-master-0 >>>>>" 236 kubectl logs chaos-server-master-0 -p || true 237 echo "<<<<< show current log for chaos-server-master-1 >>>>>" 238 kubectl logs chaos-server-master-1 || true 239 echo "<<<<< show previous log for chaos-server-master-1 >>>>>" 240 kubectl logs chaos-server-master-1 -p || true 241 echo "<<<<< show current log for chaos-server-master-2 >>>>>" 242 kubectl logs chaos-server-master-2 || true 243 echo "<<<<< show previous log for chaos-server-master-2 >>>>>" 244 kubectl logs chaos-server-master-2 -p || true 245 246 kubectl logs chaos-server-master-0 -c wait-mysql || true 247 248 - name: Wait for executor ready 249 run: | 250 kubectl wait --for=condition=Ready pod -l app=chaos-executor --all --timeout=15s|| true 251 echo "<<<<< show pvc >>>>>" 252 kubectl get pvc -l app=chaos-executor -o wide 253 echo "<<<<< show pv >>>>>" 254 kubectl get pv -o wide 255 echo "<<<<< show svc >>>>>" 256 kubectl get svc -l app=chaos-executor -o wide 257 echo "<<<<< show sts >>>>>" 258 kubectl get sts -l app=chaos-executor -o wide 259 echo "<<<<< show po >>>>>" 260 kubectl get po -l app=chaos-executor -o wide 261 echo "<<<<< describe po >>>>>" 262 kubectl describe po -l app=chaos-executor 263 echo "<<<<< describe pvc >>>>>" 264 kubectl describe pvc -l app=chaos-executor 265 echo "<<<<< show current log for chaos-executor-0 >>>>>" 266 kubectl logs chaos-executor-0 || true 267 echo "<<<<< show previous log for chaos-executor-0 >>>>>" 268 kubectl logs chaos-executor-0 -p || true 269 echo "<<<<< show current log for chaos-executor-1 >>>>>" 270 kubectl logs chaos-executor-1 || true 271 echo "<<<<< show previous log for worker-master-1 >>>>>" 272 kubectl logs chaos-executor-1 -p || true 273 echo "<<<<< show current log for chaos-executor-2 >>>>>" 274 kubectl logs chaos-executor-2 || true 275 echo "<<<<< show previous log for chaos-executor-2 >>>>>" 276 kubectl logs chaos-executor-2 -p || true 277 278 kubectl logs chaos-executor-0 -c wait-server-master || true 279 280 - name: Set up chaos test cases 281 run: | 282 kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml 283 kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml 284 kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml 285 kubectl get pods 286 287 # FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304 288 - name: Wait DM enter sync stage 289 run: | 290 for idx in $(seq 0 300); do 291 echo "wait dm enter sync stage" 292 if kubectl logs job.batch/chaos-test-case | grep "full mode of the task has completed" ; then 293 break 294 fi 295 sleep 1 296 done 297 298 - name: Encode chaos-mesh action 299 run: | 300 echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV 301 302 - name: Run chaos mesh action 303 uses: chaos-mesh/chaos-mesh-action@master 304 env: 305 CFG_BASE64: ${{ env.CFG_BASE64 }} 306 307 # check whether complete with 1m * 20 times. 308 - name: Wait for chaos test case complete 309 run: | 310 $GITHUB_WORKSPACE/engine/chaos/scripts/check-case.sh 311 312 - name: Pause all chaos 313 if: ${{ always() }} 314 run: | 315 kubectl delete -f $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml 316 317 - name: Dump goroutines 318 if: ${{ failure() }} 319 run: | 320 # Add a delay if test fails, to check whether the cluster can recover after chaos is removed 321 sleep 60 322 kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master"|xargs -I{} kubectl exec -i -c server-master {} -- wget http://127.0.0.1:10240/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true 323 kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "executor"|xargs -I{} kubectl exec -i -c executor {} -- wget http://127.0.0.1:10241/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true 324 325 - name: Copy logs to hack permission 326 if: ${{ always() }} 327 run: | 328 mkdir ./logs 329 kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master|executor"|xargs -I{} kubectl cp {}:/log ./logs || true 330 kind export logs ./logs/kind --name dataflow-engine-cluster 331 sudo chown -R runner ./logs 332 333 # Upload logs as artifact seems not stable, so we set `continue-on-error: true` here. 334 - name: Upload logs 335 continue-on-error: true 336 uses: actions/upload-artifact@v2 337 if: ${{ always() }} 338 with: 339 name: chaos-base-logs.${{ matrix.chaos-obj }} 340 path: | 341 ./logs 342 343 # Send feishu notification if failed. 344 - name: Feishu notification 345 continue-on-error: true 346 uses: foxundermoon/feishu-action@v2 347 if: ${{ failure() }} 348 with: 349 url: ${{ secrets.ENGINE_FEISHU_NOTIFY_URL }} 350 msg_type: text 351 content: | 352 text: | 353 dataflow engine chaos job failed, see https://github.com/pingcap/tiflow/actions/runs/${{ github.run_id }}