github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/tests/ha_master/run.sh (about) 1 #!/bin/bash 2 3 set -eu 4 5 cur=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 6 source $cur/../_utils/test_prepare 7 WORK_DIR=$TEST_DIR/$TEST_NAME 8 API_VERSION="v1alpha1" 9 MASTER_PORT1=8261 10 MASTER_PORT2=8361 11 MASTER_PORT3=8461 12 MASTER_PORT4=8561 13 MASTER_PORT5=8661 14 15 LEADER_NAME="master1" 16 LEADER_PORT=MASTER_PORT1 17 18 function set_leader_port() { 19 case $LEADER_NAME in 20 "master1") 21 LEADER_PORT=$MASTER_PORT1 22 ;; 23 "master2") 24 LEADER_PORT=$MASTER_PORT2 25 ;; 26 "master3") 27 LEADER_PORT=$MASTER_PORT3 28 ;; 29 "master4") 30 LEADER_PORT=$MASTER_PORT4 31 ;; 32 "master5") 33 LEADER_PORT=$MASTER_PORT5 34 ;; 35 esac 36 } 37 38 function test_evict_leader() { 39 echo "[$(date)] <<<<<< start test_evict_leader >>>>>>" 40 41 master_ports=($MASTER_PORT1 $MASTER_PORT2 $MASTER_PORT3 $MASTER_PORT4 $MASTER_PORT5) 42 43 # evict leader 44 for i in $(seq 0 4); do 45 LEADER_NAME=$(get_leader $WORK_DIR 127.0.0.1:${MASTER_PORT1}) 46 echo "leader is $LEADER_NAME" 47 set_leader_port 48 49 run_dm_ctl $WORK_DIR "127.0.0.1:$LEADER_PORT" \ 50 "operate-leader evict" \ 51 "\"result\": true" 1 52 53 # evict leader twice, and test evict leader from http interface 54 curl -X PUT 127.0.0.1:$LEADER_PORT/apis/v1alpha1/leader/1 >$WORK_DIR/evict_leader.log 55 check_log_contains $WORK_DIR/evict_leader.log "\"result\": true" 1 56 57 # will get_leader failed because evict leader on all master, so just skip 58 if [ $i = 4 ]; then 59 continue 60 fi 61 # Leader evict is not effective immediately, we need to wait for a proper period of time. 62 for _ in {0..30}; do 63 NEW_LEADER_NAME=$(get_leader "$WORK_DIR" 127.0.0.1:${MASTER_PORT1}) 64 if [ "$NEW_LEADER_NAME" != "$LEADER_NAME" ]; then 65 break 66 fi 67 sleep 1 68 done 69 if [ "$NEW_LEADER_NAME" = "$LEADER_NAME" ]; then 70 echo "leader evict failed" 71 exit 1 72 fi 73 echo "new leader is $NEW_LEADER_NAME" 74 done 75 76 echo "cancel evict leader on master1, and master1 will be the leader" 77 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT1" \ 78 "operate-leader cancel-evict" \ 79 "\"result\": true" 1 80 81 echo "cancel evict leader twice, and test cancel evict leader from http interface" 82 curl -X PUT 127.0.0.1:$MASTER_PORT1/apis/v1alpha1/leader/2 >$WORK_DIR/cancel_evict_leader.log 83 check_log_contains $WORK_DIR/cancel_evict_leader.log "\"result\": true" 1 84 85 LEADER_NAME=$(get_leader $WORK_DIR 127.0.0.1:${MASTER_PORT1}) 86 echo "leader is $LEADER_NAME" 87 if [ "$LEADER_NAME" != "master1" ]; then 88 echo "cancel evict leader failed" 89 exit 1 90 fi 91 92 echo "cancel evict leader on all masters" 93 for i in $(seq 1 4); do 94 echo "cancel master port ${master_ports[$i]}" 95 run_dm_ctl $WORK_DIR "127.0.0.1:${master_ports[$i]}" \ 96 "operate-leader cancel-evict" \ 97 "\"result\": true" 1 98 done 99 100 echo "[$(date)] <<<<<< finish test_evict_leader >>>>>>" 101 } 102 103 function test_list_member() { 104 echo "[$(date)] <<<<<< start test_list_member_command >>>>>>" 105 106 master_ports=(0 $MASTER_PORT1 $MASTER_PORT2 $MASTER_PORT3 $MASTER_PORT4 $MASTER_PORT5) 107 master_peer_ports=(0 $MASTER_PEER_PORT1 $MASTER_PEER_PORT2 $MASTER_PEER_PORT3 $MASTER_PEER_PORT4 $MASTER_PEER_PORT5) 108 109 alive=(1 2 3 4 5) 110 leaders=() 111 leader_idx=0 112 113 # TODO: when removing 3 masters (use `seq 0 2`), this test sometimes will fail 114 # In these cases, DM-master will campaign successfully, but fails to `get` from etcd while starting scheduler. But finally it will recover. 115 for i in $(seq 0 1); do 116 alive=("${alive[@]/$leader_idx/}") 117 leaders=() 118 119 # get leader in all masters 120 for idx in ${alive[@]}; do 121 leaders+=($(get_leader $WORK_DIR 127.0.0.1:${master_ports[$idx]})) 122 done 123 leader=${leaders[0]} 124 leader_idx=${leader:6} 125 echo "current leader is" $leader 126 127 # check leader is same for every master 128 for ld in ${leaders[@]}; do 129 if [ "$leader" != "$ld" ]; then 130 echo "leader not consisent" 131 exit 1 132 fi 133 done 134 135 # check list-member master 136 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:${master_ports[$leader_idx]}" \ 137 "list-member --master" \ 138 "\"alive\": true" $((5 - i)) 139 140 # kill leader 141 echo "kill leader" $leader 142 kill_process $leader 143 check_master_port_offline $leader_idx 144 done 145 146 # join master which has been killed 147 alive=("${alive[@]/$leader_idx/}") 148 for idx in $(seq 1 5); do 149 if [[ ! " ${alive[@]} " =~ " ${idx} " ]]; then 150 run_dm_master $WORK_DIR/master${idx} ${master_ports[$idx]} $cur/conf/dm-master${idx}.toml 151 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:${master_ports[$idx]} 152 fi 153 done 154 # check leader is same for every master 155 alive=(1 2 3 4 5) 156 leaders=() 157 for idx in ${alive[@]}; do 158 leaders+=($(get_leader $WORK_DIR 127.0.0.1:${master_ports[$idx]})) 159 done 160 leader=${leaders[0]} 161 leader_idx=${leader:6} 162 echo "current leader is" $leader 163 for ld in ${leaders[@]}; do 164 if [ "$leader" != "$ld" ]; then 165 echo "leader not consisent" 166 exit 1 167 fi 168 done 169 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 170 "list-member --master" \ 171 "\"alive\": true" 5 172 173 # restart follower 174 for idx in $(seq 1 5); do 175 leader=$(get_leader $WORK_DIR 127.0.0.1:${master_ports[$idx]}) 176 leader_idx=${leader:6} 177 echo "current leader is" $leader 178 if [[ $idx = $leader_idx ]]; then 179 continue 180 fi 181 echo "kill master$idx" 182 kill_process dm-master$idx 183 check_master_port_offline $idx 184 sleep 5 185 run_dm_master $WORK_DIR/master${idx} ${master_ports[$idx]} $cur/conf/dm-master${idx}.toml 186 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:${master_ports[$idx]} 187 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 188 "list-member --master" \ 189 "\"alive\": true" 5 190 done 191 192 # check list-member worker 193 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 194 "list-member --worker --name=worker1,worker2" \ 195 "\"stage\": \"bound\"" 2 196 197 dmctl_operate_source stop $WORK_DIR/source1.yaml $SOURCE_ID1 198 199 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 200 "list-member --worker" \ 201 "\"stage\": \"bound\"" 1 \ 202 "\"stage\": \"free\"" 1 203 204 dmctl_operate_source stop $WORK_DIR/source2.yaml $SOURCE_ID2 205 206 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 207 "list-member" \ 208 "\"stage\": \"free\"" 2 209 210 dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 211 dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2 212 213 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 214 "list-member --name=worker1,worker2" \ 215 "\"stage\": \"bound\"" 2 216 217 # kill worker 218 echo "kill worker1" 219 kill_process dm-worker1 220 check_port_offline $WORKER1_PORT 20 221 222 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 223 "list-member --name=worker1,worker2" \ 224 "\"stage\": \"bound\"" 1 \ 225 "\"stage\": \"offline\"" 1 226 227 # kill worker 228 echo "kill worker2" 229 kill_process dm-worker2 230 check_port_offline $WORKER2_PORT 20 231 232 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 233 "list-member" \ 234 "\"stage\": \"offline\"" 2 235 236 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 237 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 238 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 239 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 240 241 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 242 "list-member --worker" \ 243 "\"stage\": \"bound\"" 2 244 245 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 246 "config master master1" \ 247 'name = \\"master1\\"' 1 248 249 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 250 "config master master2" \ 251 'name = \\"master2\\"' 1 252 253 echo "[$(date)] <<<<<< finish test_list_member_command >>>>>>" 254 } 255 256 function run() { 257 run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 258 check_contains 'Query OK, 2 rows affected' 259 run_sql_file $cur/data/db2.prepare.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 260 check_contains 'Query OK, 3 rows affected' 261 262 echo "start DM worker and master" 263 # start 5 dm-master 264 run_dm_master $WORK_DIR/master1 $MASTER_PORT1 $cur/conf/dm-master1.toml 265 run_dm_master $WORK_DIR/master2 $MASTER_PORT2 $cur/conf/dm-master2.toml 266 run_dm_master $WORK_DIR/master3 $MASTER_PORT3 $cur/conf/dm-master3.toml 267 run_dm_master $WORK_DIR/master4 $MASTER_PORT4 $cur/conf/dm-master4.toml 268 run_dm_master $WORK_DIR/master5 $MASTER_PORT5 $cur/conf/dm-master5.toml 269 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT1 270 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT2 271 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT3 272 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT4 273 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT5 274 275 # wait for master raft log to catch up 276 sleep 5 277 278 # disabled because https://github.com/pingcap/dm/issues/1458 279 # # a valid version should exist after the cluster bootstrapped. 280 # version_key=$(echo -n '/dm-cluster/version' | base64) 281 # curl http://127.0.0.1:$MASTER_PORT1/v3/kv/range -X POST -d '{"key": "'"${version_key}"'"}' > $WORK_DIR/cluster_version.log 282 # check_log_contains $WORK_DIR/cluster_version.log "value" 1 # only check the version exist but do not check the value now 283 284 # kill dm-master1 and dm-master2 to simulate the first two dm-master addr in join config are invalid 285 echo "kill dm-master1 and kill dm-master2" 286 kill_process dm-master1 287 check_master_port_offline 1 288 kill_process dm-master2 289 check_master_port_offline 2 290 291 # wait for master switch leader and re-setup 292 get_leader $WORK_DIR 127.0.0.1:$MASTER_PORT3 293 294 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 295 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 296 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 297 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 298 299 # start dm_master1 and dm-master2 again 300 echo "start dm-master1 and dm-master2 again" 301 run_dm_master $WORK_DIR/master1 $MASTER_PORT1 $cur/conf/dm-master1.toml 302 run_dm_master $WORK_DIR/master2 $MASTER_PORT2 $cur/conf/dm-master2.toml 303 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT1 304 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT2 305 306 echo "operate mysql config to worker" 307 cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml 308 cp $cur/conf/source2.yaml $WORK_DIR/source2.yaml 309 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker1/relay_log" $WORK_DIR/source1.yaml 310 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker2/relay_log" $WORK_DIR/source2.yaml 311 dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 312 dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2 313 314 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 315 "list-member --master" \ 316 "\"alive\": true" 5 317 318 test_evict_leader 319 test_list_member # TICASE-942, 944, 945, 946, 947 320 321 worker1bound=$($PWD/bin/dmctl.test DEVEL --master-addr "127.0.0.1:$MASTER_PORT1" list-member --name worker1 | 322 grep 'source' | awk -F: '{print $2}' | cut -d'"' -f 2) 323 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 324 "start-relay -s $worker1bound worker1" \ 325 "\"result\": true" 2 326 327 echo "start DM task" 328 dmctl_start_task "$cur/conf/dm-task.yaml" "--remove-meta" 329 330 echo "use sync_diff_inspector to check full dump loader" 331 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 332 333 echo "flush logs to force rotate binlog file" 334 run_sql "flush logs;" $MYSQL_PORT1 $MYSQL_PASSWORD1 335 run_sql "flush logs;" $MYSQL_PORT2 $MYSQL_PASSWORD2 336 337 echo "kill dm-master1 and kill dm-master2" 338 kill_process dm-master1 339 check_master_port_offline 1 340 kill_process dm-master2 341 check_master_port_offline 2 342 343 echo "wait and check task running" 344 check_http_alive 127.0.0.1:$MASTER_PORT3/apis/${API_VERSION}/status/test '"stage": "Running"' 10 345 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT3" \ 346 "query-status test" \ 347 "\"stage\": \"Running\"" 3 348 349 run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 350 run_sql_file $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 351 sleep 2 352 353 echo "use sync_diff_inspector to check data now!" 354 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 355 356 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT4" \ 357 "offline-member --master --name master1" \ 358 "\"result\": true" 1 359 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT4" \ 360 "offline-member --master --name master2" \ 361 "\"result\": true" 1 362 363 echo "kill dm-master3" 364 kill_process dm-master3 365 check_master_port_offline 3 366 367 sleep 2 368 # the last two masters should elect a new leader and serve service 369 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT4" \ 370 "query-status test" \ 371 "\"stage\": \"Running\"" 3 372 373 # run master3 again 374 run_dm_master $WORK_DIR/master3 $MASTER_PORT3 $cur/conf/dm-master3.toml 375 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT3 376 377 sleep 5 378 379 rm -rf $WORK_DIR/master1/default.master1 380 # join master1 after offline, TICASE-933, 943 381 run_dm_master $WORK_DIR/master1 $MASTER_PORT1 $cur/conf/dm-master-join1.toml 382 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT1 383 384 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT4" \ 385 "pause-task test" \ 386 "\"result\": true" 3 \ 387 "\"source\": \"$SOURCE_ID1\"" 1 \ 388 "\"source\": \"$SOURCE_ID2\"" 1 389 } 390 391 cleanup_data ha_master_test 392 # also cleanup dm processes in case of last run failed 393 cleanup_process $* 394 run $* 395 cleanup_process $* 396 397 echo "[$(date)] <<<<<< test case $TEST_NAME success! >>>>>>"