github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/tests/ha_master/run.sh (about)

     1  #!/bin/bash
     2  
     3  set -eu
     4  
     5  cur=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
     6  source $cur/../_utils/test_prepare
     7  WORK_DIR=$TEST_DIR/$TEST_NAME
     8  API_VERSION="v1alpha1"
     9  MASTER_PORT1=8261
    10  MASTER_PORT2=8361
    11  MASTER_PORT3=8461
    12  MASTER_PORT4=8561
    13  MASTER_PORT5=8661
    14  
    15  LEADER_NAME="master1"
    16  LEADER_PORT=MASTER_PORT1
    17  
    18  function set_leader_port() {
    19  	case $LEADER_NAME in
    20  	"master1")
    21  		LEADER_PORT=$MASTER_PORT1
    22  		;;
    23  	"master2")
    24  		LEADER_PORT=$MASTER_PORT2
    25  		;;
    26  	"master3")
    27  		LEADER_PORT=$MASTER_PORT3
    28  		;;
    29  	"master4")
    30  		LEADER_PORT=$MASTER_PORT4
    31  		;;
    32  	"master5")
    33  		LEADER_PORT=$MASTER_PORT5
    34  		;;
    35  	esac
    36  }
    37  
    38  function test_evict_leader() {
    39  	echo "[$(date)] <<<<<< start test_evict_leader >>>>>>"
    40  
    41  	master_ports=($MASTER_PORT1 $MASTER_PORT2 $MASTER_PORT3 $MASTER_PORT4 $MASTER_PORT5)
    42  
    43  	# evict leader
    44  	for i in $(seq 0 4); do
    45  		LEADER_NAME=$(get_leader $WORK_DIR 127.0.0.1:${MASTER_PORT1})
    46  		echo "leader is $LEADER_NAME"
    47  		set_leader_port
    48  
    49  		run_dm_ctl $WORK_DIR "127.0.0.1:$LEADER_PORT" \
    50  			"operate-leader evict" \
    51  			"\"result\": true" 1
    52  
    53  		# evict leader twice, and test evict leader from http interface
    54  		curl -X PUT 127.0.0.1:$LEADER_PORT/apis/v1alpha1/leader/1 >$WORK_DIR/evict_leader.log
    55  		check_log_contains $WORK_DIR/evict_leader.log "\"result\": true" 1
    56  
    57  		# will get_leader failed because evict leader on all master, so just skip
    58  		if [ $i = 4 ]; then
    59  			continue
    60  		fi
    61  		# Leader evict is not effective immediately, we need to wait for a proper period of time.
    62  		for _ in {0..30}; do
    63  			NEW_LEADER_NAME=$(get_leader "$WORK_DIR" 127.0.0.1:${MASTER_PORT1})
    64  			if [ "$NEW_LEADER_NAME" != "$LEADER_NAME" ]; then
    65  				break
    66  			fi
    67  			sleep 1
    68  		done
    69  		if [ "$NEW_LEADER_NAME" = "$LEADER_NAME" ]; then
    70  			echo "leader evict failed"
    71  			exit 1
    72  		fi
    73  		echo "new leader is $NEW_LEADER_NAME"
    74  	done
    75  
    76  	echo "cancel evict leader on master1, and master1 will be the leader"
    77  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT1" \
    78  		"operate-leader cancel-evict" \
    79  		"\"result\": true" 1
    80  
    81  	echo "cancel evict leader twice, and test cancel evict leader from http interface"
    82  	curl -X PUT 127.0.0.1:$MASTER_PORT1/apis/v1alpha1/leader/2 >$WORK_DIR/cancel_evict_leader.log
    83  	check_log_contains $WORK_DIR/cancel_evict_leader.log "\"result\": true" 1
    84  
    85  	LEADER_NAME=$(get_leader $WORK_DIR 127.0.0.1:${MASTER_PORT1})
    86  	echo "leader is $LEADER_NAME"
    87  	if [ "$LEADER_NAME" != "master1" ]; then
    88  		echo "cancel evict leader failed"
    89  		exit 1
    90  	fi
    91  
    92  	echo "cancel evict leader on all masters"
    93  	for i in $(seq 1 4); do
    94  		echo "cancel master port ${master_ports[$i]}"
    95  		run_dm_ctl $WORK_DIR "127.0.0.1:${master_ports[$i]}" \
    96  			"operate-leader cancel-evict" \
    97  			"\"result\": true" 1
    98  	done
    99  
   100  	echo "[$(date)] <<<<<< finish test_evict_leader >>>>>>"
   101  }
   102  
   103  function test_list_member() {
   104  	echo "[$(date)] <<<<<< start test_list_member_command >>>>>>"
   105  
   106  	master_ports=(0 $MASTER_PORT1 $MASTER_PORT2 $MASTER_PORT3 $MASTER_PORT4 $MASTER_PORT5)
   107  	master_peer_ports=(0 $MASTER_PEER_PORT1 $MASTER_PEER_PORT2 $MASTER_PEER_PORT3 $MASTER_PEER_PORT4 $MASTER_PEER_PORT5)
   108  
   109  	alive=(1 2 3 4 5)
   110  	leaders=()
   111  	leader_idx=0
   112  
   113  	# TODO: when removing 3 masters (use `seq 0 2`), this test sometimes will fail
   114  	# In these cases, DM-master will campaign successfully, but fails to `get` from etcd while starting scheduler. But finally it will recover.
   115  	for i in $(seq 0 1); do
   116  		alive=("${alive[@]/$leader_idx/}")
   117  		leaders=()
   118  
   119  		# get leader in all masters
   120  		for idx in ${alive[@]}; do
   121  			leaders+=($(get_leader $WORK_DIR 127.0.0.1:${master_ports[$idx]}))
   122  		done
   123  		leader=${leaders[0]}
   124  		leader_idx=${leader:6}
   125  		echo "current leader is" $leader
   126  
   127  		# check leader is same for every master
   128  		for ld in ${leaders[@]}; do
   129  			if [ "$leader" != "$ld" ]; then
   130  				echo "leader not consisent"
   131  				exit 1
   132  			fi
   133  		done
   134  
   135  		# check list-member master
   136  		run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:${master_ports[$leader_idx]}" \
   137  			"list-member --master" \
   138  			"\"alive\": true" $((5 - i))
   139  
   140  		# kill leader
   141  		echo "kill leader" $leader
   142  		kill_process $leader
   143  		check_master_port_offline $leader_idx
   144  	done
   145  
   146  	# join master which has been killed
   147  	alive=("${alive[@]/$leader_idx/}")
   148  	for idx in $(seq 1 5); do
   149  		if [[ ! " ${alive[@]} " =~ " ${idx} " ]]; then
   150  			run_dm_master $WORK_DIR/master${idx} ${master_ports[$idx]} $cur/conf/dm-master${idx}.toml
   151  			check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:${master_ports[$idx]}
   152  		fi
   153  	done
   154  	# check leader is same for every master
   155  	alive=(1 2 3 4 5)
   156  	leaders=()
   157  	for idx in ${alive[@]}; do
   158  		leaders+=($(get_leader $WORK_DIR 127.0.0.1:${master_ports[$idx]}))
   159  	done
   160  	leader=${leaders[0]}
   161  	leader_idx=${leader:6}
   162  	echo "current leader is" $leader
   163  	for ld in ${leaders[@]}; do
   164  		if [ "$leader" != "$ld" ]; then
   165  			echo "leader not consisent"
   166  			exit 1
   167  		fi
   168  	done
   169  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   170  		"list-member --master" \
   171  		"\"alive\": true" 5
   172  
   173  	# restart follower
   174  	for idx in $(seq 1 5); do
   175  		leader=$(get_leader $WORK_DIR 127.0.0.1:${master_ports[$idx]})
   176  		leader_idx=${leader:6}
   177  		echo "current leader is" $leader
   178  		if [[ $idx = $leader_idx ]]; then
   179  			continue
   180  		fi
   181  		echo "kill master$idx"
   182  		kill_process dm-master$idx
   183  		check_master_port_offline $idx
   184  		sleep 5
   185  		run_dm_master $WORK_DIR/master${idx} ${master_ports[$idx]} $cur/conf/dm-master${idx}.toml
   186  		check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:${master_ports[$idx]}
   187  		run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   188  			"list-member --master" \
   189  			"\"alive\": true" 5
   190  	done
   191  
   192  	# check list-member worker
   193  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   194  		"list-member --worker --name=worker1,worker2" \
   195  		"\"stage\": \"bound\"" 2
   196  
   197  	dmctl_operate_source stop $WORK_DIR/source1.yaml $SOURCE_ID1
   198  
   199  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   200  		"list-member --worker" \
   201  		"\"stage\": \"bound\"" 1 \
   202  		"\"stage\": \"free\"" 1
   203  
   204  	dmctl_operate_source stop $WORK_DIR/source2.yaml $SOURCE_ID2
   205  
   206  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   207  		"list-member" \
   208  		"\"stage\": \"free\"" 2
   209  
   210  	dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1
   211  	dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2
   212  
   213  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   214  		"list-member --name=worker1,worker2" \
   215  		"\"stage\": \"bound\"" 2
   216  
   217  	# kill worker
   218  	echo "kill worker1"
   219  	kill_process dm-worker1
   220  	check_port_offline $WORKER1_PORT 20
   221  
   222  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   223  		"list-member --name=worker1,worker2" \
   224  		"\"stage\": \"bound\"" 1 \
   225  		"\"stage\": \"offline\"" 1
   226  
   227  	# kill worker
   228  	echo "kill worker2"
   229  	kill_process dm-worker2
   230  	check_port_offline $WORKER2_PORT 20
   231  
   232  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   233  		"list-member" \
   234  		"\"stage\": \"offline\"" 2
   235  
   236  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
   237  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
   238  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
   239  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
   240  
   241  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   242  		"list-member --worker" \
   243  		"\"stage\": \"bound\"" 2
   244  
   245  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   246  		"config master master1" \
   247  		'name = \\"master1\\"' 1
   248  
   249  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   250  		"config master master2" \
   251  		'name = \\"master2\\"' 1
   252  
   253  	echo "[$(date)] <<<<<< finish test_list_member_command >>>>>>"
   254  }
   255  
   256  function run() {
   257  	run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
   258  	check_contains 'Query OK, 2 rows affected'
   259  	run_sql_file $cur/data/db2.prepare.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2
   260  	check_contains 'Query OK, 3 rows affected'
   261  
   262  	echo "start DM worker and master"
   263  	# start 5 dm-master
   264  	run_dm_master $WORK_DIR/master1 $MASTER_PORT1 $cur/conf/dm-master1.toml
   265  	run_dm_master $WORK_DIR/master2 $MASTER_PORT2 $cur/conf/dm-master2.toml
   266  	run_dm_master $WORK_DIR/master3 $MASTER_PORT3 $cur/conf/dm-master3.toml
   267  	run_dm_master $WORK_DIR/master4 $MASTER_PORT4 $cur/conf/dm-master4.toml
   268  	run_dm_master $WORK_DIR/master5 $MASTER_PORT5 $cur/conf/dm-master5.toml
   269  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT1
   270  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT2
   271  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT3
   272  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT4
   273  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT5
   274  
   275  	# wait for master raft log to catch up
   276  	sleep 5
   277  
   278  	# disabled because https://github.com/pingcap/dm/issues/1458
   279  	#    # a valid version should exist after the cluster bootstrapped.
   280  	#    version_key=$(echo -n '/dm-cluster/version' | base64)
   281  	#    curl http://127.0.0.1:$MASTER_PORT1/v3/kv/range -X POST -d '{"key": "'"${version_key}"'"}' > $WORK_DIR/cluster_version.log
   282  	#    check_log_contains $WORK_DIR/cluster_version.log "value" 1 # only check the version exist but do not check the value now
   283  
   284  	# kill dm-master1 and dm-master2 to simulate the first two dm-master addr in join config are invalid
   285  	echo "kill dm-master1 and kill dm-master2"
   286  	kill_process dm-master1
   287  	check_master_port_offline 1
   288  	kill_process dm-master2
   289  	check_master_port_offline 2
   290  
   291  	# wait for master switch leader and re-setup
   292  	get_leader $WORK_DIR 127.0.0.1:$MASTER_PORT3
   293  
   294  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
   295  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
   296  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
   297  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
   298  
   299  	# start dm_master1 and dm-master2 again
   300  	echo "start dm-master1 and dm-master2 again"
   301  	run_dm_master $WORK_DIR/master1 $MASTER_PORT1 $cur/conf/dm-master1.toml
   302  	run_dm_master $WORK_DIR/master2 $MASTER_PORT2 $cur/conf/dm-master2.toml
   303  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT1
   304  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT2
   305  
   306  	echo "operate mysql config to worker"
   307  	cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml
   308  	cp $cur/conf/source2.yaml $WORK_DIR/source2.yaml
   309  	sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker1/relay_log" $WORK_DIR/source1.yaml
   310  	sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker2/relay_log" $WORK_DIR/source2.yaml
   311  	dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1
   312  	dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2
   313  
   314  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   315  		"list-member --master" \
   316  		"\"alive\": true" 5
   317  
   318  	test_evict_leader
   319  	test_list_member # TICASE-942, 944, 945, 946, 947
   320  
   321  	worker1bound=$($PWD/bin/dmctl.test DEVEL --master-addr "127.0.0.1:$MASTER_PORT1" list-member --name worker1 |
   322  		grep 'source' | awk -F: '{print $2}' | cut -d'"' -f 2)
   323  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   324  		"start-relay -s $worker1bound worker1" \
   325  		"\"result\": true" 2
   326  
   327  	echo "start DM task"
   328  	dmctl_start_task "$cur/conf/dm-task.yaml" "--remove-meta"
   329  
   330  	echo "use sync_diff_inspector to check full dump loader"
   331  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
   332  
   333  	echo "flush logs to force rotate binlog file"
   334  	run_sql "flush logs;" $MYSQL_PORT1 $MYSQL_PASSWORD1
   335  	run_sql "flush logs;" $MYSQL_PORT2 $MYSQL_PASSWORD2
   336  
   337  	echo "kill dm-master1 and kill dm-master2"
   338  	kill_process dm-master1
   339  	check_master_port_offline 1
   340  	kill_process dm-master2
   341  	check_master_port_offline 2
   342  
   343  	echo "wait and check task running"
   344  	check_http_alive 127.0.0.1:$MASTER_PORT3/apis/${API_VERSION}/status/test '"stage": "Running"' 10
   345  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT3" \
   346  		"query-status test" \
   347  		"\"stage\": \"Running\"" 3
   348  
   349  	run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
   350  	run_sql_file $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2
   351  	sleep 2
   352  
   353  	echo "use sync_diff_inspector to check data now!"
   354  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
   355  
   356  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT4" \
   357  		"offline-member --master --name master1" \
   358  		"\"result\": true" 1
   359  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT4" \
   360  		"offline-member --master --name master2" \
   361  		"\"result\": true" 1
   362  
   363  	echo "kill dm-master3"
   364  	kill_process dm-master3
   365  	check_master_port_offline 3
   366  
   367  	sleep 2
   368  	# the last two masters should elect a new leader and serve service
   369  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT4" \
   370  		"query-status test" \
   371  		"\"stage\": \"Running\"" 3
   372  
   373  	# run master3 again
   374  	run_dm_master $WORK_DIR/master3 $MASTER_PORT3 $cur/conf/dm-master3.toml
   375  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT3
   376  
   377  	sleep 5
   378  
   379  	rm -rf $WORK_DIR/master1/default.master1
   380  	# join master1 after offline, TICASE-933, 943
   381  	run_dm_master $WORK_DIR/master1 $MASTER_PORT1 $cur/conf/dm-master-join1.toml
   382  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT1
   383  
   384  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT4" \
   385  		"pause-task test" \
   386  		"\"result\": true" 3 \
   387  		"\"source\": \"$SOURCE_ID1\"" 1 \
   388  		"\"source\": \"$SOURCE_ID2\"" 1
   389  }
   390  
   391  cleanup_data ha_master_test
   392  # also cleanup dm processes in case of last run failed
   393  cleanup_process $*
   394  run $*
   395  cleanup_process $*
   396  
   397  echo "[$(date)] <<<<<< test case $TEST_NAME success! >>>>>>"