github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/tests/ha_cases2/run.sh (about)

     1  #!/bin/bash
     2  
     3  set -eu
     4  
     5  cur=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
     6  source $cur/../_utils/test_prepare
     7  WORK_DIR=$TEST_DIR/$TEST_NAME
     8  API_VERSION="v1alpha1"
     9  # import helper functions
    10  source $cur/../_utils/ha_cases_lib.sh
    11  
    12  function print_debug_status() {
    13  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT3" \
    14  		"query-status test" \
    15  		"fail me!" 1 &&
    16  		run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT3" \
    17  			"query-status test2" \
    18  			"fail me!" 1 && exit 1
    19  }
    20  
    21  function test_multi_task_running() {
    22  	echo "[$(date)] <<<<<< start test_multi_task_running >>>>>>"
    23  	cleanup
    24  	prepare_sql_multi_task
    25  	start_multi_tasks_cluster
    26  
    27  	# make sure task to step in "Sync" stage
    28  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT3" \
    29  		"query-status test" \
    30  		"\"stage\": \"Running\"" 2 \
    31  		"\"unit\": \"Sync\"" 2
    32  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT3" \
    33  		"query-status test2" \
    34  		"\"stage\": \"Running\"" 2 \
    35  		"\"unit\": \"Sync\"" 2
    36  
    37  	echo "use sync_diff_inspector to check full dump loader"
    38  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
    39  	check_sync_diff $WORK_DIR $cur/conf/diff_config_multi_task.toml
    40  
    41  	echo "flush logs to force rotate binlog file"
    42  	run_sql "flush logs;" $MYSQL_PORT1 $MYSQL_PASSWORD1
    43  	run_sql "flush logs;" $MYSQL_PORT2 $MYSQL_PASSWORD2
    44  
    45  	echo "apply increment data before restart dm-worker to ensure entering increment phase"
    46  	run_sql_file_withdb $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 $ha_test
    47  	run_sql_file_withdb $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 $ha_test
    48  	run_sql_file_withdb $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 $ha_test2
    49  	run_sql_file_withdb $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 $ha_test2
    50  
    51  	sleep 5 # wait for flush checkpoint
    52  	echo "use sync_diff_inspector to check increment data"
    53  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 50 || print_debug_status
    54  	check_sync_diff $WORK_DIR $cur/conf/diff_config_multi_task.toml 50 || print_debug_status
    55  	echo "[$(date)] <<<<<< finish test_multi_task_running >>>>>>"
    56  }
    57  
    58  function test_pause_task() {
    59  	echo "[$(date)] <<<<<< start test_pause_task >>>>>>"
    60  	test_multi_task_running
    61  
    62  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    63  		"start-relay -s $SOURCE_ID1 worker1" \
    64  		"\"result\": true" 2
    65  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    66  		"start-relay -s $SOURCE_ID2 worker2" \
    67  		"\"result\": true" 2
    68  
    69  	echo "start dumping SQLs into source"
    70  	load_data $MYSQL_PORT1 $MYSQL_PASSWORD1 "a" &
    71  	load_data $MYSQL_PORT2 $MYSQL_PASSWORD2 "b" &
    72  
    73  	task_name=(test test2)
    74  	for name in ${task_name[@]}; do
    75  		echo "pause tasks $name"
    76  
    77  		# because some SQL may running (often remove checkpoint record), pause will cause that SQL failed
    78  		# thus `result` is not true
    79  		run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    80  			"pause-task $name"
    81  
    82  		# pause twice, just used to test pause by the way
    83  		run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    84  			"pause-task $name"
    85  
    86  		run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    87  			"query-status $name" \
    88  			"\"stage\": \"Paused\"" 2
    89  	done
    90  
    91  	sleep 1
    92  
    93  	for name in ${task_name[@]}; do
    94  		echo "resume tasks $name"
    95  		run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    96  			"resume-task $name" \
    97  			"\"result\": true" 3
    98  
    99  		# resume twice, just used to test resume by the way
   100  		run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   101  			"resume-task $name" \
   102  			"\"result\": true" 3
   103  
   104  		run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   105  			"query-status $name" \
   106  			"\"stage\": \"Running\"" 4
   107  	done
   108  
   109  	# waiting for syncing
   110  	wait
   111  	sleep 1
   112  
   113  	echo "use sync_diff_inspector to check increment data"
   114  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
   115  	check_sync_diff $WORK_DIR $cur/conf/diff_config_multi_task.toml
   116  	echo "[$(date)] <<<<<< finish test_pause_task >>>>>>"
   117  }
   118  
   119  function test_multi_task_reduce_and_restart_worker() {
   120  	echo "[$(date)] <<<<<< start test_multi_task_reduce_and_restart_worker >>>>>>"
   121  	test_multi_task_running
   122  
   123  	echo "start dumping SQLs into source"
   124  	load_data $MYSQL_PORT1 $MYSQL_PASSWORD1 "a" &
   125  	load_data $MYSQL_PORT2 $MYSQL_PASSWORD2 "b" &
   126  	load_data $MYSQL_PORT1 $MYSQL_PASSWORD1 "a" $ha_test2 &
   127  	load_data $MYSQL_PORT2 $MYSQL_PASSWORD2 "b" $ha_test2 &
   128  	worker_ports=($WORKER1_PORT $WORKER2_PORT $WORKER3_PORT $WORKER4_PORT $WORKER5_PORT)
   129  
   130  	# find which worker is in use
   131  	task_name=(test test2)
   132  	worker_inuse=("") # such as ("worker1" "worker4")
   133  	status=$($PWD/bin/dmctl.test DEVEL --master-addr "127.0.0.1:$MASTER_PORT" query-status test |
   134  		grep 'worker' | awk -F: '{print $2}')
   135  	echo $status
   136  	for w in ${status[@]}; do
   137  		worker_inuse=(${worker_inuse[*]} ${w:0-9:7})
   138  		echo "find workers: ${w:0-9:7} for task: test"
   139  	done
   140  	echo "find all workers: ${worker_inuse[@]} (total: ${#worker_inuse[@]})"
   141  
   142  	for idx in $(seq 1 5); do
   143  		if [[ ! " ${worker_inuse[@]} " =~ " worker${idx} " ]]; then
   144  			echo "restart unuse worker${idx}"
   145  
   146  			echo "try to kill worker port ${worker_ports[$(($idx - 1))]}"
   147  			kill_process dm-worker${idx}
   148  			run_dm_ctl_with_retry $WORK_DIR 127.0.0.1:$MASTER_PORT2 "list-member --worker --name=worker$idx" '"stage": "offline"' 1
   149  
   150  			echo "start dm-worker${idx}"
   151  			run_dm_worker $WORK_DIR/worker${idx} ${worker_ports[$(($idx - 1))]} $cur/conf/dm-worker${idx}.toml
   152  			check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:${worker_ports[$(($idx - 1))]}
   153  		fi
   154  	done
   155  
   156  	for ((i = 0; i < ${#worker_inuse[@]}; i++)); do
   157  		wk=${worker_inuse[$i]:0-1:1}                                 # get worker id, such as ("1", "4")
   158  		echo "try to kill worker port ${worker_ports[$(($wk - 1))]}" # get relative worker port
   159  		kill_process dm-${worker_inuse[$i]}
   160  		check_port_offline ${worker_ports[$(($wk - 1))]} 20
   161  		# just one worker was killed should be safe
   162  		echo "${worker_inuse[$i]} was killed"
   163  		for name in ${task_name[@]}; do
   164  			run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   165  				"query-status $name" \
   166  				"\"stage\": \"Running\"" 2
   167  		done
   168  		if [ $i = 0 ]; then
   169  			# waiting for syncing
   170  			wait
   171  			sleep 2
   172  			echo "use sync_diff_inspector to check increment data"
   173  			check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
   174  			check_sync_diff $WORK_DIR $cur/conf/diff_config_multi_task.toml
   175  			echo "data checked after one worker was killed"
   176  		fi
   177  	done
   178  	echo "[$(date)] <<<<<< finish test_multi_task_reduce_and_restart_worker >>>>>>"
   179  }
   180  
   181  function run() {
   182  	test_pause_task                           # TICASE-990
   183  	test_multi_task_reduce_and_restart_worker # TICASE-968, 994, 995, 964, 966, 979, 981, 982, 985, 986, 989, 993
   184  }
   185  
   186  cleanup_data $ha_test
   187  cleanup_data $ha_test2
   188  # also cleanup dm processes in case of last run failed
   189  cleanup_process $*
   190  run $*
   191  cleanup_process $*
   192  
   193  echo "[$(date)] <<<<<< test case $TEST_NAME success! >>>>>>"