github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/tests/incremental_mode/run.sh (about)

     1  #!/bin/bash
     2  
     3  set -eu
     4  
     5  cur=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
     6  source $cur/../_utils/test_prepare
     7  WORK_DIR=$TEST_DIR/$TEST_NAME
     8  TASK_NAME="test"
     9  
    10  API_VERSION="v1alpha1"
    11  
    12  function get_uuid() {
    13  	uuid=$(echo "show variables like '%server_uuid%';" | MYSQL_PWD=123456 mysql -uroot -h$1 -P$2 | awk 'FNR == 2 {print $2}')
    14  	echo $uuid
    15  }
    16  
    17  function get_binlog_name() {
    18  	binlog_name=$(echo "SHOW BINARY LOGS;" | MYSQL_PWD=123456 mysql -uroot -h127.0.0.1 -P3307 | awk 'FNR == 2 {print $1}')
    19  	echo $binlog_name
    20  }
    21  
    22  function get_latest_name() {
    23  	binlog_name=$(echo "SHOW BINARY LOGS;" | MYSQL_PWD=123456 mysql -uroot -h$1 -P$2 | awk 'END{print $1}')
    24  	echo $binlog_name
    25  }
    26  
    27  ######################################################
    28  #   		this test also used by binlog 999999 test
    29  ######################################################
    30  function run() {
    31  	run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
    32  	check_contains 'Query OK, 2 rows affected'
    33  	run_sql_file $cur/data/db2.prepare.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2
    34  	check_contains 'Query OK, 3 rows affected'
    35  	uuid=($(get_uuid $MYSQL_HOST1 $MYSQL_PORT1))
    36  	binlog_name=($(get_binlog_name $MYSQL_HOST2 $MYSQL_PORT2))
    37  
    38  	export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/worker/defaultKeepAliveTTL=return(1)"
    39  
    40  	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
    41  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
    42  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
    43  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
    44  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
    45  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
    46  
    47  	# test keepalive is changed by failpoint, so after 1 second DM master will know not alive
    48  	killall -9 dm-worker.test
    49  	sleep 3
    50  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    51  		"list-member" \
    52  		"\"stage\": \"offline\"" 2
    53  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
    54  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
    55  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
    56  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
    57  
    58  	# operate mysql config to worker
    59  	cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml
    60  	cp $cur/conf/source2.yaml $WORK_DIR/source2.yaml
    61  	sed -i "s/binlog-gtid-placeholder/$uuid:0/g" $WORK_DIR/source1.yaml
    62  	sed -i "s/binlog-name-placeholder/$binlog_name/g" $WORK_DIR/source2.yaml
    63  	sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker1/relay_log" $WORK_DIR/source1.yaml
    64  	sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker2/relay_log" $WORK_DIR/source2.yaml
    65  	dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1
    66  	dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2
    67  
    68  	worker1bound=$($PWD/bin/dmctl.test DEVEL --master-addr "127.0.0.1:$MASTER_PORT1" list-member --name worker1 |
    69  		grep 'source' | awk -F: '{print $2}' | cut -d'"' -f 2)
    70  	worker2bound=$($PWD/bin/dmctl.test DEVEL --master-addr "127.0.0.1:$MASTER_PORT1" list-member --name worker2 |
    71  		grep 'source' | awk -F: '{print $2}' | cut -d'"' -f 2)
    72  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    73  		"start-relay -s $worker1bound worker1" \
    74  		"\"result\": true" 2
    75  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    76  		"start-relay -s $worker2bound worker2" \
    77  		"\"result\": true" 2
    78  
    79  	# relay should be started after start-relay
    80  	sleep 2
    81  	# and now default keepalive TTL is 30 minutes
    82  	killall -9 dm-worker.test
    83  	sleep 3
    84  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    85  		"list-member" \
    86  		"\"stage\": \"bound\"" 2
    87  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
    88  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
    89  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
    90  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
    91  
    92  	kill_dm_worker
    93  	check_port_offline $WORKER1_PORT 20
    94  	check_port_offline $WORKER2_PORT 20
    95  
    96  	# using account with limited privileges
    97  	run_sql_file $cur/data/db1.prepare.user.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
    98  	check_count 'Query OK, 0 rows affected' 7
    99  	run_sql_file $cur/data/db2.prepare.user.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2
   100  	check_count 'Query OK, 0 rows affected' 7
   101  
   102  	# update mysql config
   103  	sed -i "s/root/dm_incremental/g" $WORK_DIR/source1.yaml
   104  	sed -i "s/root/dm_incremental/g" $WORK_DIR/source2.yaml
   105  
   106  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   107  		"operate-source update $WORK_DIR/source1.yaml" \
   108  		"Update worker config is not supported by dm-ha now" 1
   109  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   110  		"operate-source update $WORK_DIR/source2.yaml" \
   111  		"Update worker config is not supported by dm-ha now" 1
   112  	# update mysql config is not supported by dm-ha now, so we stop and start source again to update source config
   113  
   114  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   115  		"stop-relay -s $worker1bound worker1" \
   116  		"\"result\": true" 2
   117  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   118  		"stop-relay -s $worker2bound worker2" \
   119  		"\"result\": true" 2
   120  
   121  	dmctl_operate_source stop $WORK_DIR/source1.yaml $SOURCE_ID1
   122  	dmctl_operate_source stop $WORK_DIR/source2.yaml $SOURCE_ID2
   123  	dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1
   124  	dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2
   125  
   126  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   127  		"start-relay -s $worker1bound worker1" \
   128  		"\"result\": true" 2
   129  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   130  		"start-relay -s $worker2bound worker2" \
   131  		"\"result\": true" 2
   132  
   133  	worker1_run_source_1=$(sed "s/$SOURCE_ID1/$SOURCE_ID1\n/g" $WORK_DIR/worker1/log/dm-worker.log | grep -c "$SOURCE_ID1") || true
   134  	echo "start task in incremental mode with zero gtid/pos"
   135  	sed "s/binlog-gtid-placeholder-1/$uuid:0/g" $cur/conf/dm-task.yaml >$WORK_DIR/dm-task.yaml
   136  	sed -i "s/binlog-name-placeholder-2/$binlog_name/g" $WORK_DIR/dm-task.yaml
   137  	sed -i "s/binlog-pos-placeholder-2/4/g" $WORK_DIR/dm-task.yaml
   138  
   139  	# test graceful display error
   140  	export GO_FAILPOINTS='github.com/pingcap/tiflow/dm/syncer/binlogstream/GetEventError=return'
   141  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
   142  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
   143  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
   144  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
   145  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   146  		"start-task $WORK_DIR/dm-task.yaml --remove-meta"
   147  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   148  		"query-status test" \
   149  		"\"ErrCode\": 36069" 2
   150  
   151  	kill_dm_worker
   152  	check_port_offline $WORKER1_PORT 20
   153  	check_port_offline $WORKER2_PORT 20
   154  
   155  	# only mock pull binlog failed once
   156  	export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/syncer/WaitUserCancel=return(8);github.com/pingcap/tiflow/dm/syncer/binlogstream/GetEventError=1*return"
   157  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
   158  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
   159  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
   160  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
   161  
   162  	sleep 3
   163  	# check not specify binlog name could also update active relay log
   164  	if [ $worker1_run_source_1 -gt 0 ]; then
   165  		grep -E ".*current earliest active relay log.*$binlog_name" $WORK_DIR/worker2/log/dm-worker.log
   166  	else
   167  		grep -E ".*current earliest active relay log.*$binlog_name" $WORK_DIR/worker1/log/dm-worker.log
   168  	fi
   169  
   170  	run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
   171  	run_sql_file $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2
   172  
   173  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   174  		"query-status test" \
   175  		"Running" 4
   176  	# check reset binlog puller success
   177  	grep -Fq "reset replication binlog puller" $WORK_DIR/worker1/log/dm-worker.log
   178  	grep -Fq "reset replication binlog puller" $WORK_DIR/worker2/log/dm-worker.log
   179  
   180  	check_log_contain_with_retry 'finish to handle ddls in normal mode.*create table t2' $WORK_DIR/worker1/log/dm-worker.log $WORK_DIR/worker2/log/dm-worker.log
   181  
   182  	# we use failpoint to let worker sleep 8 second when executeSQLs, to increase possibility of
   183  	# meeting an error of context cancel.
   184  	# when below check pass, it means we filter out that error, or that error doesn't happen.
   185  	# we only focus on fails, to find any unfiltered context cancel error.
   186  	# and should not contain errors like:
   187  	#   - `driver: bad connection`
   188  	#   - `sql: connection is already closed`
   189  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   190  		"pause-task test" \
   191  		"\"result\": true" 3
   192  
   193  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   194  		"stop-task test" \
   195  		"\"result\": true" 3
   196  	kill_dm_worker
   197  	check_port_offline $WORKER1_PORT 20
   198  	check_port_offline $WORKER2_PORT 20
   199  
   200  	export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/syncer/FlushCheckpointStage=return(100)" # for all stages
   201  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
   202  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
   203  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
   204  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
   205  
   206  	sleep 3
   207  	# start DM task. don't check error because it will meet injected error soon
   208  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   209  		"start-task $WORK_DIR/dm-task.yaml"
   210  
   211  	# the task should paused by `FlushCheckpointStage` failpoint before flush old checkpoint.
   212  	# `db2.increment.sql` has no DDL, so we check count of content as `1`.
   213  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   214  		"query-status test" \
   215  		"failpoint error for FlushCheckpointStage before flush old checkpoint" 1
   216  
   217  	# resume-task to next stage
   218  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   219  		"resume-task test"
   220  
   221  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   222  		"query-status test" \
   223  		"failpoint error for FlushCheckpointStage before track DDL" 1
   224  
   225  	# resume-task to next stage
   226  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   227  		"resume-task test"
   228  
   229  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   230  		"query-status test" \
   231  		"failpoint error for FlushCheckpointStage before execute DDL" 1
   232  
   233  	# resume-task to next stage
   234  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   235  		"resume-task test"
   236  
   237  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   238  		"query-status test" \
   239  		"failpoint error for FlushCheckpointStage before save checkpoint" 1
   240  
   241  	# resume-task to next stage
   242  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   243  		"resume-task test"
   244  
   245  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   246  		"query-status test" \
   247  		"failpoint error for FlushCheckpointStage before flush checkpoint" 1
   248  
   249  	# resume-task to continue the sync
   250  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   251  		"resume-task test" \
   252  		"\"result\": true" 3
   253  
   254  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
   255  
   256  	# test rotate binlog, after rotate and ddl, master binlog should be equal to sync binlog
   257  	run_sql "flush logs;" $MYSQL_PORT1 $MYSQL_PASSWORD1
   258  	run_sql "truncate table incremental_mode.t1;" $MYSQL_PORT1 $MYSQL_PASSWORD1
   259  
   260  	sleep 2
   261  	curl -X GET 127.0.0.1:$MASTER_PORT/apis/${API_VERSION}/status/test >$WORK_DIR/status.log
   262  	SYNCER_BINLOG=$(cat $WORK_DIR/status.log | sed 's/.*mysql-replica-01.*\"syncerBinlog\":\"\(.*\)\",\"syncerBinlogGtid.*mysql-replica-02.*/\1/g')
   263  	MASTER_BINLOG=$(cat $WORK_DIR/status.log | sed 's/.*mysql-replica-01.*\"masterBinlog\":\"\(.*\)\",\"masterBinlogGtid.*mysql-replica-02.*/\1/g')
   264  
   265  	if [ "$MASTER_BINLOG" != "$SYNCER_BINLOG" ]; then
   266  		echo "master binlog is not equal to syncer binlog"
   267  		cat $WORK_DIR/status.log
   268  		exit 1
   269  	fi
   270  
   271  	export GO_FAILPOINTS=''
   272  	# stop DM task.
   273  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   274  		"stop-task $WORK_DIR/dm-task.yaml"
   275  
   276  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   277  		"stop-relay -s $worker1bound worker1" \
   278  		"\"result\": true" 2
   279  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   280  		"stop-relay -s $worker2bound worker2" \
   281  		"\"result\": true" 2
   282  
   283  	# test rotate binlog, after rotate purge some binlogs. check whether dm can do precheck correctly
   284  	uuid=($(get_uuid $MYSQL_HOST1 $MYSQL_PORT1)) # get uuid before truncate
   285  	binlog_name=($(get_latest_name $MYSQL_HOST2 $MYSQL_PORT2))
   286  
   287  	run_sql "flush logs;" $MYSQL_PORT1 $MYSQL_PASSWORD1
   288  	run_sql "flush logs;" $MYSQL_PORT2 $MYSQL_PASSWORD2
   289  	new_binlog_name1=($(get_latest_name $MYSQL_HOST1 $MYSQL_PORT1))
   290  	new_binlog_name2=($(get_latest_name $MYSQL_HOST2 $MYSQL_PORT2))
   291  
   292  	sed "s/binlog-gtid-placeholder-1/$uuid:0/g" $cur/conf/dm-task.yaml >$WORK_DIR/dm-task.yaml
   293  	sed -i "s/binlog-name-placeholder-2/$new_binlog_name2/g" $WORK_DIR/dm-task.yaml
   294  	sed -i "s/binlog-pos-placeholder-2/4/g" $WORK_DIR/dm-task.yaml
   295  	# precheck DM task.
   296  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   297  		"check-task $WORK_DIR/dm-task.yaml" \
   298  		"\"name\": \"meta position check\"" 0
   299  
   300  	run_sql "purge binary logs to '$new_binlog_name2'" $MYSQL_PORT2 $MYSQL_PASSWORD2
   301  	run_sql "truncate table incremental_mode.t1;" $MYSQL_PORT1 $MYSQL_PASSWORD1
   302  	run_sql "truncate table incremental_mode.t2;" $MYSQL_PORT2 $MYSQL_PASSWORD2
   303  
   304  	sed "s/binlog-gtid-placeholder-1/$uuid:0/g" $cur/conf/dm-task.yaml >$WORK_DIR/dm-task.yaml
   305  	sed -i "s/binlog-name-placeholder-2/$binlog_name/g" $WORK_DIR/dm-task.yaml
   306  	sed -i "s/binlog-pos-placeholder-2/4/g" $WORK_DIR/dm-task.yaml
   307  
   308  	# precheck DM task.
   309  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   310  		"check-task $WORK_DIR/dm-task.yaml" \
   311  		"\"name\": \"meta position check\"" 1
   312  
   313  	run_sql "purge binary logs to '$new_binlog_name1'" $MYSQL_PORT1 $MYSQL_PASSWORD1
   314  	sed "s/binlog-gtid-placeholder-1/$uuid:0/g" $cur/conf/dm-task.yaml >$WORK_DIR/dm-task.yaml
   315  	sed -i "s/binlog-name-placeholder-2/$new_binlog_name2/g" $WORK_DIR/dm-task.yaml
   316  	sed -i "s/binlog-pos-placeholder-2/4/g" $WORK_DIR/dm-task.yaml
   317  
   318  	# precheck DM task.
   319  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   320  		"check-task $WORK_DIR/dm-task.yaml" \
   321  		"\"name\": \"meta position check\"" 1
   322  
   323  	# start DM task.
   324  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   325  		"start-task $WORK_DIR/dm-task.yaml --remove-meta" \
   326  		"\"name\": \"meta position check\"" 1
   327  }
   328  
   329  cleanup_data $TEST_NAME
   330  # also cleanup dm processes in case of last run failed
   331  cleanup_process $*
   332  run $*
   333  cleanup_process $*
   334  
   335  echo "[$(date)] <<<<<< test case $TEST_NAME success! >>>>>>"