github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/tests/safe_mode/run.sh (about) 1 #!/bin/bash 2 3 set -eu 4 5 cur=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 6 source $cur/../_utils/test_prepare 7 WORK_DIR=$TEST_DIR/$TEST_NAME 8 9 function consistency_none() { 10 run_sql_source2 "SET @@GLOBAL.SQL_MODE='ANSI_QUOTES'" 11 run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 12 check_contains 'Query OK, 2 rows affected' 13 run_sql_file $cur/data/db2.prepare.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 14 check_contains 'Query OK, 3 rows affected' 15 16 run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml 17 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT 18 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 19 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 20 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 21 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 22 cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml 23 cp $cur/conf/source2.yaml $WORK_DIR/source2.yaml 24 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker1/relay_log" $WORK_DIR/source1.yaml 25 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker2/relay_log" $WORK_DIR/source2.yaml 26 dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 27 dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2 28 29 cp $cur/conf/dm-task.yaml $WORK_DIR/dm-task.yaml 30 sed -i "/enable-heartbeat/i\clean-dump-file: false" $WORK_DIR/dm-task.yaml 31 sed -i "s/extra-args: \"\"/extra-args: \"--consistency none\"/g" $WORK_DIR/dm-task.yaml 32 dmctl_start_task "$WORK_DIR/dm-task.yaml" "--remove-meta" 33 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 34 35 sleep 1 36 # make sure dumpling's metadata added empty line after two SHOW MASTER STATUS 37 empty_line=$(grep -cvE '\S' $WORK_DIR/worker1/dumped_data.test/metadata) 38 if [ $empty_line -ne 2 ]; then 39 echo "wrong number of empty line in dumpling's metadata" 40 exit 1 41 fi 42 empty_line=$(grep -cvE '\S' $WORK_DIR/worker2/dumped_data.test/metadata) 43 if [ $empty_line -ne 2 ]; then 44 echo "wrong number of empty line in dumpling's metadata" 45 exit 1 46 fi 47 48 name1=$(grep "Log: " $WORK_DIR/worker1/dumped_data.test/metadata | tail -1 | awk -F: '{print $2}' | tr -d ' ') 49 pos1=$(grep "Pos: " $WORK_DIR/worker1/dumped_data.test/metadata | tail -1 | awk -F: '{print $2}' | tr -d ' ') 50 gtid1=$(grep "GTID:" $WORK_DIR/worker1/dumped_data.test/metadata | tail -1 | awk -F: '{print $2,":",$3}' | tr -d ' ') 51 check_log_contain_with_retry "\[\"compare exitPoint and beginLocation\"\] \[task=test\] \[unit=\"binlog replication\"\] \[exitPoint=\"position: ($name1, $pos1), gtid-set: $gtid1\"\]" $WORK_DIR/worker1/log/dm-worker.log 52 name2=$(grep "Log: " $WORK_DIR/worker2/dumped_data.test/metadata | tail -1 | awk -F: '{print $2}' | tr -d ' ') 53 pos2=$(grep "Pos: " $WORK_DIR/worker2/dumped_data.test/metadata | tail -1 | awk -F: '{print $2}' | tr -d ' ') 54 gtid2=$(grep "GTID:" $WORK_DIR/worker2/dumped_data.test/metadata | tail -1 | awk -F: '{print $2,":",$3}' | tr -d ' ') 55 check_log_contain_with_retry "\[\"compare exitPoint and beginLocation\"\] \[task=test\] \[unit=\"binlog replication\"\] \[exitPoint=\"position: ($name2, $pos2), gtid-set: $gtid2\"\]" $WORK_DIR/worker2/log/dm-worker.log 56 run_sql_source2 "SET @@GLOBAL.SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'" 57 cleanup_process $* 58 cleanup_data safe_mode_target 59 } 60 61 function check_exit_safe_binlog() { 62 source_id=$1 63 compare_gtid=$2 64 compare=$3 65 66 bash -c "$cur/../bin/check_exit_safe_binlog $TIDB_PASSWORD $source_id $compare_gtid \"$compare\"" 67 } 68 69 function safe_mode_recover() { 70 i=1 71 while [ $i -lt 5 ]; do 72 echo "start to run safe mode case $i" 73 export GO_FAILPOINTS="" 74 run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 75 check_contains 'Query OK, 2 rows affected' 76 run_sql_file $cur/data/db2.prepare.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 77 check_contains 'Query OK, 3 rows affected' 78 79 run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml 80 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT 81 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 82 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 83 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 84 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 85 # operate mysql config to worker 86 cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml 87 cp $cur/conf/source2.yaml $WORK_DIR/source2.yaml 88 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker1/relay_log" $WORK_DIR/source1.yaml 89 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker2/relay_log" $WORK_DIR/source2.yaml 90 dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 91 dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2 92 93 dmctl_start_task "$cur/conf/dm-task.yaml" "--remove-meta" 94 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 95 96 kill_dm_worker 97 98 export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/syncer/SafeModeExit=return($i)" 99 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 100 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 101 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 102 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 103 104 # DM-worker returns error due to the mocked error 105 run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 106 run_sql_file $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 107 expected_paused=2 108 if [ $i -ge 2 ] && [ $i -le 3 ]; then 109 expected_paused=1 110 fi 111 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 112 "query-status test" \ 113 "Paused" $expected_paused 114 if [ $expected_paused -eq 1 ]; then 115 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 116 "pause-task test" \ 117 "\"result\": true" 3 118 echo 'create table t1 (id bigint auto_increment, uid int, name varchar(80), primary key (`id`), unique key(`uid`)) DEFAULT CHARSET=utf8mb4;' >${WORK_DIR}/schema.sql 119 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 120 "operate-schema set -s mysql-replica-01 test -d safe_mode_test -t t1 ${WORK_DIR}/schema.sql" \ 121 "\"result\": true" 2 122 echo 'create table t2 (id bigint auto_increment, uid int, name varchar(80), primary key (`id`), unique key(`uid`)) DEFAULT CHARSET=utf8mb4 AUTO_INCREMENT = 100;' >${WORK_DIR}/schema.sql 123 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 124 "operate-schema set -s mysql-replica-01 test -d safe_mode_test -t t2 ${WORK_DIR}/schema.sql" \ 125 "\"result\": true" 2 126 echo 'create table t2 (id bigint auto_increment, uid int, name varchar(80), primary key (`id`), unique key(`uid`)) DEFAULT CHARSET=utf8mb4 AUTO_INCREMENT = 200;' >${WORK_DIR}/schema.sql 127 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 128 "operate-schema set -s mysql-replica-02 test -d safe_mode_test -t t2 ${WORK_DIR}/schema.sql" \ 129 "\"result\": true" 2 130 echo 'create table t3 (id bigint auto_increment, uid int, name varchar(80), primary key (`id`), unique key(`uid`)) DEFAULT CHARSET=utf8mb4 AUTO_INCREMENT = 300;' >${WORK_DIR}/schema.sql 131 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 132 "operate-schema set -s mysql-replica-02 test -d safe_mode_test -t t3 ${WORK_DIR}/schema.sql" \ 133 "\"result\": true" 2 134 fi 135 136 kill_dm_worker 137 138 compare="<" 139 if [ $i -lt 2 ]; then 140 compare="=" 141 fi 142 check_exit_safe_binlog "mysql-replica-01" "true" $compare 143 check_exit_safe_binlog "mysql-replica-02" "false" $compare 144 145 export GO_FAILPOINTS="" 146 # clean failpoint, should sync successfully 147 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 148 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 149 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 150 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 151 152 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 153 "resume-task test" \ 154 "\"result\": true" 3 155 sleep 3 156 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 157 "query-status test" \ 158 "Running" 3 159 echo "check sync diff after clean SafeModeExit failpoint" 160 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 161 162 # DM-worker exit when waiting for sharding group synced 163 run_sql_file $cur/data/db1.increment2.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 164 run_sql_file $cur/data/db2.increment2.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 165 echo "check sync diff after restart DDL owner" 166 sleep 3 167 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 168 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 169 "pause-task test" \ 170 "\"result\": true" 3 171 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 172 "query-status test" \ 173 "Paused" 2 174 175 check_exit_safe_binlog "mysql-replica-01" "true" "=" 176 check_exit_safe_binlog "mysql-replica-02" "false" "=" 177 178 echo "finish running run safe mode recover case $i" 179 ((i += 1)) 180 cleanup_process $* 181 cleanup_data safe_mode_target 182 done 183 } 184 185 function safe_mode_duration() { 186 run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 187 check_contains 'Query OK, 2 rows affected' 188 run_sql_file $cur/data/db2.prepare.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 189 check_contains 'Query OK, 3 rows affected' 190 191 # make sure exitPoint not equal to beginPosition 192 export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/syncer/SkipSaveGlobalPoint=return()" 193 run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml 194 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT 195 # worker1 -> source1 196 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 197 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 198 cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml 199 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker1/relay_log" $WORK_DIR/source1.yaml 200 dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 201 # worker2 -> source2 202 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 203 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 204 cp $cur/conf/source2.yaml $WORK_DIR/source2.yaml 205 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker2/relay_log" $WORK_DIR/source2.yaml 206 dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2 207 208 dmctl_start_task "$cur/conf/dm-task-safe-mode-duration.yaml" "--remove-meta" 209 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 210 211 run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 212 run_sql_file $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 213 214 # make sure worker1 enter to sync status, and complete one dml 215 check_log_contain_with_retry "event=XID" $WORK_DIR/worker1/log/dm-worker.log 216 # restart workers 217 kill_dm_worker 218 219 export GO_FAILPOINTS="" 220 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 221 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 222 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 223 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 224 225 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 226 "query-status test" \ 227 "your \`safe-mode-duration\` in task.yaml is set to 0s" 1 228 229 # stop and start task success 230 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 231 "stop-task test" \ 232 "\"result\": true" 3 233 234 cp $cur/conf/dm-task-safe-mode-duration.yaml $WORK_DIR/dm-task-safe-mode-duration.yaml 235 sed -i "s/safe-mode-duration: \"0s\"/safe-mode-duration: \"30s\"/" $WORK_DIR/dm-task-safe-mode-duration.yaml 236 237 dmctl_start_task "$WORK_DIR/dm-task-safe-mode-duration.yaml" 238 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 239 240 sed -i "s/safe-mode-duration: \"30s\"/safe-mode-duration: \"0s\"/" $WORK_DIR/dm-task-safe-mode-duration.yaml 241 # stop and start task success 242 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 243 "stop-task test" \ 244 "\"result\": true" 3 245 246 run_sql_file $cur/data/db1.increment2.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 247 run_sql_file $cur/data/db2.increment2.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 248 249 dmctl_start_task "$WORK_DIR/dm-task-safe-mode-duration.yaml" 250 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 251 252 echo "finish running safe mode duration case" 253 cleanup_process $* 254 cleanup_data safe_mode_target 255 } 256 257 function run() { 258 consistency_none 259 safe_mode_recover 260 safe_mode_duration 261 262 run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 263 check_contains 'Query OK, 2 rows affected' 264 run_sql_file $cur/data/db2.prepare.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 265 check_contains 'Query OK, 3 rows affected' 266 267 export GO_FAILPOINTS='github.com/pingcap/tiflow/dm/syncer/ReSyncExit=return(true)' 268 run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml 269 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT 270 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 271 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 272 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 273 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 274 # operate mysql config to worker 275 cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml 276 cp $cur/conf/source2.yaml $WORK_DIR/source2.yaml 277 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker1/relay_log" $WORK_DIR/source1.yaml 278 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker2/relay_log" $WORK_DIR/source2.yaml 279 dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 280 dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2 281 282 dmctl_start_task "$cur/conf/dm-task.yaml" "--remove-meta" 283 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 284 285 # DM-worker exit during re-sync after sharding group synced 286 run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 287 run_sql_file $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 288 289 check_port_offline $WORKER1_PORT 20 290 check_port_offline $WORKER2_PORT 20 291 292 export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/syncer/ShardSyncedExecutionExit=return(true);github.com/pingcap/tiflow/dm/syncer/SafeModeInitPhaseSeconds=return(\"300s\")" 293 294 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 295 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 296 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 297 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 298 299 sleep 5 300 echo "check sync diff after set SafeModeInitPhaseSeconds failpoint" 301 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 302 303 # DM-worker exit when waiting for sharding group synced 304 run_sql_file $cur/data/db1.increment2.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 305 run_sql_file $cur/data/db2.increment2.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 306 307 OWNER_PORT="" 308 i=0 309 while [ $i -lt 10 ]; do 310 # we can't determine which DM-worker is the sharding lock owner, so we try both of them 311 # DM-worker1 is sharding lock owner and exits 312 if [ "$(check_port_return $WORKER1_PORT)" == "0" ]; then 313 echo "DM-worker1 is sharding lock owner and detects it offline" 314 export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/syncer/SafeModeInitPhaseSeconds=return(\"0s\")" 315 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 316 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 317 check_instance_id="1" 318 OWNER_PORT=$WORKER1_PORT 319 break 320 fi 321 # DM-worker2 is sharding lock owner and exits 322 if [ "$(check_port_return $WORKER2_PORT)" == "0" ]; then 323 echo "DM-worker2 is sharding lock owner and detects it offline" 324 export GO_FAILPOINTS="github.com/pingcap/tiflow/dm/syncer/SafeModeInitPhaseSeconds=return(\"0s\")" 325 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 326 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 327 check_instance_id="2" 328 OWNER_PORT=$WORKER2_PORT 329 break 330 fi 331 332 ((i += 1)) 333 echo "wait for one of DM-worker offine failed, retry later" && sleep 1 334 done 335 if [ $i -ge 10 ]; then 336 echo "wait DM-worker offline timeout" 337 exit 1 338 fi 339 340 sleep 5 341 echo "check sync diff after restart DDL owner" 342 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 343 } 344 345 cleanup_data safe_mode_target 346 # also cleanup dm processes in case of last run failed 347 cleanup_process $* 348 run $* 349 cleanup_process $* 350 351 echo "[$(date)] <<<<<< test case $TEST_NAME success! >>>>>>"