github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/tests/metrics/run.sh (about) 1 #!/bin/bash 2 3 set -eu 4 5 cur=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 6 source $cur/../_utils/test_prepare 7 WORK_DIR=$TEST_DIR/$TEST_NAME 8 9 function check_dashboard_datasource() { 10 echo "check dashboard data source" 11 check_grafana_dashboard_datasource "../metrics/grafana/DM-Monitor-Standard.json" 12 check_grafana_dashboard_datasource "../metrics/grafana/DM-Monitor-Professional.json" 13 echo "check dashboard data source success" 14 } 15 16 function run() { 17 18 check_dashboard_datasource 19 20 inject_points=( 21 "github.com/pingcap/tiflow/dm/syncer/BlockDDLJob=return(1)" 22 "github.com/pingcap/tiflow/dm/syncer/ShowLagInLog=return(1)" # test lag metric >= 1 because we inject BlockDDLJob(ddl) to sleep(1) 23 "github.com/pingcap/tiflow/dm/worker/PrintStatusCheckSeconds=return(1)" 24 ) 25 export GO_FAILPOINTS="$(join_string \; ${inject_points[@]})" 26 27 run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 28 run_sql_file $cur/data/db2.prepare.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 29 30 # start DM worker and master 31 run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml 32 check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT 33 34 # operate mysql config to worker 35 cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml 36 cp $cur/conf/source2.yaml $WORK_DIR/source2.yaml 37 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker1/relay_log" $WORK_DIR/source1.yaml 38 sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker2/relay_log" $WORK_DIR/source2.yaml 39 40 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 41 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 42 dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1 43 44 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 45 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 46 dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2 47 48 # start DM task 49 cp $cur/conf/dm-task.yaml $WORK_DIR/dm-task.yaml 50 dmctl_start_task "$WORK_DIR/dm-task.yaml" "--remove-meta" 51 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 52 "query-status test" \ 53 "\"result\": true" 3 54 55 check_metric $WORKER1_PORT 'dm_worker_task_state{source_id="mysql-replica-01",task="test",worker="worker1"}' 10 1 3 56 check_metric $WORKER2_PORT 'dm_worker_task_state{source_id="mysql-replica-02",task="test",worker="worker2"}' 10 1 3 57 58 # check dm_syncer_binlog_file is updated timely 59 run_sql_source1 "flush logs;" 60 check_metric $WORKER1_PORT 'dm_syncer_binlog_file{node="syncer",source_id="mysql-replica-01",task="test"}' 10 1 3 61 check_metric $WORKER1_PORT 'dm_syncer_binlog_file{node="master",source_id="mysql-replica-01",task="test"}' 10 1 3 62 63 # check ddl job lag 64 run_sql_source1 "alter table metrics.t1 add column new_col1 int;" 65 run_sql_source2 "alter table metrics.t2 add column new_col1 int;" 66 67 # check two worker's lag >= 1 68 check_log_contain_with_retry "[ShowLagInLog]" $WORK_DIR/worker1/log/dm-worker.log 69 check_log_contain_with_retry "[ShowLagInLog]" $WORK_DIR/worker2/log/dm-worker.log 70 71 check_metric $WORKER1_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 0 999 72 check_metric $WORKER2_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 0 999 73 74 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 75 # check the after ddl query-status lag should be set to 0 76 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 77 "query-status test" \ 78 "\"secondsBehindMaster\": \"0\"" 2 79 echo "check ddl lag done!" 80 81 # check new metric dm_syncer_flush_checkpoints_time_interval exists 82 check_metric $WORKER1_PORT 'dm_syncer_flush_checkpoints_time_interval_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 -1 99999 83 check_metric $WORKER2_PORT 'dm_syncer_flush_checkpoints_time_interval_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 -1 99999 84 85 # restart dm worker 86 kill_dm_worker 87 rm -rf $WORK_DIR/worker1/log/dm-worker.log # clean up the old log 88 rm -rf $WORK_DIR/worker2/log/dm-worker.log # clean up the old log 89 inject_points=( 90 "github.com/pingcap/tiflow/dm/syncer/BlockExecuteSQLs=return(2)" 91 "github.com/pingcap/tiflow/dm/syncer/ShowLagInLog=return(2)" # test lag metric >= 2 because we inject BlockExecuteSQLs to sleep(2) although skip lag is 0 (locally), but we use that lag of all dml/skip lag, so lag still >= 2 92 ) 93 export GO_FAILPOINTS="$(join_string \; ${inject_points[@]})" 94 95 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 96 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 97 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 98 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 99 100 run_sql_source1 "create view metrics.v1 as select * from metrics.t1;" # make skip job 101 run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 # make dml job 102 run_sql_file $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 # make dml job 103 104 # check two worker's lag >= 2 105 check_log_contain_with_retry "ShowLagInLog" $WORK_DIR/worker1/log/dm-worker.log 106 check_log_contain_with_retry "ShowLagInLog" $WORK_DIR/worker2/log/dm-worker.log 107 check_metric $WORKER1_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 1 999 108 check_metric $WORKER2_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 1 999 109 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 110 # this updated will blocked for 10s by failpoints(BlockExecuteSQLs), but during this time, dm_syncer_replication_lag_sum will continue increasing 111 run_sql_source1 'UPDATE metrics.t1 SET name="ehco" WHERE id = 1001' 112 check_metric $WORKER1_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 2 999 113 echo "check dml/skip lag done!" 114 115 # check new metric: dm_syncer_replication_lag_sum,dm_syncer_replication_lag_gauge, 116 # finished_transaction_total,dm_syncer_ideal_qps,dm_syncer_binlog_event_row,replication_transaction_batch exists 117 check_metric $WORKER1_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 -1 999 118 check_metric $WORKER2_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 -1 999 119 120 check_metric $WORKER1_PORT 'dm_syncer_replication_lag_gauge{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 -1 999 121 check_metric $WORKER2_PORT 'dm_syncer_replication_lag_gauge{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 -1 999 122 123 check_metric $WORKER1_PORT 'dm_syncer_finished_transaction_total{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 1 99999 124 check_metric $WORKER2_PORT 'dm_syncer_finished_transaction_total{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 1 99999 125 126 check_metric $WORKER1_PORT 'dm_syncer_ideal_qps{source_id="mysql-replica-01",task="test",worker="worker1"' 5 1 99999 127 check_metric $WORKER2_PORT 'dm_syncer_ideal_qps{source_id="mysql-replica-02",task="test",worker="worker2"' 5 1 99999 128 129 check_metric $WORKER1_PORT 'dm_syncer_binlog_event_row_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 0 99999 130 check_metric $WORKER2_PORT 'dm_syncer_binlog_event_row_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 0 99999 131 132 check_metric $WORKER1_PORT 'dm_syncer_replication_transaction_batch_count.*type="rows"' 5 0 99999 133 check_metric $WORKER1_PORT 'dm_syncer_replication_transaction_batch_count.*type="statements"' 5 0 99999 134 check_metric $WORKER2_PORT 'dm_syncer_replication_transaction_batch_count.*type="rows"' 5 0 99999 135 check_metric $WORKER2_PORT 'dm_syncer_replication_transaction_batch_count.*type="statements"' 5 0 99999 136 137 # restart dm worker 138 kill_dm_worker 139 export GO_FAILPOINTS='' 140 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 141 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 142 run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml 143 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT 144 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 145 # check the dmctl query-status no new dml, lag should be set to 0 146 run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 147 "query-status test" \ 148 "\"secondsBehindMaster\": \"0\"" 2 149 echo "check zero job done!" 150 151 # restart dm-worker1 152 pkill -hup -f dm-worker1.toml 2>/dev/null || true 153 wait_pattern_exit dm-worker1.toml 154 155 inject_points=( 156 "github.com/pingcap/tiflow/dm/syncer/noJobInQueueLog=return()" 157 "github.com/pingcap/tiflow/dm/syncer/IgnoreSomeTypeEvent=return(\"HeartbeatEvent\")" 158 ) 159 export GO_FAILPOINTS="$(join_string \; ${inject_points[@]})" 160 rm -rf $WORK_DIR/worker1/log/dm-worker.log # clean up the old log 161 run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml 162 check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT 163 164 echo "make a dml job" 165 run_sql_source1 "insert into metrics.t1 (id, name, ts) values (1004, 'zmj4', '2022-05-11 12:01:05')" 166 check_sync_diff $WORK_DIR $cur/conf/diff_config.toml 167 check_log_contain_with_retry 'no job in queue, update lag to zero' $WORK_DIR/worker1/log/dm-worker.log 168 run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \ 169 "stop-task test" \ 170 "\"result\": true" 3 171 export GO_FAILPOINTS='' 172 } 173 174 cleanup_data metrics 175 # also cleanup dm processes in case of last run failed 176 cleanup_process 177 run 178 cleanup_process 179 180 echo "[$(date)] <<<<<< test case $TEST_NAME success! >>>>>>"