github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/tests/metrics/run.sh (about)

     1  #!/bin/bash
     2  
     3  set -eu
     4  
     5  cur=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
     6  source $cur/../_utils/test_prepare
     7  WORK_DIR=$TEST_DIR/$TEST_NAME
     8  
     9  function check_dashboard_datasource() {
    10  	echo "check dashboard data source"
    11  	check_grafana_dashboard_datasource "../metrics/grafana/DM-Monitor-Standard.json"
    12  	check_grafana_dashboard_datasource "../metrics/grafana/DM-Monitor-Professional.json"
    13  	echo "check dashboard data source success"
    14  }
    15  
    16  function run() {
    17  
    18  	check_dashboard_datasource
    19  
    20  	inject_points=(
    21  		"github.com/pingcap/tiflow/dm/syncer/BlockDDLJob=return(1)"
    22  		"github.com/pingcap/tiflow/dm/syncer/ShowLagInLog=return(1)" # test lag metric >= 1 because we inject BlockDDLJob(ddl) to sleep(1)
    23  		"github.com/pingcap/tiflow/dm/worker/PrintStatusCheckSeconds=return(1)"
    24  	)
    25  	export GO_FAILPOINTS="$(join_string \; ${inject_points[@]})"
    26  
    27  	run_sql_file $cur/data/db1.prepare.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1
    28  	run_sql_file $cur/data/db2.prepare.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2
    29  
    30  	# start DM worker and master
    31  	run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
    32  	check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
    33  
    34  	# operate mysql config to worker
    35  	cp $cur/conf/source1.yaml $WORK_DIR/source1.yaml
    36  	cp $cur/conf/source2.yaml $WORK_DIR/source2.yaml
    37  	sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker1/relay_log" $WORK_DIR/source1.yaml
    38  	sed -i "/relay-binlog-name/i\relay-dir: $WORK_DIR/worker2/relay_log" $WORK_DIR/source2.yaml
    39  
    40  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
    41  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
    42  	dmctl_operate_source create $WORK_DIR/source1.yaml $SOURCE_ID1
    43  
    44  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
    45  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
    46  	dmctl_operate_source create $WORK_DIR/source2.yaml $SOURCE_ID2
    47  
    48  	# start DM task
    49  	cp $cur/conf/dm-task.yaml $WORK_DIR/dm-task.yaml
    50  	dmctl_start_task "$WORK_DIR/dm-task.yaml" "--remove-meta"
    51  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    52  		"query-status test" \
    53  		"\"result\": true" 3
    54  
    55  	check_metric $WORKER1_PORT 'dm_worker_task_state{source_id="mysql-replica-01",task="test",worker="worker1"}' 10 1 3
    56  	check_metric $WORKER2_PORT 'dm_worker_task_state{source_id="mysql-replica-02",task="test",worker="worker2"}' 10 1 3
    57  
    58  	# check dm_syncer_binlog_file is updated timely
    59  	run_sql_source1 "flush logs;"
    60  	check_metric $WORKER1_PORT 'dm_syncer_binlog_file{node="syncer",source_id="mysql-replica-01",task="test"}' 10 1 3
    61  	check_metric $WORKER1_PORT 'dm_syncer_binlog_file{node="master",source_id="mysql-replica-01",task="test"}' 10 1 3
    62  
    63  	# check ddl job lag
    64  	run_sql_source1 "alter table metrics.t1 add column new_col1 int;"
    65  	run_sql_source2 "alter table metrics.t2 add column new_col1 int;"
    66  
    67  	# check two worker's lag >= 1
    68  	check_log_contain_with_retry "[ShowLagInLog]" $WORK_DIR/worker1/log/dm-worker.log
    69  	check_log_contain_with_retry "[ShowLagInLog]" $WORK_DIR/worker2/log/dm-worker.log
    70  
    71  	check_metric $WORKER1_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 0 999
    72  	check_metric $WORKER2_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 0 999
    73  
    74  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
    75  	# check the after ddl query-status lag should be set to 0
    76  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
    77  		"query-status test" \
    78  		"\"secondsBehindMaster\": \"0\"" 2
    79  	echo "check ddl lag done!"
    80  
    81  	# check new metric dm_syncer_flush_checkpoints_time_interval exists
    82  	check_metric $WORKER1_PORT 'dm_syncer_flush_checkpoints_time_interval_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 -1 99999
    83  	check_metric $WORKER2_PORT 'dm_syncer_flush_checkpoints_time_interval_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 -1 99999
    84  
    85  	# restart dm worker
    86  	kill_dm_worker
    87  	rm -rf $WORK_DIR/worker1/log/dm-worker.log # clean up the old log
    88  	rm -rf $WORK_DIR/worker2/log/dm-worker.log # clean up the old log
    89  	inject_points=(
    90  		"github.com/pingcap/tiflow/dm/syncer/BlockExecuteSQLs=return(2)"
    91  		"github.com/pingcap/tiflow/dm/syncer/ShowLagInLog=return(2)" # test lag metric >= 2 because we inject BlockExecuteSQLs to sleep(2) although skip lag is 0 (locally), but we use that lag of all dml/skip lag, so lag still >= 2
    92  	)
    93  	export GO_FAILPOINTS="$(join_string \; ${inject_points[@]})"
    94  
    95  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
    96  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
    97  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
    98  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
    99  
   100  	run_sql_source1 "create view metrics.v1 as select * from metrics.t1;"               # make skip job
   101  	run_sql_file $cur/data/db1.increment.sql $MYSQL_HOST1 $MYSQL_PORT1 $MYSQL_PASSWORD1 # make dml job
   102  	run_sql_file $cur/data/db2.increment.sql $MYSQL_HOST2 $MYSQL_PORT2 $MYSQL_PASSWORD2 # make dml job
   103  
   104  	# check two worker's lag >= 2
   105  	check_log_contain_with_retry "ShowLagInLog" $WORK_DIR/worker1/log/dm-worker.log
   106  	check_log_contain_with_retry "ShowLagInLog" $WORK_DIR/worker2/log/dm-worker.log
   107  	check_metric $WORKER1_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 1 999
   108  	check_metric $WORKER2_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 1 999
   109  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
   110  	# this updated will blocked for 10s by failpoints(BlockExecuteSQLs), but during this time, dm_syncer_replication_lag_sum will continue increasing
   111  	run_sql_source1 'UPDATE metrics.t1 SET name="ehco" WHERE id = 1001'
   112  	check_metric $WORKER1_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 2 999
   113  	echo "check dml/skip lag done!"
   114  
   115  	# check new metric: dm_syncer_replication_lag_sum,dm_syncer_replication_lag_gauge,
   116  	# finished_transaction_total,dm_syncer_ideal_qps,dm_syncer_binlog_event_row,replication_transaction_batch exists
   117  	check_metric $WORKER1_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 -1 999
   118  	check_metric $WORKER2_PORT 'dm_syncer_replication_lag_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 -1 999
   119  
   120  	check_metric $WORKER1_PORT 'dm_syncer_replication_lag_gauge{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 -1 999
   121  	check_metric $WORKER2_PORT 'dm_syncer_replication_lag_gauge{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 -1 999
   122  
   123  	check_metric $WORKER1_PORT 'dm_syncer_finished_transaction_total{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 1 99999
   124  	check_metric $WORKER2_PORT 'dm_syncer_finished_transaction_total{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 1 99999
   125  
   126  	check_metric $WORKER1_PORT 'dm_syncer_ideal_qps{source_id="mysql-replica-01",task="test",worker="worker1"' 5 1 99999
   127  	check_metric $WORKER2_PORT 'dm_syncer_ideal_qps{source_id="mysql-replica-02",task="test",worker="worker2"' 5 1 99999
   128  
   129  	check_metric $WORKER1_PORT 'dm_syncer_binlog_event_row_sum{source_id="mysql-replica-01",task="test",worker="worker1"}' 5 0 99999
   130  	check_metric $WORKER2_PORT 'dm_syncer_binlog_event_row_sum{source_id="mysql-replica-02",task="test",worker="worker2"}' 5 0 99999
   131  
   132  	check_metric $WORKER1_PORT 'dm_syncer_replication_transaction_batch_count.*type="rows"' 5 0 99999
   133  	check_metric $WORKER1_PORT 'dm_syncer_replication_transaction_batch_count.*type="statements"' 5 0 99999
   134  	check_metric $WORKER2_PORT 'dm_syncer_replication_transaction_batch_count.*type="rows"' 5 0 99999
   135  	check_metric $WORKER2_PORT 'dm_syncer_replication_transaction_batch_count.*type="statements"' 5 0 99999
   136  
   137  	# restart dm worker
   138  	kill_dm_worker
   139  	export GO_FAILPOINTS=''
   140  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
   141  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
   142  	run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
   143  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
   144  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
   145  	# check the dmctl query-status no new dml, lag should be set to 0
   146  	run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   147  		"query-status test" \
   148  		"\"secondsBehindMaster\": \"0\"" 2
   149  	echo "check zero job done!"
   150  
   151  	# restart dm-worker1
   152  	pkill -hup -f dm-worker1.toml 2>/dev/null || true
   153  	wait_pattern_exit dm-worker1.toml
   154  
   155  	inject_points=(
   156  		"github.com/pingcap/tiflow/dm/syncer/noJobInQueueLog=return()"
   157  		"github.com/pingcap/tiflow/dm/syncer/IgnoreSomeTypeEvent=return(\"HeartbeatEvent\")"
   158  	)
   159  	export GO_FAILPOINTS="$(join_string \; ${inject_points[@]})"
   160  	rm -rf $WORK_DIR/worker1/log/dm-worker.log # clean up the old log
   161  	run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
   162  	check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
   163  
   164  	echo "make a dml job"
   165  	run_sql_source1 "insert into metrics.t1 (id, name, ts) values (1004, 'zmj4', '2022-05-11 12:01:05')"
   166  	check_sync_diff $WORK_DIR $cur/conf/diff_config.toml
   167  	check_log_contain_with_retry 'no job in queue, update lag to zero' $WORK_DIR/worker1/log/dm-worker.log
   168  	run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
   169  		"stop-task test" \
   170  		"\"result\": true" 3
   171  	export GO_FAILPOINTS=''
   172  }
   173  
   174  cleanup_data metrics
   175  # also cleanup dm processes in case of last run failed
   176  cleanup_process
   177  run
   178  cleanup_process
   179  
   180  echo "[$(date)] <<<<<< test case $TEST_NAME success! >>>>>>"