github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/tests/integration_tests/availability/owner.sh (about)

     1  #!/bin/bash
     2  set -eu
     3  CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
     4  source $CUR/../_utils/test_prepare
     5  WORK_DIR=$OUT_DIR/$TEST_NAME
     6  CDC_BINARY=cdc.test
     7  MAX_RETRIES=10
     8  function test_owner_ha() {
     9  	test_kill_owner
    10  	test_hang_up_owner
    11  	test_expire_owner
    12  	test_owner_cleanup_stale_tasks
    13  	test_owner_retryable_error
    14  	test_gap_between_watch_capture
    15  	test_delete_owner_key
    16  }
    17  # test_kill_owner starts two captures and kill the owner
    18  # we expect the live capture will be elected as the new
    19  # owner
    20  function test_kill_owner() {
    21  	# record tso before we create tables to skip the system table DDLs
    22  	start_ts=$(run_cdc_cli_tso_query ${UP_PD_HOST_1} ${UP_PD_PORT_1})
    23  	run_sql "CREATE table test.availability1(id int primary key, val int);"
    24  	run_sql "CREATE table test.availability2(id int primary key, val int);"
    25  	run_sql "CREATE table test.availability3(id int primary key, val int);"
    26  	echo "run test case test_kill_owner"
    27  	# start a capture server
    28  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_kill_owner.server1
    29  	# create changefeed after cdc is started
    30  	run_cdc_cli changefeed create --start-ts=$start_ts \
    31  		--sink-uri="mysql://normal:123456@127.0.0.1:3306/"
    32  	# ensure the server become the owner
    33  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
    34  	owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
    35  	owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}')
    36  	echo "owner pid:" $owner_pid
    37  	echo "owner id" $owner_id
    38  	# run another server
    39  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_kill_owner.server2
    40  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep -v cluster_id | grep id"
    41  	capture_id=$($CDC_BINARY cli capture list --server 'http://127.0.0.1:8301' 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id")
    42  	echo "capture_id:" $capture_id
    43  	# kill the server
    44  	kill_cdc_pid $owner_pid
    45  	# check that the new owner is elected
    46  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list --server 'http://127.0.0.1:8301' 2>&1 |grep $capture_id -A1 | grep '\"is-owner\": true'"
    47  	echo "test_kill_owner: pass"
    48  	cleanup_process $CDC_BINARY
    49  }
    50  # test_hang_up_owner starts two captures and stops the owner
    51  # by sending a SIGSTOP signal.
    52  # We expect another capture will be elected as the new owner
    53  function test_hang_up_owner() {
    54  	echo "run test case test_hang_up_owner"
    55  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_hang_up_owner.server1
    56  	# ensure the server become the owner
    57  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
    58  	owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
    59  	owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}')
    60  	echo "owner pid:" $owner_pid
    61  	echo "owner id" $owner_id
    62  	# run another server
    63  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_hang_up_owner.server2
    64  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list --server 'http://127.0.0.1:8301'  2>&1 | grep -v \"$owner_id\" | grep id"
    65  	capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id")
    66  	echo "capture_id:" $capture_id
    67  	# stop the owner
    68  	kill -SIGSTOP $owner_pid
    69  	# check that the new owner is elected
    70  	ensure $MAX_RETRIES "ETCDCTL_API=3 etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep '$capture_id'"
    71  	# resume the original process
    72  	kill -SIGCONT $owner_pid
    73  	echo "test_hang_up_owner: pass"
    74  	cleanup_process $CDC_BINARY
    75  }
    76  # test_expire_owner stops the owner by sending
    77  # the SIGSTOP signal and wait unitl its session
    78  # expires.
    79  # We expect when the owner process resumes, it suicides
    80  # itself and recovers from the death.
    81  function test_expire_owner() {
    82  	echo "run test case test_expire_owner"
    83  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_expire_owner.server1
    84  	# ensure the server become the owner
    85  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
    86  	owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
    87  	owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}')
    88  	echo "owner pid:" $owner_pid
    89  	echo "owner id" $owner_id
    90  	# stop the owner
    91  	kill -SIGSTOP $owner_pid
    92  	echo "process status:" $(ps -h -p $owner_pid -o "s")
    93  	# ensure the session has expired
    94  	ensure $MAX_RETRIES "ETCDCTL_API=3 etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep -v '$owner_id'"
    95  	# resume the owner
    96  	kill -SIGCONT $owner_pid
    97  	echo "process status:" $(ps -h -p $owner_pid -o "s")
    98  	# ensure the owner has recovered
    99  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   100  	echo "test_expire_owner pass"
   101  	cleanup_process $CDC_BINARY
   102  }
   103  function test_owner_cleanup_stale_tasks() {
   104  	echo "run test case test_owner_cleanup_stale_tasks"
   105  	# start a capture server
   106  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_cleanup_stale_tasks.server1
   107  	# ensure the server become the owner
   108  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   109  	owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
   110  	owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}')
   111  	echo "owner pid:" $owner_pid
   112  	echo "owner id" $owner_id
   113  	# run another server
   114  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_owner_cleanup_stale_tasks.server2
   115  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list --server 'http://127.0.0.1:8301' 2>&1 | grep -v \"$owner_id\" | grep id"
   116  	capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid")
   117  	capture_id=$($CDC_BINARY cli capture list --server 'http://127.0.0.1:8301' 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id")
   118  	echo "capture_id:" $capture_id
   119  	kill -SIGKILL $owner_pid
   120  	kill -SIGKILL $capture_pid
   121  	# wait capture info expires
   122  	sleep 3
   123  	# simulate task status is deleted but task position stales
   124  	ETCDCTL_API=3 etcdctl del /tidb/cdc/task/status --prefix
   125  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8302" --logsuffix test_owner_cleanup_stale_tasks.server3
   126  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list --server 'http://127.0.0.1:8302' 2>&1 | grep '\"is-owner\": true'"
   127  	run_sql "INSERT INTO test.availability1(id, val) VALUES (1, 1);"
   128  	ensure $MAX_RETRIES nonempty 'select id, val from test.availability1 where id=1 and val=1'
   129  	run_sql "UPDATE test.availability1 set val = 22 where id = 1;"
   130  	ensure $MAX_RETRIES nonempty 'select id, val from test.availability1 where id=1 and val=22'
   131  	run_sql "DELETE from test.availability1 where id=1;"
   132  	ensure $MAX_RETRIES empty 'select id, val from test.availability1 where id=1'
   133  	echo "test_owner_cleanup_stale_tasks pass"
   134  	cleanup_process $CDC_BINARY
   135  }
   136  # test some retryable error meeting in the campaign owner loop
   137  function test_owner_retryable_error() {
   138  	echo "run test case test_owner_retryable_error"
   139  	export GO_FAILPOINTS='github.com/pingcap/tiflow/cdc/capture/capture-campaign-compacted-error=1*return(true)'
   140  
   141  	# start a capture server
   142  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_retryable_error.server1
   143  	# ensure the server become the owner
   144  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   145  	owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
   146  	owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}')
   147  	echo "owner pid:" $owner_pid
   148  	echo "owner id" $owner_id
   149  	export GO_FAILPOINTS='github.com/pingcap/tiflow/cdc/owner/owner-run-with-error=1*return(true);github.com/pingcap/tiflow/cdc/capture/capture-resign-failed=1*return(true)'
   150  	# run another server
   151  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_retryable_error.server2 --addr "127.0.0.1:8301"
   152  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id"
   153  	capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid")
   154  	capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id")
   155  	echo "capture_id:" $capture_id
   156  	# resign the first capture, the second capture campaigns to be owner.
   157  	# However we have injected two failpoints, the second capture owner runs
   158  	# with error and before it exits resign owner also failed, so the second
   159  	# capture will restart and the first capture campaigns to be owner again.
   160  	curl -X POST http://127.0.0.1:8300/capture/owner/resign
   161  	ensure $MAX_RETRIES "ETCDCTL_API=3 etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep  '$owner_id'"
   162  	# The second capture will restart but not exit, so there are two capture servers.
   163  	# So the wc -l will be 2.
   164  	ensure $MAX_RETRIES "ps -C $CDC_BINARY -o pid= | awk '{print \$1}' | wc -l | grep 2"
   165  	echo "test_owner_retryable_error pass"
   166  	export GO_FAILPOINTS=''
   167  	cleanup_process $CDC_BINARY
   168  }
   169  function test_gap_between_watch_capture() {
   170  	echo "run test case test_gap_between_watch_capture"
   171  	export GO_FAILPOINTS='github.com/pingcap/tiflow/cdc/owner/sleep-in-owner-tick=1*sleep(6000)'
   172  	# start a capture server
   173  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_gap_between_watch_capture.server1
   174  	# ensure the server become the owner
   175  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   176  	owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
   177  	owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}')
   178  	echo "owner pid:" $owner_pid
   179  	echo "owner id" $owner_id
   180  	# run another server
   181  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_gap_between_watch_capture.server2
   182  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id"
   183  	capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid")
   184  	capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id")
   185  	echo "capture_id:" $capture_id
   186  	kill -SIGKILL $capture_pid
   187  	# wait capture info expires
   188  	sleep 3
   189  	for i in $(seq 1 3); do
   190  		run_sql "INSERT INTO test.availability$i(id, val) VALUES (1, 1);"
   191  		ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=1"
   192  		run_sql "UPDATE test.availability$i set val = 22 where id = 1;"
   193  		ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=22"
   194  		run_sql "DELETE from test.availability$i where id=1;"
   195  		ensure $MAX_RETRIES empty "select id, val from test.availability$i where id=1"
   196  	done
   197  	export GO_FAILPOINTS=''
   198  	echo "test_gap_between_watch_capture pass"
   199  	cleanup_process $CDC_BINARY
   200  }
   201  
   202  # make sure when owner key in etcd is deleted, the owner will resign,
   203  # and only one owner exists in the cluster at the same time.
   204  function test_delete_owner_key() {
   205  	echo "run test case delete_owner_key"
   206  
   207  	# start a capture server
   208  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_gap_between_watch_capture.server1
   209  	# ensure the server become the owner
   210  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   211  	owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
   212  	owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}')
   213  	owner_key=$(etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep -B 1 "$owner_id" | head -n 1)
   214  	echo "owner pid:" $owner_pid
   215  	echo "owner id" $owner_id
   216  	echo "owner key" $owner_key
   217  
   218  	# run another server
   219  	run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_gap_between_watch_capture.server2
   220  	ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id"
   221  	capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid")
   222  	capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id")
   223  	echo "capture_id:" $capture_id
   224  
   225  	etcdctl del $owner_key
   226  	ensure $MAX_RETRIES "ETCDCTL_API=3 etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep  '$capture_id'"
   227  	# ensure the first capture has resign owner
   228  	ensure $MAX_RETRIES "curl -X GET http://127.0.0.1:8300/status | grep '\"is_owner\": false'"
   229  
   230  	sleep 3
   231  
   232  	for i in $(seq 1 3); do
   233  		run_sql "INSERT INTO test.availability$i(id, val) VALUES (1, 1);"
   234  		ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=1"
   235  		run_sql "UPDATE test.availability$i set val = 22 where id = 1;"
   236  		ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=22"
   237  		run_sql "DELETE from test.availability$i where id=1;"
   238  		ensure $MAX_RETRIES empty "select id, val from test.availability$i where id=1"
   239  	done
   240  
   241  	export GO_FAILPOINTS=''
   242  	echo "delete_owner_key pass"
   243  	cleanup_process $CDC_BINARY
   244  }