github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/tests/integration_tests/availability/owner.sh (about) 1 #!/bin/bash 2 set -eu 3 CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 4 source $CUR/../_utils/test_prepare 5 WORK_DIR=$OUT_DIR/$TEST_NAME 6 CDC_BINARY=cdc.test 7 MAX_RETRIES=10 8 function test_owner_ha() { 9 test_kill_owner 10 test_hang_up_owner 11 test_expire_owner 12 test_owner_cleanup_stale_tasks 13 test_owner_retryable_error 14 test_gap_between_watch_capture 15 test_delete_owner_key 16 } 17 # test_kill_owner starts two captures and kill the owner 18 # we expect the live capture will be elected as the new 19 # owner 20 function test_kill_owner() { 21 # record tso before we create tables to skip the system table DDLs 22 start_ts=$(run_cdc_cli_tso_query ${UP_PD_HOST_1} ${UP_PD_PORT_1}) 23 run_sql "CREATE table test.availability1(id int primary key, val int);" 24 run_sql "CREATE table test.availability2(id int primary key, val int);" 25 run_sql "CREATE table test.availability3(id int primary key, val int);" 26 echo "run test case test_kill_owner" 27 # start a capture server 28 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_kill_owner.server1 29 # create changefeed after cdc is started 30 run_cdc_cli changefeed create --start-ts=$start_ts \ 31 --sink-uri="mysql://normal:123456@127.0.0.1:3306/" 32 # ensure the server become the owner 33 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 34 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 35 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}') 36 echo "owner pid:" $owner_pid 37 echo "owner id" $owner_id 38 # run another server 39 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_kill_owner.server2 40 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep -v cluster_id | grep id" 41 capture_id=$($CDC_BINARY cli capture list --server 'http://127.0.0.1:8301' 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id") 42 echo "capture_id:" $capture_id 43 # kill the server 44 kill_cdc_pid $owner_pid 45 # check that the new owner is elected 46 ensure $MAX_RETRIES "$CDC_BINARY cli capture list --server 'http://127.0.0.1:8301' 2>&1 |grep $capture_id -A1 | grep '\"is-owner\": true'" 47 echo "test_kill_owner: pass" 48 cleanup_process $CDC_BINARY 49 } 50 # test_hang_up_owner starts two captures and stops the owner 51 # by sending a SIGSTOP signal. 52 # We expect another capture will be elected as the new owner 53 function test_hang_up_owner() { 54 echo "run test case test_hang_up_owner" 55 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_hang_up_owner.server1 56 # ensure the server become the owner 57 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 58 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 59 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}') 60 echo "owner pid:" $owner_pid 61 echo "owner id" $owner_id 62 # run another server 63 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_hang_up_owner.server2 64 ensure $MAX_RETRIES "$CDC_BINARY cli capture list --server 'http://127.0.0.1:8301' 2>&1 | grep -v \"$owner_id\" | grep id" 65 capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id") 66 echo "capture_id:" $capture_id 67 # stop the owner 68 kill -SIGSTOP $owner_pid 69 # check that the new owner is elected 70 ensure $MAX_RETRIES "ETCDCTL_API=3 etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep '$capture_id'" 71 # resume the original process 72 kill -SIGCONT $owner_pid 73 echo "test_hang_up_owner: pass" 74 cleanup_process $CDC_BINARY 75 } 76 # test_expire_owner stops the owner by sending 77 # the SIGSTOP signal and wait unitl its session 78 # expires. 79 # We expect when the owner process resumes, it suicides 80 # itself and recovers from the death. 81 function test_expire_owner() { 82 echo "run test case test_expire_owner" 83 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_expire_owner.server1 84 # ensure the server become the owner 85 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 86 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 87 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}') 88 echo "owner pid:" $owner_pid 89 echo "owner id" $owner_id 90 # stop the owner 91 kill -SIGSTOP $owner_pid 92 echo "process status:" $(ps -h -p $owner_pid -o "s") 93 # ensure the session has expired 94 ensure $MAX_RETRIES "ETCDCTL_API=3 etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep -v '$owner_id'" 95 # resume the owner 96 kill -SIGCONT $owner_pid 97 echo "process status:" $(ps -h -p $owner_pid -o "s") 98 # ensure the owner has recovered 99 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 100 echo "test_expire_owner pass" 101 cleanup_process $CDC_BINARY 102 } 103 function test_owner_cleanup_stale_tasks() { 104 echo "run test case test_owner_cleanup_stale_tasks" 105 # start a capture server 106 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_cleanup_stale_tasks.server1 107 # ensure the server become the owner 108 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 109 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 110 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}') 111 echo "owner pid:" $owner_pid 112 echo "owner id" $owner_id 113 # run another server 114 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_owner_cleanup_stale_tasks.server2 115 ensure $MAX_RETRIES "$CDC_BINARY cli capture list --server 'http://127.0.0.1:8301' 2>&1 | grep -v \"$owner_id\" | grep id" 116 capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid") 117 capture_id=$($CDC_BINARY cli capture list --server 'http://127.0.0.1:8301' 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id") 118 echo "capture_id:" $capture_id 119 kill -SIGKILL $owner_pid 120 kill -SIGKILL $capture_pid 121 # wait capture info expires 122 sleep 3 123 # simulate task status is deleted but task position stales 124 ETCDCTL_API=3 etcdctl del /tidb/cdc/task/status --prefix 125 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8302" --logsuffix test_owner_cleanup_stale_tasks.server3 126 ensure $MAX_RETRIES "$CDC_BINARY cli capture list --server 'http://127.0.0.1:8302' 2>&1 | grep '\"is-owner\": true'" 127 run_sql "INSERT INTO test.availability1(id, val) VALUES (1, 1);" 128 ensure $MAX_RETRIES nonempty 'select id, val from test.availability1 where id=1 and val=1' 129 run_sql "UPDATE test.availability1 set val = 22 where id = 1;" 130 ensure $MAX_RETRIES nonempty 'select id, val from test.availability1 where id=1 and val=22' 131 run_sql "DELETE from test.availability1 where id=1;" 132 ensure $MAX_RETRIES empty 'select id, val from test.availability1 where id=1' 133 echo "test_owner_cleanup_stale_tasks pass" 134 cleanup_process $CDC_BINARY 135 } 136 # test some retryable error meeting in the campaign owner loop 137 function test_owner_retryable_error() { 138 echo "run test case test_owner_retryable_error" 139 export GO_FAILPOINTS='github.com/pingcap/tiflow/cdc/capture/capture-campaign-compacted-error=1*return(true)' 140 141 # start a capture server 142 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_retryable_error.server1 143 # ensure the server become the owner 144 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 145 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 146 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}') 147 echo "owner pid:" $owner_pid 148 echo "owner id" $owner_id 149 export GO_FAILPOINTS='github.com/pingcap/tiflow/cdc/owner/owner-run-with-error=1*return(true);github.com/pingcap/tiflow/cdc/capture/capture-resign-failed=1*return(true)' 150 # run another server 151 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_retryable_error.server2 --addr "127.0.0.1:8301" 152 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id" 153 capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid") 154 capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id") 155 echo "capture_id:" $capture_id 156 # resign the first capture, the second capture campaigns to be owner. 157 # However we have injected two failpoints, the second capture owner runs 158 # with error and before it exits resign owner also failed, so the second 159 # capture will restart and the first capture campaigns to be owner again. 160 curl -X POST http://127.0.0.1:8300/capture/owner/resign 161 ensure $MAX_RETRIES "ETCDCTL_API=3 etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep '$owner_id'" 162 # The second capture will restart but not exit, so there are two capture servers. 163 # So the wc -l will be 2. 164 ensure $MAX_RETRIES "ps -C $CDC_BINARY -o pid= | awk '{print \$1}' | wc -l | grep 2" 165 echo "test_owner_retryable_error pass" 166 export GO_FAILPOINTS='' 167 cleanup_process $CDC_BINARY 168 } 169 function test_gap_between_watch_capture() { 170 echo "run test case test_gap_between_watch_capture" 171 export GO_FAILPOINTS='github.com/pingcap/tiflow/cdc/owner/sleep-in-owner-tick=1*sleep(6000)' 172 # start a capture server 173 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_gap_between_watch_capture.server1 174 # ensure the server become the owner 175 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 176 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 177 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}') 178 echo "owner pid:" $owner_pid 179 echo "owner id" $owner_id 180 # run another server 181 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_gap_between_watch_capture.server2 182 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id" 183 capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid") 184 capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/\"id/{print $4}' | grep -v "$owner_id") 185 echo "capture_id:" $capture_id 186 kill -SIGKILL $capture_pid 187 # wait capture info expires 188 sleep 3 189 for i in $(seq 1 3); do 190 run_sql "INSERT INTO test.availability$i(id, val) VALUES (1, 1);" 191 ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=1" 192 run_sql "UPDATE test.availability$i set val = 22 where id = 1;" 193 ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=22" 194 run_sql "DELETE from test.availability$i where id=1;" 195 ensure $MAX_RETRIES empty "select id, val from test.availability$i where id=1" 196 done 197 export GO_FAILPOINTS='' 198 echo "test_gap_between_watch_capture pass" 199 cleanup_process $CDC_BINARY 200 } 201 202 # make sure when owner key in etcd is deleted, the owner will resign, 203 # and only one owner exists in the cluster at the same time. 204 function test_delete_owner_key() { 205 echo "run test case delete_owner_key" 206 207 # start a capture server 208 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_gap_between_watch_capture.server1 209 # ensure the server become the owner 210 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 211 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 212 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}') 213 owner_key=$(etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep -B 1 "$owner_id" | head -n 1) 214 echo "owner pid:" $owner_pid 215 echo "owner id" $owner_id 216 echo "owner key" $owner_key 217 218 # run another server 219 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_gap_between_watch_capture.server2 220 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id" 221 capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid") 222 capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id") 223 echo "capture_id:" $capture_id 224 225 etcdctl del $owner_key 226 ensure $MAX_RETRIES "ETCDCTL_API=3 etcdctl get /tidb/cdc/default/__cdc_meta__/owner --prefix | grep '$capture_id'" 227 # ensure the first capture has resign owner 228 ensure $MAX_RETRIES "curl -X GET http://127.0.0.1:8300/status | grep '\"is_owner\": false'" 229 230 sleep 3 231 232 for i in $(seq 1 3); do 233 run_sql "INSERT INTO test.availability$i(id, val) VALUES (1, 1);" 234 ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=1" 235 run_sql "UPDATE test.availability$i set val = 22 where id = 1;" 236 ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=22" 237 run_sql "DELETE from test.availability$i where id=1;" 238 ensure $MAX_RETRIES empty "select id, val from test.availability$i where id=1" 239 done 240 241 export GO_FAILPOINTS='' 242 echo "delete_owner_key pass" 243 cleanup_process $CDC_BINARY 244 }