github.com/pingcap/ticdc@v0.0.0-20220526033649-485a10ef2652/tests/availability/owner.sh (about) 1 #!/bin/bash 2 3 set -e 4 5 CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 6 source $CUR/../_utils/test_prepare 7 WORK_DIR=$OUT_DIR/$TEST_NAME 8 CDC_BINARY=cdc.test 9 10 MAX_RETRIES=10 11 12 function test_owner_ha() { 13 test_kill_owner 14 test_hang_up_owner 15 test_expire_owner 16 test_owner_cleanup_stale_tasks 17 # FIXME: this test case should be owner crashed during task cleanup 18 # test_owner_cleanup_stale_tasks 19 test_owner_retryable_error 20 test_gap_between_watch_capture 21 } 22 # test_kill_owner starts two captures and kill the owner 23 # we expect the live capture will be elected as the new 24 # owner 25 function test_kill_owner() { 26 echo "run test case test_kill_owner" 27 # start a capture server 28 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_kill_owner.server1 29 # ensure the server become the owner 30 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 31 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 32 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}') 33 echo "owner pid:" $owner_pid 34 echo "owner id" $owner_id 35 36 # run another server 37 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_kill_owner.server2 38 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id" 39 capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id") 40 echo "capture_id:" $capture_id 41 42 # kill the server 43 kill $owner_pid 44 45 # check that the new owner is elected 46 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 |grep $capture_id -A1 | grep '\"is-owner\": true'" 47 echo "test_kill_owner: pass" 48 49 cleanup_process $CDC_BINARY 50 } 51 52 # test_hang_up_owner starts two captures and stops the owner 53 # by sending a SIGSTOP signal. 54 # We expect another capture will be elected as the new owner 55 function test_hang_up_owner() { 56 echo "run test case test_hang_up_owner" 57 58 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_hang_up_owner.server1 59 # ensure the server become the owner 60 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 61 62 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 63 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}') 64 echo "owner pid:" $owner_pid 65 echo "owner id" $owner_id 66 67 # run another server 68 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_hang_up_owner.server2 69 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id" 70 capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id") 71 echo "capture_id:" $capture_id 72 73 # stop the owner 74 kill -SIGSTOP $owner_pid 75 76 # check that the new owner is elected 77 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 |grep $capture_id -A1 | grep '\"is-owner\": true'" 78 # resume the original process 79 kill -SIGCONT $owner_pid 80 81 echo "test_hang_up_owner: pass" 82 83 cleanup_process $CDC_BINARY 84 } 85 86 # test_expire_owner stops the owner by sending 87 # the SIGSTOP signal and wait unitl its session 88 # expires. 89 # We expect when the owner process resumes, it suicides 90 # itself and recovers from the death. 91 function test_expire_owner() { 92 echo "run test case test_expire_owner" 93 94 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_expire_owner.server1 95 # ensure the server become the owner 96 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 97 98 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 99 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}') 100 echo "owner pid:" $owner_pid 101 echo "owner id" $owner_id 102 103 # stop the owner 104 kill -SIGSTOP $owner_pid 105 echo "process status:" $(ps -h -p $owner_pid -o "s") 106 107 # ensure the session has expired 108 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\[\]'" 109 110 # resume the owner 111 kill -SIGCONT $owner_pid 112 echo "process status:" $(ps -h -p $owner_pid -o "s") 113 # ensure the owner has recovered 114 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 115 echo "test_expire_owner pass" 116 117 cleanup_process $CDC_BINARY 118 } 119 120 function test_owner_cleanup_stale_tasks() { 121 echo "run test case test_owner_cleanup_stale_tasks" 122 123 # start a capture server 124 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_cleanup_stale_tasks.server1 125 # ensure the server become the owner 126 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 127 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 128 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}') 129 echo "owner pid:" $owner_pid 130 echo "owner id" $owner_id 131 132 # run another server 133 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_owner_cleanup_stale_tasks.server2 134 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id" 135 capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid") 136 capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id") 137 echo "capture_id:" $capture_id 138 139 kill -SIGKILL $owner_pid 140 kill -SIGKILL $capture_pid 141 # wait capture info expires 142 sleep 3 143 144 # simulate task status is deleted but task position stales 145 etcdctl del /tidb/cdc/task/status --prefix 146 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8302" --logsuffix test_owner_cleanup_stale_tasks.server3 147 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 148 149 run_sql "INSERT INTO test.availability1(id, val) VALUES (1, 1);" 150 ensure $MAX_RETRIES nonempty 'select id, val from test.availability1 where id=1 and val=1' 151 run_sql "UPDATE test.availability1 set val = 22 where id = 1;" 152 ensure $MAX_RETRIES nonempty 'select id, val from test.availability1 where id=1 and val=22' 153 run_sql "DELETE from test.availability1 where id=1;" 154 ensure $MAX_RETRIES empty 'select id, val from test.availability1 where id=1' 155 156 echo "test_owner_cleanup_stale_tasks pass" 157 cleanup_process $CDC_BINARY 158 } 159 160 # test some retryable error meeting in the campaign owner loop 161 function test_owner_retryable_error() { 162 echo "run test case test_owner_retryable_error" 163 164 export GO_FAILPOINTS='github.com/pingcap/ticdc/cdc/capture-campaign-compacted-error=1*return(true)' 165 166 # start a capture server 167 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_retryable_error.server1 168 169 # ensure the server become the owner 170 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 171 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 172 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}') 173 echo "owner pid:" $owner_pid 174 echo "owner id" $owner_id 175 176 export GO_FAILPOINTS='github.com/pingcap/ticdc/cdc/owner-run-with-error=1*return(true);github.com/pingcap/ticdc/cdc/capture-resign-failed=1*return(true)' 177 178 # run another server 179 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_retryable_error.server2 --addr "127.0.0.1:8301" 180 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id" 181 capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid") 182 capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id") 183 echo "capture_id:" $capture_id 184 185 # resign the first capture, the second capture campaigns to be owner. 186 # However we have injected two failpoints, the second capture owner runs 187 # with error and before it exits resign owner also failed, so the second 188 # capture will exit and the first capture campaigns to be owner again. 189 curl -X POST http://127.0.0.1:8300/capture/owner/resign 190 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep $owner_id -A1 | grep '\"is-owner\": true'" 191 ensure $MAX_RETRIES "ps -C $CDC_BINARY -o pid= | awk '{print \$1}' | wc -l | grep 1" 192 193 echo "test_owner_retryable_error pass" 194 export GO_FAILPOINTS='' 195 cleanup_process $CDC_BINARY 196 } 197 198 function test_gap_between_watch_capture() { 199 echo "run test case test_gap_between_watch_capture" 200 201 export GO_FAILPOINTS='github.com/pingcap/ticdc/cdc/sleep-before-watch-capture=1*sleep(6000)' 202 203 # start a capture server 204 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_gap_between_watch_capture.server1 205 # ensure the server become the owner 206 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'" 207 owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}') 208 owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}') 209 echo "owner pid:" $owner_pid 210 echo "owner id" $owner_id 211 212 # run another server 213 run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_gap_between_watch_capture.server2 214 ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id" 215 capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid") 216 capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id") 217 echo "capture_id:" $capture_id 218 219 kill -SIGKILL $capture_pid 220 # wait capture info expires 221 sleep 3 222 223 for i in $(seq 1 3); do 224 run_sql "INSERT INTO test.availability$i(id, val) VALUES (1, 1);" 225 ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=1" 226 run_sql "UPDATE test.availability$i set val = 22 where id = 1;" 227 ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=22" 228 run_sql "DELETE from test.availability$i where id=1;" 229 ensure $MAX_RETRIES empty "select id, val from test.availability$i where id=1" 230 done 231 232 export GO_FAILPOINTS='' 233 echo "test_gap_between_watch_capture pass" 234 cleanup_process $CDC_BINARY 235 }