github.com/pingcap/ticdc@v0.0.0-20220526033649-485a10ef2652/tests/availability/owner.sh (about)

     1  #!/bin/bash
     2  
     3  set -e
     4  
     5  CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
     6  source $CUR/../_utils/test_prepare
     7  WORK_DIR=$OUT_DIR/$TEST_NAME
     8  CDC_BINARY=cdc.test
     9  
    10  MAX_RETRIES=10
    11  
    12  function test_owner_ha() {
    13      test_kill_owner
    14      test_hang_up_owner
    15      test_expire_owner
    16      test_owner_cleanup_stale_tasks
    17      # FIXME: this test case should be owner crashed during task cleanup
    18      # test_owner_cleanup_stale_tasks
    19      test_owner_retryable_error
    20      test_gap_between_watch_capture
    21  }
    22  # test_kill_owner starts two captures and kill the owner
    23  # we expect the live capture will be elected as the new
    24  # owner
    25  function test_kill_owner() {
    26      echo "run test case test_kill_owner"
    27      # start a capture server
    28      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_kill_owner.server1
    29      # ensure the server become the owner
    30      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
    31      owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
    32      owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}')
    33      echo "owner pid:" $owner_pid
    34      echo "owner id" $owner_id
    35  
    36      # run another server
    37      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_kill_owner.server2
    38      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id"
    39      capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id")
    40      echo "capture_id:" $capture_id
    41  
    42      # kill the server
    43      kill $owner_pid
    44  
    45      # check that the new owner is elected
    46      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 |grep $capture_id -A1 | grep '\"is-owner\": true'"
    47      echo "test_kill_owner: pass"
    48  
    49      cleanup_process $CDC_BINARY
    50  }
    51  
    52  # test_hang_up_owner starts two captures and stops the owner
    53  # by sending a SIGSTOP signal.
    54  # We expect another capture will be elected as the new owner
    55  function test_hang_up_owner() {
    56      echo "run test case test_hang_up_owner"
    57  
    58      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_hang_up_owner.server1
    59      # ensure the server become the owner
    60      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
    61  
    62      owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
    63      owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}')
    64      echo "owner pid:" $owner_pid
    65      echo "owner id" $owner_id
    66  
    67      # run another server
    68      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_hang_up_owner.server2
    69      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id"
    70      capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id")
    71      echo "capture_id:" $capture_id
    72  
    73      # stop the owner
    74      kill -SIGSTOP $owner_pid
    75  
    76      # check that the new owner is elected
    77      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 |grep $capture_id -A1 | grep '\"is-owner\": true'"
    78      # resume the original process
    79      kill -SIGCONT $owner_pid
    80  
    81      echo "test_hang_up_owner: pass"
    82  
    83      cleanup_process $CDC_BINARY
    84  }
    85  
    86  # test_expire_owner stops the owner by sending
    87  # the SIGSTOP signal and wait unitl its session
    88  # expires.
    89  # We expect when the owner process resumes, it suicides
    90  # itself and recovers from the death.
    91  function test_expire_owner() {
    92      echo "run test case test_expire_owner"
    93  
    94      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_expire_owner.server1
    95      # ensure the server become the owner
    96      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
    97  
    98      owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
    99      owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}')
   100      echo "owner pid:" $owner_pid
   101      echo "owner id" $owner_id
   102  
   103      # stop the owner
   104      kill -SIGSTOP $owner_pid
   105      echo "process status:" $(ps -h -p $owner_pid -o "s")
   106  
   107      # ensure the session has expired
   108      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\[\]'"
   109  
   110      # resume the owner
   111      kill -SIGCONT $owner_pid
   112      echo "process status:" $(ps -h -p $owner_pid -o "s")
   113      # ensure the owner has recovered
   114      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   115      echo "test_expire_owner pass"
   116  
   117      cleanup_process $CDC_BINARY
   118  }
   119  
   120  function test_owner_cleanup_stale_tasks() {
   121      echo "run test case test_owner_cleanup_stale_tasks"
   122  
   123      # start a capture server
   124      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_cleanup_stale_tasks.server1
   125      # ensure the server become the owner
   126      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   127      owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
   128      owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}')
   129      echo "owner pid:" $owner_pid
   130      echo "owner id" $owner_id
   131  
   132      # run another server
   133      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_owner_cleanup_stale_tasks.server2
   134      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id"
   135      capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid")
   136      capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id")
   137      echo "capture_id:" $capture_id
   138  
   139      kill -SIGKILL $owner_pid
   140      kill -SIGKILL $capture_pid
   141      # wait capture info expires
   142      sleep 3
   143  
   144      # simulate task status is deleted but task position stales
   145      etcdctl del /tidb/cdc/task/status --prefix
   146      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8302" --logsuffix test_owner_cleanup_stale_tasks.server3
   147      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   148  
   149      run_sql "INSERT INTO test.availability1(id, val) VALUES (1, 1);"
   150      ensure $MAX_RETRIES nonempty 'select id, val from test.availability1 where id=1 and val=1'
   151      run_sql "UPDATE test.availability1 set val = 22 where id = 1;"
   152      ensure $MAX_RETRIES nonempty 'select id, val from test.availability1 where id=1 and val=22'
   153      run_sql "DELETE from test.availability1 where id=1;"
   154      ensure $MAX_RETRIES empty 'select id, val from test.availability1 where id=1'
   155  
   156      echo "test_owner_cleanup_stale_tasks pass"
   157      cleanup_process $CDC_BINARY
   158  }
   159  
   160  # test some retryable error meeting in the campaign owner loop
   161  function test_owner_retryable_error() {
   162      echo "run test case test_owner_retryable_error"
   163  
   164      export GO_FAILPOINTS='github.com/pingcap/ticdc/cdc/capture-campaign-compacted-error=1*return(true)'
   165  
   166      # start a capture server
   167      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_retryable_error.server1
   168  
   169      # ensure the server become the owner
   170      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   171      owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
   172      owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}')
   173      echo "owner pid:" $owner_pid
   174      echo "owner id" $owner_id
   175  
   176      export GO_FAILPOINTS='github.com/pingcap/ticdc/cdc/owner-run-with-error=1*return(true);github.com/pingcap/ticdc/cdc/capture-resign-failed=1*return(true)'
   177  
   178      # run another server
   179      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_owner_retryable_error.server2 --addr "127.0.0.1:8301"
   180      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id"
   181      capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid")
   182      capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id")
   183      echo "capture_id:" $capture_id
   184  
   185      # resign the first capture, the second capture campaigns to be owner.
   186      # However we have injected two failpoints, the second capture owner runs
   187      # with error and before it exits resign owner also failed, so the second
   188      # capture will exit and the first capture campaigns to be owner again.
   189      curl -X POST http://127.0.0.1:8300/capture/owner/resign
   190      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep $owner_id -A1 | grep '\"is-owner\": true'"
   191      ensure $MAX_RETRIES "ps -C $CDC_BINARY -o pid= | awk '{print \$1}' | wc -l | grep 1"
   192  
   193      echo "test_owner_retryable_error pass"
   194      export GO_FAILPOINTS=''
   195      cleanup_process $CDC_BINARY
   196  }
   197  
   198  function test_gap_between_watch_capture() {
   199      echo "run test case test_gap_between_watch_capture"
   200  
   201      export GO_FAILPOINTS='github.com/pingcap/ticdc/cdc/sleep-before-watch-capture=1*sleep(6000)'
   202  
   203      # start a capture server
   204      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --logsuffix test_gap_between_watch_capture.server1
   205      # ensure the server become the owner
   206      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep '\"is-owner\": true'"
   207      owner_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}')
   208      owner_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}')
   209      echo "owner pid:" $owner_pid
   210      echo "owner id" $owner_id
   211  
   212      # run another server
   213      run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8301" --logsuffix test_gap_between_watch_capture.server2
   214      ensure $MAX_RETRIES "$CDC_BINARY cli capture list 2>&1 | grep -v \"$owner_id\" | grep id"
   215      capture_pid=$(ps -C $CDC_BINARY -o pid= | awk '{print $1}' | grep -v "$owner_pid")
   216      capture_id=$($CDC_BINARY cli capture list 2>&1 | awk -F '"' '/id/{print $4}' | grep -v "$owner_id")
   217      echo "capture_id:" $capture_id
   218  
   219      kill -SIGKILL $capture_pid
   220      # wait capture info expires
   221      sleep 3
   222  
   223      for i in $(seq 1 3); do
   224          run_sql "INSERT INTO test.availability$i(id, val) VALUES (1, 1);"
   225          ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=1"
   226          run_sql "UPDATE test.availability$i set val = 22 where id = 1;"
   227          ensure $MAX_RETRIES nonempty "select id, val from test.availability$i where id=1 and val=22"
   228          run_sql "DELETE from test.availability$i where id=1;"
   229          ensure $MAX_RETRIES empty "select id, val from test.availability$i where id=1"
   230      done
   231  
   232      export GO_FAILPOINTS=''
   233      echo "test_gap_between_watch_capture pass"
   234      cleanup_process $CDC_BINARY
   235  }