github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/cdc/owner/feed_state_manager_test.go (about)

     1  // Copyright 2021 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package owner
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"testing"
    20  	"time"
    21  
    22  	"github.com/cenkalti/backoff/v4"
    23  	"github.com/pingcap/tiflow/cdc/model"
    24  	"github.com/pingcap/tiflow/cdc/vars"
    25  	"github.com/pingcap/tiflow/pkg/config"
    26  	cerror "github.com/pingcap/tiflow/pkg/errors"
    27  	"github.com/pingcap/tiflow/pkg/etcd"
    28  	"github.com/pingcap/tiflow/pkg/orchestrator"
    29  	"github.com/pingcap/tiflow/pkg/pdutil"
    30  	"github.com/pingcap/tiflow/pkg/upstream"
    31  	"github.com/pingcap/tiflow/pkg/util"
    32  	"github.com/stretchr/testify/require"
    33  	pd "github.com/tikv/pd/client"
    34  )
    35  
    36  type mockPD struct {
    37  	pd.Client
    38  
    39  	getTs func() (int64, int64, error)
    40  }
    41  
    42  func (p *mockPD) GetTS(_ context.Context) (int64, int64, error) {
    43  	if p.getTs != nil {
    44  		return p.getTs()
    45  	}
    46  	return 1, 2, nil
    47  }
    48  
    49  // newFeedStateManager4Test creates feedStateManager for test
    50  func newFeedStateManager4Test(
    51  	initialIntervalInMs, maxIntervalInMs, maxElapsedTimeInMs int,
    52  	multiplier float64,
    53  ) *feedStateManager {
    54  	f := new(feedStateManager)
    55  	f.upstream = new(upstream.Upstream)
    56  	f.upstream.PDClient = &mockPD{}
    57  	f.upstream.PDClock = pdutil.NewClock4Test()
    58  
    59  	f.errBackoff = backoff.NewExponentialBackOff()
    60  	f.errBackoff.InitialInterval = time.Duration(initialIntervalInMs) * time.Millisecond
    61  	f.errBackoff.MaxInterval = time.Duration(maxIntervalInMs) * time.Millisecond
    62  	f.errBackoff.MaxElapsedTime = time.Duration(maxElapsedTimeInMs) * time.Millisecond
    63  	f.errBackoff.Multiplier = multiplier
    64  	f.errBackoff.RandomizationFactor = 0
    65  
    66  	f.resetErrRetry()
    67  
    68  	f.changefeedErrorStuckDuration = time.Second * 3
    69  
    70  	return f
    71  }
    72  
    73  func TestHandleJob(t *testing.T) {
    74  	_, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
    75  	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
    76  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
    77  		model.DefaultChangeFeedID(changefeedInfo.ID))
    78  	manager.state = state
    79  	tester := orchestrator.NewReactorStateTester(t, state, nil)
    80  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
    81  		require.Nil(t, info)
    82  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
    83  	})
    84  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
    85  		require.Nil(t, status)
    86  		return &model.ChangeFeedStatus{}, true, nil
    87  	})
    88  	tester.MustApplyPatches()
    89  	manager.Tick(0, state.Status, state.Info)
    90  	tester.MustApplyPatches()
    91  	require.True(t, manager.ShouldRunning())
    92  
    93  	// an admin job which of changefeed is not match
    94  	manager.PushAdminJob(&model.AdminJob{
    95  		CfID: model.DefaultChangeFeedID("fake-changefeed-id"),
    96  		Type: model.AdminStop,
    97  	})
    98  	manager.Tick(0, state.Status, state.Info)
    99  	tester.MustApplyPatches()
   100  	require.True(t, manager.ShouldRunning())
   101  
   102  	// a running can not be resume
   103  	manager.PushAdminJob(&model.AdminJob{
   104  		CfID: model.DefaultChangeFeedID(changefeedInfo.ID),
   105  		Type: model.AdminResume,
   106  	})
   107  	manager.Tick(0, state.Status, state.Info)
   108  	tester.MustApplyPatches()
   109  	require.True(t, manager.ShouldRunning())
   110  
   111  	// stop a changefeed
   112  	manager.PushAdminJob(&model.AdminJob{
   113  		CfID: model.DefaultChangeFeedID(changefeedInfo.ID),
   114  		Type: model.AdminStop,
   115  	})
   116  	manager.Tick(0, state.Status, state.Info)
   117  	tester.MustApplyPatches()
   118  
   119  	require.False(t, manager.ShouldRunning())
   120  	require.False(t, manager.ShouldRemoved())
   121  	require.Equal(t, state.Info.State, model.StateStopped)
   122  	require.Equal(t, state.Info.AdminJobType, model.AdminStop)
   123  	require.Equal(t, state.Status.AdminJobType, model.AdminStop)
   124  
   125  	// resume a changefeed
   126  	manager.PushAdminJob(&model.AdminJob{
   127  		CfID: model.DefaultChangeFeedID(changefeedInfo.ID),
   128  		Type: model.AdminResume,
   129  	})
   130  	manager.Tick(0, state.Status, state.Info)
   131  	tester.MustApplyPatches()
   132  	require.True(t, manager.ShouldRunning())
   133  	require.False(t, manager.ShouldRemoved())
   134  	require.Equal(t, state.Info.State, model.StateNormal)
   135  	require.Equal(t, state.Info.AdminJobType, model.AdminNone)
   136  	require.Equal(t, state.Status.AdminJobType, model.AdminNone)
   137  
   138  	// remove a changefeed
   139  	manager.PushAdminJob(&model.AdminJob{
   140  		CfID: model.DefaultChangeFeedID(changefeedInfo.ID),
   141  		Type: model.AdminRemove,
   142  	})
   143  	manager.Tick(0, state.Status, state.Info)
   144  	tester.MustApplyPatches()
   145  
   146  	require.False(t, manager.ShouldRunning())
   147  	require.True(t, manager.ShouldRemoved())
   148  	require.False(t, state.Exist())
   149  }
   150  
   151  func TestResumeChangefeedWithCheckpointTs(t *testing.T) {
   152  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   153  	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
   154  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   155  		model.DefaultChangeFeedID(changefeedInfo.ID))
   156  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   157  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   158  		require.Nil(t, info)
   159  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   160  	})
   161  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   162  		require.Nil(t, status)
   163  		return &model.ChangeFeedStatus{}, true, nil
   164  	})
   165  	tester.MustApplyPatches()
   166  	manager.state = state
   167  	manager.Tick(0, state.Status, state.Info)
   168  	tester.MustApplyPatches()
   169  	require.True(t, manager.ShouldRunning())
   170  
   171  	// stop a changefeed
   172  	manager.PushAdminJob(&model.AdminJob{
   173  		CfID: model.DefaultChangeFeedID(changefeedInfo.ID),
   174  		Type: model.AdminStop,
   175  	})
   176  	manager.Tick(0, state.Status, state.Info)
   177  	tester.MustApplyPatches()
   178  
   179  	require.False(t, manager.ShouldRunning())
   180  	require.False(t, manager.ShouldRemoved())
   181  	require.Equal(t, state.Info.State, model.StateStopped)
   182  	require.Equal(t, state.Info.AdminJobType, model.AdminStop)
   183  	require.Equal(t, state.Status.AdminJobType, model.AdminStop)
   184  
   185  	// resume the changefeed in stopped state
   186  	manager.PushAdminJob(&model.AdminJob{
   187  		CfID:                  model.DefaultChangeFeedID(changefeedInfo.ID),
   188  		Type:                  model.AdminResume,
   189  		OverwriteCheckpointTs: 100,
   190  	})
   191  	manager.Tick(0, state.Status, state.Info)
   192  	tester.MustApplyPatches()
   193  	require.True(t, manager.ShouldRunning())
   194  	require.False(t, manager.ShouldRemoved())
   195  	require.Equal(t, state.Info.State, model.StateNormal)
   196  	require.Equal(t, state.Info.AdminJobType, model.AdminNone)
   197  	require.Equal(t, state.Status.AdminJobType, model.AdminNone)
   198  
   199  	// mock a non-retryable error occurs for this changefeed
   200  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   201  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   202  			return &model.TaskPosition{Error: &model.RunningError{
   203  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   204  				Code:    "CDC:ErrStartTsBeforeGC",
   205  				Message: "fake error for test",
   206  			}}, true, nil
   207  		})
   208  	tester.MustApplyPatches()
   209  	manager.Tick(0, state.Status, state.Info)
   210  	tester.MustApplyPatches()
   211  	require.Equal(t, state.Info.State, model.StateFailed)
   212  	require.Equal(t, state.Info.AdminJobType, model.AdminStop)
   213  	require.Equal(t, state.Status.AdminJobType, model.AdminStop)
   214  
   215  	// resume the changefeed in failed state
   216  	manager.isRetrying = true
   217  	manager.PushAdminJob(&model.AdminJob{
   218  		CfID:                  model.DefaultChangeFeedID(changefeedInfo.ID),
   219  		Type:                  model.AdminResume,
   220  		OverwriteCheckpointTs: 200,
   221  	})
   222  	manager.Tick(0, state.Status, state.Info)
   223  	tester.MustApplyPatches()
   224  	require.True(t, manager.ShouldRunning())
   225  	require.False(t, manager.ShouldRemoved())
   226  	require.Equal(t, state.Info.State, model.StateNormal)
   227  	require.Equal(t, state.Info.AdminJobType, model.AdminNone)
   228  	require.Equal(t, state.Status.AdminJobType, model.AdminNone)
   229  	require.False(t, manager.isRetrying)
   230  }
   231  
   232  func TestMarkFinished(t *testing.T) {
   233  	_, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   234  	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
   235  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   236  		model.DefaultChangeFeedID(changefeedInfo.ID))
   237  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   238  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   239  		require.Nil(t, info)
   240  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   241  	})
   242  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   243  		require.Nil(t, status)
   244  		return &model.ChangeFeedStatus{}, true, nil
   245  	})
   246  	tester.MustApplyPatches()
   247  	manager.state = state
   248  	manager.Tick(0, state.Status, state.Info)
   249  	tester.MustApplyPatches()
   250  	require.True(t, manager.ShouldRunning())
   251  
   252  	manager.MarkFinished()
   253  	manager.Tick(0, state.Status, state.Info)
   254  	tester.MustApplyPatches()
   255  
   256  	require.False(t, manager.ShouldRunning())
   257  	require.Equal(t, state.Info.State, model.StateFinished)
   258  	require.Equal(t, state.Info.AdminJobType, model.AdminFinish)
   259  	require.Equal(t, state.Status.AdminJobType, model.AdminFinish)
   260  }
   261  
   262  func TestCleanUpInfos(t *testing.T) {
   263  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   264  	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
   265  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   266  		model.DefaultChangeFeedID(changefeedInfo.ID))
   267  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   268  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   269  		require.Nil(t, info)
   270  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   271  	})
   272  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   273  		require.Nil(t, status)
   274  		return &model.ChangeFeedStatus{}, true, nil
   275  	})
   276  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   277  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   278  			return &model.TaskPosition{}, true, nil
   279  		})
   280  	tester.MustApplyPatches()
   281  	require.Contains(t, state.TaskPositions, globalVars.CaptureInfo.ID)
   282  	manager.state = state
   283  	manager.Tick(0, state.Status, state.Info)
   284  	tester.MustApplyPatches()
   285  	require.True(t, manager.ShouldRunning())
   286  
   287  	manager.MarkFinished()
   288  	manager.Tick(0, state.Status, state.Info)
   289  	tester.MustApplyPatches()
   290  	require.False(t, manager.ShouldRunning())
   291  	require.Equal(t, state.Info.State, model.StateFinished)
   292  	require.Equal(t, state.Info.AdminJobType, model.AdminFinish)
   293  	require.Equal(t, state.Status.AdminJobType, model.AdminFinish)
   294  	require.NotContains(t, state.TaskPositions, globalVars.CaptureInfo.ID)
   295  }
   296  
   297  func TestHandleError(t *testing.T) {
   298  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   299  	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
   300  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   301  		model.DefaultChangeFeedID(changefeedInfo.ID))
   302  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   303  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   304  		require.Nil(t, info)
   305  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   306  	})
   307  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   308  		require.Nil(t, status)
   309  		return &model.ChangeFeedStatus{
   310  			CheckpointTs: 200,
   311  		}, true, nil
   312  	})
   313  
   314  	tester.MustApplyPatches()
   315  	manager.state = state
   316  	manager.Tick(0, state.Status, state.Info)
   317  	tester.MustApplyPatches()
   318  
   319  	intervals := []time.Duration{200, 400, 800, 1600, 1600}
   320  	for i, d := range intervals {
   321  		intervals[i] = d * time.Millisecond
   322  	}
   323  
   324  	for _, d := range intervals {
   325  		require.True(t, manager.ShouldRunning())
   326  		state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   327  			func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   328  				return &model.TaskPosition{Error: &model.RunningError{
   329  					Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   330  					Code:    "[CDC:ErrEtcdSessionDone]",
   331  					Message: "fake error for test",
   332  				}}, true, nil
   333  			})
   334  		tester.MustApplyPatches()
   335  		manager.Tick(0, state.Status, state.Info)
   336  		tester.MustApplyPatches()
   337  		require.False(t, manager.ShouldRunning())
   338  		require.Equal(t, state.Info.State, model.StatePending)
   339  		require.Equal(t, state.Info.AdminJobType, model.AdminStop)
   340  		require.Equal(t, state.Status.AdminJobType, model.AdminStop)
   341  		time.Sleep(d)
   342  		manager.Tick(0, state.Status, state.Info)
   343  		tester.MustApplyPatches()
   344  	}
   345  
   346  	// no error tick, state should be transferred from pending to warning
   347  	manager.Tick(0, state.Status, state.Info)
   348  	require.True(t, manager.ShouldRunning())
   349  	require.Equal(t, model.StateWarning, state.Info.State)
   350  	require.Equal(t, model.AdminNone, state.Info.AdminJobType)
   351  	require.Equal(t, model.AdminNone, state.Status.AdminJobType)
   352  
   353  	// no error tick and checkpointTs is progressing,
   354  	// state should be transferred from warning to normal
   355  	state.PatchStatus(
   356  		func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   357  			status.CheckpointTs += 1
   358  			return status, true, nil
   359  		})
   360  	tester.MustApplyPatches()
   361  	manager.Tick(0, state.Status, state.Info)
   362  	tester.MustApplyPatches()
   363  	require.True(t, manager.ShouldRunning())
   364  	state.PatchStatus(
   365  		func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   366  			status.CheckpointTs += 1
   367  			return status, true, nil
   368  		})
   369  	manager.Tick(0, state.Status, state.Info)
   370  	tester.MustApplyPatches()
   371  	require.Equal(t, model.StateNormal, state.Info.State)
   372  	require.Equal(t, model.AdminNone, state.Info.AdminJobType)
   373  	require.Equal(t, model.AdminNone, state.Status.AdminJobType)
   374  }
   375  
   376  func TestHandleFastFailError(t *testing.T) {
   377  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   378  	manager := newFeedStateManager4Test(0, 0, 0, 0)
   379  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   380  		model.DefaultChangeFeedID(changefeedInfo.ID))
   381  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   382  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   383  		require.Nil(t, info)
   384  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   385  	})
   386  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   387  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   388  			return &model.TaskPosition{Error: &model.RunningError{
   389  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   390  				Code:    "CDC:ErrStartTsBeforeGC",
   391  				Message: "fake error for test",
   392  			}}, true, nil
   393  		})
   394  	tester.MustApplyPatches()
   395  	manager.state = state
   396  	manager.Tick(0, state.Status, state.Info)
   397  	// test handling fast failed error with non-nil ChangeFeedInfo
   398  	tester.MustApplyPatches()
   399  	// test handling fast failed error with nil ChangeFeedInfo
   400  	// set info to nil when this patch is applied
   401  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   402  		return nil, true, nil
   403  	})
   404  	manager.Tick(0, state.Status, state.Info)
   405  	// When the patches are applied, the callback function of PatchInfo in feedStateManager.HandleError will be called.
   406  	// At that time, the nil pointer will be checked instead of throwing a panic. See issue #3128 for more detail.
   407  	tester.MustApplyPatches()
   408  }
   409  
   410  func TestHandleErrorWhenChangefeedIsPaused(t *testing.T) {
   411  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   412  	manager := newFeedStateManager4Test(0, 0, 0, 0)
   413  	manager.state = orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   414  		model.DefaultChangeFeedID(changefeedInfo.ID))
   415  	err := &model.RunningError{
   416  		Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   417  		Code:    "CDC:ErrReachMaxTry",
   418  		Message: "fake error for test",
   419  	}
   420  	manager.state.(*orchestrator.ChangefeedReactorState).Info = &model.ChangeFeedInfo{
   421  		State: model.StateStopped,
   422  	}
   423  	manager.HandleError(err)
   424  	require.Equal(t, model.StateStopped, manager.state.(*orchestrator.ChangefeedReactorState).Info.State)
   425  }
   426  
   427  func TestChangefeedStatusNotExist(t *testing.T) {
   428  	changefeedInfo := `
   429  {
   430      "ddlSink-uri": "blackhole:///",
   431      "create-time": "2021-06-05T00:44:15.065939487+08:00",
   432      "start-ts": 425381670108266496,
   433      "target-ts": 0,
   434      "admin-job-type": 1,
   435      "sort-engine": "unified",
   436      "config": {
   437          "case-sensitive": true,
   438          "force-replicate": false,
   439          "check-gc-safe-point": true,
   440          "filter": {
   441              "rules": [
   442                  "*.*"
   443              ],
   444              "ignore-txn-start-ts": null
   445          },
   446          "mounter": {
   447              "worker-num": 16
   448          },
   449          "ddlSink": {
   450              "dispatchers": null,
   451              "protocol": "open-protocol"
   452          }
   453      },
   454      "state": "failed",
   455      "history": [],
   456      "error": {
   457          "addr": "172.16.6.147:8300",
   458          "code": "CDC:ErrSnapshotLostByGC",
   459          "message": ` + "\"[CDC:ErrSnapshotLostByGC]fail to create or maintain changefeed " +
   460  		"due to snapshot loss caused by GC. tableCheckpoint-ts 425381670108266496 " +
   461  		"is earlier than GC safepoint at 0\"" + `
   462      },
   463      "sync-point-enabled": false,
   464      "sync-point-interval": 600000000000,
   465      "creator-version": "v5.0.0-master-dirty"
   466  }
   467  `
   468  	_, changefeedConfig := vars.NewGlobalVarsAndChangefeedInfo4Test()
   469  	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
   470  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   471  		model.DefaultChangeFeedID(changefeedConfig.ID))
   472  	tester := orchestrator.NewReactorStateTester(t, state, map[string]string{
   473  		fmt.Sprintf("%s/capture/d563bfc0-f406-4f34-bc7d-6dc2e35a44e5",
   474  			etcd.DefaultClusterAndMetaPrefix): `
   475  {"id":"d563bfc0-f406-4f34-bc7d-6dc2e35a44e5",
   476  "address":"172.16.6.147:8300","version":"v5.0.0-master-dirty"}`,
   477  		fmt.Sprintf("%s/changefeed/info/",
   478  			etcd.DefaultClusterAndNamespacePrefix) +
   479  			changefeedConfig.ID: changefeedInfo,
   480  		fmt.Sprintf("%s/owner/156579d017f84a68",
   481  			etcd.DefaultClusterAndMetaPrefix,
   482  		): "d563bfc0-f406-4f34-bc7d-6dc2e35a44e5",
   483  	})
   484  	manager.state = state
   485  	manager.Tick(0, state.Status, state.Info)
   486  	require.False(t, manager.ShouldRunning())
   487  	require.False(t, manager.ShouldRemoved())
   488  	tester.MustApplyPatches()
   489  
   490  	manager.PushAdminJob(&model.AdminJob{
   491  		CfID: model.DefaultChangeFeedID(changefeedConfig.ID),
   492  		Type: model.AdminRemove,
   493  	})
   494  	manager.Tick(0, state.Status, state.Info)
   495  	require.False(t, manager.ShouldRunning())
   496  	require.True(t, manager.ShouldRemoved())
   497  	tester.MustApplyPatches()
   498  	require.Nil(t, state.Info)
   499  	require.False(t, state.Exist())
   500  }
   501  
   502  func TestChangefeedNotRetry(t *testing.T) {
   503  	_, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   504  	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
   505  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   506  		model.DefaultChangeFeedID(changefeedInfo.ID))
   507  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   508  
   509  	// changefeed state normal
   510  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   511  		require.Nil(t, info)
   512  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}, State: model.StateNormal}, true, nil
   513  	})
   514  	tester.MustApplyPatches()
   515  	manager.state = state
   516  	manager.Tick(0, state.Status, state.Info)
   517  	require.True(t, manager.ShouldRunning())
   518  
   519  	// changefeed in error state but error can be retried
   520  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   521  		return &model.ChangeFeedInfo{
   522  			SinkURI: "123",
   523  			Config:  &config.ReplicaConfig{},
   524  			State:   model.StateWarning,
   525  			Error: &model.RunningError{
   526  				Addr: "127.0.0.1",
   527  				Code: "CDC:ErrPipelineTryAgain",
   528  				Message: "pipeline is full, please try again. Internal use only, " +
   529  					"report a bug if seen externally",
   530  			},
   531  		}, true, nil
   532  	})
   533  	tester.MustApplyPatches()
   534  	manager.Tick(0, state.Status, state.Info)
   535  	require.True(t, manager.ShouldRunning())
   536  
   537  	state.PatchTaskPosition("test",
   538  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   539  			if position == nil {
   540  				position = &model.TaskPosition{}
   541  			}
   542  			position.Error = &model.RunningError{
   543  				Time:    time.Now(),
   544  				Addr:    "test",
   545  				Code:    "CDC:ErrExpressionColumnNotFound",
   546  				Message: "what ever",
   547  			}
   548  			return position, true, nil
   549  		})
   550  	tester.MustApplyPatches()
   551  	manager.Tick(0, state.Status, state.Info)
   552  	require.False(t, manager.ShouldRunning())
   553  
   554  	state.PatchTaskPosition("test",
   555  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   556  			if position == nil {
   557  				position = &model.TaskPosition{}
   558  			}
   559  			position.Error = &model.RunningError{
   560  				Addr:    "127.0.0.1",
   561  				Code:    string(cerror.ErrExpressionColumnNotFound.RFCCode()),
   562  				Message: cerror.ErrExpressionColumnNotFound.Error(),
   563  			}
   564  			return position, true, nil
   565  		})
   566  	tester.MustApplyPatches()
   567  	manager.Tick(0, state.Status, state.Info)
   568  	// should be false
   569  	require.False(t, manager.ShouldRunning())
   570  
   571  	state.PatchTaskPosition("test",
   572  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   573  			if position == nil {
   574  				position = &model.TaskPosition{}
   575  			}
   576  			position.Error = &model.RunningError{
   577  				Addr:    "127.0.0.1",
   578  				Code:    string(cerror.ErrExpressionParseFailed.RFCCode()),
   579  				Message: cerror.ErrExpressionParseFailed.Error(),
   580  			}
   581  			return position, true, nil
   582  		})
   583  	tester.MustApplyPatches()
   584  	manager.Tick(0, state.Status, state.Info)
   585  	// should be false
   586  	require.False(t, manager.ShouldRunning())
   587  }
   588  
   589  func TestBackoffStopsUnexpectedly(t *testing.T) {
   590  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   591  	// after 4000ms, the backoff will stop
   592  	manager := newFeedStateManager4Test(500, 500, 4000, 1.0)
   593  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   594  		model.DefaultChangeFeedID(changefeedInfo.ID))
   595  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   596  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   597  		require.Nil(t, info)
   598  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   599  	})
   600  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   601  		require.Nil(t, status)
   602  		return &model.ChangeFeedStatus{}, true, nil
   603  	})
   604  
   605  	tester.MustApplyPatches()
   606  	manager.state = state
   607  	manager.Tick(0, state.Status, state.Info)
   608  	tester.MustApplyPatches()
   609  
   610  	for i := 1; i <= 10; i++ {
   611  		if i >= 8 {
   612  			// after round 8, the maxElapsedTime of backoff will exceed 4000ms,
   613  			// and NextBackOff() will return -1, so the changefeed state will
   614  			// never turn into error state.
   615  			require.Equal(t, state.Info.State, model.StateFailed)
   616  			require.False(t, manager.ShouldRunning())
   617  		} else {
   618  			if i == 1 {
   619  				require.Equal(t, model.StateNormal, state.Info.State)
   620  			} else {
   621  				require.Equal(t, model.StateWarning, state.Info.State)
   622  			}
   623  			require.True(t, manager.ShouldRunning())
   624  			state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   625  				func(position *model.TaskPosition) (
   626  					*model.TaskPosition, bool, error,
   627  				) {
   628  					return &model.TaskPosition{Error: &model.RunningError{
   629  						Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   630  						Code:    "[CDC:ErrEtcdSessionDone]",
   631  						Message: "fake error for test",
   632  					}}, true, nil
   633  				})
   634  			tester.MustApplyPatches()
   635  			manager.Tick(0, state.Status, state.Info)
   636  			tester.MustApplyPatches()
   637  			// If an error occurs, backing off from running the task.
   638  			require.False(t, manager.ShouldRunning())
   639  			require.Equal(t, model.StatePending, state.Info.State)
   640  			require.Equal(t, state.Info.AdminJobType, model.AdminStop)
   641  			require.Equal(t, state.Status.AdminJobType, model.AdminStop)
   642  		}
   643  
   644  		// 500ms is the backoff interval, so sleep 500ms and after a manager
   645  		// tick, the changefeed will turn into normal state
   646  		time.Sleep(500 * time.Millisecond)
   647  		manager.Tick(0, state.Status, state.Info)
   648  		tester.MustApplyPatches()
   649  	}
   650  }
   651  
   652  func TestBackoffNeverStops(t *testing.T) {
   653  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   654  	// the backoff will never stop
   655  	manager := newFeedStateManager4Test(100, 100, 0, 1.0)
   656  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   657  		model.DefaultChangeFeedID(changefeedInfo.ID))
   658  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   659  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   660  		require.Nil(t, info)
   661  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   662  	})
   663  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   664  		require.Nil(t, status)
   665  		return &model.ChangeFeedStatus{}, true, nil
   666  	})
   667  
   668  	tester.MustApplyPatches()
   669  	manager.state = state
   670  	manager.Tick(0, state.Status, state.Info)
   671  	tester.MustApplyPatches()
   672  
   673  	for i := 1; i <= 30; i++ {
   674  		if i == 1 {
   675  			require.Equal(t, model.StateNormal, state.Info.State)
   676  		} else {
   677  			require.Equal(t, model.StateWarning, state.Info.State)
   678  		}
   679  		require.True(t, manager.ShouldRunning())
   680  		state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   681  			func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   682  				return &model.TaskPosition{Error: &model.RunningError{
   683  					Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   684  					Code:    "[CDC:ErrEtcdSessionDone]",
   685  					Message: "fake error for test",
   686  				}}, true, nil
   687  			})
   688  		tester.MustApplyPatches()
   689  		manager.Tick(0, state.Status, state.Info)
   690  		tester.MustApplyPatches()
   691  		require.False(t, manager.ShouldRunning())
   692  		require.Equal(t, model.StatePending, state.Info.State)
   693  		require.Equal(t, state.Info.AdminJobType, model.AdminStop)
   694  		require.Equal(t, state.Status.AdminJobType, model.AdminStop)
   695  		// 100ms is the backoff interval, so sleep 100ms and after a manager tick,
   696  		// the changefeed will turn into normal state
   697  		time.Sleep(100 * time.Millisecond)
   698  		manager.Tick(0, state.Status, state.Info)
   699  		tester.MustApplyPatches()
   700  	}
   701  }
   702  
   703  func TestUpdateChangefeedEpoch(t *testing.T) {
   704  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   705  	// Set a long backoff time
   706  	manager := newFeedStateManager4Test(int(time.Hour), int(time.Hour), 0, 1.0)
   707  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   708  		model.DefaultChangeFeedID(changefeedInfo.ID))
   709  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   710  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   711  		require.Nil(t, info)
   712  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   713  	})
   714  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   715  		require.Nil(t, status)
   716  		return &model.ChangeFeedStatus{}, true, nil
   717  	})
   718  
   719  	tester.MustApplyPatches()
   720  	manager.state = state
   721  	manager.Tick(0, state.Status, state.Info)
   722  	tester.MustApplyPatches()
   723  	require.Equal(t, state.Info.State, model.StateNormal)
   724  	require.True(t, manager.ShouldRunning())
   725  
   726  	for i := 1; i <= 30; i++ {
   727  		manager.upstream.PDClient.(*mockPD).getTs = func() (int64, int64, error) {
   728  			return int64(i), 0, nil
   729  		}
   730  		previousEpoch := state.Info.Epoch
   731  		previousState := state.Info.State
   732  		state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   733  			func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   734  				return &model.TaskPosition{Error: &model.RunningError{
   735  					Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   736  					Code:    "[CDC:ErrEtcdSessionDone]",
   737  					Message: "fake error for test",
   738  				}}, true, nil
   739  			})
   740  		tester.MustApplyPatches()
   741  		manager.Tick(0, state.Status, state.Info)
   742  		tester.MustApplyPatches()
   743  		require.False(t, manager.ShouldRunning())
   744  		require.Equal(t, model.StatePending, state.Info.State, i)
   745  
   746  		require.Equal(t, state.Info.AdminJobType, model.AdminStop)
   747  		require.Equal(t, state.Status.AdminJobType, model.AdminStop)
   748  
   749  		// Epoch only changes when State changes.
   750  		if previousState == state.Info.State {
   751  			require.Equal(t, previousEpoch, state.Info.Epoch)
   752  		} else {
   753  			require.NotEqual(t, previousEpoch, state.Info.Epoch)
   754  		}
   755  	}
   756  }
   757  
   758  func TestHandleWarning(t *testing.T) {
   759  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   760  	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
   761  	manager.changefeedErrorStuckDuration = 100 * time.Millisecond
   762  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   763  		model.DefaultChangeFeedID(changefeedInfo.ID))
   764  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   765  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   766  		require.Nil(t, info)
   767  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   768  	})
   769  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   770  		require.Nil(t, status)
   771  		return &model.ChangeFeedStatus{
   772  			CheckpointTs: 200,
   773  		}, true, nil
   774  	})
   775  
   776  	tester.MustApplyPatches()
   777  	manager.state = state
   778  	manager.Tick(0, state.Status, state.Info)
   779  	tester.MustApplyPatches()
   780  	require.Equal(t, model.StateNormal, state.Info.State)
   781  	require.True(t, manager.ShouldRunning())
   782  
   783  	// 1. test when an warning occurs, the changefeed state will be changed to warning
   784  	// and it will still keep running
   785  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   786  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   787  			return &model.TaskPosition{Warning: &model.RunningError{
   788  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   789  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
   790  				Message: "fake error for test",
   791  			}}, true, nil
   792  		})
   793  	tester.MustApplyPatches()
   794  	manager.Tick(0, state.Status, state.Info)
   795  	// some patches will be generated when the manager.Tick is called
   796  	// so we need to apply the patches before we check the state
   797  	tester.MustApplyPatches()
   798  	require.Equal(t, model.StateWarning, state.Info.State)
   799  	require.True(t, manager.ShouldRunning())
   800  
   801  	// 2. test when the changefeed is in warning state, and the checkpointTs is not progressing,
   802  	// the changefeed state will remain warning
   803  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   804  		require.NotNil(t, status)
   805  		return &model.ChangeFeedStatus{
   806  			CheckpointTs: 200,
   807  		}, true, nil
   808  	})
   809  	tester.MustApplyPatches()
   810  	manager.Tick(0, state.Status, state.Info)
   811  	tester.MustApplyPatches()
   812  	require.Equal(t, model.StateWarning, state.Info.State)
   813  	require.True(t, manager.ShouldRunning())
   814  
   815  	// 3. test when the changefeed is in warning state, and the checkpointTs is progressing,
   816  	// the changefeed state will be changed to normal
   817  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   818  		require.NotNil(t, status)
   819  		return &model.ChangeFeedStatus{
   820  			CheckpointTs: 201,
   821  		}, true, nil
   822  	})
   823  	tester.MustApplyPatches()
   824  	manager.Tick(0, state.Status, state.Info)
   825  	tester.MustApplyPatches()
   826  	require.Equal(t, model.StateNormal, state.Info.State)
   827  	require.True(t, manager.ShouldRunning())
   828  
   829  	// 4. test when the changefeed is in warning state, and the checkpointTs is not progressing
   830  	// for defaultBackoffMaxElapsedTime time, the changefeed state will be changed to failed
   831  	// and it will stop running
   832  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   833  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   834  			return &model.TaskPosition{Warning: &model.RunningError{
   835  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   836  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
   837  				Message: "fake error for test",
   838  			}}, true, nil
   839  		})
   840  	tester.MustApplyPatches()
   841  	manager.Tick(0, state.Status, state.Info)
   842  	// some patches will be generated when the manager.Tick is called
   843  	// so we need to apply the patches before we check the state
   844  	tester.MustApplyPatches()
   845  	require.Equal(t, model.StateWarning, state.Info.State)
   846  	require.True(t, manager.ShouldRunning())
   847  
   848  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   849  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   850  			return &model.TaskPosition{Warning: &model.RunningError{
   851  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   852  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
   853  				Message: "fake error for test",
   854  			}}, true, nil
   855  		})
   856  	tester.MustApplyPatches()
   857  	// mock the checkpointTs is not progressing for defaultBackoffMaxElapsedTime time
   858  	manager.checkpointTsAdvanced = manager.
   859  		checkpointTsAdvanced.Add(-(manager.changefeedErrorStuckDuration + 1))
   860  	// resolveTs = 202 > checkpointTs = 201
   861  	manager.Tick(202, state.Status, state.Info)
   862  	// some patches will be generated when the manager.Tick is called
   863  	// so we need to apply the patches before we check the state
   864  	tester.MustApplyPatches()
   865  	require.Equal(t, model.StateFailed, state.Info.State)
   866  	require.False(t, manager.ShouldRunning())
   867  }
   868  
   869  func TestErrorAfterWarning(t *testing.T) {
   870  	t.Parallel()
   871  
   872  	maxElapsedTimeInMs := 2000
   873  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   874  	manager := newFeedStateManager4Test(200, 1600, maxElapsedTimeInMs, 2.0)
   875  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   876  		model.DefaultChangeFeedID(changefeedInfo.ID))
   877  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   878  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   879  		require.Nil(t, info)
   880  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   881  	})
   882  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   883  		require.Nil(t, status)
   884  		return &model.ChangeFeedStatus{
   885  			CheckpointTs: 200,
   886  		}, true, nil
   887  	})
   888  
   889  	tester.MustApplyPatches()
   890  	manager.state = state
   891  	manager.Tick(0, state.Status, state.Info)
   892  	tester.MustApplyPatches()
   893  	require.Equal(t, model.StateNormal, state.Info.State)
   894  	require.True(t, manager.ShouldRunning())
   895  
   896  	// 1. test when an warning occurs, the changefeed state will be changed to warning
   897  	// and it will still keep running
   898  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   899  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   900  			return &model.TaskPosition{Warning: &model.RunningError{
   901  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   902  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
   903  				Message: "fake error for test",
   904  			}}, true, nil
   905  		})
   906  	tester.MustApplyPatches()
   907  	manager.Tick(0, state.Status, state.Info)
   908  	// some patches will be generated when the manager.Tick is called
   909  	// so we need to apply the patches before we check the state
   910  	tester.MustApplyPatches()
   911  	require.Equal(t, model.StateWarning, state.Info.State)
   912  	require.True(t, manager.ShouldRunning())
   913  
   914  	// 2. test when the changefeed is in warning state, and the checkpointTs is not progressing,
   915  	// the changefeed state will remain warning
   916  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   917  		require.NotNil(t, status)
   918  		return &model.ChangeFeedStatus{
   919  			CheckpointTs: 200,
   920  		}, true, nil
   921  	})
   922  	tester.MustApplyPatches()
   923  	manager.Tick(0, state.Status, state.Info)
   924  	tester.MustApplyPatches()
   925  	require.Equal(t, model.StateWarning, state.Info.State)
   926  	require.True(t, manager.ShouldRunning())
   927  
   928  	// 3. Sleep maxElapsedTimeInMs to wait backoff timeout. And when an error occurs after an warning,
   929  	// the backoff will be reseted, and changefeed state will be changed to warning and it will still
   930  	// keep running.
   931  	time.Sleep(time.Millisecond * time.Duration(maxElapsedTimeInMs))
   932  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   933  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   934  			return &model.TaskPosition{Error: &model.RunningError{
   935  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   936  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
   937  				Message: "fake error for test",
   938  			}}, true, nil
   939  		})
   940  	tester.MustApplyPatches()
   941  
   942  	manager.Tick(0, state.Status, state.Info)
   943  	// some patches will be generated when the manager.Tick is called
   944  	// so we need to apply the patches before we check the state
   945  	tester.MustApplyPatches()
   946  	require.Equal(t, model.StatePending, state.Info.State)
   947  	require.False(t, manager.ShouldRunning())
   948  	manager.Tick(0, state.Status, state.Info)
   949  
   950  	// some patches will be generated when the manager.Tick is called
   951  	// so we need to apply the patches before we check the state
   952  	tester.MustApplyPatches()
   953  	require.Equal(t, model.StateWarning, state.Info.State)
   954  	require.True(t, manager.ShouldRunning())
   955  }
   956  
   957  func TestHandleWarningWhileAdvanceResolvedTs(t *testing.T) {
   958  	t.Parallel()
   959  
   960  	maxElapsedTimeInMs := 2000
   961  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
   962  	manager := newFeedStateManager4Test(200, 1600, maxElapsedTimeInMs, 2.0)
   963  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
   964  		model.DefaultChangeFeedID(changefeedInfo.ID))
   965  	manager.state = state
   966  	tester := orchestrator.NewReactorStateTester(t, state, nil)
   967  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
   968  		require.Nil(t, info)
   969  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
   970  	})
   971  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
   972  		require.Nil(t, status)
   973  		return &model.ChangeFeedStatus{
   974  			CheckpointTs: 200,
   975  		}, true, nil
   976  	})
   977  
   978  	tester.MustApplyPatches()
   979  	manager.Tick(200, state.Status, state.Info)
   980  	tester.MustApplyPatches()
   981  	require.Equal(t, model.StateNormal, state.Info.State)
   982  	require.True(t, manager.ShouldRunning())
   983  
   984  	// 1. test when an warning occurs, the changefeed state will be changed to warning
   985  	// and it will still keep running
   986  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
   987  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
   988  			return &model.TaskPosition{Warning: &model.RunningError{
   989  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
   990  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
   991  				Message: "fake error for test",
   992  			}}, true, nil
   993  		})
   994  	tester.MustApplyPatches()
   995  	manager.Tick(200, state.Status, state.Info)
   996  	// some patches will be generated when the manager.Tick is called
   997  	// so we need to apply the patches before we check the state
   998  	tester.MustApplyPatches()
   999  	require.Equal(t, model.StateWarning, state.Info.State)
  1000  	require.True(t, manager.ShouldRunning())
  1001  
  1002  	// 2. test when the changefeed is in warning state, and the resolvedTs and checkpointTs is not progressing,
  1003  	// the changefeed state will remain warning when a new warning is encountered.
  1004  	time.Sleep(manager.changefeedErrorStuckDuration + 10)
  1005  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
  1006  		require.NotNil(t, status)
  1007  		return &model.ChangeFeedStatus{
  1008  			CheckpointTs: 200,
  1009  		}, true, nil
  1010  	})
  1011  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
  1012  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
  1013  			return &model.TaskPosition{Warning: &model.RunningError{
  1014  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
  1015  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
  1016  				Message: "fake error for test",
  1017  			}}, true, nil
  1018  		})
  1019  	tester.MustApplyPatches()
  1020  	manager.Tick(200, state.Status, state.Info)
  1021  	tester.MustApplyPatches()
  1022  	require.Equal(t, model.StateWarning, state.Info.State)
  1023  	require.True(t, manager.ShouldRunning())
  1024  
  1025  	// 3. Test changefeed remain warning when resolvedTs is progressing after stuck beyond the detection time.
  1026  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
  1027  		require.NotNil(t, status)
  1028  		return &model.ChangeFeedStatus{
  1029  			CheckpointTs: 200,
  1030  		}, true, nil
  1031  	})
  1032  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
  1033  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
  1034  			return &model.TaskPosition{Warning: &model.RunningError{
  1035  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
  1036  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
  1037  				Message: "fake error for test",
  1038  			}}, true, nil
  1039  		})
  1040  	tester.MustApplyPatches()
  1041  	manager.Tick(400, state.Status, state.Info)
  1042  	tester.MustApplyPatches()
  1043  	require.Equal(t, model.StateWarning, state.Info.State)
  1044  	require.True(t, manager.ShouldRunning())
  1045  
  1046  	// 4. Test changefeed failed when checkpointTs is not progressing for changefeedErrorStuckDuration time.
  1047  	time.Sleep(manager.changefeedErrorStuckDuration + 10)
  1048  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
  1049  		require.NotNil(t, status)
  1050  		return &model.ChangeFeedStatus{
  1051  			CheckpointTs: 200,
  1052  		}, true, nil
  1053  	})
  1054  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
  1055  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
  1056  			return &model.TaskPosition{Warning: &model.RunningError{
  1057  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
  1058  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
  1059  				Message: "fake error for test",
  1060  			}}, true, nil
  1061  		})
  1062  	tester.MustApplyPatches()
  1063  	manager.Tick(400, state.Status, state.Info)
  1064  	tester.MustApplyPatches()
  1065  	require.Equal(t, model.StateFailed, state.Info.State)
  1066  	require.False(t, manager.ShouldRunning())
  1067  }
  1068  
  1069  func TestUpdateChangefeedWithChangefeedErrorStuckDuration(t *testing.T) {
  1070  	globalVars, changefeedInfo := vars.NewGlobalVarsAndChangefeedInfo4Test()
  1071  	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
  1072  	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
  1073  		model.DefaultChangeFeedID(changefeedInfo.ID))
  1074  	tester := orchestrator.NewReactorStateTester(t, state, nil)
  1075  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
  1076  		require.Nil(t, info)
  1077  		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
  1078  	})
  1079  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
  1080  		require.Nil(t, status)
  1081  		return &model.ChangeFeedStatus{}, true, nil
  1082  	})
  1083  	tester.MustApplyPatches()
  1084  	manager.state = state
  1085  	manager.Tick(0, state.Status, state.Info)
  1086  	tester.MustApplyPatches()
  1087  	require.True(t, manager.ShouldRunning())
  1088  
  1089  	stuckDuration := manager.changefeedErrorStuckDuration + time.Second*3
  1090  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
  1091  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
  1092  			return &model.TaskPosition{Warning: &model.RunningError{
  1093  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
  1094  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
  1095  				Message: "fake error for test",
  1096  			}}, true, nil
  1097  		})
  1098  	tester.MustApplyPatches()
  1099  	time.Sleep(stuckDuration - time.Second)
  1100  	manager.Tick(100, state.Status, state.Info)
  1101  	tester.MustApplyPatches()
  1102  	require.False(t, manager.ShouldRunning())
  1103  	require.Less(t, manager.changefeedErrorStuckDuration, stuckDuration)
  1104  	require.Equal(t, state.Info.State, model.StateFailed)
  1105  
  1106  	// update ChangefeedErrorStuckDuration
  1107  	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
  1108  		require.NotNil(t, info)
  1109  		info.Config.ChangefeedErrorStuckDuration = util.AddressOf(stuckDuration)
  1110  		return info, true, nil
  1111  	})
  1112  	// update status
  1113  	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
  1114  		require.NotNil(t, status)
  1115  		return &model.ChangeFeedStatus{
  1116  			CheckpointTs: 100,
  1117  		}, true, nil
  1118  	})
  1119  	tester.MustApplyPatches()
  1120  
  1121  	// resume the changefeed in failed state
  1122  	manager.PushAdminJob(&model.AdminJob{
  1123  		CfID:                  model.DefaultChangeFeedID(changefeedInfo.ID),
  1124  		Type:                  model.AdminResume,
  1125  		OverwriteCheckpointTs: 100,
  1126  	})
  1127  
  1128  	manager.Tick(101, state.Status, state.Info)
  1129  	tester.MustApplyPatches()
  1130  	require.True(t, manager.ShouldRunning())
  1131  	require.False(t, manager.ShouldRemoved())
  1132  	require.Equal(t, manager.changefeedErrorStuckDuration, stuckDuration)
  1133  	require.Equal(t, state.Info.State, model.StateNormal)
  1134  	require.Equal(t, state.Info.AdminJobType, model.AdminNone)
  1135  	require.Equal(t, state.Status.AdminJobType, model.AdminNone)
  1136  
  1137  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
  1138  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
  1139  			return &model.TaskPosition{Warning: &model.RunningError{
  1140  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
  1141  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
  1142  				Message: "fake error for test",
  1143  			}}, true, nil
  1144  		})
  1145  	tester.MustApplyPatches()
  1146  
  1147  	time.Sleep(stuckDuration - time.Second)
  1148  	manager.Tick(200, state.Status, state.Info)
  1149  	tester.MustApplyPatches()
  1150  	require.True(t, manager.ShouldRunning())
  1151  	require.Equal(t, state.Info.State, model.StateWarning)
  1152  
  1153  	state.PatchTaskPosition(globalVars.CaptureInfo.ID,
  1154  		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
  1155  			return &model.TaskPosition{Warning: &model.RunningError{
  1156  				Addr:    globalVars.CaptureInfo.AdvertiseAddr,
  1157  				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
  1158  				Message: "fake error for test",
  1159  			}}, true, nil
  1160  		})
  1161  	tester.MustApplyPatches()
  1162  
  1163  	time.Sleep(time.Second)
  1164  	manager.Tick(201, state.Status, state.Info)
  1165  	tester.MustApplyPatches()
  1166  	require.False(t, manager.ShouldRunning())
  1167  	require.Equal(t, state.Info.State, model.StateFailed)
  1168  }