github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/worker/source_worker_test.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package worker
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"sync"
    20  	"sync/atomic"
    21  	"testing"
    22  	"time"
    23  
    24  	"github.com/DATA-DOG/go-sqlmock"
    25  	. "github.com/pingcap/check"
    26  	"github.com/pingcap/errors"
    27  	"github.com/pingcap/failpoint"
    28  	"github.com/pingcap/tiflow/dm/config"
    29  	"github.com/pingcap/tiflow/dm/pb"
    30  	"github.com/pingcap/tiflow/dm/pkg/conn"
    31  	"github.com/pingcap/tiflow/dm/pkg/ha"
    32  	"github.com/pingcap/tiflow/dm/pkg/log"
    33  	"github.com/pingcap/tiflow/dm/pkg/terror"
    34  	"github.com/pingcap/tiflow/dm/pkg/utils"
    35  	"github.com/pingcap/tiflow/dm/relay"
    36  	"github.com/pingcap/tiflow/dm/syncer"
    37  	"github.com/pingcap/tiflow/dm/unit"
    38  	"github.com/stretchr/testify/require"
    39  	"github.com/tikv/pd/pkg/utils/tempurl"
    40  	clientv3 "go.etcd.io/etcd/client/v3"
    41  )
    42  
    43  var emptyWorkerStatusInfoJSONLength = 25
    44  
    45  func mockShowMasterStatus(mockDB sqlmock.Sqlmock) {
    46  	rows := mockDB.NewRows([]string{"File", "Position", "Binlog_Do_DB", "Binlog_Ignore_DB", "Executed_Gtid_Set"}).AddRow(
    47  		"mysql-bin.000009", 11232, nil, nil, "074be7f4-f0f1-11ea-95bd-0242ac120002:1-699",
    48  	)
    49  	mockDB.ExpectQuery(`SHOW MASTER STATUS`).WillReturnRows(rows)
    50  }
    51  
    52  func mockShowMasterStatusNoRows(mockDB sqlmock.Sqlmock) {
    53  	rows := mockDB.NewRows([]string{"File", "Position", "Binlog_Do_DB", "Binlog_Ignore_DB", "Executed_Gtid_Set"})
    54  	mockDB.ExpectQuery(`SHOW MASTER STATUS`).WillReturnRows(rows)
    55  }
    56  
    57  type testServer2 struct{}
    58  
    59  var _ = Suite(&testServer2{})
    60  
    61  func (t *testServer2) SetUpSuite(c *C) {
    62  	err := log.InitLogger(&log.Config{})
    63  	c.Assert(err, IsNil)
    64  
    65  	getMinLocForSubTaskFunc = getFakeLocForSubTask
    66  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/worker/MockGetSourceCfgFromETCD", `return(true)`), IsNil)
    67  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/worker/SkipRefreshFromETCDInUT", `return()`), IsNil)
    68  }
    69  
    70  func (t *testServer2) TearDownSuite(c *C) {
    71  	getMinLocForSubTaskFunc = getMinLocForSubTask
    72  	c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/worker/MockGetSourceCfgFromETCD"), IsNil)
    73  	c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/worker/SkipRefreshFromETCDInUT"), IsNil)
    74  }
    75  
    76  func (t *testServer2) TestTaskAutoResume(c *C) {
    77  	var (
    78  		taskName = "sub-task-name"
    79  		port     = 8263
    80  	)
    81  	hostName := "127.0.0.1:18261"
    82  	etcdDir := c.MkDir()
    83  	ETCD, err := createMockETCD(etcdDir, "http://"+hostName)
    84  	c.Assert(err, IsNil)
    85  	defer ETCD.Close()
    86  
    87  	cfg := NewConfig()
    88  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
    89  	cfg.Join = hostName
    90  	sourceConfig := loadSourceConfigWithoutPassword(c)
    91  	sourceConfig.Checker.CheckEnable = true
    92  	sourceConfig.Checker.CheckInterval = config.Duration{Duration: 40 * time.Millisecond}
    93  	sourceConfig.Checker.BackoffMin = config.Duration{Duration: 20 * time.Millisecond}
    94  	sourceConfig.Checker.BackoffMax = config.Duration{Duration: 1 * time.Second}
    95  
    96  	cfg.WorkerAddr = fmt.Sprintf(":%d", port)
    97  
    98  	dir := c.MkDir()
    99  	sourceConfig.RelayDir = dir
   100  	sourceConfig.MetaDir = dir
   101  	sourceConfig.EnableRelay = true
   102  
   103  	NewRelayHolder = NewDummyRelayHolder
   104  	defer func() {
   105  		NewRelayHolder = NewRealRelayHolder
   106  	}()
   107  
   108  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/dumpling/dumpUnitProcessForever", `return()`), IsNil)
   109  	//nolint:errcheck
   110  	defer failpoint.Disable("github.com/pingcap/tiflow/dm/dumpling/dumpUnitProcessForever")
   111  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/worker/mockCreateUnitsDumpOnly", `return(true)`), IsNil)
   112  	//nolint:errcheck
   113  	defer failpoint.Disable("github.com/pingcap/tiflow/dm/worker/mockCreateUnitsDumpOnly")
   114  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/loader/ignoreLoadCheckpointErr", `return()`), IsNil)
   115  	//nolint:errcheck
   116  	defer failpoint.Disable("github.com/pingcap/tiflow/dm/loader/ignoreLoadCheckpointErr")
   117  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/dumpling/dumpUnitProcessWithError", `return("test auto resume inject error")`), IsNil)
   118  	//nolint:errcheck
   119  	defer failpoint.Disable("github.com/pingcap/tiflow/dm/dumpling/dumpUnitProcessWithError")
   120  
   121  	s := NewServer(cfg)
   122  	defer s.Close()
   123  	go func() {
   124  		c.Assert(s.Start(), IsNil)
   125  	}()
   126  	c.Assert(utils.WaitSomething(10, 100*time.Millisecond, func() bool {
   127  		if s.closed.Load() {
   128  			return false
   129  		}
   130  		w, err2 := s.getOrStartWorker(sourceConfig, true)
   131  		c.Assert(err2, IsNil)
   132  		// we set sourceConfig.EnableRelay = true above
   133  		c.Assert(w.EnableRelay(false), IsNil)
   134  		c.Assert(w.EnableHandleSubtasks(), IsNil)
   135  		return true
   136  	}), IsTrue)
   137  	// start task
   138  	var subtaskCfg config.SubTaskConfig
   139  	c.Assert(subtaskCfg.Decode(config.SampleSubtaskConfig, true), IsNil)
   140  	c.Assert(err, IsNil)
   141  	subtaskCfg.Mode = "full"
   142  	subtaskCfg.Timezone = "UTC"
   143  	c.Assert(s.getSourceWorker(true).StartSubTask(&subtaskCfg, pb.Stage_Running, pb.Stage_Stopped, true), IsNil)
   144  
   145  	// check task in paused state
   146  	c.Assert(utils.WaitSomething(100, 100*time.Millisecond, func() bool {
   147  		subtaskStatus, _, _ := s.getSourceWorker(true).QueryStatus(context.Background(), taskName)
   148  		for _, st := range subtaskStatus {
   149  			if st.Name == taskName && st.Stage == pb.Stage_Paused {
   150  				return true
   151  			}
   152  		}
   153  		return false
   154  	}), IsTrue)
   155  	//nolint:errcheck
   156  	failpoint.Disable("github.com/pingcap/tiflow/dm/dumpling/dumpUnitProcessWithError")
   157  
   158  	rtsc, ok := s.getSourceWorker(true).taskStatusChecker.(*realTaskStatusChecker)
   159  	c.Assert(ok, IsTrue)
   160  	defer func() {
   161  		// close multiple time
   162  		rtsc.Close()
   163  		rtsc.Close()
   164  	}()
   165  
   166  	// check task will be auto resumed
   167  	c.Assert(utils.WaitSomething(10, 100*time.Millisecond, func() bool {
   168  		sts, _, _ := s.getSourceWorker(true).QueryStatus(context.Background(), taskName)
   169  		for _, st := range sts {
   170  			if st.Name == taskName && st.Stage == pb.Stage_Running {
   171  				return true
   172  			}
   173  		}
   174  		c.Log(sts)
   175  		return false
   176  	}), IsTrue)
   177  }
   178  
   179  type testWorkerFunctionalities struct {
   180  	createUnitCount         int32
   181  	expectedCreateUnitCount int32
   182  }
   183  
   184  var _ = Suite(&testWorkerFunctionalities{})
   185  
   186  func (t *testWorkerFunctionalities) SetUpSuite(c *C) {
   187  	NewRelayHolder = NewDummyRelayHolder
   188  	NewSubTask = NewRealSubTask
   189  	createUnits = func(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, worker string, relay relay.Process) []unit.Unit {
   190  		atomic.AddInt32(&t.createUnitCount, 1)
   191  		mockDumper := NewMockUnit(pb.UnitType_Dump)
   192  		mockLoader := NewMockUnit(pb.UnitType_Load)
   193  		mockSync := NewMockUnit(pb.UnitType_Sync)
   194  		return []unit.Unit{mockDumper, mockLoader, mockSync}
   195  	}
   196  	getMinLocForSubTaskFunc = getFakeLocForSubTask
   197  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/worker/MockGetSourceCfgFromETCD", `return(true)`), IsNil)
   198  }
   199  
   200  func (t *testWorkerFunctionalities) TearDownSuite(c *C) {
   201  	NewRelayHolder = NewRealRelayHolder
   202  	NewSubTask = NewRealSubTask
   203  	createUnits = createRealUnits
   204  	getMinLocForSubTaskFunc = getMinLocForSubTask
   205  	c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/worker/MockGetSourceCfgFromETCD"), IsNil)
   206  }
   207  
   208  func (t *testWorkerFunctionalities) TestWorkerFunctionalities(c *C) {
   209  	var (
   210  		masterAddr   = tempurl.Alloc()[len("http://"):]
   211  		keepAliveTTL = int64(1)
   212  	)
   213  	etcdDir := c.MkDir()
   214  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   215  	c.Assert(err, IsNil)
   216  	defer ETCD.Close()
   217  	cfg := NewConfig()
   218  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   219  	cfg.Join = masterAddr
   220  	cfg.KeepAliveTTL = keepAliveTTL
   221  	cfg.RelayKeepAliveTTL = keepAliveTTL
   222  
   223  	etcdCli, err := clientv3.New(clientv3.Config{
   224  		Endpoints:            GetJoinURLs(cfg.Join),
   225  		DialTimeout:          dialTimeout,
   226  		DialKeepAliveTime:    keepaliveTime,
   227  		DialKeepAliveTimeout: keepaliveTimeout,
   228  	})
   229  	c.Assert(err, IsNil)
   230  	sourceCfg := loadSourceConfigWithoutPassword(c)
   231  	sourceCfg.EnableRelay = false
   232  
   233  	subtaskCfg := config.SubTaskConfig{}
   234  	err = subtaskCfg.Decode(config.SampleSubtaskConfig, true)
   235  	c.Assert(err, IsNil)
   236  
   237  	// start worker
   238  	w, err := NewSourceWorker(sourceCfg, etcdCli, "", "")
   239  	c.Assert(err, IsNil)
   240  	defer w.Stop(true)
   241  	go func() {
   242  		w.Start()
   243  	}()
   244  	c.Assert(utils.WaitSomething(50, 100*time.Millisecond, func() bool {
   245  		return !w.closed.Load()
   246  	}), IsTrue)
   247  
   248  	// test 1: when subTaskEnabled is false, switch on relay
   249  	c.Assert(w.subTaskEnabled.Load(), IsFalse)
   250  	t.testEnableRelay(c, w, etcdCli, sourceCfg, cfg)
   251  
   252  	// test2: when subTaskEnabled is false, switch off relay
   253  	c.Assert(w.subTaskEnabled.Load(), IsFalse)
   254  	t.testDisableRelay(c, w)
   255  
   256  	// test3: when relayEnabled is false, switch on subtask
   257  	c.Assert(w.relayEnabled.Load(), IsFalse)
   258  
   259  	t.testEnableHandleSubtasks(c, w, etcdCli, subtaskCfg, sourceCfg)
   260  
   261  	// test4: when subTaskEnabled is true, switch on relay
   262  	c.Assert(w.subTaskEnabled.Load(), IsTrue)
   263  
   264  	t.testEnableRelay(c, w, etcdCli, sourceCfg, cfg)
   265  	c.Assert(w.subTaskHolder.findSubTask(subtaskCfg.Name).cfg.UseRelay, IsTrue)
   266  	t.expectedCreateUnitCount++
   267  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   268  		return atomic.LoadInt32(&t.createUnitCount) == t.expectedCreateUnitCount
   269  	}), IsTrue)
   270  
   271  	// test5: when subTaskEnabled is true, switch off relay
   272  	c.Assert(w.subTaskEnabled.Load(), IsTrue)
   273  	t.testDisableRelay(c, w)
   274  
   275  	c.Assert(w.subTaskHolder.findSubTask(subtaskCfg.Name).cfg.UseRelay, IsFalse)
   276  	t.expectedCreateUnitCount++
   277  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   278  		return atomic.LoadInt32(&t.createUnitCount) == t.expectedCreateUnitCount
   279  	}), IsTrue)
   280  
   281  	// test6: when relayEnabled is false, switch off subtask
   282  	c.Assert(w.relayEnabled.Load(), IsFalse)
   283  
   284  	w.DisableHandleSubtasks()
   285  	c.Assert(w.subTaskEnabled.Load(), IsFalse)
   286  
   287  	// prepare for test7 & 8
   288  	t.testEnableRelay(c, w, etcdCli, sourceCfg, cfg)
   289  	// test7: when relayEnabled is true, switch on subtask
   290  	c.Assert(w.relayEnabled.Load(), IsTrue)
   291  
   292  	subtaskCfg2 := subtaskCfg
   293  	subtaskCfg2.Name = "sub-task-name-2"
   294  	// we already added subtaskCfg, so below EnableHandleSubtasks will find an extra subtask
   295  	t.expectedCreateUnitCount++
   296  	t.testEnableHandleSubtasks(c, w, etcdCli, subtaskCfg2, sourceCfg)
   297  	c.Assert(w.subTaskHolder.findSubTask(subtaskCfg.Name).cfg.UseRelay, IsTrue)
   298  	c.Assert(w.subTaskHolder.findSubTask(subtaskCfg2.Name).cfg.UseRelay, IsTrue)
   299  
   300  	// test8: when relayEnabled is true, switch off subtask
   301  	c.Assert(w.relayEnabled.Load(), IsTrue)
   302  
   303  	w.DisableHandleSubtasks()
   304  	c.Assert(w.subTaskEnabled.Load(), IsFalse)
   305  }
   306  
   307  func (t *testWorkerFunctionalities) testEnableRelay(c *C, w *SourceWorker, etcdCli *clientv3.Client,
   308  	sourceCfg *config.SourceConfig, cfg *Config,
   309  ) {
   310  	c.Assert(w.EnableRelay(false), IsNil)
   311  
   312  	c.Assert(w.relayEnabled.Load(), IsTrue)
   313  	c.Assert(w.relayHolder.Stage(), Equals, pb.Stage_New)
   314  
   315  	_, err := ha.PutSourceCfg(etcdCli, sourceCfg)
   316  	c.Assert(err, IsNil)
   317  	_, err = ha.PutRelayStageRelayConfigSourceBound(etcdCli, ha.NewRelayStage(pb.Stage_Running, sourceCfg.SourceID),
   318  		ha.NewSourceBound(sourceCfg.SourceID, cfg.Name))
   319  	c.Assert(err, IsNil)
   320  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   321  		return w.relayHolder.Stage() == pb.Stage_Running
   322  	}), IsTrue)
   323  
   324  	_, err = ha.DeleteSourceCfgRelayStageSourceBound(etcdCli, sourceCfg.SourceID, cfg.Name)
   325  	c.Assert(err, IsNil)
   326  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   327  		return w.relayHolder.Stage() == pb.Stage_Stopped
   328  	}), IsTrue)
   329  }
   330  
   331  func (t *testWorkerFunctionalities) testDisableRelay(c *C, w *SourceWorker) {
   332  	w.DisableRelay()
   333  
   334  	c.Assert(w.relayEnabled.Load(), IsFalse)
   335  	c.Assert(w.relayHolder, IsNil)
   336  }
   337  
   338  func (t *testWorkerFunctionalities) testEnableHandleSubtasks(c *C, w *SourceWorker, etcdCli *clientv3.Client,
   339  	subtaskCfg config.SubTaskConfig, sourceCfg *config.SourceConfig,
   340  ) {
   341  	c.Assert(w.EnableHandleSubtasks(), IsNil)
   342  	c.Assert(w.subTaskEnabled.Load(), IsTrue)
   343  
   344  	_, err := ha.PutSubTaskCfgStage(etcdCli, []config.SubTaskConfig{subtaskCfg}, []ha.Stage{ha.NewSubTaskStage(pb.Stage_Running, sourceCfg.SourceID, subtaskCfg.Name)}, nil)
   345  	c.Assert(err, IsNil)
   346  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   347  		return w.subTaskHolder.findSubTask(subtaskCfg.Name) != nil
   348  	}), IsTrue)
   349  	t.expectedCreateUnitCount++
   350  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   351  		return atomic.LoadInt32(&t.createUnitCount) == t.expectedCreateUnitCount
   352  	}), IsTrue)
   353  }
   354  
   355  type testWorkerEtcdCompact struct{}
   356  
   357  var _ = Suite(&testWorkerEtcdCompact{})
   358  
   359  func (t *testWorkerEtcdCompact) SetUpSuite(c *C) {
   360  	NewRelayHolder = NewDummyRelayHolder
   361  	NewSubTask = func(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, worker string) *SubTask {
   362  		cfg.UseRelay = false
   363  		return NewRealSubTask(cfg, etcdClient, worker)
   364  	}
   365  	createUnits = func(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, worker string, relay relay.Process) []unit.Unit {
   366  		mockDumper := NewMockUnit(pb.UnitType_Dump)
   367  		mockLoader := NewMockUnit(pb.UnitType_Load)
   368  		mockSync := NewMockUnit(pb.UnitType_Sync)
   369  		return []unit.Unit{mockDumper, mockLoader, mockSync}
   370  	}
   371  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/worker/MockGetSourceCfgFromETCD", `return(true)`), IsNil)
   372  }
   373  
   374  func (t *testWorkerEtcdCompact) TearDownSuite(c *C) {
   375  	NewRelayHolder = NewRealRelayHolder
   376  	NewSubTask = NewRealSubTask
   377  	createUnits = createRealUnits
   378  	c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/worker/MockGetSourceCfgFromETCD"), IsNil)
   379  }
   380  
   381  func (t *testWorkerEtcdCompact) TestWatchSubtaskStageEtcdCompact(c *C) {
   382  	var (
   383  		masterAddr   = tempurl.Alloc()[len("http://"):]
   384  		keepAliveTTL = int64(1)
   385  		startRev     = int64(1)
   386  	)
   387  
   388  	etcdDir := c.MkDir()
   389  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   390  	c.Assert(err, IsNil)
   391  	defer ETCD.Close()
   392  	cfg := NewConfig()
   393  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   394  	cfg.Join = masterAddr
   395  	cfg.KeepAliveTTL = keepAliveTTL
   396  	cfg.RelayKeepAliveTTL = keepAliveTTL
   397  
   398  	etcdCli, err := clientv3.New(clientv3.Config{
   399  		Endpoints:            GetJoinURLs(cfg.Join),
   400  		DialTimeout:          dialTimeout,
   401  		DialKeepAliveTime:    keepaliveTime,
   402  		DialKeepAliveTimeout: keepaliveTimeout,
   403  	})
   404  	c.Assert(err, IsNil)
   405  	sourceCfg := loadSourceConfigWithoutPassword(c)
   406  	sourceCfg.From = config.GetDBConfigForTest()
   407  	sourceCfg.EnableRelay = false
   408  
   409  	// step 1: start worker
   410  	w, err := NewSourceWorker(sourceCfg, etcdCli, "", "")
   411  	c.Assert(err, IsNil)
   412  	ctx, cancel := context.WithCancel(context.Background())
   413  	defer cancel()
   414  	defer w.Stop(true)
   415  	go func() {
   416  		w.Start()
   417  	}()
   418  	c.Assert(utils.WaitSomething(50, 100*time.Millisecond, func() bool {
   419  		return !w.closed.Load()
   420  	}), IsTrue)
   421  	// step 2: Put a subtask config and subtask stage to this source, then delete it
   422  	subtaskCfg := config.SubTaskConfig{}
   423  	err = subtaskCfg.Decode(config.SampleSubtaskConfig, true)
   424  	c.Assert(err, IsNil)
   425  	subtaskCfg.MydumperPath = mydumperPath
   426  
   427  	_, err = ha.PutSubTaskCfgStage(etcdCli, []config.SubTaskConfig{subtaskCfg}, []ha.Stage{ha.NewSubTaskStage(pb.Stage_Running, sourceCfg.SourceID, subtaskCfg.Name)}, nil)
   428  	c.Assert(err, IsNil)
   429  	rev, err := ha.DeleteSubTaskCfgStage(etcdCli, []config.SubTaskConfig{subtaskCfg},
   430  		[]ha.Stage{ha.NewSubTaskStage(pb.Stage_Stopped, sourceCfg.SourceID, subtaskCfg.Name)}, nil)
   431  	c.Assert(err, IsNil)
   432  	// step 2.1: start a subtask manually
   433  	c.Assert(w.StartSubTask(&subtaskCfg, pb.Stage_Running, pb.Stage_Stopped, true), IsNil)
   434  	// step 3: trigger etcd compaction and check whether we can receive it through watcher
   435  	_, err = etcdCli.Compact(ctx, rev)
   436  	c.Assert(err, IsNil)
   437  	subTaskStageCh := make(chan ha.Stage, 10)
   438  	subTaskErrCh := make(chan error, 10)
   439  	ha.WatchSubTaskStage(ctx, etcdCli, sourceCfg.SourceID, startRev, subTaskStageCh, subTaskErrCh)
   440  	select {
   441  	case err = <-subTaskErrCh:
   442  		c.Assert(errors.Cause(err), Equals, etcdErrCompacted)
   443  	case <-time.After(300 * time.Millisecond):
   444  		c.Fatal("fail to get etcd error compacted")
   445  	}
   446  	// step 4: watch subtask stage from startRev
   447  	c.Assert(w.subTaskHolder.findSubTask(subtaskCfg.Name), NotNil)
   448  	var wg sync.WaitGroup
   449  	ctx1, cancel1 := context.WithCancel(ctx)
   450  	wg.Add(1)
   451  	go func() {
   452  		defer wg.Done()
   453  		c.Assert(w.observeSubtaskStage(ctx1, etcdCli, startRev), IsNil)
   454  	}()
   455  	time.Sleep(time.Second)
   456  	// step 4.1: after observe, invalid subtask should be removed
   457  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   458  		return w.subTaskHolder.findSubTask(subtaskCfg.Name) == nil
   459  	}), IsTrue)
   460  	// step 4.2: add a new subtask stage, worker should receive and start it
   461  	_, err = ha.PutSubTaskCfgStage(etcdCli, []config.SubTaskConfig{subtaskCfg}, []ha.Stage{ha.NewSubTaskStage(pb.Stage_Running, sourceCfg.SourceID, subtaskCfg.Name)}, nil)
   462  	c.Assert(err, IsNil)
   463  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   464  		return w.subTaskHolder.findSubTask(subtaskCfg.Name) != nil
   465  	}), IsTrue)
   466  	mockDB := conn.InitMockDB(c)
   467  	mockShowMasterStatus(mockDB)
   468  	status, _, err := w.QueryStatus(ctx1, subtaskCfg.Name)
   469  	c.Assert(err, IsNil)
   470  	c.Assert(status, HasLen, 1)
   471  	c.Assert(status[0].Name, Equals, subtaskCfg.Name)
   472  	c.Assert(status[0].Stage, Equals, pb.Stage_Running)
   473  	cancel1()
   474  	wg.Wait()
   475  	w.subTaskHolder.closeAllSubTasks()
   476  	// step 5: restart observe and start from startRev, this subtask should be added
   477  	ctx2, cancel2 := context.WithCancel(ctx)
   478  	wg.Add(1)
   479  	go func() {
   480  		defer wg.Done()
   481  		c.Assert(w.observeSubtaskStage(ctx2, etcdCli, startRev), IsNil)
   482  	}()
   483  	time.Sleep(time.Second)
   484  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   485  		return w.subTaskHolder.findSubTask(subtaskCfg.Name) != nil
   486  	}), IsTrue)
   487  	mockShowMasterStatus(mockDB)
   488  	status, _, err = w.QueryStatus(ctx2, subtaskCfg.Name)
   489  	c.Assert(err, IsNil)
   490  	c.Assert(status, HasLen, 1)
   491  	c.Assert(status[0].Name, Equals, subtaskCfg.Name)
   492  	c.Assert(status[0].Stage, Equals, pb.Stage_Running)
   493  	w.Stop(true)
   494  	cancel2()
   495  	wg.Wait()
   496  }
   497  
   498  func (t *testWorkerEtcdCompact) TestWatchValidatorStageEtcdCompact(c *C) {
   499  	var (
   500  		masterAddr   = tempurl.Alloc()[len("http://"):]
   501  		keepAliveTTL = int64(1)
   502  		startRev     = int64(1)
   503  	)
   504  
   505  	etcdDir := c.MkDir()
   506  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   507  	c.Assert(err, IsNil)
   508  	defer ETCD.Close()
   509  	cfg := NewConfig()
   510  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   511  	cfg.Join = masterAddr
   512  	cfg.KeepAliveTTL = keepAliveTTL
   513  	cfg.RelayKeepAliveTTL = keepAliveTTL
   514  
   515  	etcdCli, err := clientv3.New(clientv3.Config{
   516  		Endpoints:            GetJoinURLs(cfg.Join),
   517  		DialTimeout:          dialTimeout,
   518  		DialKeepAliveTime:    keepaliveTime,
   519  		DialKeepAliveTimeout: keepaliveTimeout,
   520  	})
   521  	c.Assert(err, IsNil)
   522  	sourceCfg := loadSourceConfigWithoutPassword(c)
   523  	sourceCfg.From = config.GetDBConfigForTest()
   524  	sourceCfg.EnableRelay = false
   525  
   526  	//
   527  	// step 1: start worker
   528  	w, err := NewSourceWorker(sourceCfg, etcdCli, "", "")
   529  	c.Assert(err, IsNil)
   530  	ctx, cancel := context.WithCancel(context.Background())
   531  	defer cancel()
   532  	defer w.Stop(true)
   533  	go func() {
   534  		w.Start()
   535  	}()
   536  	c.Assert(utils.WaitSomething(50, 100*time.Millisecond, func() bool {
   537  		return !w.closed.Load()
   538  	}), IsTrue)
   539  
   540  	//
   541  	// step 2: Put a subtask config and subtask stage to this source, then delete it
   542  	subtaskCfg := config.SubTaskConfig{}
   543  	err = subtaskCfg.Decode(config.SampleSubtaskConfig, true)
   544  	c.Assert(err, IsNil)
   545  	subtaskCfg.MydumperPath = mydumperPath
   546  	subtaskCfg.ValidatorCfg = config.ValidatorConfig{Mode: config.ValidationNone}
   547  
   548  	// increase revision
   549  	_, err = etcdCli.Put(context.Background(), "/dummy-key", "value")
   550  	c.Assert(err, IsNil)
   551  	rev, err := ha.PutSubTaskCfgStage(etcdCli, []config.SubTaskConfig{subtaskCfg}, []ha.Stage{ha.NewSubTaskStage(pb.Stage_Running, sourceCfg.SourceID, subtaskCfg.Name)}, nil)
   552  	c.Assert(err, IsNil)
   553  
   554  	//
   555  	// step 2.1: start a subtask manually
   556  	c.Assert(w.StartSubTask(&subtaskCfg, pb.Stage_Running, pb.Stage_Stopped, true), IsNil)
   557  
   558  	//
   559  	// step 3: trigger etcd compaction and check whether we can receive it through watcher
   560  	_, err = etcdCli.Compact(ctx, rev)
   561  	c.Assert(err, IsNil)
   562  	subTaskStageCh := make(chan ha.Stage, 10)
   563  	subTaskErrCh := make(chan error, 10)
   564  	ctxForWatch, cancelFunc := context.WithCancel(ctx)
   565  	ha.WatchValidatorStage(ctxForWatch, etcdCli, sourceCfg.SourceID, startRev, subTaskStageCh, subTaskErrCh)
   566  	select {
   567  	case err = <-subTaskErrCh:
   568  		c.Assert(errors.Cause(err), Equals, etcdErrCompacted)
   569  	case <-time.After(300 * time.Millisecond):
   570  		c.Fatal("fail to get etcd error compacted")
   571  	}
   572  	cancelFunc()
   573  
   574  	//
   575  	// step 4: watch subtask stage from startRev
   576  	subTask := w.subTaskHolder.findSubTask(subtaskCfg.Name)
   577  	getValidator := func() *syncer.DataValidator {
   578  		subTask.RLock()
   579  		defer subTask.RUnlock()
   580  		return subTask.validator
   581  	}
   582  	c.Assert(subTask, NotNil)
   583  	c.Assert(getValidator(), IsNil)
   584  	var wg sync.WaitGroup
   585  	ctx1, cancel1 := context.WithCancel(ctx)
   586  	wg.Add(1)
   587  	go func() {
   588  		defer wg.Done()
   589  		c.Assert(w.observeValidatorStage(ctx1, startRev), IsNil)
   590  	}()
   591  	time.Sleep(time.Second)
   592  
   593  	subtaskCfg.ValidatorCfg = config.ValidatorConfig{Mode: config.ValidationFast}
   594  	unitBakup := subTask.units[len(subTask.units)-1]
   595  	subTask.units[len(subTask.units)-1] = &syncer.Syncer{} // validator need a Syncer, not a mocked unit
   596  	validatorStage := ha.NewValidatorStage(pb.Stage_Running, subtaskCfg.SourceID, subtaskCfg.Name)
   597  	_, err = ha.PutSubTaskCfgStage(etcdCli, []config.SubTaskConfig{subtaskCfg}, nil, []ha.Stage{validatorStage})
   598  	c.Assert(err, IsNil)
   599  
   600  	// validator created
   601  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   602  		return getValidator() != nil
   603  	}), IsTrue)
   604  
   605  	subTask.units[len(subTask.units)-1] = unitBakup // restore unit
   606  	cancel1()
   607  	wg.Wait()
   608  
   609  	// test operate validator
   610  	err = w.operateValidatorStage(ha.Stage{IsDeleted: true})
   611  	c.Assert(err, IsNil)
   612  	err = w.operateValidatorStage(ha.Stage{Expect: pb.Stage_Running, Task: "not-exist"})
   613  	c.Assert(err, IsNil)
   614  	err = w.operateValidatorStage(ha.Stage{Expect: pb.Stage_Running, Task: subtaskCfg.Name})
   615  	c.Assert(err, ErrorMatches, ".*failed to get subtask config.*")
   616  	err = w.operateValidatorStage(ha.Stage{Expect: pb.Stage_Running, Source: subtaskCfg.SourceID, Task: subtaskCfg.Name})
   617  	c.Assert(err, IsNil)
   618  }
   619  
   620  func (t *testWorkerEtcdCompact) TestWatchRelayStageEtcdCompact(c *C) {
   621  	var (
   622  		masterAddr   = tempurl.Alloc()[len("http://"):]
   623  		keepAliveTTL = int64(1)
   624  		startRev     = int64(1)
   625  	)
   626  	etcdDir := c.MkDir()
   627  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   628  	c.Assert(err, IsNil)
   629  	defer ETCD.Close()
   630  	cfg := NewConfig()
   631  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   632  	cfg.Join = masterAddr
   633  	cfg.KeepAliveTTL = keepAliveTTL
   634  	cfg.RelayKeepAliveTTL = keepAliveTTL
   635  
   636  	etcdCli, err := clientv3.New(clientv3.Config{
   637  		Endpoints:            GetJoinURLs(cfg.Join),
   638  		DialTimeout:          dialTimeout,
   639  		DialKeepAliveTime:    keepaliveTime,
   640  		DialKeepAliveTimeout: keepaliveTimeout,
   641  	})
   642  	c.Assert(err, IsNil)
   643  	sourceCfg := loadSourceConfigWithoutPassword(c)
   644  	sourceCfg.EnableRelay = true
   645  	sourceCfg.RelayDir = c.MkDir()
   646  	sourceCfg.MetaDir = c.MkDir()
   647  
   648  	// step 1: start worker
   649  	w, err := NewSourceWorker(sourceCfg, etcdCli, "", "")
   650  	c.Assert(err, IsNil)
   651  	ctx, cancel := context.WithCancel(context.Background())
   652  	defer cancel()
   653  	defer w.Stop(true)
   654  	go func() {
   655  		c.Assert(w.EnableRelay(false), IsNil)
   656  		w.Start()
   657  	}()
   658  	c.Assert(utils.WaitSomething(50, 100*time.Millisecond, func() bool {
   659  		return !w.closed.Load()
   660  	}), IsTrue)
   661  	// step 2: Put a relay stage to this source, then delete it
   662  	// put mysql config into relative etcd key adapter to trigger operation event
   663  	c.Assert(w.relayHolder, NotNil)
   664  	_, err = ha.PutSourceCfg(etcdCli, sourceCfg)
   665  	c.Assert(err, IsNil)
   666  	rev, err := ha.PutRelayStageRelayConfigSourceBound(etcdCli, ha.NewRelayStage(pb.Stage_Running, sourceCfg.SourceID),
   667  		ha.NewSourceBound(sourceCfg.SourceID, cfg.Name))
   668  	c.Assert(err, IsNil)
   669  	// check relay stage, should be running
   670  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   671  		return w.relayHolder.Stage() == pb.Stage_Running
   672  	}), IsTrue)
   673  	// step 3: trigger etcd compaction and check whether we can receive it through watcher, then we delete relay stage
   674  	_, err = etcdCli.Compact(ctx, rev)
   675  	c.Assert(err, IsNil)
   676  	_, err = ha.DeleteSourceCfgRelayStageSourceBound(etcdCli, sourceCfg.SourceID, cfg.Name)
   677  	c.Assert(err, IsNil)
   678  	relayStageCh := make(chan ha.Stage, 10)
   679  	relayErrCh := make(chan error, 10)
   680  	ha.WatchRelayStage(ctx, etcdCli, cfg.Name, startRev, relayStageCh, relayErrCh)
   681  	select {
   682  	case err := <-relayErrCh:
   683  		c.Assert(errors.Cause(err), Equals, etcdErrCompacted)
   684  	case <-time.After(300 * time.Millisecond):
   685  		c.Fatal("fail to get etcd error compacted")
   686  	}
   687  	// step 4: should stop the running relay because see deletion after compaction
   688  	time.Sleep(time.Second)
   689  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   690  		return w.relayHolder.Stage() == pb.Stage_Stopped
   691  	}), IsTrue)
   692  }
   693  
   694  func (t *testServer) testSourceWorker(c *C) {
   695  	cfg := loadSourceConfigWithoutPassword(c)
   696  
   697  	dir := c.MkDir()
   698  	cfg.EnableRelay = true
   699  	cfg.RelayDir = dir
   700  	cfg.MetaDir = dir
   701  
   702  	var (
   703  		masterAddr   = tempurl.Alloc()[len("http://"):]
   704  		keepAliveTTL = int64(1)
   705  	)
   706  	etcdDir := c.MkDir()
   707  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   708  	c.Assert(err, IsNil)
   709  	defer ETCD.Close()
   710  	workerCfg := NewConfig()
   711  	c.Assert(workerCfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   712  	workerCfg.Join = masterAddr
   713  	workerCfg.KeepAliveTTL = keepAliveTTL
   714  	workerCfg.RelayKeepAliveTTL = keepAliveTTL
   715  
   716  	etcdCli, err := clientv3.New(clientv3.Config{
   717  		Endpoints:            GetJoinURLs(workerCfg.Join),
   718  		DialTimeout:          dialTimeout,
   719  		DialKeepAliveTime:    keepaliveTime,
   720  		DialKeepAliveTimeout: keepaliveTimeout,
   721  	})
   722  	c.Assert(err, IsNil)
   723  
   724  	NewRelayHolder = NewDummyRelayHolderWithInitError
   725  	defer func() {
   726  		NewRelayHolder = NewRealRelayHolder
   727  	}()
   728  	w, err := NewSourceWorker(cfg, etcdCli, "", "")
   729  	c.Assert(err, IsNil)
   730  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/worker/MockGetSourceCfgFromETCD", `return(true)`), IsNil)
   731  	c.Assert(w.EnableRelay(false), ErrorMatches, "init error")
   732  	c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/worker/MockGetSourceCfgFromETCD"), IsNil)
   733  
   734  	NewRelayHolder = NewDummyRelayHolder
   735  	w, err = NewSourceWorker(cfg, etcdCli, "", "")
   736  	c.Assert(err, IsNil)
   737  	c.Assert(w.GetUnitAndSourceStatusJSON("", nil), HasLen, emptyWorkerStatusInfoJSONLength)
   738  
   739  	// stop twice
   740  	w.Stop(true)
   741  	c.Assert(w.closed.Load(), IsTrue)
   742  	c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0)
   743  	w.Stop(true)
   744  	c.Assert(w.closed.Load(), IsTrue)
   745  	c.Assert(w.subTaskHolder.getAllSubTasks(), HasLen, 0)
   746  	c.Assert(w.closed.Load(), IsTrue)
   747  
   748  	c.Assert(w.StartSubTask(&config.SubTaskConfig{
   749  		Name: "testStartTask",
   750  	}, pb.Stage_Running, pb.Stage_Stopped, true), IsNil)
   751  	task := w.subTaskHolder.findSubTask("testStartTask")
   752  	c.Assert(task, NotNil)
   753  	c.Assert(task.Result().String(), Matches, ".*worker already closed.*")
   754  
   755  	c.Assert(w.StartSubTask(&config.SubTaskConfig{
   756  		Name: "testStartTask-in-stopped",
   757  	}, pb.Stage_Stopped, pb.Stage_Stopped, true), IsNil)
   758  	task = w.subTaskHolder.findSubTask("testStartTask-in-stopped")
   759  	c.Assert(task, NotNil)
   760  	c.Assert(task.Result().String(), Matches, ".*worker already closed.*")
   761  
   762  	err = w.UpdateSubTask(context.Background(), &config.SubTaskConfig{
   763  		Name: "testStartTask",
   764  	}, true)
   765  	c.Assert(err, ErrorMatches, ".*worker already closed.*")
   766  
   767  	err = w.OperateSubTask("testSubTask", pb.TaskOp_Delete)
   768  	c.Assert(err, ErrorMatches, ".*worker already closed.*")
   769  }
   770  
   771  func (t *testServer) TestQueryValidator(c *C) {
   772  	cfg := loadSourceConfigWithoutPassword(c)
   773  
   774  	dir := c.MkDir()
   775  	cfg.EnableRelay = true
   776  	cfg.RelayDir = dir
   777  	cfg.MetaDir = dir
   778  
   779  	w, err := NewSourceWorker(cfg, nil, "", "")
   780  	w.closed.Store(false)
   781  	c.Assert(err, IsNil)
   782  	st := NewSubTaskWithStage(&config.SubTaskConfig{
   783  		Name: "testQueryValidator",
   784  		ValidatorCfg: config.ValidatorConfig{
   785  			Mode: config.ValidationFull,
   786  		},
   787  	}, pb.Stage_Running, nil, "")
   788  	w.subTaskHolder.recordSubTask(st)
   789  	var ret *pb.ValidationStatus
   790  	ret, err = w.GetValidatorStatus("invalidTaskName")
   791  	c.Assert(ret, IsNil)
   792  	c.Assert(terror.ErrWorkerSubTaskNotFound.Equal(err), IsTrue)
   793  }
   794  
   795  func (t *testServer) setupValidator(c *C) *SourceWorker {
   796  	cfg := loadSourceConfigWithoutPassword(c)
   797  
   798  	dir := c.MkDir()
   799  	cfg.EnableRelay = true
   800  	cfg.RelayDir = dir
   801  	cfg.MetaDir = dir
   802  	st := NewSubTaskWithStage(&config.SubTaskConfig{
   803  		Name: "testQueryValidator",
   804  		ValidatorCfg: config.ValidatorConfig{
   805  			Mode: config.ValidationFull,
   806  		},
   807  	}, pb.Stage_Running, nil, "")
   808  	w, err := NewSourceWorker(cfg, nil, "", "")
   809  	st.StartValidator(pb.Stage_Running, false)
   810  	w.subTaskHolder.recordSubTask(st)
   811  	w.closed.Store(false)
   812  	c.Assert(err, IsNil)
   813  	return w
   814  }
   815  
   816  func (t *testServer) TestGetWorkerValidatorErr(c *C) {
   817  	w := t.setupValidator(c)
   818  	// when subtask name not exists
   819  	// return empty array
   820  	errs, err := w.GetWorkerValidatorErr("invalidTask", pb.ValidateErrorState_InvalidErr)
   821  	c.Assert(terror.ErrWorkerSubTaskNotFound.Equal(err), IsTrue)
   822  	c.Assert(errs, IsNil)
   823  }
   824  
   825  func (t *testServer) TestOperateWorkerValidatorErr(c *C) {
   826  	w := t.setupValidator(c)
   827  	// when subtask name not exists
   828  	// return empty array
   829  	taskNotFound := terror.ErrWorkerSubTaskNotFound.Generate("invalidTask")
   830  	c.Assert(w.OperateWorkerValidatorErr("invalidTask", pb.ValidationErrOp_ClearErrOp, 0, true).Error(), Equals, taskNotFound.Error())
   831  }
   832  
   833  func TestMasterBinlogOff(t *testing.T) {
   834  	ctx := context.Background()
   835  	cfg, err := config.SourceCfgFromYamlAndVerify(config.SampleSourceConfig)
   836  	require.NoError(t, err)
   837  	cfg.From.Password = "no need to connect"
   838  
   839  	w, err := NewSourceWorker(cfg, nil, "", "")
   840  	require.NoError(t, err)
   841  	w.closed.Store(false)
   842  
   843  	// start task
   844  	var subtaskCfg config.SubTaskConfig
   845  	require.NoError(t, subtaskCfg.Decode(config.SampleSubtaskConfig, true))
   846  	require.NoError(t, w.StartSubTask(&subtaskCfg, pb.Stage_Running, pb.Stage_Stopped, true))
   847  
   848  	_, mockDB, err := conn.InitMockDBFull()
   849  	require.NoError(t, err)
   850  	mockShowMasterStatusNoRows(mockDB)
   851  	status, _, err := w.QueryStatus(ctx, subtaskCfg.Name)
   852  	require.NoError(t, err)
   853  	require.Len(t, status, 1)
   854  	require.Equal(t, subtaskCfg.Name, status[0].Name)
   855  }