github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/worker/server_test.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package worker
    15  
    16  import (
    17  	"context"
    18  	"io"
    19  	"net/http"
    20  	"net/url"
    21  	"sync"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/go-mysql-org/go-mysql/mysql"
    26  	. "github.com/pingcap/check"
    27  	"github.com/pingcap/errors"
    28  	"github.com/pingcap/failpoint"
    29  	"github.com/pingcap/tiflow/dm/config"
    30  	"github.com/pingcap/tiflow/dm/pb"
    31  	"github.com/pingcap/tiflow/dm/pkg/binlog"
    32  	"github.com/pingcap/tiflow/dm/pkg/gtid"
    33  	"github.com/pingcap/tiflow/dm/pkg/ha"
    34  	"github.com/pingcap/tiflow/dm/pkg/log"
    35  	"github.com/pingcap/tiflow/dm/pkg/terror"
    36  	"github.com/pingcap/tiflow/dm/pkg/utils"
    37  	"github.com/pingcap/tiflow/dm/relay"
    38  	"github.com/pingcap/tiflow/dm/unit"
    39  	"github.com/stretchr/testify/require"
    40  	"github.com/tikv/pd/pkg/utils/tempurl"
    41  	v3rpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
    42  	clientv3 "go.etcd.io/etcd/client/v3"
    43  	"go.etcd.io/etcd/server/v3/embed"
    44  	"google.golang.org/grpc"
    45  )
    46  
    47  // do not forget to update this path if the file removed/renamed.
    48  const (
    49  	mydumperPath = "../../bin/mydumper"
    50  )
    51  
    52  var etcdErrCompacted = v3rpc.ErrCompacted
    53  
    54  func TestServer(t *testing.T) {
    55  	TestingT(t)
    56  }
    57  
    58  type testServer struct{}
    59  
    60  var _ = Suite(&testServer{})
    61  
    62  func (t *testServer) SetUpSuite(c *C) {
    63  	err := log.InitLogger(&log.Config{})
    64  	c.Assert(err, IsNil)
    65  	getMinLocForSubTaskFunc = getFakeLocForSubTask
    66  }
    67  
    68  func (t *testServer) TearDownSuite(c *C) {
    69  	getMinLocForSubTaskFunc = getMinLocForSubTask
    70  }
    71  
    72  func createMockETCD(dir string, host string) (*embed.Etcd, error) {
    73  	cfg := embed.NewConfig()
    74  	cfg.Dir = dir
    75  	lcurl, _ := url.Parse(host)
    76  	cfg.ListenClientUrls = []url.URL{*lcurl}
    77  	cfg.AdvertiseClientUrls = []url.URL{*lcurl}
    78  	lpurl, _ := url.Parse(tempurl.Alloc())
    79  	cfg.ListenPeerUrls = []url.URL{*lpurl}
    80  	cfg.AdvertisePeerUrls = []url.URL{*lpurl}
    81  	cfg.InitialCluster = "default=" + lpurl.String()
    82  	cfg.Logger = "zap"
    83  	metricsURL, _ := url.Parse(tempurl.Alloc())
    84  	cfg.ListenMetricsUrls = []url.URL{*metricsURL}
    85  	ETCD, err := embed.StartEtcd(cfg)
    86  	if err != nil {
    87  		return nil, err
    88  	}
    89  
    90  	select {
    91  	case <-ETCD.Server.ReadyNotify():
    92  	case <-time.After(5 * time.Second):
    93  		ETCD.Server.Stop() // trigger a shutdown
    94  	}
    95  	// embd.client = v3client.New(embd.ETCD.Server)
    96  	return ETCD, nil
    97  }
    98  
    99  func (t *testServer) TestServer(c *C) {
   100  	var (
   101  		masterAddr   = tempurl.Alloc()[len("http://"):]
   102  		workerAddr1  = "127.0.0.1:8262"
   103  		keepAliveTTL = int64(1)
   104  	)
   105  	etcdDir := c.MkDir()
   106  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   107  	c.Assert(err, IsNil)
   108  	cfg := NewConfig()
   109  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   110  	cfg.Join = masterAddr
   111  	cfg.KeepAliveTTL = keepAliveTTL
   112  	cfg.RelayKeepAliveTTL = keepAliveTTL
   113  
   114  	NewRelayHolder = NewDummyRelayHolder
   115  	NewSubTask = func(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, worker string) *SubTask {
   116  		cfg.UseRelay = false
   117  		return NewRealSubTask(cfg, etcdClient, worker)
   118  	}
   119  	createUnits = func(cfg *config.SubTaskConfig, etcdClient *clientv3.Client, worker string, relay relay.Process) []unit.Unit {
   120  		mockDumper := NewMockUnit(pb.UnitType_Dump)
   121  		mockLoader := NewMockUnit(pb.UnitType_Load)
   122  		mockSync := NewMockUnit(pb.UnitType_Sync)
   123  		return []unit.Unit{mockDumper, mockLoader, mockSync}
   124  	}
   125  	defer func() {
   126  		NewRelayHolder = NewRealRelayHolder
   127  		NewSubTask = NewRealSubTask
   128  		createUnits = createRealUnits
   129  	}()
   130  
   131  	s := NewServer(cfg)
   132  	defer s.Close()
   133  	go func() {
   134  		err1 := s.Start()
   135  		c.Assert(err1, IsNil)
   136  	}()
   137  
   138  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   139  		return !s.closed.Load()
   140  	}), IsTrue)
   141  	dir := c.MkDir()
   142  
   143  	t.testOperateSourceBoundWithoutConfigInEtcd(c, s)
   144  
   145  	t.testOperateWorker(c, s, dir, true)
   146  
   147  	// check worker would retry connecting master rather than stop worker directly.
   148  	ETCD = t.testRetryConnectMaster(c, s, ETCD, etcdDir, masterAddr)
   149  
   150  	// resume contact with ETCD and start worker again
   151  	t.testOperateWorker(c, s, dir, true)
   152  
   153  	// test condition hub
   154  	t.testConidtionHub(c, s)
   155  
   156  	t.testHTTPInterface(c, "status")
   157  	t.testHTTPInterface(c, "metrics")
   158  
   159  	// create client
   160  	cli := t.createClient(c, workerAddr1)
   161  
   162  	// start task
   163  	subtaskCfg := config.SubTaskConfig{}
   164  	err = subtaskCfg.Decode(config.SampleSubtaskConfig, true)
   165  	c.Assert(err, IsNil)
   166  	subtaskCfg.MydumperPath = mydumperPath
   167  
   168  	sourceCfg := loadSourceConfigWithoutPassword(c)
   169  	_, err = ha.PutSubTaskCfgStage(s.etcdClient, []config.SubTaskConfig{subtaskCfg}, []ha.Stage{ha.NewSubTaskStage(pb.Stage_Running, sourceCfg.SourceID, subtaskCfg.Name)}, nil)
   170  	c.Assert(err, IsNil)
   171  
   172  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   173  		return checkSubTaskStatus(cli, pb.Stage_Running)
   174  	}), IsTrue)
   175  
   176  	t.testSubTaskRecover(c, s, dir)
   177  
   178  	// pause relay
   179  	_, err = ha.PutRelayStage(s.etcdClient, ha.NewRelayStage(pb.Stage_Paused, sourceCfg.SourceID))
   180  	c.Assert(err, IsNil)
   181  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   182  		return checkRelayStatus(cli, pb.Stage_Paused)
   183  	}), IsTrue)
   184  	// resume relay
   185  	_, err = ha.PutRelayStage(s.etcdClient, ha.NewRelayStage(pb.Stage_Running, sourceCfg.SourceID))
   186  	c.Assert(err, IsNil)
   187  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   188  		return checkRelayStatus(cli, pb.Stage_Running)
   189  	}), IsTrue)
   190  	// pause task
   191  	_, err = ha.PutSubTaskStage(s.etcdClient, ha.NewSubTaskStage(pb.Stage_Paused, sourceCfg.SourceID, subtaskCfg.Name))
   192  	c.Assert(err, IsNil)
   193  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   194  		return checkSubTaskStatus(cli, pb.Stage_Paused)
   195  	}), IsTrue)
   196  
   197  	// test refresh source cfg
   198  	sourceCfg.MetaDir = "new meta"
   199  	_, err = ha.PutSourceCfg(s.etcdClient, sourceCfg)
   200  	c.Assert(err, IsNil)
   201  	c.Assert(s.worker.refreshSourceCfg(), IsNil)
   202  	c.Assert(s.worker.cfg.MetaDir, Equals, sourceCfg.MetaDir)
   203  
   204  	// check update subtask cfg failed
   205  	tomlStr, tomlErr := subtaskCfg.Toml()
   206  	c.Assert(tomlErr, IsNil)
   207  	ctx := context.Background()
   208  	checkReq := &pb.CheckSubtasksCanUpdateRequest{SubtaskCfgTomlString: tomlStr}
   209  	checkResp, checkErr := s.CheckSubtasksCanUpdate(ctx, checkReq)
   210  	c.Assert(checkErr, IsNil)
   211  	c.Assert(checkResp.Success, IsFalse)
   212  
   213  	// test refresh subtask cfg
   214  	subtaskCfg.SyncerConfig.Batch = 111
   215  	_, err = ha.PutSubTaskCfgStage(s.etcdClient, []config.SubTaskConfig{subtaskCfg}, []ha.Stage{}, []ha.Stage{})
   216  	c.Assert(err, IsNil)
   217  	subTask := s.worker.subTaskHolder.findSubTask(subtaskCfg.Name)
   218  	subTask.setCurrUnit(subTask.units[2]) // set to syncer unit
   219  	c.Assert(s.worker.tryRefreshSubTaskAndSourceConfig(subTask), IsNil)
   220  	subtaskCfgInWorker := s.worker.subTaskHolder.findSubTask(subtaskCfg.Name)
   221  	c.Assert(subtaskCfgInWorker.cfg.SyncerConfig.Batch, Equals, subtaskCfg.SyncerConfig.Batch)
   222  
   223  	// resume task
   224  	_, err = ha.PutSubTaskStage(s.etcdClient, ha.NewSubTaskStage(pb.Stage_Running, sourceCfg.SourceID, subtaskCfg.Name))
   225  	c.Assert(err, IsNil)
   226  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   227  		return checkSubTaskStatus(cli, pb.Stage_Running)
   228  	}), IsTrue)
   229  
   230  	// stop task
   231  	_, err = ha.DeleteSubTaskStage(s.etcdClient, ha.NewSubTaskStage(pb.Stage_Stopped, sourceCfg.SourceID, subtaskCfg.Name))
   232  	c.Assert(err, IsNil)
   233  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   234  		return s.getSourceWorker(true).subTaskHolder.findSubTask(subtaskCfg.Name) == nil
   235  	}), IsTrue)
   236  
   237  	dupServer := NewServer(cfg)
   238  	err = dupServer.Start()
   239  	c.Assert(terror.ErrWorkerStartService.Equal(err), IsTrue)
   240  	c.Assert(err.Error(), Matches, ".*bind: address already in use.*")
   241  
   242  	t.testStopWorkerWhenLostConnect(c, s, ETCD)
   243  	s.Close()
   244  
   245  	c.Assert(utils.WaitSomething(30, 10*time.Millisecond, func() bool {
   246  		return s.closed.Load()
   247  	}), IsTrue)
   248  
   249  	// test source worker, just make sure testing sort
   250  	t.testSourceWorker(c)
   251  }
   252  
   253  func (t *testServer) TestHandleSourceBoundAfterError(c *C) {
   254  	var (
   255  		masterAddr   = tempurl.Alloc()[len("http://"):]
   256  		keepAliveTTL = int64(1)
   257  	)
   258  	// start etcd server
   259  	etcdDir := c.MkDir()
   260  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   261  	c.Assert(err, IsNil)
   262  	defer ETCD.Close()
   263  	cfg := NewConfig()
   264  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   265  	cfg.Join = masterAddr
   266  	cfg.KeepAliveTTL = keepAliveTTL
   267  
   268  	// new etcd client
   269  	etcdCli, err := clientv3.New(clientv3.Config{
   270  		Endpoints:            GetJoinURLs(cfg.Join),
   271  		DialTimeout:          dialTimeout,
   272  		DialKeepAliveTime:    keepaliveTime,
   273  		DialKeepAliveTimeout: keepaliveTimeout,
   274  	})
   275  	c.Assert(err, IsNil)
   276  
   277  	// watch worker event(oneline or offline)
   278  	var (
   279  		wg       sync.WaitGroup
   280  		startRev int64 = 1
   281  	)
   282  	workerEvCh := make(chan ha.WorkerEvent, 10)
   283  	workerErrCh := make(chan error, 10)
   284  	ctx, cancel := context.WithCancel(context.Background())
   285  	wg.Add(1)
   286  	go func() {
   287  		defer func() {
   288  			close(workerEvCh)
   289  			close(workerErrCh)
   290  			wg.Done()
   291  		}()
   292  		ha.WatchWorkerEvent(ctx, etcdCli, startRev, workerEvCh, workerErrCh)
   293  	}()
   294  
   295  	// start worker server
   296  	s := NewServer(cfg)
   297  	defer s.Close()
   298  	go func() {
   299  		err1 := s.Start()
   300  		c.Assert(err1, IsNil)
   301  	}()
   302  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   303  		return !s.closed.Load()
   304  	}), IsTrue)
   305  
   306  	// check if the worker is online
   307  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   308  		select {
   309  		case ev := <-workerEvCh:
   310  			if !ev.IsDeleted {
   311  				return true
   312  			}
   313  		default:
   314  		}
   315  		return false
   316  	}), IsTrue)
   317  
   318  	// enable failpoint
   319  	c.Assert(failpoint.Enable("github.com/pingcap/tiflow/dm/pkg/ha/FailToGetSourceCfg", `return(true)`), IsNil)
   320  	sourceCfg := loadSourceConfigWithoutPassword(c)
   321  	sourceCfg.EnableRelay = false
   322  	_, err = ha.PutSourceCfg(etcdCli, sourceCfg)
   323  	c.Assert(err, IsNil)
   324  	sourceBound := ha.NewSourceBound(sourceCfg.SourceID, s.cfg.Name)
   325  	_, err = ha.PutSourceBound(etcdCli, sourceBound)
   326  	c.Assert(err, IsNil)
   327  
   328  	// do check until worker offline
   329  	c.Assert(utils.WaitSomething(50, 100*time.Millisecond, func() bool {
   330  		select {
   331  		case ev := <-workerEvCh:
   332  			if ev.IsDeleted {
   333  				return true
   334  			}
   335  		default:
   336  		}
   337  		return false
   338  	}), IsTrue)
   339  
   340  	// check if the worker is online
   341  	c.Assert(utils.WaitSomething(5, time.Duration(s.cfg.KeepAliveTTL)*time.Second, func() bool {
   342  		select {
   343  		case ev := <-workerEvCh:
   344  			if !ev.IsDeleted {
   345  				return true
   346  			}
   347  		default:
   348  		}
   349  		return false
   350  	}), IsTrue)
   351  
   352  	// stop watching and disable failpoint
   353  	cancel()
   354  	wg.Wait()
   355  	c.Assert(failpoint.Disable("github.com/pingcap/tiflow/dm/pkg/ha/FailToGetSourceCfg"), IsNil)
   356  
   357  	_, err = ha.PutSourceBound(etcdCli, sourceBound)
   358  	c.Assert(err, IsNil)
   359  	_, err = ha.PutSourceCfg(etcdCli, sourceCfg)
   360  	c.Assert(err, IsNil)
   361  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   362  		return s.getSourceWorker(true) != nil
   363  	}), IsTrue)
   364  
   365  	_, err = ha.DeleteSourceBound(etcdCli, s.cfg.Name)
   366  	c.Assert(err, IsNil)
   367  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   368  		return s.getSourceWorker(true) == nil
   369  	}), IsTrue)
   370  }
   371  
   372  func (t *testServer) TestServerQueryValidator(c *C) {
   373  	var (
   374  		masterAddr   = tempurl.Alloc()[len("http://"):]
   375  		keepAliveTTL = int64(1)
   376  	)
   377  	etcdDir := c.MkDir()
   378  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   379  	c.Assert(err, IsNil)
   380  	defer ETCD.Close()
   381  	cfg := NewConfig()
   382  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   383  	cfg.Join = masterAddr
   384  	cfg.KeepAliveTTL = keepAliveTTL
   385  	cfg.RelayKeepAliveTTL = keepAliveTTL
   386  
   387  	s := NewServer(cfg)
   388  	resp, err := s.GetWorkerValidatorStatus(context.Background(), &pb.GetValidationStatusRequest{})
   389  	c.Assert(err, IsNil)
   390  	c.Assert(resp.Result, IsFalse)
   391  	c.Assert(resp.Msg, Matches, ".*no mysql source is being handled in the worker.*")
   392  }
   393  
   394  func (t *testServer) TestServerQueryValidatorError(c *C) {
   395  	var (
   396  		masterAddr   = tempurl.Alloc()[len("http://"):]
   397  		keepAliveTTL = int64(1)
   398  	)
   399  	etcdDir := c.MkDir()
   400  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   401  	c.Assert(err, IsNil)
   402  	defer ETCD.Close()
   403  	cfg := NewConfig()
   404  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   405  	cfg.Join = masterAddr
   406  	cfg.KeepAliveTTL = keepAliveTTL
   407  	cfg.RelayKeepAliveTTL = keepAliveTTL
   408  
   409  	s := NewServer(cfg)
   410  	resp, err := s.GetValidatorError(context.Background(), &pb.GetValidationErrorRequest{})
   411  	c.Assert(err, IsNil)
   412  	c.Assert(resp.Result, IsFalse)
   413  	c.Assert(resp.Msg, Matches, ".*no mysql source is being handled in the worker.*")
   414  }
   415  
   416  func (t *testServer) TestServerOperateValidatorError(c *C) {
   417  	var (
   418  		masterAddr   = tempurl.Alloc()[len("http://"):]
   419  		keepAliveTTL = int64(1)
   420  	)
   421  	etcdDir := c.MkDir()
   422  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   423  	c.Assert(err, IsNil)
   424  	defer ETCD.Close()
   425  	cfg := NewConfig()
   426  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   427  	cfg.Join = masterAddr
   428  	cfg.KeepAliveTTL = keepAliveTTL
   429  	cfg.RelayKeepAliveTTL = keepAliveTTL
   430  
   431  	s := NewServer(cfg)
   432  	resp, err := s.OperateValidatorError(context.Background(), &pb.OperateValidationErrorRequest{})
   433  	c.Assert(err, IsNil)
   434  	c.Assert(resp.Result, IsFalse)
   435  	c.Assert(resp.Msg, Matches, ".*no mysql source is being handled in the worker.*")
   436  }
   437  
   438  func (t *testServer) TestWatchSourceBoundEtcdCompact(c *C) {
   439  	var (
   440  		masterAddr   = tempurl.Alloc()[len("http://"):]
   441  		keepAliveTTL = int64(1)
   442  		startRev     = int64(1)
   443  	)
   444  	etcdDir := c.MkDir()
   445  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   446  	c.Assert(err, IsNil)
   447  	defer ETCD.Close()
   448  	cfg := NewConfig()
   449  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   450  	cfg.Join = masterAddr
   451  	cfg.KeepAliveTTL = keepAliveTTL
   452  	cfg.RelayKeepAliveTTL = keepAliveTTL
   453  
   454  	s := NewServer(cfg)
   455  	etcdCli, err := clientv3.New(clientv3.Config{
   456  		Endpoints:            GetJoinURLs(s.cfg.Join),
   457  		DialTimeout:          dialTimeout,
   458  		DialKeepAliveTime:    keepaliveTime,
   459  		DialKeepAliveTimeout: keepaliveTimeout,
   460  	})
   461  	s.etcdClient = etcdCli
   462  	s.closed.Store(false)
   463  	c.Assert(err, IsNil)
   464  	sourceCfg := loadSourceConfigWithoutPassword(c)
   465  	sourceCfg.EnableRelay = false
   466  
   467  	ctx, cancel := context.WithCancel(context.Background())
   468  	defer cancel()
   469  
   470  	// step 1: Put a source config and source bound to this worker, then delete it
   471  	_, err = ha.PutSourceCfg(etcdCli, sourceCfg)
   472  	c.Assert(err, IsNil)
   473  	sourceBound := ha.NewSourceBound(sourceCfg.SourceID, cfg.Name)
   474  	_, err = ha.PutSourceBound(etcdCli, sourceBound)
   475  	c.Assert(err, IsNil)
   476  	rev, err := ha.DeleteSourceBound(etcdCli, cfg.Name)
   477  	c.Assert(err, IsNil)
   478  	// step 2: start source at this worker
   479  	w, err := s.getOrStartWorker(sourceCfg, true)
   480  	c.Assert(err, IsNil)
   481  	c.Assert(w.EnableHandleSubtasks(), IsNil)
   482  	// step 3: trigger etcd compaction and check whether we can receive it through watcher
   483  	_, err = etcdCli.Compact(ctx, rev)
   484  	c.Assert(err, IsNil)
   485  	sourceBoundCh := make(chan ha.SourceBound, 10)
   486  	sourceBoundErrCh := make(chan error, 10)
   487  	ha.WatchSourceBound(ctx, etcdCli, cfg.Name, startRev, sourceBoundCh, sourceBoundErrCh)
   488  	select {
   489  	case err = <-sourceBoundErrCh:
   490  		c.Assert(errors.Cause(err), Equals, etcdErrCompacted)
   491  	case <-time.After(300 * time.Millisecond):
   492  		c.Fatal("fail to get etcd error compacted")
   493  	}
   494  	// step 4: watch source bound from startRev
   495  	var wg sync.WaitGroup
   496  	ctx1, cancel1 := context.WithCancel(ctx)
   497  	wg.Add(1)
   498  	go func() {
   499  		defer wg.Done()
   500  		c.Assert(s.observeSourceBound(ctx1, startRev), IsNil)
   501  	}()
   502  	// step 4.1: should stop the running worker, source bound has been deleted, should stop this worker
   503  	c.Assert(utils.WaitSomething(20, 100*time.Millisecond, func() bool {
   504  		return s.getSourceWorker(true) == nil
   505  	}), IsTrue)
   506  	// step 4.2: put a new source bound, source should be started
   507  	_, err = ha.PutSourceBound(etcdCli, sourceBound)
   508  	c.Assert(err, IsNil)
   509  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   510  		return s.getSourceWorker(true) != nil
   511  	}), IsTrue)
   512  	cfg2 := s.getSourceWorker(true).cfg
   513  	c.Assert(cfg2, DeepEquals, sourceCfg)
   514  	cancel1()
   515  	wg.Wait()
   516  	c.Assert(s.stopSourceWorker(sourceCfg.SourceID, true, true), IsNil)
   517  	// step 5: start observeSourceBound from compacted revision again, should start worker
   518  	ctx2, cancel2 := context.WithCancel(ctx)
   519  	wg.Add(1)
   520  	go func() {
   521  		defer wg.Done()
   522  		c.Assert(s.observeSourceBound(ctx2, startRev), IsNil)
   523  	}()
   524  	c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   525  		return s.getSourceWorker(true) != nil
   526  	}), IsTrue)
   527  	cfg2 = s.getSourceWorker(true).cfg
   528  	c.Assert(cfg2, DeepEquals, sourceCfg)
   529  	cancel2()
   530  	wg.Wait()
   531  }
   532  
   533  func (t *testServer) testHTTPInterface(c *C, uri string) {
   534  	// nolint:noctx
   535  	resp, err := http.Get("http://127.0.0.1:8262/" + uri)
   536  	c.Assert(err, IsNil)
   537  	defer resp.Body.Close()
   538  	c.Assert(resp.StatusCode, Equals, 200)
   539  	_, err = io.ReadAll(resp.Body)
   540  	c.Assert(err, IsNil)
   541  }
   542  
   543  func (t *testServer) createClient(c *C, addr string) pb.WorkerClient {
   544  	//nolint:staticcheck
   545  	conn, err := grpc.Dial(addr, grpc.WithInsecure(), grpc.WithBackoffMaxDelay(3*time.Second))
   546  	c.Assert(err, IsNil)
   547  	return pb.NewWorkerClient(conn)
   548  }
   549  
   550  func (t *testServer) testOperateSourceBoundWithoutConfigInEtcd(c *C, s *Server) {
   551  	err := s.operateSourceBound(ha.NewSourceBound("sourceWithoutConfigInEtcd", s.cfg.Name))
   552  	c.Assert(terror.ErrWorkerFailToGetSourceConfigFromEtcd.Equal(err), IsTrue)
   553  }
   554  
   555  func (t *testServer) testOperateWorker(c *C, s *Server, dir string, start bool) {
   556  	// load sourceCfg
   557  	sourceCfg := loadSourceConfigWithoutPassword(c)
   558  	sourceCfg.EnableRelay = true
   559  	sourceCfg.RelayDir = dir
   560  	sourceCfg.MetaDir = c.MkDir()
   561  
   562  	if start {
   563  		// put mysql config into relative etcd key adapter to trigger operation event
   564  		_, err := ha.PutSourceCfg(s.etcdClient, sourceCfg)
   565  		c.Assert(err, IsNil)
   566  		_, err = ha.PutRelayStageRelayConfigSourceBound(s.etcdClient, ha.NewRelayStage(pb.Stage_Running, sourceCfg.SourceID),
   567  			ha.NewSourceBound(sourceCfg.SourceID, s.cfg.Name))
   568  		c.Assert(err, IsNil)
   569  		// worker should be started and without error
   570  		c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   571  			w := s.getSourceWorker(true)
   572  			return w != nil && !w.closed.Load()
   573  		}), IsTrue)
   574  		c.Assert(s.getSourceStatus(true).Result, IsNil)
   575  	} else {
   576  		// worker should be started before stopped
   577  		w := s.getSourceWorker(true)
   578  		c.Assert(w, NotNil)
   579  		c.Assert(w.closed.Load(), IsFalse)
   580  		_, err := ha.DeleteRelayConfig(s.etcdClient, w.name)
   581  		c.Assert(err, IsNil)
   582  		_, err = ha.DeleteSourceCfgRelayStageSourceBound(s.etcdClient, sourceCfg.SourceID, s.cfg.Name)
   583  		c.Assert(err, IsNil)
   584  		// worker should be closed and without error
   585  		c.Assert(utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   586  			currentWorker := s.getSourceWorker(true)
   587  			return currentWorker == nil && w.closed.Load()
   588  		}), IsTrue)
   589  		c.Assert(s.getSourceStatus(true).Result, IsNil)
   590  	}
   591  }
   592  
   593  func (t *testServer) testRetryConnectMaster(c *C, s *Server, etcd *embed.Etcd, dir string, hostName string) *embed.Etcd {
   594  	etcd.Close()
   595  	time.Sleep(6 * time.Second)
   596  	// When worker server fail to keepalive with etcd, server should close its worker
   597  	c.Assert(s.getSourceWorker(true), IsNil)
   598  	c.Assert(s.getSourceStatus(true).Result, IsNil)
   599  	ETCD, err := createMockETCD(dir, "http://"+hostName)
   600  	c.Assert(err, IsNil)
   601  	time.Sleep(3 * time.Second)
   602  	return ETCD
   603  }
   604  
   605  func (t *testServer) testSubTaskRecover(c *C, s *Server, dir string) {
   606  	workerCli := t.createClient(c, "127.0.0.1:8262")
   607  	t.testOperateWorker(c, s, dir, false)
   608  
   609  	status, err := workerCli.QueryStatus(context.Background(), &pb.QueryStatusRequest{Name: "sub-task-name"})
   610  	c.Assert(err, IsNil)
   611  	c.Assert(status.Result, IsFalse)
   612  	c.Assert(status.Msg, Equals, terror.ErrWorkerNoStart.Error())
   613  
   614  	t.testOperateWorker(c, s, dir, true)
   615  
   616  	// because we split starting worker and enabling handling subtasks into two parts, a query-status may occur between
   617  	// them, thus get a result of no subtask running
   618  	utils.WaitSomething(30, 100*time.Millisecond, func() bool {
   619  		status, err = workerCli.QueryStatus(context.Background(), &pb.QueryStatusRequest{Name: "sub-task-name"})
   620  		if err != nil {
   621  			return false
   622  		}
   623  		if status.Result == false {
   624  			return false
   625  		}
   626  		if len(status.SubTaskStatus) == 0 || status.SubTaskStatus[0].Stage != pb.Stage_Running {
   627  			return false
   628  		}
   629  		return true
   630  	})
   631  
   632  	status, err = workerCli.QueryStatus(context.Background(), &pb.QueryStatusRequest{Name: "sub-task-name"})
   633  	c.Assert(err, IsNil)
   634  	c.Assert(status.Result, IsTrue)
   635  	c.Assert(status.SubTaskStatus, HasLen, 1)
   636  	c.Assert(status.SubTaskStatus[0].Stage, Equals, pb.Stage_Running)
   637  }
   638  
   639  func (t *testServer) testStopWorkerWhenLostConnect(c *C, s *Server, etcd *embed.Etcd) {
   640  	etcd.Close()
   641  	c.Assert(utils.WaitSomething(int(defaultKeepAliveTTL+3), time.Second, func() bool {
   642  		return s.getSourceWorker(true) == nil
   643  	}), IsTrue)
   644  	c.Assert(s.getSourceWorker(true), IsNil)
   645  }
   646  
   647  func (t *testServer) TestGetMinLocInAllSubTasks(c *C) {
   648  	subTaskCfg := map[string]config.SubTaskConfig{
   649  		"test2": {Name: "test2"},
   650  		"test3": {Name: "test3"},
   651  		"test1": {Name: "test1"},
   652  	}
   653  	minLoc, err := getMinLocInAllSubTasks(context.Background(), subTaskCfg)
   654  	c.Assert(err, IsNil)
   655  	c.Assert(minLoc.Position.Name, Equals, "mysql-binlog.00001")
   656  	c.Assert(minLoc.Position.Pos, Equals, uint32(12))
   657  
   658  	for k, cfg := range subTaskCfg {
   659  		cfg.EnableGTID = true
   660  		subTaskCfg[k] = cfg
   661  	}
   662  
   663  	minLoc, err = getMinLocInAllSubTasks(context.Background(), subTaskCfg)
   664  	c.Assert(err, IsNil)
   665  	c.Assert(minLoc.Position.Name, Equals, "mysql-binlog.00001")
   666  	c.Assert(minLoc.Position.Pos, Equals, uint32(123))
   667  }
   668  
   669  func getFakeLocForSubTask(ctx context.Context, subTaskCfg config.SubTaskConfig) (minLoc *binlog.Location, err error) {
   670  	gset1, _ := gtid.ParserGTID(mysql.MySQLFlavor, "ba8f633f-1f15-11eb-b1c7-0242ac110001:1-30")
   671  	gset2, _ := gtid.ParserGTID(mysql.MySQLFlavor, "ba8f633f-1f15-11eb-b1c7-0242ac110001:1-50")
   672  	gset3, _ := gtid.ParserGTID(mysql.MySQLFlavor, "ba8f633f-1f15-11eb-b1c7-0242ac110001:1-50,ba8f633f-1f15-11eb-b1c7-0242ac110002:1")
   673  	loc1 := binlog.NewLocation(
   674  		mysql.Position{
   675  			Name: "mysql-binlog.00001",
   676  			Pos:  123,
   677  		},
   678  		gset1,
   679  	)
   680  	loc2 := binlog.NewLocation(
   681  		mysql.Position{
   682  			Name: "mysql-binlog.00001",
   683  			Pos:  12,
   684  		},
   685  		gset2,
   686  	)
   687  	loc3 := binlog.NewLocation(
   688  		mysql.Position{
   689  			Name: "mysql-binlog.00003",
   690  		},
   691  		gset3,
   692  	)
   693  
   694  	switch subTaskCfg.Name {
   695  	case "test1":
   696  		return &loc1, nil
   697  	case "test2":
   698  		return &loc2, nil
   699  	case "test3":
   700  		return &loc3, nil
   701  	default:
   702  		return nil, nil
   703  	}
   704  }
   705  
   706  func checkSubTaskStatus(cli pb.WorkerClient, expect pb.Stage) bool {
   707  	status, err := cli.QueryStatus(context.Background(), &pb.QueryStatusRequest{Name: "sub-task-name"})
   708  	if err != nil {
   709  		return false
   710  	}
   711  	if status.Result == false {
   712  		return false
   713  	}
   714  	return len(status.SubTaskStatus) > 0 && status.SubTaskStatus[0].Stage == expect
   715  }
   716  
   717  func checkRelayStatus(cli pb.WorkerClient, expect pb.Stage) bool {
   718  	status, err := cli.QueryStatus(context.Background(), &pb.QueryStatusRequest{Name: "sub-task-name"})
   719  	if err != nil {
   720  		return false
   721  	}
   722  	if status.Result == false {
   723  		return false
   724  	}
   725  	return status.SourceStatus.RelayStatus.Stage == expect
   726  }
   727  
   728  func loadSourceConfigWithoutPassword(c *C) *config.SourceConfig {
   729  	sourceCfg, err := config.SourceCfgFromYamlAndVerify(config.SampleSourceConfig)
   730  	c.Assert(err, IsNil)
   731  	sourceCfg.From.Password = "" // no password set
   732  	return sourceCfg
   733  }
   734  
   735  func (t *testServer) TestServerDataRace(c *C) {
   736  	var (
   737  		masterAddr   = tempurl.Alloc()[len("http://"):]
   738  		keepAliveTTL = int64(1)
   739  	)
   740  	etcdDir := c.MkDir()
   741  	ETCD, err := createMockETCD(etcdDir, "http://"+masterAddr)
   742  	c.Assert(err, IsNil)
   743  	defer ETCD.Close()
   744  	cfg := NewConfig()
   745  	c.Assert(cfg.Parse([]string{"-config=./dm-worker.toml"}), IsNil)
   746  	cfg.Join = masterAddr
   747  	cfg.KeepAliveTTL = keepAliveTTL
   748  	cfg.RelayKeepAliveTTL = keepAliveTTL
   749  
   750  	s := NewServer(cfg)
   751  	defer s.Close()
   752  
   753  	var wg sync.WaitGroup
   754  	for i := 0; i < 20; i++ {
   755  		wg.Add(2)
   756  		go func() {
   757  			defer wg.Done()
   758  			err1 := s.Start()
   759  			c.Assert(err1 == nil || err1 == terror.ErrWorkerServerClosed, IsTrue)
   760  		}()
   761  		go func() {
   762  			defer wg.Done()
   763  			s.Close()
   764  		}()
   765  		wg.Wait()
   766  	}
   767  }
   768  
   769  func loadSourceConfigWithoutPassword2(t *testing.T) *config.SourceConfig {
   770  	t.Helper()
   771  
   772  	sourceCfg, err := config.SourceCfgFromYamlAndVerify(config.SampleSourceConfig)
   773  	require.NoError(t, err)
   774  	sourceCfg.From.Password = "" // no password set
   775  	return sourceCfg
   776  }