github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/worker/server.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package worker
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"net"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/pingcap/errors"
    24  	toolutils "github.com/pingcap/tidb-tools/pkg/utils"
    25  	"github.com/pingcap/tiflow/dm/common"
    26  	"github.com/pingcap/tiflow/dm/config"
    27  	"github.com/pingcap/tiflow/dm/pb"
    28  	"github.com/pingcap/tiflow/dm/pkg/binlog"
    29  	tcontext "github.com/pingcap/tiflow/dm/pkg/context"
    30  	"github.com/pingcap/tiflow/dm/pkg/etcdutil"
    31  	"github.com/pingcap/tiflow/dm/pkg/ha"
    32  	"github.com/pingcap/tiflow/dm/pkg/log"
    33  	"github.com/pingcap/tiflow/dm/pkg/terror"
    34  	"github.com/pingcap/tiflow/dm/pkg/utils"
    35  	"github.com/pingcap/tiflow/dm/syncer"
    36  	"github.com/pingcap/tiflow/dm/unit"
    37  	"github.com/soheilhy/cmux"
    38  	clientv3 "go.etcd.io/etcd/client/v3"
    39  	"go.uber.org/atomic"
    40  	"go.uber.org/zap"
    41  	"google.golang.org/grpc"
    42  )
    43  
    44  var (
    45  	cmuxReadTimeout           = 10 * time.Second
    46  	dialTimeout               = 3 * time.Second
    47  	keepaliveTimeout          = 3 * time.Second
    48  	keepaliveTime             = 3 * time.Second
    49  	retryGetSourceBoundConfig = 5
    50  	retryGetRelayConfig       = 5
    51  	retryConnectSleepTime     = time.Second
    52  	syncMasterEndpointsTime   = 3 * time.Second
    53  	getMinLocForSubTaskFunc   = getMinLocForSubTask
    54  )
    55  
    56  // Server accepts RPC requests
    57  // dispatches requests to worker
    58  // sends responses to RPC client.
    59  type Server struct {
    60  	// closeMu is used to sync Start/Close and protect 5 fields below
    61  	closeMu sync.Mutex
    62  	// closed is used to indicate whether dm-worker server is in closed state.
    63  	closed atomic.Bool
    64  	// calledClose is used to indicate that dm-worker has received signal to close and closed successfully.
    65  	// we use this variable to avoid Start() after Close()
    66  	calledClose bool
    67  	rootLis     net.Listener
    68  	svr         *grpc.Server
    69  	etcdClient  *clientv3.Client
    70  	// end of closeMu
    71  
    72  	wg     sync.WaitGroup
    73  	kaWg   sync.WaitGroup
    74  	httpWg sync.WaitGroup
    75  	runWg  sync.WaitGroup
    76  
    77  	ctx    context.Context
    78  	cancel context.CancelFunc
    79  
    80  	runCtx    context.Context
    81  	runCancel context.CancelFunc
    82  
    83  	kaCtx    context.Context
    84  	kaCancel context.CancelFunc
    85  
    86  	cfg *Config
    87  
    88  	// mu is used to protect worker and sourceStatus. closeMu should be locked first to avoid
    89  	// deadlock when closeMu and mu are both acquired.
    90  	mu     sync.Mutex
    91  	worker *SourceWorker
    92  	// relay status will never be put in server.sourceStatus
    93  	sourceStatus pb.SourceStatus
    94  }
    95  
    96  // NewServer creates a new Server.
    97  func NewServer(cfg *Config) *Server {
    98  	s := Server{
    99  		cfg: cfg,
   100  	}
   101  	s.ctx, s.cancel = context.WithCancel(context.Background())
   102  	s.closed.Store(true) // not start yet
   103  	return &s
   104  }
   105  
   106  // Start starts to serving.
   107  // this function should only exit when can't dail DM-master, for other errors it should not exit.
   108  func (s *Server) Start() error {
   109  	log.L().Info("starting dm-worker server")
   110  	RegistryMetrics()
   111  
   112  	var m cmux.CMux
   113  
   114  	s.runCtx, s.runCancel = context.WithCancel(s.ctx)
   115  
   116  	// protect member from data race. some functions below like GetRelayConfig,
   117  	// GetSourceBoundConfig has a built-in timeout so it will not be stuck for a
   118  	// long time.
   119  	startErr := func() error {
   120  		s.closeMu.Lock()
   121  		defer s.closeMu.Unlock()
   122  		// if dm-worker has received signal and finished close, start() should not continue
   123  		if s.calledClose {
   124  			return terror.ErrWorkerServerClosed
   125  		}
   126  
   127  		tls, err := toolutils.NewTLS(s.cfg.SSLCA, s.cfg.SSLCert, s.cfg.SSLKey, s.cfg.AdvertiseAddr, s.cfg.CertAllowedCN)
   128  		if err != nil {
   129  			return terror.ErrWorkerTLSConfigNotValid.Delegate(err)
   130  		}
   131  
   132  		rootLis, err := net.Listen("tcp", s.cfg.WorkerAddr)
   133  		if err != nil {
   134  			return terror.ErrWorkerStartService.Delegate(err)
   135  		}
   136  		s.rootLis = tls.WrapListener(rootLis)
   137  
   138  		s.etcdClient, err = clientv3.New(clientv3.Config{
   139  			Endpoints:            GetJoinURLs(s.cfg.Join),
   140  			DialTimeout:          dialTimeout,
   141  			DialKeepAliveTime:    keepaliveTime,
   142  			DialKeepAliveTimeout: keepaliveTimeout,
   143  			TLS:                  tls.TLSConfig(),
   144  			AutoSyncInterval:     syncMasterEndpointsTime,
   145  		})
   146  		if err != nil {
   147  			return err
   148  		}
   149  
   150  		s.setWorker(nil, true)
   151  
   152  		s.runWg.Add(1)
   153  		go func() {
   154  			s.runBackgroundJob(s.runCtx)
   155  			s.runWg.Done()
   156  		}()
   157  
   158  		s.startKeepAlive()
   159  
   160  		relaySource, revRelay, err := ha.GetRelayConfig(s.etcdClient, s.cfg.Name)
   161  		if err != nil {
   162  			return err
   163  		}
   164  		if relaySource != nil {
   165  			log.L().Warn("worker has been assigned relay before keepalive", zap.String("relay source", relaySource.SourceID))
   166  			if err2 := s.enableRelay(relaySource, true); err2 != nil {
   167  				return err2
   168  			}
   169  		}
   170  
   171  		s.runWg.Add(1)
   172  		go func(ctx context.Context) {
   173  			defer s.runWg.Done()
   174  			// TODO: handle fatal error from observeRelayConfig
   175  			//nolint:errcheck
   176  			s.observeRelayConfig(ctx, revRelay)
   177  		}(s.runCtx)
   178  
   179  		bound, sourceCfg, revBound, err := ha.GetSourceBoundConfig(s.etcdClient, s.cfg.Name)
   180  		if err != nil {
   181  			return err
   182  		}
   183  		if !bound.IsEmpty() {
   184  			log.L().Warn("worker has been assigned source before keepalive", zap.Stringer("bound", bound), zap.Bool("is deleted", bound.IsDeleted))
   185  			if err2 := s.enableHandleSubtasks(sourceCfg, true); err2 != nil {
   186  				return err2
   187  			}
   188  			log.L().Info("started to handle mysql source", zap.String("sourceCfg", sourceCfg.String()))
   189  		}
   190  
   191  		s.runWg.Add(1)
   192  		go func(ctx context.Context) {
   193  			defer s.runWg.Done()
   194  			for {
   195  				err1 := s.observeSourceBound(ctx, revBound)
   196  				if err1 == nil {
   197  					return
   198  				}
   199  				s.restartKeepAlive()
   200  			}
   201  		}(s.runCtx)
   202  
   203  		// create a cmux
   204  		m = cmux.New(s.rootLis)
   205  
   206  		m.SetReadTimeout(cmuxReadTimeout) // set a timeout, ref: https://github.com/pingcap/tidb-binlog/pull/352
   207  
   208  		// match connections in order: first gRPC, then HTTP
   209  		grpcL := m.MatchWithWriters(cmux.HTTP2MatchHeaderFieldSendSettings("content-type", "application/grpc"))
   210  
   211  		httpL := m.Match(cmux.HTTP1Fast())
   212  
   213  		// NOTE: don't need to set tls config, because rootLis already use tls
   214  		s.svr = grpc.NewServer()
   215  		pb.RegisterWorkerServer(s.svr, s)
   216  
   217  		grpcExitCh := make(chan struct{}, 1)
   218  		s.wg.Add(1)
   219  		go func() {
   220  			err2 := s.svr.Serve(grpcL)
   221  			if err2 != nil && !common.IsErrNetClosing(err2) && err2 != cmux.ErrListenerClosed {
   222  				log.L().Error("gRPC server returned", log.ShortError(err2))
   223  			}
   224  			grpcExitCh <- struct{}{}
   225  		}()
   226  		go func(ctx context.Context) {
   227  			defer s.wg.Done()
   228  			select {
   229  			case <-ctx.Done():
   230  				if s.svr != nil {
   231  					// GracefulStop can not cancel active stream RPCs
   232  					// and the stream RPC may block on Recv or Send
   233  					// so we use Stop instead to cancel all active RPCs
   234  					s.svr.Stop()
   235  				}
   236  			case <-grpcExitCh:
   237  			}
   238  		}(s.ctx)
   239  
   240  		s.httpWg.Add(1)
   241  		go func() {
   242  			s.httpWg.Done()
   243  			InitStatus(httpL) // serve status
   244  		}()
   245  
   246  		s.closed.Store(false) // the server started now.
   247  		return nil
   248  	}()
   249  
   250  	if startErr != nil {
   251  		return startErr
   252  	}
   253  
   254  	log.L().Info("listening gRPC API and status request", zap.String("address", s.cfg.WorkerAddr))
   255  
   256  	err := m.Serve()
   257  	if err != nil && common.IsErrNetClosing(err) {
   258  		err = nil
   259  	}
   260  	return terror.ErrWorkerStartService.Delegate(err)
   261  }
   262  
   263  // worker keepalive with master
   264  // If worker loses connect from master, it would stop all task and try to connect master again.
   265  func (s *Server) startKeepAlive() {
   266  	s.kaWg.Add(1)
   267  	s.kaCtx, s.kaCancel = context.WithCancel(s.ctx)
   268  	go s.doStartKeepAlive()
   269  }
   270  
   271  func (s *Server) doStartKeepAlive() {
   272  	defer s.kaWg.Done()
   273  	s.KeepAlive()
   274  }
   275  
   276  func (s *Server) stopKeepAlive() {
   277  	if s.kaCancel != nil {
   278  		s.kaCancel()
   279  		s.kaWg.Wait()
   280  	}
   281  }
   282  
   283  func (s *Server) restartKeepAlive() {
   284  	s.stopKeepAlive()
   285  	s.startKeepAlive()
   286  }
   287  
   288  func (s *Server) observeRelayConfig(ctx context.Context, rev int64) error {
   289  	var wg sync.WaitGroup
   290  	for {
   291  		relayCh := make(chan ha.RelaySource, 10)
   292  		relayErrCh := make(chan error, 10)
   293  		wg.Add(1)
   294  		// use ctx1, cancel1 to make sure old watcher has been released
   295  		ctx1, cancel1 := context.WithCancel(ctx)
   296  		go func() {
   297  			defer func() {
   298  				close(relayCh)
   299  				close(relayErrCh)
   300  				wg.Done()
   301  			}()
   302  			ha.WatchRelayConfig(ctx1, s.etcdClient, s.cfg.Name, rev+1, relayCh, relayErrCh)
   303  		}()
   304  		err := s.handleRelayConfig(ctx1, relayCh, relayErrCh)
   305  		cancel1()
   306  		wg.Wait()
   307  
   308  		if etcdutil.IsRetryableError(err) {
   309  			rev = 0
   310  			retryNum := 1
   311  			for rev == 0 {
   312  				select {
   313  				case <-ctx.Done():
   314  					return nil
   315  				case <-time.After(500 * time.Millisecond):
   316  					relaySource, rev1, err1 := ha.GetRelayConfig(s.etcdClient, s.cfg.Name)
   317  					if err1 != nil {
   318  						log.L().Error("get relay config from etcd failed, will retry later", zap.Error(err1), zap.Int("retryNum", retryNum))
   319  						retryNum++
   320  						if retryNum > retryGetRelayConfig && etcdutil.IsLimitedRetryableError(err1) {
   321  							return err1
   322  						}
   323  						break
   324  					}
   325  					rev = rev1
   326  					if relaySource == nil {
   327  						if w := s.getSourceWorker(true); w != nil && w.startedRelayBySourceCfg {
   328  							break
   329  						}
   330  						log.L().Info("didn't found relay config after etcd retryable error. Will stop relay now")
   331  						err = s.disableRelay("")
   332  						if err != nil {
   333  							log.L().Error("fail to disableRelay after etcd retryable error", zap.Error(err))
   334  							return err // return if failed to stop the worker.
   335  						}
   336  					} else {
   337  						err2 := func() error {
   338  							s.mu.Lock()
   339  							defer s.mu.Unlock()
   340  
   341  							if w := s.getSourceWorker(false); w != nil && w.cfg.SourceID == relaySource.SourceID {
   342  								// we may face both relay config and subtask bound changed in a compaction error, so here
   343  								// we check if observeSourceBound has started a worker
   344  								// TODO: add a test for this situation
   345  								if !w.relayEnabled.Load() {
   346  									if err2 := w.EnableRelay(false); err2 != nil {
   347  										return err2
   348  									}
   349  								}
   350  								return nil
   351  							}
   352  							err = s.stopSourceWorker("", false, true)
   353  							if err != nil {
   354  								log.L().Error("fail to stop worker", zap.Error(err))
   355  								return err // return if failed to stop the worker.
   356  							}
   357  							log.L().Info("will recover observeRelayConfig",
   358  								zap.String("relay source", relaySource.SourceID))
   359  							return s.enableRelay(relaySource, false)
   360  						}()
   361  						if err2 != nil {
   362  							return err2
   363  						}
   364  					}
   365  				}
   366  			}
   367  		} else {
   368  			if err != nil {
   369  				log.L().Error("observeRelayConfig is failed and will quit now", zap.Error(err))
   370  			} else {
   371  				log.L().Info("observeRelayConfig will quit now")
   372  			}
   373  			return err
   374  		}
   375  	}
   376  }
   377  
   378  // observeSourceBound will
   379  // 1. keep bound relation updated from DM-master
   380  // 2. keep enable-relay in source config updated. (TODO) This relies on DM-master re-put SourceBound after change it.
   381  func (s *Server) observeSourceBound(ctx context.Context, rev int64) error {
   382  	var wg sync.WaitGroup
   383  	for {
   384  		sourceBoundCh := make(chan ha.SourceBound, 10)
   385  		sourceBoundErrCh := make(chan error, 10)
   386  		wg.Add(1)
   387  		// use ctx1, cancel1 to make sure old watcher has been released
   388  		ctx1, cancel1 := context.WithCancel(ctx)
   389  		go func() {
   390  			defer func() {
   391  				close(sourceBoundCh)
   392  				close(sourceBoundErrCh)
   393  				wg.Done()
   394  			}()
   395  			ha.WatchSourceBound(ctx1, s.etcdClient, s.cfg.Name, rev+1, sourceBoundCh, sourceBoundErrCh)
   396  		}()
   397  		err := s.handleSourceBound(ctx1, sourceBoundCh, sourceBoundErrCh)
   398  		cancel1()
   399  		wg.Wait()
   400  
   401  		if etcdutil.IsRetryableError(err) {
   402  			rev = 0
   403  			retryNum := 1
   404  			for rev == 0 {
   405  				select {
   406  				case <-ctx.Done():
   407  					return nil
   408  				case <-time.After(500 * time.Millisecond):
   409  					bound, cfg, rev1, err1 := ha.GetSourceBoundConfig(s.etcdClient, s.cfg.Name)
   410  					if err1 != nil {
   411  						log.L().Error("get source bound from etcd failed, will retry later", zap.Error(err1), zap.Int("retryNum", retryNum))
   412  						retryNum++
   413  						if retryNum > retryGetSourceBoundConfig && etcdutil.IsLimitedRetryableError(err1) {
   414  							return err1
   415  						}
   416  						break
   417  					}
   418  					rev = rev1
   419  					if bound.IsEmpty() {
   420  						err = s.disableHandleSubtasks("")
   421  						if err != nil {
   422  							log.L().Error("fail to disableHandleSubtasks after etcd retryable error", zap.Error(err))
   423  							return err // return if failed to stop the worker.
   424  						}
   425  					} else {
   426  						err2 := func() error {
   427  							s.mu.Lock()
   428  							defer s.mu.Unlock()
   429  
   430  							if w := s.getSourceWorker(false); w != nil && w.cfg.SourceID == bound.Source {
   431  								// we may face both relay config and subtask bound changed in a compaction error, so here
   432  								// we check if observeRelayConfig has started a worker
   433  								// TODO: add a test for this situation
   434  								if !w.subTaskEnabled.Load() {
   435  									if err2 := w.EnableHandleSubtasks(); err2 != nil {
   436  										return err2
   437  									}
   438  								}
   439  								return nil
   440  							}
   441  							err = s.stopSourceWorker("", false, true)
   442  							if err != nil {
   443  								log.L().Error("fail to stop worker", zap.Error(err))
   444  								return err // return if failed to stop the worker.
   445  							}
   446  							log.L().Info("will recover observeSourceBound",
   447  								zap.String("relay source", cfg.SourceID))
   448  							return s.enableHandleSubtasks(cfg, false)
   449  						}()
   450  						if err2 != nil {
   451  							if terror.ErrWorkerServerClosed.Equal(err2) {
   452  								// return nil to exit the loop in caller
   453  								return nil
   454  							}
   455  							return err2
   456  						}
   457  					}
   458  				}
   459  			}
   460  		} else {
   461  			if err != nil {
   462  				log.L().Error("observeSourceBound is failed and will quit now", zap.Error(err))
   463  			} else {
   464  				log.L().Info("observeSourceBound will quit now")
   465  			}
   466  			return err
   467  		}
   468  	}
   469  }
   470  
   471  func (s *Server) doClose() {
   472  	if s.closed.Load() {
   473  		return
   474  	}
   475  	// stop server in advance, stop receiving source bound and relay bound
   476  	s.runCancel()
   477  	s.runWg.Wait()
   478  
   479  	// stop worker and wait for return(we already lock the whole Sever, so no need use lock to get source worker)
   480  	if w := s.getSourceWorker(true); w != nil {
   481  		w.Stop(true)
   482  	}
   483  
   484  	// close listener at last, so we can get status from it if worker failed to close in previous step
   485  	if s.rootLis != nil {
   486  		err2 := s.rootLis.Close()
   487  		if err2 != nil && !common.IsErrNetClosing(err2) {
   488  			log.L().Error("fail to close net listener", log.ShortError(err2))
   489  		}
   490  	}
   491  	s.httpWg.Wait()
   492  
   493  	s.closed.Store(true)
   494  }
   495  
   496  // Close closes the RPC server, this function can be called multiple times.
   497  func (s *Server) Close() {
   498  	s.closeMu.Lock()
   499  	defer s.closeMu.Unlock()
   500  	s.doClose() // we should stop current sync first, otherwise master may schedule task on new worker while we are closing
   501  	s.stopKeepAlive()
   502  
   503  	s.cancel()
   504  	s.wg.Wait()
   505  
   506  	if s.etcdClient != nil {
   507  		s.etcdClient.Close()
   508  	}
   509  	s.calledClose = true
   510  }
   511  
   512  // if needLock is false, we should make sure Server has been locked in caller.
   513  func (s *Server) getSourceWorker(needLock bool) *SourceWorker {
   514  	if needLock {
   515  		s.mu.Lock()
   516  		defer s.mu.Unlock()
   517  	}
   518  	return s.worker
   519  }
   520  
   521  // if needLock is false, we should make sure Server has been locked in caller.
   522  func (s *Server) setWorker(worker *SourceWorker, needLock bool) {
   523  	if needLock {
   524  		s.mu.Lock()
   525  		defer s.mu.Unlock()
   526  	}
   527  	s.worker = worker
   528  }
   529  
   530  // nolint:unparam
   531  func (s *Server) getSourceStatus(needLock bool) pb.SourceStatus {
   532  	if needLock {
   533  		s.mu.Lock()
   534  		defer s.mu.Unlock()
   535  	}
   536  	return s.sourceStatus
   537  }
   538  
   539  // TODO: move some call to setWorker/getOrStartWorker.
   540  func (s *Server) setSourceStatus(source string, err error, needLock bool) {
   541  	if needLock {
   542  		s.mu.Lock()
   543  		defer s.mu.Unlock()
   544  	}
   545  	// now setSourceStatus will be concurrently called. skip setting a source status if worker has been closed
   546  	if s.getSourceWorker(false) == nil && source != "" {
   547  		return
   548  	}
   549  	s.sourceStatus = pb.SourceStatus{
   550  		Source: source,
   551  		Worker: s.cfg.Name,
   552  	}
   553  	if err != nil {
   554  		s.sourceStatus.Result = &pb.ProcessResult{
   555  			Errors: []*pb.ProcessError{
   556  				unit.NewProcessError(err),
   557  			},
   558  		}
   559  	}
   560  }
   561  
   562  // if sourceID is set to "", worker will be closed directly
   563  // if sourceID is not "", we will check sourceID with w.cfg.SourceID.
   564  func (s *Server) stopSourceWorker(sourceID string, needLock, graceful bool) error {
   565  	if needLock {
   566  		s.mu.Lock()
   567  		defer s.mu.Unlock()
   568  	}
   569  	w := s.getSourceWorker(false)
   570  	if w == nil {
   571  		log.L().Warn("worker has not been started, no need to stop", zap.String("source", sourceID))
   572  		return nil // no need to stop because not started yet
   573  	}
   574  	if sourceID != "" && w.cfg.SourceID != sourceID {
   575  		return terror.ErrWorkerSourceNotMatch
   576  	}
   577  	s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL)
   578  	s.setWorker(nil, false)
   579  	s.setSourceStatus("", nil, false)
   580  	w.Stop(graceful)
   581  	return nil
   582  }
   583  
   584  func (s *Server) handleSourceBound(ctx context.Context, boundCh chan ha.SourceBound, errCh chan error) error {
   585  OUTER:
   586  	for {
   587  		select {
   588  		case <-ctx.Done():
   589  			break OUTER
   590  		case bound, ok := <-boundCh:
   591  			if !ok {
   592  				break OUTER
   593  			}
   594  			log.L().Info("receive source bound", zap.Stringer("bound", bound), zap.Bool("is deleted", bound.IsDeleted))
   595  			err := s.operateSourceBound(bound)
   596  			s.setSourceStatus(bound.Source, err, true)
   597  			if err != nil {
   598  				opErrCounter.WithLabelValues(s.cfg.Name, opErrTypeSourceBound).Inc()
   599  				log.L().Error("fail to operate sourceBound on worker", zap.Stringer("bound", bound), zap.Bool("is deleted", bound.IsDeleted), zap.Error(err))
   600  				if etcdutil.IsRetryableError(err) {
   601  					return err
   602  				}
   603  			}
   604  		case err, ok := <-errCh:
   605  			if !ok {
   606  				break OUTER
   607  			}
   608  			// TODO: Deal with err
   609  			log.L().Error("WatchSourceBound received an error", zap.Error(err))
   610  			if etcdutil.IsRetryableError(err) {
   611  				return err
   612  			}
   613  		}
   614  	}
   615  	log.L().Info("handleSourceBound will quit now")
   616  	return nil
   617  }
   618  
   619  func (s *Server) handleRelayConfig(ctx context.Context, relayCh chan ha.RelaySource, errCh chan error) error {
   620  OUTER:
   621  	for {
   622  		select {
   623  		case <-ctx.Done():
   624  			break OUTER
   625  		case relaySource, ok := <-relayCh:
   626  			if !ok {
   627  				break OUTER
   628  			}
   629  			log.L().Info("receive relay source", zap.String("relay source", relaySource.Source), zap.Bool("is deleted", relaySource.IsDeleted))
   630  			err := s.operateRelaySource(relaySource)
   631  			s.setSourceStatus(relaySource.Source, err, true)
   632  			if err != nil {
   633  				opErrCounter.WithLabelValues(s.cfg.Name, opErrTypeRelaySource).Inc()
   634  				log.L().Error("fail to operate relay source on worker",
   635  					zap.String("relay source", relaySource.Source),
   636  					zap.Bool("is deleted", relaySource.IsDeleted),
   637  					zap.Error(err))
   638  				if etcdutil.IsRetryableError(err) {
   639  					return err
   640  				}
   641  			}
   642  		case err, ok := <-errCh:
   643  			// currently no value is sent to errCh
   644  			if !ok {
   645  				break OUTER
   646  			}
   647  			// TODO: Deal with err
   648  			log.L().Error("WatchRelayConfig received an error", zap.Error(err))
   649  			if etcdutil.IsRetryableError(err) {
   650  				return err
   651  			}
   652  		}
   653  	}
   654  	log.L().Info("worker server is closed, handleRelayConfig will quit now")
   655  	return nil
   656  }
   657  
   658  func (s *Server) operateSourceBound(bound ha.SourceBound) error {
   659  	if bound.IsDeleted {
   660  		return s.disableHandleSubtasks(bound.Source)
   661  	}
   662  	scm, _, err := ha.GetSourceCfg(s.etcdClient, bound.Source, bound.Revision)
   663  	if err != nil {
   664  		// TODO: need retry
   665  		return err
   666  	}
   667  	sourceCfg, ok := scm[bound.Source]
   668  	if !ok {
   669  		return terror.ErrWorkerFailToGetSourceConfigFromEtcd.Generate(bound.Source)
   670  	}
   671  	return s.enableHandleSubtasks(sourceCfg, true)
   672  }
   673  
   674  func (s *Server) enableHandleSubtasks(sourceCfg *config.SourceConfig, needLock bool) error {
   675  	if needLock {
   676  		s.mu.Lock()
   677  		defer s.mu.Unlock()
   678  	}
   679  
   680  	w, err := s.getOrStartWorker(sourceCfg, false)
   681  	s.setSourceStatus(sourceCfg.SourceID, err, false)
   682  	if err != nil {
   683  		return err
   684  	}
   685  
   686  	if sourceCfg.EnableRelay {
   687  		log.L().Info("will start relay by `enable-relay` in source config")
   688  		if err2 := w.EnableRelay(true); err2 != nil {
   689  			log.L().Error("found a `enable-relay: true` source, but failed to enable relay for DM worker",
   690  				zap.Error(err2))
   691  			return err2
   692  		}
   693  	} else if w.startedRelayBySourceCfg {
   694  		log.L().Info("will disable relay by `enable-relay: false` in source config")
   695  		w.DisableRelay()
   696  	}
   697  
   698  	if err2 := w.EnableHandleSubtasks(); err2 != nil {
   699  		s.setSourceStatus(sourceCfg.SourceID, err2, false)
   700  		return err2
   701  	}
   702  	return nil
   703  }
   704  
   705  func (s *Server) disableHandleSubtasks(source string) error {
   706  	s.mu.Lock()
   707  	defer s.mu.Unlock()
   708  	w := s.getSourceWorker(false)
   709  	if w == nil {
   710  		log.L().Warn("worker has already stopped before DisableHandleSubtasks", zap.String("source", source))
   711  		return nil
   712  	}
   713  
   714  	w.DisableHandleSubtasks()
   715  
   716  	// now the worker is unbound, stop relay if it's started by source config
   717  	if w.cfg.EnableRelay && w.startedRelayBySourceCfg {
   718  		log.L().Info("stop relay because the source is unbound")
   719  		w.DisableRelay()
   720  	}
   721  
   722  	var err error
   723  	if !w.relayEnabled.Load() {
   724  		log.L().Info("relay is not enabled after disabling subtask, so stop worker")
   725  		err = s.stopSourceWorker(source, false, true)
   726  	}
   727  	return err
   728  }
   729  
   730  func (s *Server) operateRelaySource(relaySource ha.RelaySource) error {
   731  	if relaySource.IsDeleted {
   732  		return s.disableRelay(relaySource.Source)
   733  	}
   734  	scm, _, err := ha.GetSourceCfg(s.etcdClient, relaySource.Source, relaySource.Revision)
   735  	if err != nil {
   736  		// TODO: need retry
   737  		return err
   738  	}
   739  	sourceCfg, ok := scm[relaySource.Source]
   740  	if !ok {
   741  		return terror.ErrWorkerFailToGetSourceConfigFromEtcd.Generate(relaySource.Source)
   742  	}
   743  	return s.enableRelay(sourceCfg, true)
   744  }
   745  
   746  func (s *Server) enableRelay(sourceCfg *config.SourceConfig, needLock bool) error {
   747  	if needLock {
   748  		s.mu.Lock()
   749  		defer s.mu.Unlock()
   750  	}
   751  
   752  	w, err2 := s.getOrStartWorker(sourceCfg, false)
   753  	s.setSourceStatus(sourceCfg.SourceID, err2, false)
   754  	if err2 != nil {
   755  		// if DM-worker can't handle pre-assigned source before keepalive, it simply exits with the error,
   756  		// because no re-assigned mechanism exists for keepalived DM-worker yet.
   757  		return err2
   758  	}
   759  	if err2 = w.EnableRelay(false); err2 != nil {
   760  		s.setSourceStatus(sourceCfg.SourceID, err2, false)
   761  		return err2
   762  	}
   763  	s.UpdateKeepAliveTTL(s.cfg.RelayKeepAliveTTL)
   764  	return nil
   765  }
   766  
   767  func (s *Server) disableRelay(source string) error {
   768  	s.mu.Lock()
   769  	defer s.mu.Unlock()
   770  	w := s.getSourceWorker(false)
   771  	if w == nil {
   772  		log.L().Warn("worker has already stopped before DisableRelay", zap.Any("relaySource", source))
   773  		return nil
   774  	}
   775  	s.UpdateKeepAliveTTL(s.cfg.KeepAliveTTL)
   776  	w.DisableRelay()
   777  	var err error
   778  	if !w.subTaskEnabled.Load() {
   779  		log.L().Info("subtask is not enabled after disabling relay, so stop worker")
   780  		err = s.stopSourceWorker(source, false, true)
   781  	}
   782  	return err
   783  }
   784  
   785  // QueryStatus implements WorkerServer.QueryStatus.
   786  func (s *Server) QueryStatus(ctx context.Context, req *pb.QueryStatusRequest) (*pb.QueryStatusResponse, error) {
   787  	log.L().Info("", zap.String("request", "QueryStatus"), zap.Stringer("payload", req))
   788  
   789  	sourceStatus := s.getSourceStatus(true)
   790  	sourceStatus.Worker = s.cfg.Name
   791  	resp := &pb.QueryStatusResponse{
   792  		Result:       true,
   793  		SourceStatus: &sourceStatus,
   794  	}
   795  
   796  	w := s.getSourceWorker(true)
   797  	if w == nil {
   798  		log.L().Warn("fail to call QueryStatus, because no mysql source is being handled in the worker")
   799  		resp.Result = false
   800  		resp.Msg = terror.ErrWorkerNoStart.Error()
   801  		return resp, nil
   802  	}
   803  
   804  	var err error
   805  	resp.SubTaskStatus, sourceStatus.RelayStatus, err = w.QueryStatus(ctx, req.Name)
   806  
   807  	if err != nil {
   808  		resp.Msg = fmt.Sprintf("error when get master status: %v", err)
   809  	} else if len(resp.SubTaskStatus) == 0 {
   810  		resp.Msg = "no sub task started"
   811  	}
   812  	return resp, nil
   813  }
   814  
   815  // PurgeRelay implements WorkerServer.PurgeRelay.
   816  func (s *Server) PurgeRelay(ctx context.Context, req *pb.PurgeRelayRequest) (*pb.CommonWorkerResponse, error) {
   817  	log.L().Info("", zap.String("request", "PurgeRelay"), zap.Stringer("payload", req))
   818  	w := s.getSourceWorker(true)
   819  	if w == nil {
   820  		log.L().Warn("fail to call StartSubTask, because no mysql source is being handled in the worker")
   821  		return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil
   822  	}
   823  
   824  	err := w.PurgeRelay(ctx, req)
   825  	if err != nil {
   826  		log.L().Error("fail to purge relay", zap.String("request", "PurgeRelay"), zap.Stringer("payload", req), zap.Error(err))
   827  	}
   828  	return makeCommonWorkerResponse(err), nil
   829  }
   830  
   831  // OperateSchema operates schema for an upstream table.
   832  func (s *Server) OperateSchema(ctx context.Context, req *pb.OperateWorkerSchemaRequest) (*pb.CommonWorkerResponse, error) {
   833  	log.L().Info("", zap.String("request", "OperateSchema"), zap.Stringer("payload", req))
   834  
   835  	w := s.getSourceWorker(true)
   836  	if w == nil {
   837  		log.L().Warn("fail to call OperateSchema, because no mysql source is being handled in the worker")
   838  		return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil
   839  	}
   840  	w.RLock()
   841  	sourceID := w.cfg.SourceID
   842  	w.RUnlock()
   843  	if req.Source != sourceID {
   844  		log.L().Error("fail to call OperateSchema, because source mismatch", zap.String("request", req.Source), zap.String("current", sourceID))
   845  		return makeCommonWorkerResponse(terror.ErrWorkerSourceNotMatch.Generate()), nil
   846  	}
   847  
   848  	schema, err := w.OperateSchema(ctx, req)
   849  	if err != nil {
   850  		return makeCommonWorkerResponse(err), nil
   851  	}
   852  	return &pb.CommonWorkerResponse{
   853  		Result: true,
   854  		Msg:    schema, // if any schema return for `GET`, we place it in the `msg` field now.
   855  		Source: req.Source,
   856  		Worker: s.cfg.Name,
   857  	}, nil
   858  }
   859  
   860  func (s *Server) getOrStartWorker(cfg *config.SourceConfig, needLock bool) (*SourceWorker, error) {
   861  	if needLock {
   862  		s.mu.Lock()
   863  		defer s.mu.Unlock()
   864  	}
   865  
   866  	if w := s.getSourceWorker(false); w != nil {
   867  		if w.cfg.SourceID == cfg.SourceID {
   868  			log.L().Info("mysql source is being handled", zap.String("sourceID", s.worker.cfg.SourceID))
   869  			return w, nil
   870  		}
   871  		return nil, terror.ErrWorkerAlreadyStart.Generate(w.name, w.cfg.SourceID, cfg.SourceID)
   872  	}
   873  
   874  	log.L().Info("will start a new worker", zap.String("sourceID", cfg.SourceID))
   875  	w, err := NewSourceWorker(cfg, s.etcdClient, s.cfg.Name, s.cfg.RelayDir)
   876  	if err != nil {
   877  		return nil, err
   878  	}
   879  	s.setWorker(w, false)
   880  
   881  	go w.Start()
   882  
   883  	isStarted := utils.WaitSomething(50, 100*time.Millisecond, func() bool {
   884  		return !w.closed.Load()
   885  	})
   886  	if !isStarted {
   887  		// TODO: add more mechanism to wait or un-bound the source
   888  		return nil, terror.ErrWorkerNoStart
   889  	}
   890  	return w, nil
   891  }
   892  
   893  func makeCommonWorkerResponse(reqErr error) *pb.CommonWorkerResponse {
   894  	resp := &pb.CommonWorkerResponse{
   895  		Result: true,
   896  	}
   897  	if reqErr != nil {
   898  		resp.Result = false
   899  		resp.Msg = reqErr.Error()
   900  	}
   901  	return resp
   902  }
   903  
   904  // all subTask in subTaskCfgs should have same source
   905  // this function return the min location in all subtasks, used for relay's location.
   906  func getMinLocInAllSubTasks(ctx context.Context, subTaskCfgs map[string]config.SubTaskConfig) (minLoc *binlog.Location, err error) {
   907  	for _, subTaskCfg := range subTaskCfgs {
   908  		loc, err := getMinLocForSubTaskFunc(ctx, subTaskCfg)
   909  		if err != nil {
   910  			return nil, err
   911  		}
   912  
   913  		if loc == nil {
   914  			continue
   915  		}
   916  
   917  		if minLoc == nil {
   918  			minLoc = loc
   919  		} else if binlog.CompareLocation(*minLoc, *loc, subTaskCfg.EnableGTID) >= 1 {
   920  			minLoc = loc
   921  		}
   922  	}
   923  
   924  	return minLoc, nil
   925  }
   926  
   927  func getMinLocForSubTask(ctx context.Context, subTaskCfg config.SubTaskConfig) (minLoc *binlog.Location, err error) {
   928  	if !config.HasSync(subTaskCfg.Mode) {
   929  		return nil, nil
   930  	}
   931  	subTaskCfg2, err := subTaskCfg.DecryptedClone()
   932  	if err != nil {
   933  		return nil, errors.Annotate(err, "get min position from checkpoint")
   934  	}
   935  
   936  	tctx := tcontext.NewContext(ctx, log.L())
   937  	checkpoint := syncer.NewRemoteCheckPoint(tctx, subTaskCfg2, nil, subTaskCfg2.SourceID)
   938  	err = checkpoint.Init(tctx)
   939  	if err != nil {
   940  		return nil, errors.Annotate(err, "get min position from checkpoint")
   941  	}
   942  	defer checkpoint.Close()
   943  
   944  	err = checkpoint.Load(tctx)
   945  	if err != nil {
   946  		return nil, errors.Annotate(err, "get min position from checkpoint")
   947  	}
   948  
   949  	location := checkpoint.GlobalPoint()
   950  	return &location, nil
   951  }
   952  
   953  // HandleError handle error.
   954  func (s *Server) HandleError(ctx context.Context, req *pb.HandleWorkerErrorRequest) (*pb.CommonWorkerResponse, error) {
   955  	log.L().Info("", zap.String("request", "HandleError"), zap.Stringer("payload", req))
   956  
   957  	w := s.getSourceWorker(true)
   958  	if w == nil {
   959  		log.L().Warn("fail to call HandleError, because no mysql source is being handled in the worker")
   960  		return makeCommonWorkerResponse(terror.ErrWorkerNoStart.Generate()), nil
   961  	}
   962  
   963  	msg, err := w.HandleError(ctx, req)
   964  	if err != nil {
   965  		return makeCommonWorkerResponse(err), nil
   966  	}
   967  	return &pb.CommonWorkerResponse{
   968  		Result: true,
   969  		Worker: s.cfg.Name,
   970  		Msg:    msg,
   971  	}, nil
   972  }
   973  
   974  // GetWorkerCfg get worker config.
   975  func (s *Server) GetWorkerCfg(ctx context.Context, req *pb.GetWorkerCfgRequest) (*pb.GetWorkerCfgResponse, error) {
   976  	log.L().Info("", zap.String("request", "GetWorkerCfg"), zap.Stringer("payload", req))
   977  	var err error
   978  	resp := &pb.GetWorkerCfgResponse{}
   979  
   980  	resp.Cfg, err = s.cfg.Toml()
   981  	return resp, err
   982  }
   983  
   984  // CheckSubtasksCanUpdate check if input subtask cfg can be updated.
   985  func (s *Server) CheckSubtasksCanUpdate(ctx context.Context, req *pb.CheckSubtasksCanUpdateRequest) (*pb.CheckSubtasksCanUpdateResponse, error) {
   986  	log.L().Info("", zap.String("request", "CheckSubtasksCanUpdate"), zap.Stringer("payload", req))
   987  	resp := &pb.CheckSubtasksCanUpdateResponse{}
   988  	defer func() {
   989  		log.L().Info("", zap.String("request", "CheckSubtasksCanUpdate"), zap.Stringer("resp", resp))
   990  	}()
   991  	w := s.getSourceWorker(true)
   992  	if w == nil {
   993  		msg := "fail to call CheckSubtasksCanUpdate, because no mysql source is being handled in the worker"
   994  		log.L().Warn(msg)
   995  		resp.Msg = msg
   996  		return resp, nil
   997  	}
   998  	cfg := config.NewSubTaskConfig()
   999  	if err := cfg.Decode(req.SubtaskCfgTomlString, false); err != nil {
  1000  		resp.Msg = err.Error()
  1001  		// nolint:nilerr
  1002  		return resp, nil
  1003  	}
  1004  	if err := w.CheckCfgCanUpdated(cfg); err != nil {
  1005  		resp.Msg = err.Error()
  1006  		// nolint:nilerr
  1007  		return resp, nil
  1008  	}
  1009  	resp.Success = true
  1010  	return resp, nil
  1011  }
  1012  
  1013  func (s *Server) GetWorkerValidatorStatus(ctx context.Context, req *pb.GetValidationStatusRequest) (*pb.GetValidationStatusResponse, error) {
  1014  	log.L().Info("", zap.String("request", "GetWorkerValidateStatus"), zap.Stringer("payload", req))
  1015  
  1016  	resp := &pb.GetValidationStatusResponse{
  1017  		Result: true,
  1018  	}
  1019  	w := s.getSourceWorker(true)
  1020  	if w == nil {
  1021  		log.L().Warn("fail to call GetWorkerValidateStatus, because no mysql source is being handled in the worker")
  1022  		resp.Result = false
  1023  		resp.Msg = terror.ErrWorkerNoStart.Error()
  1024  		return resp, nil
  1025  	}
  1026  	validatorStatus, err := w.GetValidatorStatus(req.TaskName)
  1027  	if err != nil {
  1028  		return resp, err
  1029  	}
  1030  	res, err := w.GetValidatorTableStatus(req.TaskName, req.FilterStatus)
  1031  	if err != nil {
  1032  		return resp, err
  1033  	}
  1034  
  1035  	resp.Validators = []*pb.ValidationStatus{validatorStatus}
  1036  	resp.TableStatuses = res
  1037  	return resp, nil
  1038  }
  1039  
  1040  func (s *Server) GetValidatorError(ctx context.Context, req *pb.GetValidationErrorRequest) (*pb.GetValidationErrorResponse, error) {
  1041  	w := s.getSourceWorker(true)
  1042  	resp := &pb.GetValidationErrorResponse{
  1043  		Result: true,
  1044  	}
  1045  	if w == nil {
  1046  		log.L().Warn("fail to get validator error, because no mysql source is being handled in the worker")
  1047  		resp.Result = false
  1048  		resp.Msg = terror.ErrWorkerNoStart.Error()
  1049  		return resp, nil
  1050  	}
  1051  	validatorErrs, err := w.GetWorkerValidatorErr(req.TaskName, req.ErrState)
  1052  	if err != nil {
  1053  		resp.Msg = err.Error()
  1054  		resp.Result = false
  1055  	} else {
  1056  		resp.Error = validatorErrs
  1057  	}
  1058  	return resp, nil
  1059  }
  1060  
  1061  func (s *Server) OperateValidatorError(ctx context.Context, req *pb.OperateValidationErrorRequest) (*pb.OperateValidationErrorResponse, error) {
  1062  	log.L().Info("operate validation error", zap.Stringer("payload", req))
  1063  	w := s.getSourceWorker(true)
  1064  	resp := &pb.OperateValidationErrorResponse{
  1065  		Result: true,
  1066  	}
  1067  	if w == nil {
  1068  		log.L().Warn("fail to operate validator error, because no mysql source is being handled in the worker")
  1069  		resp.Result = false
  1070  		resp.Msg = terror.ErrWorkerNoStart.Error()
  1071  		return resp, nil
  1072  	}
  1073  	err := w.OperateWorkerValidatorErr(req.TaskName, req.Op, req.ErrId, req.IsAllError)
  1074  	if err != nil {
  1075  		resp.Result = false
  1076  		resp.Msg = err.Error()
  1077  		//nolint:nilerr
  1078  		return resp, nil
  1079  	}
  1080  	//nolint:nilerr
  1081  	return resp, nil
  1082  }
  1083  
  1084  func (s *Server) UpdateValidator(ctx context.Context, req *pb.UpdateValidationWorkerRequest) (*pb.CommonWorkerResponse, error) {
  1085  	log.L().Info("update validation", zap.Stringer("payload", req))
  1086  	w := s.getSourceWorker(true)
  1087  	resp := &pb.CommonWorkerResponse{
  1088  		Result: true,
  1089  	}
  1090  	if w == nil {
  1091  		log.L().Warn("fail to update validator, because no mysql source is being handled in the worker")
  1092  		resp.Result = false
  1093  		resp.Msg = terror.ErrWorkerNoStart.Error()
  1094  		return resp, nil
  1095  	}
  1096  	err := w.UpdateWorkerValidator(req)
  1097  	if err != nil {
  1098  		resp.Result = false
  1099  		resp.Msg = err.Error()
  1100  		//nolint:nilerr
  1101  		return resp, nil
  1102  	}
  1103  	resp.Source = w.cfg.SourceID
  1104  	resp.Worker = s.cfg.Name
  1105  	//nolint:nilerr
  1106  	return resp, nil
  1107  }