github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/master/scheduler/scheduler.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package scheduler
    15  
    16  import (
    17  	"context"
    18  	"sort"
    19  	"sync"
    20  	"time"
    21  
    22  	"github.com/pingcap/errors"
    23  	"github.com/pingcap/failpoint"
    24  	"github.com/pingcap/tiflow/dm/config"
    25  	"github.com/pingcap/tiflow/dm/config/dbconfig"
    26  	"github.com/pingcap/tiflow/dm/config/security"
    27  	"github.com/pingcap/tiflow/dm/master/metrics"
    28  	"github.com/pingcap/tiflow/dm/master/workerrpc"
    29  	"github.com/pingcap/tiflow/dm/pb"
    30  	"github.com/pingcap/tiflow/dm/pkg/etcdutil"
    31  	"github.com/pingcap/tiflow/dm/pkg/ha"
    32  	"github.com/pingcap/tiflow/dm/pkg/log"
    33  	"github.com/pingcap/tiflow/dm/pkg/terror"
    34  	"github.com/pingcap/tiflow/dm/pkg/utils"
    35  	clientv3 "go.etcd.io/etcd/client/v3"
    36  	"go.uber.org/atomic"
    37  	"go.uber.org/zap"
    38  )
    39  
    40  const (
    41  	maxQueryWorkerRetryNum = 10
    42  )
    43  
    44  // Scheduler schedules tasks for DM-worker instances, including:
    45  // - register/unregister DM-worker instances.
    46  // - observe the online/offline status of DM-worker instances.
    47  // - observe add/remove operations for upstream sources' config.
    48  // - schedule upstream sources to DM-worker instances.
    49  // - schedule data migration subtask operations.
    50  // - holds agents of DM-worker instances.
    51  // NOTE: the DM-master server MUST wait for this scheduler become started before handling client requests.
    52  // Cases trigger a source-to-worker bound try:
    53  // - a worker from Offline to Free:
    54  //   - receive keep-alive.
    55  //
    56  // - a worker from Bound to Free:
    57  //   - trigger by unbound: `a source removed`.
    58  //
    59  // - a new source added:
    60  //   - add source request from user.
    61  //
    62  // - a source unbound from another worker:
    63  //   - trigger by unbound: `a worker from Bound to Offline`.
    64  //   - TODO(csuzhangxc): design a strategy to ensure the old worker already shutdown its work.
    65  //
    66  // Cases trigger a source-to-worker unbound try.
    67  // - a worker from Bound to Offline:
    68  //   - lost keep-alive.
    69  //
    70  // - a source removed:
    71  //   - remove source request from user.
    72  //
    73  // TODO: try to handle the return `err` of etcd operations,
    74  //
    75  //	because may put into etcd, but the response to the etcd client interrupted.
    76  //
    77  // Relay scheduling:
    78  //   - scheduled by source
    79  //     DM-worker will enable relay according to its bound source, in current implementation, it will read `enable-relay`
    80  //     of source config and decide whether to enable relay.
    81  //     turn on `enable-relay`:
    82  //   - use `enable-relay: true` when create source
    83  //   - `start-relay -s source` to dynamically change `enable-relay`
    84  //     turn off `enable-relay`:
    85  //   - use `enable-relay: false` when create source
    86  //   - `stop-relay -s source` to dynamically change `enable-relay`
    87  //   - found conflict schedule type with (source, worker) when scheduler bootstrap
    88  //   - scheduled by (source, worker)
    89  //     DM-worker will check if relay is assigned to it no matter it's bound or not. In current implementation, it will
    90  //     read UpstreamRelayWorkerKeyAdapter in etcd.
    91  //     add UpstreamRelayWorkerKeyAdapter:
    92  //   - use `start-relay -s source -w worker`
    93  //     remove UpstreamRelayWorkerKeyAdapter:
    94  //   - use `stop-relay -s source -w worker`
    95  //   - remove worker by `offline-member`
    96  type Scheduler struct {
    97  	mu sync.RWMutex
    98  
    99  	logger log.Logger
   100  
   101  	started atomic.Bool // whether the scheduler already started for work.
   102  	cancel  context.CancelFunc
   103  	wg      sync.WaitGroup
   104  
   105  	etcdCli *clientv3.Client
   106  
   107  	// must acquire latch from subtaskLatch before accessing subTaskCfgs and expectSubTaskStages,
   108  	// the latch key is task name.
   109  	// TODO: also sourceLatch, relayLatch
   110  	subtaskLatch *latches
   111  
   112  	// all source configs, source ID -> source config.
   113  	// add:
   114  	// - add source by user request (calling `AddSourceCfg`).
   115  	// - recover from etcd (calling `recoverSources`).
   116  	// delete:
   117  	// - remove source by user request (calling `RemoveSourceCfg`).
   118  	sourceCfgs map[string]*config.SourceConfig
   119  
   120  	// all subtask configs, task name -> source ID -> subtask config.
   121  	// add:
   122  	// - add/start subtask by user request (calling `AddSubTasks`).
   123  	// - recover from etcd (calling `recoverSubTasks`).
   124  	// delete:
   125  	// - remove/stop subtask by user request (calling `RemoveSubTasks`).
   126  	subTaskCfgs sync.Map
   127  
   128  	// all DM-workers, worker name -> worker.
   129  	// add:
   130  	// - add worker by user request (calling `AddWorker`).
   131  	// - recover from etcd (calling `recoverWorkersBounds`).
   132  	// delete:
   133  	// - remove worker by user request (calling `RemoveWorker`).
   134  	workers map[string]*Worker
   135  
   136  	// all bound relationship, source ID -> worker.
   137  	// add:
   138  	// - when bind a source to a worker, in updateStatusToBound
   139  	// delete:
   140  	// - when unbind a source from a worker, in updateStatusToUnbound
   141  	// see `Cases trigger a source-to-worker bound try` above.
   142  	bounds map[string]*Worker
   143  
   144  	// unbound (pending to bound) sources.
   145  	// NOTE: refactor to support scheduling by priority.
   146  	// add:
   147  	// - add source by user request (calling `AddSourceCfg`).
   148  	// - recover from etcd (calling `recoverWorkersBounds`).
   149  	// - when the bounding worker become offline, in updateStatusToUnbound.
   150  	// delete:
   151  	// - remove source by user request (calling `RemoveSourceCfg`).
   152  	// - when bound the source to a worker, in updateStatusToBound.
   153  	unbounds map[string]struct{}
   154  
   155  	// a mirror of bounds whose element is not deleted when worker unbound. worker -> SourceBound
   156  	lastBound map[string]ha.SourceBound
   157  
   158  	// expectant relay stages for sources, source ID -> stage.
   159  	// add:
   160  	// - bound the source to a worker (at first time).
   161  	// - recover from etcd (calling `recoverSources`).
   162  	// update:
   163  	// - update stage by user request (calling `UpdateExpectRelayStage`).
   164  	// delete:
   165  	// - remove source by user request (calling `RemoveSourceCfg`).
   166  	expectRelayStages map[string]ha.Stage
   167  
   168  	// expectant subtask stages for tasks & sources, task name -> source ID -> stage.
   169  	// add:
   170  	// - add/start subtask by user request (calling `AddSubTasks`).
   171  	// - recover from etcd (calling `recoverSubTasks`).
   172  	// update:
   173  	// - update stage by user request (calling `UpdateExpectSubTaskStage`).
   174  	// delete:
   175  	// - remove/stop subtask by user request (calling `RemoveSubTasks`).
   176  	expectSubTaskStages sync.Map
   177  
   178  	// a source has its relay workers. source-id -> set(worker-name)
   179  	// add:
   180  	// - start-relay
   181  	// - recover from etcd (calling `recoverRelayConfigs`)
   182  	// delete:
   183  	// - stop-relay
   184  	relayWorkers map[string]map[string]struct{}
   185  
   186  	// expectant validator stages, task name -> source ID -> stage.
   187  	// add:
   188  	// - on subtask start with validator mode not none
   189  	// - start validator manually
   190  	// - recover from etcd
   191  	// update
   192  	// - update stage by user request
   193  	// delete:
   194  	// - when subtask is removed by user request
   195  	expectValidatorStages sync.Map
   196  
   197  	// workers in load stage
   198  	// task -> source -> worker
   199  	loadTasks map[string]map[string]string
   200  
   201  	securityCfg security.Security
   202  }
   203  
   204  // NewScheduler creates a new scheduler instance.
   205  func NewScheduler(pLogger *log.Logger, securityCfg security.Security) *Scheduler {
   206  	return &Scheduler{
   207  		logger:            pLogger.WithFields(zap.String("component", "scheduler")),
   208  		subtaskLatch:      newLatches(),
   209  		sourceCfgs:        make(map[string]*config.SourceConfig),
   210  		workers:           make(map[string]*Worker),
   211  		bounds:            make(map[string]*Worker),
   212  		unbounds:          make(map[string]struct{}),
   213  		lastBound:         make(map[string]ha.SourceBound),
   214  		expectRelayStages: make(map[string]ha.Stage),
   215  		relayWorkers:      make(map[string]map[string]struct{}),
   216  		loadTasks:         make(map[string]map[string]string),
   217  		securityCfg:       securityCfg,
   218  	}
   219  }
   220  
   221  // Start starts the scheduler for work.
   222  // NOTE: for logic errors, it should start without returning errors (but report via metrics or log) so that the user can fix them.
   223  func (s *Scheduler) Start(pCtx context.Context, etcdCli *clientv3.Client) (err error) {
   224  	s.logger.Info("the scheduler is starting")
   225  
   226  	s.mu.Lock()
   227  	defer func() {
   228  		if err != nil {
   229  			s.CloseAllWorkers()
   230  		}
   231  		s.mu.Unlock()
   232  	}()
   233  
   234  	if s.started.Load() {
   235  		return terror.ErrSchedulerStarted.Generate()
   236  	}
   237  
   238  	s.etcdCli = etcdCli // set s.etcdCli first for safety, observeWorkerEvent will use s.etcdCli in retry
   239  	s.reset()           // reset previous status.
   240  
   241  	// recover previous status from etcd.
   242  	err = s.recoverSources()
   243  	if err != nil {
   244  		return err
   245  	}
   246  	err = s.recoverSubTasks()
   247  	if err != nil {
   248  		return err
   249  	}
   250  	err = s.recoverRelayConfigs()
   251  	if err != nil {
   252  		return err
   253  	}
   254  
   255  	var loadTaskRev int64
   256  	loadTaskRev, err = s.recoverLoadTasks(false)
   257  	if err != nil {
   258  		return err
   259  	}
   260  
   261  	var rev int64
   262  	rev, err = s.recoverWorkersBounds()
   263  	if err != nil {
   264  		return err
   265  	}
   266  
   267  	// check if we can bind free or relay source and workers
   268  	for _, w := range s.workers {
   269  		if w.stage == WorkerFree || w.stage == WorkerRelay {
   270  			bound, err := s.tryBoundForWorker(w)
   271  			if err != nil {
   272  				return err
   273  			}
   274  			if !bound {
   275  				break
   276  			}
   277  		}
   278  	}
   279  
   280  	ctx, cancel := context.WithCancel(pCtx)
   281  
   282  	s.wg.Add(1)
   283  	go func(rev1 int64) {
   284  		defer s.wg.Done()
   285  		// starting to observe status of DM-worker instances.
   286  		// TODO: handle fatal error from observeWorkerEvent
   287  		//nolint:errcheck
   288  		s.observeWorkerEvent(ctx, rev1)
   289  	}(rev)
   290  
   291  	s.wg.Add(1)
   292  	go func(rev1 int64) {
   293  		defer s.wg.Done()
   294  		// starting to observe load task.
   295  		// TODO: handle fatal error from observeLoadTask
   296  		//nolint:errcheck
   297  		s.observeLoadTask(ctx, rev1)
   298  	}(loadTaskRev)
   299  
   300  	s.started.Store(true) // started now
   301  	s.cancel = cancel
   302  	s.logger.Info("the scheduler has started")
   303  	return nil
   304  }
   305  
   306  // Close closes the scheduler.
   307  func (s *Scheduler) Close() {
   308  	s.mu.Lock()
   309  
   310  	if !s.started.Load() {
   311  		s.mu.Unlock()
   312  		return
   313  	}
   314  
   315  	s.logger.Info("the scheduler is closing")
   316  	if s.cancel != nil {
   317  		s.cancel()
   318  		s.cancel = nil
   319  	}
   320  	s.CloseAllWorkers()
   321  	s.mu.Unlock()
   322  
   323  	// need to wait for goroutines to return which may hold the mutex.
   324  	s.wg.Wait()
   325  
   326  	s.mu.Lock()
   327  	defer s.mu.Unlock()
   328  	s.started.Store(false) // closed now.
   329  	s.logger.Info("the scheduler has closed")
   330  }
   331  
   332  // CloseAllWorkers closes all the scheduler's workers.
   333  func (s *Scheduler) CloseAllWorkers() {
   334  	for _, worker := range s.workers {
   335  		worker.Close()
   336  	}
   337  }
   338  
   339  // AddSourceCfg adds the upstream source config to the cluster, and try to bound source to worker
   340  // NOTE: please verify the config before call this.
   341  func (s *Scheduler) AddSourceCfg(cfg *config.SourceConfig) error {
   342  	s.mu.Lock()
   343  	defer s.mu.Unlock()
   344  
   345  	if !s.started.Load() {
   346  		return terror.ErrSchedulerNotStarted.Generate()
   347  	}
   348  
   349  	err := s.addSource(cfg)
   350  	if err != nil {
   351  		return err
   352  	}
   353  
   354  	// try to bound it to a Free worker.
   355  	_, err = s.tryBoundForSource(cfg.SourceID)
   356  	return err
   357  }
   358  
   359  // AddSourceCfgWithWorker adds the upstream source config to the cluster, and try to bound source to specify worker
   360  // NOTE: please verify the config before call this.
   361  func (s *Scheduler) AddSourceCfgWithWorker(cfg *config.SourceConfig, workerName string) error {
   362  	s.mu.Lock()
   363  	defer s.mu.Unlock()
   364  
   365  	if !s.started.Load() {
   366  		return terror.ErrSchedulerNotStarted.Generate()
   367  	}
   368  
   369  	// check whether worker exists.
   370  	w, ok := s.workers[workerName]
   371  	if !ok {
   372  		return terror.ErrSchedulerWorkerNotExist.Generate(workerName)
   373  	}
   374  
   375  	if w.stage != WorkerFree {
   376  		return terror.ErrSchedulerWorkerNotFree.Generate(workerName)
   377  	}
   378  
   379  	if err := s.addSource(cfg); err != nil {
   380  		return err
   381  	}
   382  
   383  	return s.boundSourceToWorker(cfg.SourceID, w)
   384  }
   385  
   386  // addSource adds the upstream source config to the cluster.
   387  func (s *Scheduler) addSource(cfg *config.SourceConfig) error {
   388  	// 1. check whether exists.
   389  	if _, ok := s.sourceCfgs[cfg.SourceID]; ok {
   390  		return terror.ErrSchedulerSourceCfgExist.Generate(cfg.SourceID)
   391  	}
   392  	// 2. put the config into etcd.
   393  	_, err := ha.PutSourceCfg(s.etcdCli, cfg)
   394  	if err != nil {
   395  		return err
   396  	}
   397  
   398  	// 3. record the config in the scheduler.
   399  	s.sourceCfgs[cfg.SourceID] = cfg
   400  	s.unbounds[cfg.SourceID] = struct{}{}
   401  	return nil
   402  }
   403  
   404  // UpdateSourceCfg update the upstream source config to the cluster.
   405  func (s *Scheduler) UpdateSourceCfg(cfg *config.SourceConfig) error {
   406  	s.mu.Lock()
   407  	defer s.mu.Unlock()
   408  
   409  	if !s.started.Load() {
   410  		return terror.ErrSchedulerNotStarted.Generate()
   411  	}
   412  
   413  	// 1. check whether the config exists.
   414  	_, ok := s.sourceCfgs[cfg.SourceID]
   415  	if !ok {
   416  		return terror.ErrSchedulerSourceCfgNotExist.Generate(cfg.SourceID)
   417  	}
   418  	// 2. check if tasks using this configuration are running
   419  	runningStage := pb.Stage_Running
   420  	if tasks := s.GetTaskNameListBySourceName(cfg.SourceID, &runningStage); len(tasks) > 0 {
   421  		return terror.ErrSchedulerSourceCfgUpdate.Generate(cfg.SourceID)
   422  	}
   423  	// 3. check if this source is enable relay
   424  	if _, ok := s.expectRelayStages[cfg.SourceID]; ok {
   425  		return terror.ErrSchedulerSourceCfgUpdate.Generate(cfg.SourceID)
   426  	}
   427  	// 4. put the config into etcd.
   428  	_, err := ha.PutSourceCfg(s.etcdCli, cfg)
   429  	if err != nil {
   430  		return err
   431  	}
   432  	// 5. record the config in the scheduler.
   433  	s.sourceCfgs[cfg.SourceID] = cfg
   434  	return nil
   435  }
   436  
   437  // RemoveSourceCfg removes the upstream source config in the cluster.
   438  // when removing the upstream source config, it should also remove:
   439  // - any existing relay stage.
   440  // - any source-worker bound relationship.
   441  func (s *Scheduler) RemoveSourceCfg(source string) error {
   442  	s.mu.Lock()
   443  	defer s.mu.Unlock()
   444  
   445  	if !s.started.Load() {
   446  		return terror.ErrSchedulerNotStarted.Generate()
   447  	}
   448  
   449  	// 1. check whether the config exists.
   450  	if _, ok := s.sourceCfgs[source]; !ok {
   451  		return terror.ErrSchedulerSourceCfgNotExist.Generate(source)
   452  	}
   453  
   454  	// 2. check whether any subtask or relay config exists for the source.
   455  	existingSubtasksM := make(map[string]struct{})
   456  	s.subTaskCfgs.Range(func(k, v interface{}) bool {
   457  		task := k.(string)
   458  		cfg := v.(map[string]config.SubTaskConfig)
   459  		for source2 := range cfg {
   460  			if source2 == source {
   461  				existingSubtasksM[task] = struct{}{}
   462  			}
   463  		}
   464  		return true
   465  	})
   466  
   467  	existingSubtasks := strMapToSlice(existingSubtasksM)
   468  	if len(existingSubtasks) > 0 {
   469  		return terror.ErrSchedulerSourceOpTaskExist.Generate(source, existingSubtasks)
   470  	}
   471  	relayWorkers := s.relayWorkers[source]
   472  	if len(relayWorkers) != 0 {
   473  		return terror.ErrSchedulerSourceOpRelayExist.Generate(source, strMapToSlice(relayWorkers))
   474  	}
   475  
   476  	// 3. find worker name by source ID.
   477  	var (
   478  		workerName string // empty should be fine below.
   479  		worker     *Worker
   480  	)
   481  	if w, ok2 := s.bounds[source]; ok2 {
   482  		worker = w
   483  		workerName = w.BaseInfo().Name
   484  	}
   485  
   486  	// 4. delete the info in etcd.
   487  	_, err := ha.DeleteSourceCfgRelayStageSourceBound(s.etcdCli, source, workerName)
   488  	if err != nil {
   489  		return err
   490  	}
   491  
   492  	// 5. delete the config and expectant stage in the scheduler
   493  	delete(s.sourceCfgs, source)
   494  	delete(s.expectRelayStages, source)
   495  
   496  	// 6. unbound for the source.
   497  	s.updateStatusToUnbound(source)
   498  
   499  	// 7. remove it from unbounds.
   500  	delete(s.unbounds, source)
   501  
   502  	// 8. try to bound the worker for another source.
   503  	if worker != nil {
   504  		_, err = s.tryBoundForWorker(worker)
   505  		if err != nil {
   506  			return err
   507  		}
   508  	}
   509  	return nil
   510  }
   511  
   512  // GetSourceCfgs gets all source cfgs, return nil when error happens.
   513  func (s *Scheduler) GetSourceCfgs() map[string]*config.SourceConfig {
   514  	s.mu.RLock()
   515  	defer s.mu.RUnlock()
   516  	clone := make(map[string]*config.SourceConfig, len(s.sourceCfgs))
   517  	for sourceID, sourceCfg := range s.sourceCfgs {
   518  		cloneCfg := sourceCfg.Clone()
   519  		clone[sourceID] = cloneCfg
   520  	}
   521  	return clone
   522  }
   523  
   524  // GetSourceCfgIDs gets all added source ID.
   525  func (s *Scheduler) GetSourceCfgIDs() []string {
   526  	s.mu.RLock()
   527  	defer s.mu.RUnlock()
   528  
   529  	id := make([]string, 0, len(s.sourceCfgs))
   530  	for i := range s.sourceCfgs {
   531  		id = append(id, i)
   532  	}
   533  	return id
   534  }
   535  
   536  // GetSourceCfgByID gets source config by source ID.
   537  func (s *Scheduler) GetSourceCfgByID(source string) *config.SourceConfig {
   538  	s.mu.RLock()
   539  	defer s.mu.RUnlock()
   540  	cfg, ok := s.sourceCfgs[source]
   541  	if !ok {
   542  		return nil
   543  	}
   544  	clone := *cfg
   545  	return &clone
   546  }
   547  
   548  // transferWorkerAndSource swaps two sources between two workers (maybe empty). The input means before invocation of
   549  // this function, left worker and left source are bound, right worker and right source are bound. After this function,
   550  // left worker should be bound to right source and vice versa.
   551  // lworker, "", "", rsource				This means an unbound source bound to a free worker
   552  // lworker, lsource, rworker, "" 		This means transfer a source from a worker to another free worker
   553  // lworker, lsource, "", rsource		This means transfer a worker from a bound source to another unbound source
   554  // lworker, lsource, rworker, rsource	This means transfer two bound relations.
   555  func (s *Scheduler) transferWorkerAndSource(lworker, lsource, rworker, rsource string) error {
   556  	// in first four arrays, index 0 is for left worker, index 1 is for right worker
   557  	var (
   558  		inputWorkers [2]string
   559  		inputSources [2]string
   560  		workers      [2]*Worker
   561  		bounds       [2]ha.SourceBound
   562  		boundWorkers []string
   563  		boundsToPut  []ha.SourceBound
   564  		ok           bool
   565  	)
   566  
   567  	s.logger.Info("transfer source and worker", zap.String("left worker", lworker), zap.String("left source", lsource), zap.String("right worker", rworker), zap.String("right source", rsource))
   568  
   569  	inputWorkers[0], inputWorkers[1] = lworker, rworker
   570  	inputSources[0], inputSources[1] = lsource, rsource
   571  
   572  	for i, workerName := range inputWorkers {
   573  		if workerName != "" {
   574  			workers[i], ok = s.workers[workerName]
   575  			// should not happen, avoid panic
   576  			if !ok {
   577  				s.logger.Error("could not found worker in scheduler", zap.String("worker", workerName))
   578  				return terror.ErrSchedulerWorkerNotExist.Generate(workerName)
   579  			}
   580  		}
   581  	}
   582  
   583  	// check if the swap is valid, to avoid we messing up metadata in etcd.
   584  	for i := range inputWorkers {
   585  		if inputWorkers[i] != "" {
   586  			got := workers[i].bound.Source
   587  			expect := inputSources[i]
   588  			if got != expect {
   589  				return terror.ErrSchedulerWrongWorkerInput.Generate(inputWorkers[i], expect, got)
   590  			}
   591  
   592  			// if the worker has started-relay for a source, it can't be bound to another source.
   593  			relaySource := workers[i].RelaySourceID()
   594  			another := i ^ 1 // make use of XOR to flip 0 and 1
   595  			toBindSource := inputSources[another]
   596  			if relaySource != "" && toBindSource != "" && relaySource != toBindSource {
   597  				return terror.ErrSchedulerBoundDiffWithStartedRelay.Generate(inputWorkers[i], toBindSource, relaySource)
   598  			}
   599  		}
   600  	}
   601  
   602  	// get current bound workers.
   603  	for i := range inputWorkers {
   604  		if inputWorkers[i] != "" && inputSources[i] != "" {
   605  			boundWorkers = append(boundWorkers, inputWorkers[i])
   606  		}
   607  	}
   608  
   609  	// del current bound relations.
   610  	if _, err := ha.DeleteSourceBound(s.etcdCli, boundWorkers...); err != nil {
   611  		return err
   612  	}
   613  
   614  	// update unbound sources
   615  	for _, sourceID := range inputSources {
   616  		if sourceID != "" {
   617  			s.updateStatusToUnbound(sourceID)
   618  		}
   619  	}
   620  
   621  	// put new bound relations.
   622  	// TODO: move this and above DeleteSourceBound in one txn.
   623  	for i := range inputWorkers {
   624  		another := i ^ 1 // make use of XOR to flip 0 and 1
   625  		if inputWorkers[i] != "" && inputSources[another] != "" {
   626  			b := ha.NewSourceBound(inputSources[another], inputWorkers[i])
   627  			bounds[i] = b
   628  			boundsToPut = append(boundsToPut, b)
   629  		}
   630  	}
   631  	if _, err := ha.PutSourceBound(s.etcdCli, boundsToPut...); err != nil {
   632  		return err
   633  	}
   634  
   635  	// update bound sources and workers
   636  	for i := range inputWorkers {
   637  		another := i ^ 1 // make use of XOR to flip 0 and 1
   638  		if inputWorkers[i] != "" && inputSources[another] != "" {
   639  			err := s.updateStatusToBound(workers[i], bounds[i])
   640  			// TODO: if we failed here, etcd has been modified!! we should try this memory check then modify persistent data
   641  			// and revert if failed
   642  			if err != nil {
   643  				s.logger.DPanic("failed to update status to bound, but has written etcd", zap.Error(err))
   644  			}
   645  		}
   646  	}
   647  
   648  	// if one of the workers/sources become free/unbound
   649  	// try bound it.
   650  	for i := range inputWorkers {
   651  		another := i ^ 1 // make use of XOR to flip 0 and 1
   652  		if inputWorkers[i] != "" && inputSources[another] == "" {
   653  			if _, err := s.tryBoundForWorker(workers[i]); err != nil {
   654  				return err
   655  			}
   656  		}
   657  	}
   658  	for i := range inputSources {
   659  		another := i ^ 1 // make use of XOR to flip 0 and 1
   660  		if inputSources[i] != "" && inputWorkers[another] == "" {
   661  			if _, err := s.tryBoundForSource(inputSources[i]); err != nil {
   662  				return err
   663  			}
   664  		}
   665  	}
   666  
   667  	return nil
   668  }
   669  
   670  // TransferSource unbinds the `source` and binds it to a free or same-source-relay `worker`.
   671  // If fails halfway, the old worker should try recover.
   672  func (s *Scheduler) TransferSource(ctx context.Context, source, worker string) error {
   673  	if !s.started.Load() {
   674  		return terror.ErrSchedulerNotStarted.Generate()
   675  	}
   676  	s.mu.RLock()
   677  	// 1. check existence or no need
   678  	if _, ok := s.sourceCfgs[source]; !ok {
   679  		s.mu.RUnlock()
   680  		return terror.ErrSchedulerSourceCfgNotExist.Generate(source)
   681  	}
   682  	w, ok := s.workers[worker]
   683  	if !ok {
   684  		s.mu.RUnlock()
   685  		return terror.ErrSchedulerWorkerNotExist.Generate(worker)
   686  	}
   687  	oldWorker, hasOldWorker := s.bounds[source]
   688  	if hasOldWorker && oldWorker.BaseInfo().Name == worker {
   689  		s.mu.RUnlock()
   690  		return nil
   691  	}
   692  	s.mu.RUnlock()
   693  
   694  	// 2. check new worker is free and not started relay for another source
   695  	switch w.Stage() {
   696  	case WorkerOffline, WorkerBound:
   697  		return terror.ErrSchedulerWorkerInvalidTrans.Generate(worker, w.Stage(), WorkerBound)
   698  	case WorkerFree:
   699  	case WorkerRelay:
   700  		if relaySource := w.RelaySourceID(); relaySource != source {
   701  			return terror.ErrSchedulerBoundDiffWithStartedRelay.Generate(worker, source, relaySource)
   702  		}
   703  	}
   704  
   705  	// 3. if no old worker, bound it directly
   706  	if !hasOldWorker {
   707  		s.logger.Warn("in transfer source, found a free worker and not bound source, which should not happened",
   708  			zap.String("source", source),
   709  			zap.String("worker", worker))
   710  		return s.boundSourceToWorker(source, w)
   711  	}
   712  
   713  	// 4. check if old worker has running tasks
   714  	runningStage := pb.Stage_Running
   715  	if runningTasks := s.GetTaskNameListBySourceName(source, &runningStage); len(runningTasks) > 0 {
   716  		// we only allow automatically transfer-source if all subtasks are in the sync phase.
   717  		resp, err := oldWorker.queryStatus(ctx)
   718  		if err != nil {
   719  			return terror.Annotatef(err, "failed to query worker: %s status err", oldWorker.baseInfo.Name)
   720  		}
   721  		for _, status := range resp.QueryStatus.GetSubTaskStatus() {
   722  			if status.GetUnit() != pb.UnitType_Sync {
   723  				return terror.ErrSchedulerRequireRunningTaskInSyncUnit.Generate(runningTasks, source)
   724  			}
   725  		}
   726  		// pause running tasks
   727  		if batchPauseErr := s.BatchOperateTaskOnWorker(ctx, oldWorker, runningTasks, source, pb.Stage_Paused, true); batchPauseErr != nil {
   728  			return batchPauseErr
   729  		}
   730  		// we need resume tasks that we just paused, we use another goroutine to do this because if error happens
   731  		// just logging this message and let user handle it manually
   732  		defer func() {
   733  			go func() {
   734  				if err := s.BatchOperateTaskOnWorker(context.Background(), w, runningTasks, source, pb.Stage_Running, false); err != nil {
   735  					s.logger.Warn(
   736  						"auto resume task failed", zap.Any("tasks", runningTasks),
   737  						zap.String("source", source), zap.String("worker", worker), zap.Error(err))
   738  				}
   739  			}()
   740  		}()
   741  	}
   742  
   743  	// 5. replace the source bound
   744  	failpoint.Inject("failToReplaceSourceBound", func(_ failpoint.Value) {
   745  		failpoint.Return(errors.New("failToPutSourceBound"))
   746  	})
   747  	s.mu.Lock()
   748  	_, err := ha.ReplaceSourceBound(s.etcdCli, source, oldWorker.BaseInfo().Name, worker)
   749  	if err != nil {
   750  		s.mu.Unlock()
   751  		return err
   752  	}
   753  	if err2 := oldWorker.Unbound(); err2 != nil {
   754  		s.logger.DPanic("the oldWorker is get from s.bound, so there should not be an error", zap.Error(err2))
   755  	}
   756  	if err2 := s.updateStatusToBound(w, ha.NewSourceBound(source, worker)); err2 != nil {
   757  		s.logger.DPanic("we have checked w.stage is free, so there should not be an error", zap.Error(err2))
   758  	}
   759  	// 6. now this old worker is free, try bound source to it
   760  	_, err = s.tryBoundForWorker(oldWorker)
   761  	if err != nil {
   762  		s.logger.Warn("in transfer source, error when try bound the old worker", zap.Error(err))
   763  	}
   764  	s.mu.Unlock()
   765  	return nil
   766  }
   767  
   768  // BatchOperateTaskOnWorker batch operate tasks in one worker and use query-status to make sure all tasks are in expected stage if needWait=true.
   769  func (s *Scheduler) BatchOperateTaskOnWorker(
   770  	ctx context.Context, worker *Worker, tasks []string, source string, stage pb.Stage, needWait bool,
   771  ) error {
   772  	if len(tasks) == 0 {
   773  		return nil
   774  	}
   775  	for _, taskName := range tasks {
   776  		if err := s.UpdateExpectSubTaskStage(stage, taskName, source); err != nil {
   777  			return err
   778  		}
   779  	}
   780  	if !needWait {
   781  		return nil
   782  	}
   783  	// wait all tasks are in expected stage before actually starting scheduling
   784  WaitLoop:
   785  	for retry := 0; retry < maxQueryWorkerRetryNum; retry++ {
   786  		resp, err := worker.queryStatus(ctx)
   787  		if err != nil {
   788  			return terror.Annotatef(err, "failed to query worker: %s status", worker.baseInfo.Name)
   789  		}
   790  
   791  		failpoint.Inject("batchOperateTaskOnWorkerMustRetry", func(v failpoint.Value) {
   792  			if retry < v.(int) {
   793  				resp.QueryStatus.SubTaskStatus[0].Stage = pb.Stage_InvalidStage
   794  				log.L().Info("batchOperateTaskOnWorkerMustRetry failpoint triggered", zap.Int("retry", retry))
   795  			} else {
   796  				log.L().Info("batchOperateTaskOnWorkerMustRetry passed", zap.Int("retry", retry))
   797  			}
   798  		})
   799  
   800  		for _, status := range resp.QueryStatus.GetSubTaskStatus() {
   801  			if status == nil {
   802  				// this should not happen when rpc logic in server side not changed
   803  				return errors.Errorf("expect a query-status with subtask status but got a nil, resp %v", resp)
   804  			}
   805  			if status.Stage != stage {
   806  				// NOTE: the defaultRPCTimeout is 10m, use 1s * retry times to increase the waiting time
   807  				sleepTime := time.Second * time.Duration(maxQueryWorkerRetryNum-retry)
   808  				s.logger.Info(
   809  					"waiting task",
   810  					zap.String("task", status.Name),
   811  					zap.Int("retry times", retry),
   812  					zap.Duration("sleep time", sleepTime),
   813  					zap.String("want stage", stage.String()),
   814  					zap.String("current stage", status.Stage.String()),
   815  				)
   816  				failpoint.Inject("skipBatchOperateTaskOnWorkerSleep", func(_ failpoint.Value) {
   817  					failpoint.Continue("WaitLoop")
   818  				})
   819  				select {
   820  				case <-ctx.Done():
   821  					return terror.Annotatef(err, "failed to wait task on worker: %s because context is canceled", worker.baseInfo.Name)
   822  				case <-time.After(sleepTime):
   823  					continue WaitLoop
   824  				}
   825  			}
   826  		}
   827  		return nil // all task are in expected stage
   828  	}
   829  	return terror.ErrSchedulerPauseTaskForTransferSource.Generate(tasks) // failed to pause tasks, need user to handle it manually
   830  }
   831  
   832  // AcquireSubtaskLatch tries acquiring a latch for subtask name.
   833  func (s *Scheduler) AcquireSubtaskLatch(name string) (ReleaseFunc, error) {
   834  	return s.subtaskLatch.tryAcquire(name)
   835  }
   836  
   837  // AddSubTasks adds the information of one or more subtasks for one task.
   838  // use s.mu.RLock() to protect s.bound, and s.subtaskLatch to protect subtask related members.
   839  // setting `latched` to true means caller has acquired latch.
   840  func (s *Scheduler) AddSubTasks(latched bool, expectStage pb.Stage, cfgs ...config.SubTaskConfig) error {
   841  	s.mu.RLock()
   842  	defer s.mu.RUnlock()
   843  
   844  	if !s.started.Load() {
   845  		return terror.ErrSchedulerNotStarted.Generate()
   846  	}
   847  
   848  	if len(cfgs) == 0 {
   849  		return nil // no subtasks need to add, this should not happen.
   850  	}
   851  
   852  	var (
   853  		taskNamesM    = make(map[string]struct{}, 1)
   854  		existSourcesM = make(map[string]struct{}, len(cfgs))
   855  		allSources    = make([]string, 0, len(cfgs))
   856  	)
   857  
   858  	for _, cfg := range cfgs {
   859  		taskNamesM[cfg.Name] = struct{}{}
   860  	}
   861  	taskNames := strMapToSlice(taskNamesM)
   862  	if len(taskNames) > 1 {
   863  		// only subtasks from one task supported now.
   864  		return terror.ErrSchedulerMultiTask.Generate(taskNames)
   865  	}
   866  
   867  	if !latched {
   868  		release, err := s.subtaskLatch.tryAcquire(taskNames[0])
   869  		if err != nil {
   870  			return terror.ErrSchedulerLatchInUse.Generate("AddSubTasks", taskNames[0])
   871  		}
   872  		defer release()
   873  	}
   874  
   875  	// 1. check whether exists.
   876  	for _, cfg := range cfgs {
   877  		allSources = append(allSources, cfg.SourceID)
   878  		v, ok := s.subTaskCfgs.Load(cfg.Name)
   879  		if !ok {
   880  			continue
   881  		}
   882  		cfgM := v.(map[string]config.SubTaskConfig)
   883  		_, ok = cfgM[cfg.SourceID]
   884  		if !ok {
   885  			continue
   886  		}
   887  		existSourcesM[cfg.SourceID] = struct{}{}
   888  	}
   889  
   890  	existSources := strMapToSlice(existSourcesM)
   891  	switch {
   892  	case len(existSources) == len(cfgs):
   893  		// all subtasks already exist, return an error.
   894  		return terror.ErrSchedulerSubTaskExist.Generate(taskNames[0], existSources)
   895  	case len(existSources) > 0:
   896  		// some subtasks already exists, log a warn.
   897  		s.logger.Warn("some subtasks already exist", zap.String("task", taskNames[0]), zap.Strings("sources", existSources))
   898  	}
   899  
   900  	// 2. construct `Running` stages when adding.
   901  	newCfgs := make([]config.SubTaskConfig, 0, len(cfgs)-len(existSources))
   902  	newStages := make([]ha.Stage, 0, cap(newCfgs))
   903  	validatorStages := make([]ha.Stage, 0, cap(newCfgs))
   904  	unbounds := make([]string, 0)
   905  	for _, cfg := range cfgs {
   906  		if _, ok := existSourcesM[cfg.SourceID]; ok {
   907  			continue
   908  		}
   909  		newCfgs = append(newCfgs, cfg)
   910  		newStages = append(newStages, ha.NewSubTaskStage(expectStage, cfg.SourceID, cfg.Name))
   911  		if cfg.ValidatorCfg.Mode != config.ValidationNone {
   912  			validatorStages = append(validatorStages, ha.NewValidatorStage(pb.Stage_Running, cfg.SourceID, cfg.Name))
   913  		}
   914  		if _, ok := s.bounds[cfg.SourceID]; !ok {
   915  			unbounds = append(unbounds, cfg.SourceID)
   916  		}
   917  	}
   918  
   919  	// 3. check whether any sources unbound.
   920  	if len(unbounds) > 0 {
   921  		return terror.ErrSchedulerSourcesUnbound.Generate(unbounds)
   922  	}
   923  
   924  	// 4. put the lightning status, configs and stages into etcd.
   925  	if config.HasLoad(cfgs[0].Mode) && cfgs[0].LoaderConfig.ImportMode == config.LoadModePhysical {
   926  		if len(existSources) > 0 {
   927  			// don't support add new lightning subtask when some subtasks already exist.
   928  			return terror.ErrSchedulerSubTaskExist.Generate(taskNames[0], existSources)
   929  		}
   930  		_, err := ha.PutLightningNotReadyForAllSources(s.etcdCli, taskNames[0], allSources)
   931  		if err != nil {
   932  			return err
   933  		}
   934  	}
   935  	_, err := ha.PutSubTaskCfgStage(s.etcdCli, newCfgs, newStages, validatorStages)
   936  	if err != nil {
   937  		return err
   938  	}
   939  
   940  	// 5. record the config and the expectant stage.
   941  	for _, cfg := range newCfgs {
   942  		v, _ := s.subTaskCfgs.LoadOrStore(cfg.Name, map[string]config.SubTaskConfig{})
   943  		m := v.(map[string]config.SubTaskConfig)
   944  		m[cfg.SourceID] = cfg
   945  	}
   946  	for _, stage := range newStages {
   947  		v, _ := s.expectSubTaskStages.LoadOrStore(stage.Task, map[string]ha.Stage{})
   948  		m := v.(map[string]ha.Stage)
   949  		m[stage.Source] = stage
   950  	}
   951  	for _, stage := range validatorStages {
   952  		v, _ := s.expectValidatorStages.LoadOrStore(stage.Task, map[string]ha.Stage{})
   953  		m := v.(map[string]ha.Stage)
   954  		m[stage.Source] = stage
   955  	}
   956  
   957  	return nil
   958  }
   959  
   960  // RemoveSubTasks removes the information of one or more subtasks for one task.
   961  func (s *Scheduler) RemoveSubTasks(task string, sources ...string) error {
   962  	if !s.started.Load() {
   963  		return terror.ErrSchedulerNotStarted.Generate()
   964  	}
   965  
   966  	if task == "" || len(sources) == 0 {
   967  		return nil // no subtask need to stop, this should not happen.
   968  	}
   969  
   970  	release, err := s.subtaskLatch.tryAcquire(task)
   971  	if err != nil {
   972  		return terror.ErrSchedulerLatchInUse.Generate("RemoveSubTasks", task)
   973  	}
   974  	defer release()
   975  
   976  	// 1. check the task exists.
   977  	stagesMapV, ok1 := s.expectSubTaskStages.Load(task)
   978  	cfgsMapV, ok2 := s.subTaskCfgs.Load(task)
   979  	if !ok1 || !ok2 {
   980  		return terror.ErrSchedulerSubTaskOpTaskNotExist.Generate(task)
   981  	}
   982  
   983  	var validatorStageM map[string]ha.Stage
   984  	if validatorStageV, ok := s.expectValidatorStages.Load(task); ok {
   985  		validatorStageM = validatorStageV.(map[string]ha.Stage)
   986  	}
   987  
   988  	var (
   989  		stagesM          = stagesMapV.(map[string]ha.Stage)
   990  		cfgsM            = cfgsMapV.(map[string]config.SubTaskConfig)
   991  		notExistSourcesM = make(map[string]struct{})
   992  		stages           = make([]ha.Stage, 0, len(sources))
   993  		validatorStages  = make([]ha.Stage, 0, len(sources))
   994  		cfgs             = make([]config.SubTaskConfig, 0, len(sources))
   995  	)
   996  	for _, source := range sources {
   997  		if stage, ok := stagesM[source]; !ok {
   998  			notExistSourcesM[source] = struct{}{}
   999  		} else {
  1000  			stages = append(stages, stage)
  1001  		}
  1002  		if stage, ok := validatorStageM[source]; ok {
  1003  			validatorStages = append(validatorStages, stage)
  1004  		}
  1005  		if cfg, ok := cfgsM[source]; ok {
  1006  			cfgs = append(cfgs, cfg)
  1007  		}
  1008  	}
  1009  	notExistSources := strMapToSlice(notExistSourcesM)
  1010  	if len(notExistSources) > 0 {
  1011  		// some sources not exist, reject the request.
  1012  		return terror.ErrSchedulerSubTaskOpSourceNotExist.Generate(notExistSources)
  1013  	}
  1014  
  1015  	// 2. delete the configs and the stages.
  1016  	_, err = ha.DeleteSubTaskCfgStage(s.etcdCli, cfgs, stages, validatorStages)
  1017  	if err != nil {
  1018  		return err
  1019  	}
  1020  
  1021  	// 3. clear the config and the expectant stage.
  1022  	for _, cfg := range cfgs {
  1023  		delete(cfgsM, cfg.SourceID)
  1024  	}
  1025  	if len(cfgsM) == 0 {
  1026  		s.subTaskCfgs.Delete(task)
  1027  	}
  1028  	for _, stage := range stages {
  1029  		delete(stagesM, stage.Source)
  1030  	}
  1031  	if len(stagesM) == 0 {
  1032  		s.expectSubTaskStages.Delete(task)
  1033  	}
  1034  	for _, stage := range validatorStages {
  1035  		delete(validatorStageM, stage.Source)
  1036  	}
  1037  	if len(validatorStageM) == 0 {
  1038  		s.expectValidatorStages.Delete(task)
  1039  	}
  1040  
  1041  	return nil
  1042  }
  1043  
  1044  // UpdateSubTasks update the information of one or more subtasks for one task.
  1045  func (s *Scheduler) UpdateSubTasks(ctx context.Context, cfgs ...config.SubTaskConfig) error {
  1046  	s.mu.Lock()
  1047  	defer s.mu.Unlock()
  1048  	if !s.started.Load() {
  1049  		return terror.ErrSchedulerNotStarted.Generate()
  1050  	}
  1051  	if len(cfgs) == 0 {
  1052  		return nil // no subtasks need to add, this should not happen.
  1053  	}
  1054  	taskNamesM := make(map[string]struct{}, 1)
  1055  	for _, cfg := range cfgs {
  1056  		taskNamesM[cfg.Name] = struct{}{}
  1057  	}
  1058  	if len(taskNamesM) > 1 {
  1059  		// only subtasks from one task supported now.
  1060  		return terror.ErrSchedulerMultiTask.Generate(strMapToSlice(taskNamesM))
  1061  	}
  1062  	// check whether exists.
  1063  	cfg := cfgs[0]
  1064  	v, ok := s.subTaskCfgs.Load(cfg.Name)
  1065  	if !ok {
  1066  		return terror.ErrSchedulerTaskNotExist.Generate(cfg.Name)
  1067  	}
  1068  	cfgM := v.(map[string]config.SubTaskConfig)
  1069  	for _, cfg := range cfgs {
  1070  		_, ok = cfgM[cfg.SourceID]
  1071  		if !ok {
  1072  			return terror.ErrSchedulerSubTaskNotExist.Generate(cfg.Name, cfg.SourceID)
  1073  		}
  1074  	}
  1075  	// check whether in running stage
  1076  	stage := s.GetExpectSubTaskStage(cfg.Name, cfg.SourceID)
  1077  	if stage.Expect == pb.Stage_Running {
  1078  		return terror.ErrSchedulerSubTaskCfgUpdate.Generate(cfg.Name, cfg.SourceID)
  1079  	}
  1080  
  1081  	// check by workers todo batch
  1082  	for _, cfg := range cfgs {
  1083  		worker := s.bounds[cfg.SourceID]
  1084  		if worker == nil {
  1085  			return terror.ErrSchedulerSubTaskCfgUpdate.Generatef("this source: %s have not bound to worker", cfg.SourceID)
  1086  		}
  1087  		resp, err := worker.checkSubtasksCanUpdate(ctx, &cfg)
  1088  		if err != nil {
  1089  			return err
  1090  		}
  1091  		if !resp.CheckSubtasksCanUpdate.Success {
  1092  			return terror.ErrSchedulerSubTaskCfgUpdate.Generatef("can not update because %s", resp.CheckSubtasksCanUpdate.Msg)
  1093  		}
  1094  	}
  1095  	// put the configs and stages into etcd.
  1096  	_, err := ha.PutSubTaskCfgStage(s.etcdCli, cfgs, []ha.Stage{}, []ha.Stage{})
  1097  	if err != nil {
  1098  		return err
  1099  	}
  1100  	// record the config
  1101  	for _, cfg := range cfgs {
  1102  		v, _ := s.subTaskCfgs.LoadOrStore(cfg.Name, map[string]config.SubTaskConfig{})
  1103  		m := v.(map[string]config.SubTaskConfig)
  1104  		m[cfg.SourceID] = cfg
  1105  	}
  1106  	return nil
  1107  }
  1108  
  1109  // getSubTaskCfgByTaskSource gets subtask config by task name and source ID. Only used in tests.
  1110  func (s *Scheduler) getSubTaskCfgByTaskSource(task, source string) *config.SubTaskConfig {
  1111  	v, ok := s.subTaskCfgs.Load(task)
  1112  	if !ok {
  1113  		return nil
  1114  	}
  1115  
  1116  	cfgM := v.(map[string]config.SubTaskConfig)
  1117  	cfg, ok := cfgM[source]
  1118  	if !ok {
  1119  		return nil
  1120  	}
  1121  	clone := cfg
  1122  	return &clone
  1123  }
  1124  
  1125  // GetDownstreamMetaByTask gets downstream db config and meta config by task name.
  1126  func (s *Scheduler) GetDownstreamMetaByTask(task string) (*dbconfig.DBConfig, string) {
  1127  	v, ok := s.subTaskCfgs.Load(task)
  1128  	if !ok {
  1129  		return nil, ""
  1130  	}
  1131  	cfgM := v.(map[string]config.SubTaskConfig)
  1132  	for _, cfg := range cfgM {
  1133  		return cfg.To.Clone(), cfg.MetaSchema
  1134  	}
  1135  	return nil, ""
  1136  }
  1137  
  1138  // GetSubTaskCfgsByTask gets subtask configs' map by task name.
  1139  func (s *Scheduler) GetSubTaskCfgsByTask(task string) map[string]*config.SubTaskConfig {
  1140  	v, ok := s.subTaskCfgs.Load(task)
  1141  	if !ok {
  1142  		return nil
  1143  	}
  1144  
  1145  	cfgM := v.(map[string]config.SubTaskConfig)
  1146  	cloneM := make(map[string]*config.SubTaskConfig, len(cfgM))
  1147  	for source, cfg := range cfgM {
  1148  		clone := cfg
  1149  		cloneM[source] = &clone
  1150  	}
  1151  	return cloneM
  1152  }
  1153  
  1154  func (s *Scheduler) GetSubTaskCfgsByTaskAndSource(taskName string, sources []string) map[string]map[string]config.SubTaskConfig {
  1155  	var ret map[string]map[string]config.SubTaskConfig // task-name->sourceID->*config.SubTaskConfig
  1156  	if len(taskName) == 0 {
  1157  		ret = s.GetSubTaskCfgs()
  1158  	} else {
  1159  		// get subtask by name
  1160  		ret = map[string]map[string]config.SubTaskConfig{}
  1161  		tmp := s.GetSubTaskCfgsByTask(taskName)
  1162  		if tmp == nil {
  1163  			// no subtask matches the `task-name`
  1164  			return ret
  1165  		}
  1166  		ret[taskName] = map[string]config.SubTaskConfig{}
  1167  		for source, cfg := range tmp {
  1168  			ret[taskName][source] = *cfg
  1169  		}
  1170  	}
  1171  	// filter the source that we don't want
  1172  	if len(sources) > 0 {
  1173  		filterSource := map[string]interface{}{}
  1174  		for _, source := range sources {
  1175  			filterSource[source] = true // the source we want
  1176  		}
  1177  		for taskName, sourceCfgs := range ret {
  1178  			for source := range sourceCfgs {
  1179  				if _, ok := filterSource[source]; !ok {
  1180  					delete(sourceCfgs, source)
  1181  				}
  1182  			}
  1183  			if len(ret[taskName]) == 0 {
  1184  				delete(ret, taskName)
  1185  			}
  1186  		}
  1187  	}
  1188  	return ret
  1189  }
  1190  
  1191  // GetSubTaskCfgs gets all subconfig, return nil when error happens.
  1192  func (s *Scheduler) GetSubTaskCfgs() map[string]map[string]config.SubTaskConfig {
  1193  	// taskName -> sourceName -> SubTaskConfig
  1194  	clone := make(map[string]map[string]config.SubTaskConfig)
  1195  	s.subTaskCfgs.Range(func(k, v interface{}) bool {
  1196  		task := k.(string)
  1197  		m := v.(map[string]config.SubTaskConfig)
  1198  		clone2 := make(map[string]config.SubTaskConfig, len(m))
  1199  		for source, cfg := range m {
  1200  			cfg2, err := cfg.Clone()
  1201  			if err != nil {
  1202  				return true
  1203  			}
  1204  			clone2[source] = *cfg2
  1205  		}
  1206  		clone[task] = clone2
  1207  		return true
  1208  	})
  1209  	return clone
  1210  }
  1211  
  1212  // GetSubTaskCfgs gets all subTask config pointer, return nil when error happens.
  1213  func (s *Scheduler) GetALlSubTaskCfgs() map[string]map[string]*config.SubTaskConfig {
  1214  	s.mu.RLock()
  1215  	defer s.mu.RUnlock()
  1216  	// taskName -> sourceName -> SubTaskConfig
  1217  	clone := make(map[string]map[string]*config.SubTaskConfig)
  1218  	s.subTaskCfgs.Range(func(k, v interface{}) bool {
  1219  		task := k.(string)
  1220  		m := v.(map[string]config.SubTaskConfig)
  1221  		clone2 := make(map[string]*config.SubTaskConfig, len(m))
  1222  		for source, cfg := range m {
  1223  			cfg2, err := cfg.Clone()
  1224  			if err != nil {
  1225  				return true
  1226  			}
  1227  			clone2[source] = cfg2
  1228  		}
  1229  		clone[task] = clone2
  1230  		return true
  1231  	})
  1232  	return clone
  1233  }
  1234  
  1235  // GetTaskNameListBySourceName gets task name list by source name.
  1236  func (s *Scheduler) GetTaskNameListBySourceName(sourceName string, expectStage *pb.Stage) []string {
  1237  	var taskNameList []string
  1238  	s.expectSubTaskStages.Range(func(k, v interface{}) bool {
  1239  		subtaskM := v.(map[string]ha.Stage)
  1240  		subtaskStage, ok2 := subtaskM[sourceName]
  1241  		if !ok2 {
  1242  			return true
  1243  		}
  1244  		task := k.(string)
  1245  		if expectStage == nil {
  1246  			taskNameList = append(taskNameList, task)
  1247  		} else if subtaskStage.Expect == *expectStage {
  1248  			taskNameList = append(taskNameList, task)
  1249  		}
  1250  		return true
  1251  	})
  1252  	return taskNameList
  1253  }
  1254  
  1255  // AddWorker adds the information of the DM-worker when registering a new instance.
  1256  // This only adds the information of the DM-worker,
  1257  // in order to know whether it's online (ready to handle works),
  1258  // we need to wait for its healthy status through keep-alive.
  1259  func (s *Scheduler) AddWorker(name, addr string) error {
  1260  	s.mu.Lock()
  1261  	defer s.mu.Unlock()
  1262  
  1263  	if !s.started.Load() {
  1264  		return terror.ErrSchedulerNotStarted.Generate()
  1265  	}
  1266  
  1267  	// 1. check whether exists.
  1268  	if w, ok := s.workers[name]; ok {
  1269  		// NOTE: we do not support add the worker with different address now, support if needed later.
  1270  		// but we support add the worker with all the same information multiple times, and only the first one take effect,
  1271  		// because this is needed when restarting the worker.
  1272  		if addr == w.BaseInfo().Addr {
  1273  			s.logger.Warn("add the same worker again", zap.Stringer("worker info", w.BaseInfo()))
  1274  			return nil
  1275  		}
  1276  		return terror.ErrSchedulerWorkerExist.Generate(w.BaseInfo())
  1277  	}
  1278  
  1279  	// 2. put the base info into etcd.
  1280  	info := ha.NewWorkerInfo(name, addr)
  1281  	_, err := ha.PutWorkerInfo(s.etcdCli, info)
  1282  	if err != nil {
  1283  		return err
  1284  	}
  1285  
  1286  	// generate an agent of DM-worker (with Offline stage) and keep it in the scheduler.
  1287  	_, err = s.recordWorker(info)
  1288  	return err
  1289  }
  1290  
  1291  // RemoveWorker removes the information of the DM-worker when removing the instance manually.
  1292  // The user should shutdown the DM-worker instance before removing its information.
  1293  func (s *Scheduler) RemoveWorker(name string) error {
  1294  	s.mu.Lock()
  1295  	defer s.mu.Unlock()
  1296  
  1297  	if !s.started.Load() {
  1298  		return terror.ErrSchedulerNotStarted.Generate()
  1299  	}
  1300  
  1301  	w, ok := s.workers[name]
  1302  	if !ok {
  1303  		return terror.ErrSchedulerWorkerNotExist.Generate(name)
  1304  	} else if w.Stage() != WorkerOffline {
  1305  		return terror.ErrSchedulerWorkerOnline.Generate(name)
  1306  	}
  1307  
  1308  	// delete the info in etcd.
  1309  	_, err := ha.DeleteWorkerInfoRelayConfig(s.etcdCli, name)
  1310  	if err != nil {
  1311  		return err
  1312  	}
  1313  	s.deleteWorker(name)
  1314  	return nil
  1315  }
  1316  
  1317  // GetAllWorkers gets all worker agent.
  1318  func (s *Scheduler) GetAllWorkers() ([]*Worker, error) {
  1319  	s.mu.RLock()
  1320  	defer s.mu.RUnlock()
  1321  
  1322  	if !s.started.Load() {
  1323  		return nil, terror.ErrSchedulerNotStarted.Generate()
  1324  	}
  1325  
  1326  	workers := make([]*Worker, 0, len(s.workers))
  1327  	for _, value := range s.workers {
  1328  		workers = append(workers, value)
  1329  	}
  1330  	return workers, nil
  1331  }
  1332  
  1333  // GetWorkerByName gets worker agent by worker name.
  1334  func (s *Scheduler) GetWorkerByName(name string) *Worker {
  1335  	s.mu.RLock()
  1336  	defer s.mu.RUnlock()
  1337  	return s.workers[name]
  1338  }
  1339  
  1340  // GetWorkerBySource gets the current bound worker agent by source ID,
  1341  // returns nil if the source not bound.
  1342  func (s *Scheduler) GetWorkerBySource(source string) *Worker {
  1343  	s.mu.RLock()
  1344  	defer s.mu.RUnlock()
  1345  	return s.bounds[source]
  1346  }
  1347  
  1348  // BoundSources returns all bound source IDs in increasing order.
  1349  func (s *Scheduler) BoundSources() []string {
  1350  	s.mu.RLock()
  1351  	defer s.mu.RUnlock()
  1352  	IDs := make([]string, 0, len(s.bounds))
  1353  	for ID := range s.bounds {
  1354  		IDs = append(IDs, ID)
  1355  	}
  1356  	sort.Strings(IDs)
  1357  	return IDs
  1358  }
  1359  
  1360  // UnboundSources returns all unbound source IDs in increasing order.
  1361  func (s *Scheduler) UnboundSources() []string {
  1362  	s.mu.RLock()
  1363  	defer s.mu.RUnlock()
  1364  	IDs := make([]string, 0, len(s.unbounds))
  1365  	for ID := range s.unbounds {
  1366  		IDs = append(IDs, ID)
  1367  	}
  1368  	sort.Strings(IDs)
  1369  	return IDs
  1370  }
  1371  
  1372  // StartRelay puts etcd key-value pairs to start relay on some workers.
  1373  func (s *Scheduler) StartRelay(source string, workers []string) error {
  1374  	s.mu.Lock()
  1375  	defer s.mu.Unlock()
  1376  
  1377  	if !s.started.Load() {
  1378  		return terror.ErrSchedulerNotStarted.Generate()
  1379  	}
  1380  
  1381  	// 1. precheck
  1382  	sourceCfg, ok := s.sourceCfgs[source]
  1383  	if !ok {
  1384  		return terror.ErrSchedulerSourceCfgNotExist.Generate(source)
  1385  	}
  1386  	startedWorkers := s.relayWorkers[source]
  1387  
  1388  	// quick path for `start-relay` without worker name
  1389  	if len(workers) == 0 {
  1390  		if len(startedWorkers) != 0 {
  1391  			return terror.ErrSchedulerStartRelayOnSpecified.Generate(utils.SetToSlice(startedWorkers))
  1392  		}
  1393  		// update enable-relay in source config
  1394  		sourceCfg.EnableRelay = true
  1395  		_, err := ha.PutSourceCfg(s.etcdCli, sourceCfg)
  1396  		if err != nil {
  1397  			return err
  1398  		}
  1399  		s.sourceCfgs[source] = sourceCfg
  1400  		// notify bound worker
  1401  		w, ok2 := s.bounds[source]
  1402  		if !ok2 {
  1403  			return nil
  1404  		}
  1405  		stage := ha.NewRelayStage(pb.Stage_Running, source)
  1406  		_, err = ha.PutRelayStageSourceBound(s.etcdCli, stage, w.Bound())
  1407  		return err
  1408  	} else if sourceCfg.EnableRelay {
  1409  		// error when `enable-relay` and `start-relay` with worker name
  1410  		return terror.ErrSchedulerStartRelayOnBound.Generate()
  1411  	}
  1412  
  1413  	if startedWorkers == nil {
  1414  		startedWorkers = map[string]struct{}{}
  1415  		s.relayWorkers[source] = startedWorkers
  1416  	}
  1417  	var (
  1418  		notExistWorkers []string
  1419  		// below two list means the worker that requested start-relay has bound to another source
  1420  		boundWorkers, boundSources []string
  1421  		alreadyStarted             []string
  1422  		// currently we forbid one worker starting multiple relay
  1423  		busyWorkers, busySources []string
  1424  	)
  1425  	for _, workerName := range workers {
  1426  		var (
  1427  			worker *Worker
  1428  			ok     bool
  1429  		)
  1430  		if worker, ok = s.workers[workerName]; !ok {
  1431  			notExistWorkers = append(notExistWorkers, workerName)
  1432  			continue
  1433  		}
  1434  		if _, ok = startedWorkers[workerName]; ok {
  1435  			alreadyStarted = append(alreadyStarted, workerName)
  1436  		}
  1437  
  1438  		// for Bound and Offline worker
  1439  		if worker.Bound().Source != "" && worker.Bound().Source != source {
  1440  			boundWorkers = append(boundWorkers, workerName)
  1441  			boundSources = append(boundSources, worker.Bound().Source)
  1442  		}
  1443  		if relaySource := worker.RelaySourceID(); relaySource != "" && relaySource != source {
  1444  			busyWorkers = append(busyWorkers, workerName)
  1445  			busySources = append(busySources, relaySource)
  1446  		}
  1447  	}
  1448  
  1449  	if len(notExistWorkers) > 0 {
  1450  		return terror.ErrSchedulerWorkerNotExist.Generate(notExistWorkers)
  1451  	}
  1452  	if len(boundWorkers) > 0 {
  1453  		return terror.ErrSchedulerRelayWorkersWrongBound.Generate(boundWorkers, boundSources)
  1454  	}
  1455  	if len(busyWorkers) > 0 {
  1456  		return terror.ErrSchedulerRelayWorkersBusy.Generate(busyWorkers, busySources)
  1457  	}
  1458  	if len(alreadyStarted) > 0 {
  1459  		s.logger.Warn("some workers already started relay",
  1460  			zap.String("source", source),
  1461  			zap.Strings("already started workers", alreadyStarted))
  1462  	}
  1463  
  1464  	// 2. put etcd and update memory cache
  1465  	// if there's no relay stage, create a running one. otherwise we should respect paused stage
  1466  	if len(startedWorkers) == 0 {
  1467  		stage := ha.NewRelayStage(pb.Stage_Running, source)
  1468  		if _, err := ha.PutRelayStage(s.etcdCli, stage); err != nil {
  1469  			return err
  1470  		}
  1471  		s.expectRelayStages[source] = stage
  1472  	}
  1473  	if _, err := ha.PutRelayConfig(s.etcdCli, source, workers...); err != nil {
  1474  		return err
  1475  	}
  1476  	for _, workerName := range workers {
  1477  		s.relayWorkers[source][workerName] = struct{}{}
  1478  		if err := s.workers[workerName].StartRelay(source); err != nil {
  1479  			s.logger.DPanic("we have checked the prerequisite and updated etcd, so should be no error",
  1480  				zap.Error(err))
  1481  		}
  1482  	}
  1483  	return nil
  1484  }
  1485  
  1486  // StopRelay deletes etcd key-value pairs to stop relay on some workers.
  1487  func (s *Scheduler) StopRelay(source string, workers []string) error {
  1488  	s.mu.Lock()
  1489  	defer s.mu.Unlock()
  1490  
  1491  	if !s.started.Load() {
  1492  		return terror.ErrSchedulerNotStarted.Generate()
  1493  	}
  1494  
  1495  	// 1. precheck
  1496  	sourceCfg, ok := s.sourceCfgs[source]
  1497  	if !ok {
  1498  		return terror.ErrSchedulerSourceCfgNotExist.Generate(source)
  1499  	}
  1500  	// quick path for `stop-relay` without worker name
  1501  	if len(workers) == 0 {
  1502  		startedWorker := s.relayWorkers[source]
  1503  		if len(startedWorker) != 0 {
  1504  			return terror.ErrSchedulerStopRelayOnSpecified.Generate(utils.SetToSlice(startedWorker))
  1505  		}
  1506  		// update enable-relay in source config
  1507  		sourceCfg.EnableRelay = false
  1508  		_, err := ha.PutSourceCfg(s.etcdCli, sourceCfg)
  1509  		if err != nil {
  1510  			return err
  1511  		}
  1512  		s.sourceCfgs[source] = sourceCfg
  1513  		// notify bound worker
  1514  		w, ok2 := s.bounds[source]
  1515  		if !ok2 {
  1516  			return nil
  1517  		}
  1518  		// TODO: remove orphan relay stage
  1519  		_, err = ha.PutSourceBound(s.etcdCli, w.Bound())
  1520  		return err
  1521  	} else if sourceCfg.EnableRelay {
  1522  		// error when `enable-relay` and `stop-relay` with worker name
  1523  		return terror.ErrSchedulerStopRelayOnBound.Generate()
  1524  	}
  1525  
  1526  	var (
  1527  		notExistWorkers                    []string
  1528  		unmatchedWorkers, unmatchedSources []string
  1529  		alreadyStopped                     []string
  1530  	)
  1531  	for _, workerName := range workers {
  1532  		var (
  1533  			worker *Worker
  1534  			ok     bool
  1535  		)
  1536  
  1537  		if worker, ok = s.workers[workerName]; !ok {
  1538  			notExistWorkers = append(notExistWorkers, workerName)
  1539  			continue
  1540  		}
  1541  
  1542  		startedRelay := worker.RelaySourceID()
  1543  		if startedRelay == "" {
  1544  			alreadyStopped = append(alreadyStopped, workerName)
  1545  			continue
  1546  		}
  1547  
  1548  		if startedRelay != source {
  1549  			unmatchedWorkers = append(unmatchedWorkers, workerName)
  1550  			unmatchedSources = append(unmatchedSources, startedRelay)
  1551  		}
  1552  	}
  1553  	if len(notExistWorkers) > 0 {
  1554  		return terror.ErrSchedulerWorkerNotExist.Generate(notExistWorkers)
  1555  	}
  1556  	if len(unmatchedWorkers) > 0 {
  1557  		return terror.ErrSchedulerRelayWorkersWrongRelay.Generate(unmatchedWorkers, unmatchedSources)
  1558  	}
  1559  	if len(alreadyStopped) > 0 {
  1560  		s.logger.Warn("some workers already stopped relay",
  1561  			zap.String("source", source),
  1562  			zap.Strings("already stopped workers", alreadyStopped))
  1563  	}
  1564  
  1565  	// 2. delete from etcd and update memory cache
  1566  	if _, err := ha.DeleteRelayConfig(s.etcdCli, workers...); err != nil {
  1567  		return err
  1568  	}
  1569  	for _, workerName := range workers {
  1570  		delete(s.relayWorkers[source], workerName)
  1571  		s.workers[workerName].StopRelay()
  1572  	}
  1573  	if len(s.relayWorkers[source]) == 0 {
  1574  		if _, err := ha.DeleteRelayStage(s.etcdCli, source); err != nil {
  1575  			return err
  1576  		}
  1577  		delete(s.relayWorkers, source)
  1578  		delete(s.expectRelayStages, source)
  1579  	}
  1580  	return nil
  1581  }
  1582  
  1583  // GetRelayWorkers returns all alive worker instances for a relay source.
  1584  func (s *Scheduler) GetRelayWorkers(source string) ([]*Worker, error) {
  1585  	s.mu.RLock()
  1586  	defer s.mu.RUnlock()
  1587  	if !s.started.Load() {
  1588  		return nil, terror.ErrSchedulerNotStarted.Generate()
  1589  	}
  1590  	workers := s.relayWorkers[source]
  1591  	ret := make([]*Worker, 0, len(workers))
  1592  	for w := range workers {
  1593  		worker, ok := s.workers[w]
  1594  		if !ok {
  1595  			// should not happen
  1596  			s.logger.Error("worker instance for relay worker not found", zap.String("worker", w))
  1597  			continue
  1598  		}
  1599  		ret = append(ret, worker)
  1600  	}
  1601  	sort.Slice(ret, func(i, j int) bool {
  1602  		return ret[i].baseInfo.Name < ret[j].baseInfo.Name
  1603  	})
  1604  	return ret, nil
  1605  }
  1606  
  1607  // UpdateExpectRelayStage updates the current expect relay stage.
  1608  // now, only support updates:
  1609  // - from `Running` to `Paused`.
  1610  // - from `Paused` to `Running`.
  1611  // NOTE: from `Running` to `Running` and `Paused` to `Paused` still update the data in etcd,
  1612  // because some user may want to update `{Running, Paused, ...}` to `{Running, Running, ...}`.
  1613  // so, this should be also supported in DM-worker.
  1614  func (s *Scheduler) UpdateExpectRelayStage(newStage pb.Stage, sources ...string) error {
  1615  	s.mu.Lock()
  1616  	defer s.mu.Unlock()
  1617  
  1618  	if !s.started.Load() {
  1619  		return terror.ErrSchedulerNotStarted.Generate()
  1620  	}
  1621  
  1622  	if len(sources) == 0 {
  1623  		return nil // no sources need to update the stage, this should not happen.
  1624  	}
  1625  
  1626  	// 1. check the new expectant stage.
  1627  	switch newStage {
  1628  	case pb.Stage_Running, pb.Stage_Paused:
  1629  	default:
  1630  		return terror.ErrSchedulerRelayStageInvalidUpdate.Generate(newStage)
  1631  	}
  1632  
  1633  	var (
  1634  		notExistSourcesM = make(map[string]struct{})
  1635  		currStagesM      = make(map[string]struct{})
  1636  		stages           = make([]ha.Stage, 0, len(sources))
  1637  	)
  1638  	for _, source := range sources {
  1639  		if _, ok := s.sourceCfgs[source]; !ok {
  1640  			notExistSourcesM[source] = struct{}{}
  1641  			continue
  1642  		}
  1643  
  1644  		if currStage, ok := s.expectRelayStages[source]; ok {
  1645  			currStagesM[currStage.Expect.String()] = struct{}{}
  1646  		} else {
  1647  			s.logger.Warn("will write relay stage for a source that doesn't have previous stage",
  1648  				zap.String("source", source))
  1649  		}
  1650  		stages = append(stages, ha.NewRelayStage(newStage, source))
  1651  	}
  1652  	notExistSources := strMapToSlice(notExistSourcesM)
  1653  	currStages := strMapToSlice(currStagesM)
  1654  	if len(notExistSources) > 0 {
  1655  		// some sources not exist, reject the request.
  1656  		return terror.ErrSchedulerRelayStageSourceNotExist.Generate(notExistSources)
  1657  	} else if len(currStages) > 1 {
  1658  		// more than one current relay stage exist, but need to update to the same one, log a warn.
  1659  		s.logger.Warn("update more than one current expectant relay stage to the same one",
  1660  			zap.Strings("from", currStages), zap.Stringer("to", newStage))
  1661  	}
  1662  
  1663  	// 2. put the stages into etcd.
  1664  	_, err := ha.PutRelayStage(s.etcdCli, stages...)
  1665  	if err != nil {
  1666  		return err
  1667  	}
  1668  
  1669  	// 3. update the stages in the scheduler.
  1670  	for _, stage := range stages {
  1671  		s.expectRelayStages[stage.Source] = stage
  1672  	}
  1673  
  1674  	return nil
  1675  }
  1676  
  1677  // GetExpectRelayStage returns the current expect relay stage.
  1678  // If the stage not exists, an invalid stage is returned.
  1679  // This func is used for testing.
  1680  func (s *Scheduler) GetExpectRelayStage(source string) ha.Stage {
  1681  	s.mu.RLock()
  1682  	defer s.mu.RUnlock()
  1683  	if stage, ok := s.expectRelayStages[source]; ok {
  1684  		return stage
  1685  	}
  1686  	return ha.NewRelayStage(pb.Stage_InvalidStage, source)
  1687  }
  1688  
  1689  // UpdateExpectSubTaskStage updates the current expect subtask stage.
  1690  // now, only support updates:
  1691  // - from `Running` to `Paused/Stopped`.
  1692  // - from `Paused/Stopped` to `Running`.
  1693  // NOTE: from `Running` to `Running` and `Paused` to `Paused` still update the data in etcd,
  1694  // because some user may want to update `{Running, Paused, ...}` to `{Running, Running, ...}`.
  1695  // so, this should be also supported in DM-worker.
  1696  func (s *Scheduler) UpdateExpectSubTaskStage(newStage pb.Stage, taskName string, sources ...string) error {
  1697  	if !s.started.Load() {
  1698  		return terror.ErrSchedulerNotStarted.Generate()
  1699  	}
  1700  
  1701  	if taskName == "" || len(sources) == 0 {
  1702  		return nil // no subtask need to update, this should not happen.
  1703  	}
  1704  
  1705  	// 1. check the new expectant stage.
  1706  	switch newStage {
  1707  	case pb.Stage_Running, pb.Stage_Paused, pb.Stage_Stopped:
  1708  	default:
  1709  		return terror.ErrSchedulerSubTaskStageInvalidUpdate.Generate(newStage)
  1710  	}
  1711  
  1712  	release, err := s.subtaskLatch.tryAcquire(taskName)
  1713  	if err != nil {
  1714  		return terror.ErrSchedulerLatchInUse.Generate("UpdateExpectSubTaskStage", taskName)
  1715  	}
  1716  	defer release()
  1717  
  1718  	// 2. check the task exists.
  1719  	v, ok := s.expectSubTaskStages.Load(taskName)
  1720  	if !ok {
  1721  		return terror.ErrSchedulerSubTaskOpTaskNotExist.Generate(taskName)
  1722  	}
  1723  
  1724  	var (
  1725  		stagesM          = v.(map[string]ha.Stage)
  1726  		notExistSourcesM = make(map[string]struct{})
  1727  		currStagesM      = make(map[string]struct{})
  1728  		stages           = make([]ha.Stage, 0, len(sources))
  1729  	)
  1730  	for _, source := range sources {
  1731  		if currStage, ok := stagesM[source]; !ok {
  1732  			notExistSourcesM[source] = struct{}{}
  1733  		} else {
  1734  			currStagesM[currStage.Expect.String()] = struct{}{}
  1735  		}
  1736  		stages = append(stages, ha.NewSubTaskStage(newStage, source, taskName))
  1737  	}
  1738  	notExistSources := strMapToSlice(notExistSourcesM)
  1739  	currStages := strMapToSlice(currStagesM)
  1740  	if len(notExistSources) > 0 {
  1741  		// some sources not exist, reject the request.
  1742  		return terror.ErrSchedulerSubTaskOpSourceNotExist.Generate(notExistSources)
  1743  	} else if len(currStages) > 1 {
  1744  		// more than one current subtask stage exist, but need to update to the same one, log a warn.
  1745  		s.logger.Warn("update more than one current expectant subtask stage to the same one",
  1746  			zap.Strings("from", currStages), zap.Stringer("to", newStage))
  1747  	}
  1748  
  1749  	// 3. put the stages into etcd.
  1750  	_, err = ha.PutSubTaskStage(s.etcdCli, stages...)
  1751  	if err != nil {
  1752  		return err
  1753  	}
  1754  
  1755  	// 4. update the stages in the scheduler.
  1756  	for _, stage := range stages {
  1757  		stagesM[stage.Source] = stage
  1758  	}
  1759  
  1760  	return nil
  1761  }
  1762  
  1763  // GetExpectSubTaskStage returns the current expect subtask stage.
  1764  // If the stage not exists, an invalid stage is returned.
  1765  func (s *Scheduler) GetExpectSubTaskStage(task, source string) ha.Stage {
  1766  	invalidStage := ha.NewSubTaskStage(pb.Stage_InvalidStage, source, task)
  1767  
  1768  	release, err := s.subtaskLatch.tryAcquire(task)
  1769  	if err != nil {
  1770  		return invalidStage
  1771  	}
  1772  	defer release()
  1773  
  1774  	v, ok := s.expectSubTaskStages.Load(task)
  1775  	if !ok {
  1776  		return invalidStage
  1777  	}
  1778  	stageM := v.(map[string]ha.Stage)
  1779  	stage, ok := stageM[source]
  1780  	if !ok {
  1781  		return invalidStage
  1782  	}
  1783  	return stage
  1784  }
  1785  
  1786  // Started returns if the scheduler is started.
  1787  func (s *Scheduler) Started() bool {
  1788  	return s.started.Load()
  1789  }
  1790  
  1791  // recoverSourceCfgs recovers history source configs and expectant relay stages from etcd.
  1792  func (s *Scheduler) recoverSources() error {
  1793  	// get all source configs.
  1794  	cfgM, _, err := ha.GetSourceCfg(s.etcdCli, "", 0)
  1795  	if err != nil {
  1796  		return err
  1797  	}
  1798  	// get all relay stages.
  1799  	stageM, _, err := ha.GetAllRelayStage(s.etcdCli)
  1800  	if err != nil {
  1801  		return err
  1802  	}
  1803  
  1804  	// recover in-memory data.
  1805  	for source, cfg := range cfgM {
  1806  		s.sourceCfgs[source] = cfg
  1807  	}
  1808  	for source, stage := range stageM {
  1809  		s.expectRelayStages[source] = stage
  1810  	}
  1811  
  1812  	return nil
  1813  }
  1814  
  1815  // recoverSubTasks recovers history subtask configs and expectant subtask stages from etcd.
  1816  func (s *Scheduler) recoverSubTasks() error {
  1817  	// get all subtask configs.
  1818  	cfgMM, _, err := ha.GetAllSubTaskCfg(s.etcdCli)
  1819  	if err != nil {
  1820  		return err
  1821  	}
  1822  	// get all subtask stages.
  1823  	stageMM, _, err := ha.GetAllSubTaskStage(s.etcdCli)
  1824  	if err != nil {
  1825  		return err
  1826  	}
  1827  	validatorStageMM, _, err := ha.GetAllValidatorStage(s.etcdCli)
  1828  	if err != nil {
  1829  		return err
  1830  	}
  1831  
  1832  	// recover in-memory data.
  1833  	for source, cfgM := range cfgMM {
  1834  		for task, cfg := range cfgM {
  1835  			v, _ := s.subTaskCfgs.LoadOrStore(task, map[string]config.SubTaskConfig{})
  1836  			m := v.(map[string]config.SubTaskConfig)
  1837  			m[source] = cfg
  1838  		}
  1839  	}
  1840  	for source, stageM := range stageMM {
  1841  		for task, stage := range stageM {
  1842  			v, _ := s.expectSubTaskStages.LoadOrStore(task, map[string]ha.Stage{})
  1843  			m := v.(map[string]ha.Stage)
  1844  			m[source] = stage
  1845  		}
  1846  	}
  1847  	for source, stageM := range validatorStageMM {
  1848  		for task, stage := range stageM {
  1849  			v, _ := s.expectValidatorStages.LoadOrStore(task, map[string]ha.Stage{})
  1850  			m := v.(map[string]ha.Stage)
  1851  			m[source] = stage
  1852  		}
  1853  	}
  1854  
  1855  	return nil
  1856  }
  1857  
  1858  // recoverRelayConfigs recovers history relay configs for each worker from etcd.
  1859  // This function also removes conflicting relay schedule types, which means if a source has both `enable-relay` and
  1860  // (source, worker) relay config, we remove the latter.
  1861  // should be called after recoverSources.
  1862  func (s *Scheduler) recoverRelayConfigs() error {
  1863  	relayWorkers, _, err := ha.GetAllRelayConfig(s.etcdCli)
  1864  	if err != nil {
  1865  		return err
  1866  	}
  1867  
  1868  	for source, workers := range relayWorkers {
  1869  		sourceCfg, ok := s.sourceCfgs[source]
  1870  		if !ok {
  1871  			s.logger.Warn("found a not existing source by relay config", zap.String("source", source))
  1872  			continue
  1873  		}
  1874  		if sourceCfg.EnableRelay {
  1875  			// current etcd max-txn-op is 2048
  1876  			_, err2 := ha.DeleteRelayConfig(s.etcdCli, utils.SetToSlice(workers)...)
  1877  			if err2 != nil {
  1878  				return err2
  1879  			}
  1880  			delete(relayWorkers, source)
  1881  		}
  1882  	}
  1883  
  1884  	s.relayWorkers = relayWorkers
  1885  	return nil
  1886  }
  1887  
  1888  // recoverLoadTasks recovers history load workers from etcd.
  1889  func (s *Scheduler) recoverLoadTasks(needLock bool) (int64, error) {
  1890  	if needLock {
  1891  		s.mu.Lock()
  1892  		defer s.mu.Unlock()
  1893  	}
  1894  	loadTasks, rev, err := ha.GetAllLoadTask(s.etcdCli)
  1895  	if err != nil {
  1896  		return 0, err
  1897  	}
  1898  
  1899  	s.loadTasks = loadTasks
  1900  	return rev, nil
  1901  }
  1902  
  1903  // recoverWorkersBounds recovers history DM-worker info and status from etcd.
  1904  // and it also recovers the bound/unbound relationship.
  1905  func (s *Scheduler) recoverWorkersBounds() (int64, error) {
  1906  	// 1. get all history base info.
  1907  	// it should no new DM-worker registered between this call and the below `GetKeepAliveWorkers`,
  1908  	// because no DM-master leader are handling DM-worker register requests.
  1909  	wim, _, err := ha.GetAllWorkerInfo(s.etcdCli)
  1910  	if err != nil {
  1911  		return 0, err
  1912  	}
  1913  
  1914  	// 2. get all history bound relationships.
  1915  	// it should no new bound relationship added between this call and the below `GetKeepAliveWorkers`,
  1916  	// because no DM-master leader are doing the scheduler.
  1917  	sbm, _, err := ha.GetSourceBound(s.etcdCli, "")
  1918  	if err != nil {
  1919  		return 0, err
  1920  	}
  1921  	lastSourceBoundM, _, err := ha.GetLastSourceBounds(s.etcdCli)
  1922  	if err != nil {
  1923  		return 0, err
  1924  	}
  1925  	s.lastBound = lastSourceBoundM
  1926  
  1927  	// 3. get all history offline status.
  1928  	kam, rev, err := ha.GetKeepAliveWorkers(s.etcdCli)
  1929  	if err != nil {
  1930  		return 0, err
  1931  	}
  1932  
  1933  	scm := s.sourceCfgs
  1934  	boundsToTrigger := make([]ha.SourceBound, 0)
  1935  
  1936  	// 4. recover DM-worker info and status.
  1937  	// prepare a worker -> relay source map
  1938  	relayInfo := map[string]string{}
  1939  	for source, workers := range s.relayWorkers {
  1940  		for worker := range workers {
  1941  			relayInfo[worker] = source
  1942  		}
  1943  	}
  1944  
  1945  	for name, info := range wim {
  1946  		// create and record the worker agent.
  1947  		w, err2 := s.recordWorker(info)
  1948  		if err2 != nil {
  1949  			return 0, err2
  1950  		}
  1951  		// set the stage as Free if it's keep alive.
  1952  		if _, ok := kam[name]; ok {
  1953  			w.ToFree()
  1954  			if source, ok2 := relayInfo[name]; ok2 {
  1955  				if err3 := w.StartRelay(source); err3 != nil {
  1956  					s.logger.DPanic("", zap.Error(err3))
  1957  				}
  1958  			}
  1959  
  1960  			// set the stage as Bound and record the bound relationship if exists.
  1961  			if bound, ok := sbm[name]; ok {
  1962  				// source bounds without source configuration should be deleted later
  1963  				if _, ok := scm[bound.Source]; ok {
  1964  					err2 = s.updateStatusToBound(w, bound)
  1965  					if err2 != nil {
  1966  						// if etcd has saved KV that worker1 started relay for source1, but bound to source2,
  1967  						// we remove the bound to avoid DM master leader failed to bootstrap
  1968  						if terror.ErrSchedulerBoundDiffWithStartedRelay.Equal(err2) {
  1969  							continue
  1970  						}
  1971  						return 0, err2
  1972  					}
  1973  					boundsToTrigger = append(boundsToTrigger, bound)
  1974  					delete(sbm, name)
  1975  				} else {
  1976  					s.logger.Warn("find source bound without config", zap.Stringer("bound", bound))
  1977  				}
  1978  			}
  1979  		}
  1980  	}
  1981  
  1982  	failpoint.Inject("failToRecoverWorkersBounds", func(_ failpoint.Value) {
  1983  		log.L().Info("mock failure", zap.String("failpoint", "failToRecoverWorkersBounds"))
  1984  		failpoint.Return(0, errors.New("failToRecoverWorkersBounds"))
  1985  	})
  1986  	// 5. delete invalid source bound info in etcd
  1987  	if len(sbm) > 0 {
  1988  		invalidSourceBounds := make([]string, 0, len(sbm))
  1989  		for name := range sbm {
  1990  			invalidSourceBounds = append(invalidSourceBounds, name)
  1991  		}
  1992  		_, err = ha.DeleteSourceBound(s.etcdCli, invalidSourceBounds...)
  1993  		if err != nil {
  1994  			return 0, err
  1995  		}
  1996  	}
  1997  
  1998  	// 6. put trigger source bounds info to etcd to order dm-workers to start source
  1999  	if len(boundsToTrigger) > 0 {
  2000  		_, err = ha.PutSourceBound(s.etcdCli, boundsToTrigger...)
  2001  		if err != nil {
  2002  			return 0, err
  2003  		}
  2004  	}
  2005  
  2006  	// 7. recover bounds/unbounds, all sources which not in bounds should be in unbounds.
  2007  	for source := range s.sourceCfgs {
  2008  		if _, ok := s.bounds[source]; !ok {
  2009  			s.unbounds[source] = struct{}{}
  2010  		}
  2011  	}
  2012  
  2013  	return rev, nil
  2014  }
  2015  
  2016  func (s *Scheduler) resetWorkerEv() (int64, error) {
  2017  	s.mu.Lock()
  2018  	defer s.mu.Unlock()
  2019  
  2020  	rwm := s.workers
  2021  	kam, rev, err := ha.GetKeepAliveWorkers(s.etcdCli)
  2022  	if err != nil {
  2023  		return 0, err
  2024  	}
  2025  
  2026  	// update all registered workers status
  2027  	for name := range rwm {
  2028  		ev := ha.WorkerEvent{WorkerName: name}
  2029  		// set the stage as Free if it's keep alive.
  2030  		if _, ok := kam[name]; ok {
  2031  			err = s.handleWorkerOnline(ev, false)
  2032  			if err != nil {
  2033  				return 0, err
  2034  			}
  2035  		} else {
  2036  			err = s.handleWorkerOffline(ev, false)
  2037  			if err != nil {
  2038  				return 0, err
  2039  			}
  2040  		}
  2041  	}
  2042  	return rev, nil
  2043  }
  2044  
  2045  // handleWorkerEv handles the online/offline status change event of DM-worker instances.
  2046  func (s *Scheduler) handleWorkerEv(ctx context.Context, evCh <-chan ha.WorkerEvent, errCh <-chan error) error {
  2047  	for {
  2048  		select {
  2049  		case <-ctx.Done():
  2050  			return nil
  2051  		case ev, ok := <-evCh:
  2052  			if !ok {
  2053  				return nil
  2054  			}
  2055  			s.logger.Info("receive worker status change event", zap.Bool("delete", ev.IsDeleted), zap.Stringer("event", ev))
  2056  			var err error
  2057  			if ev.IsDeleted {
  2058  				err = s.handleWorkerOffline(ev, true)
  2059  			} else {
  2060  				err = s.handleWorkerOnline(ev, true)
  2061  			}
  2062  			if err != nil {
  2063  				s.logger.Error("fail to handle worker status change event", zap.Bool("delete", ev.IsDeleted), zap.Stringer("event", ev), zap.Error(err))
  2064  				metrics.ReportWorkerEventErr(metrics.WorkerEventHandle)
  2065  			}
  2066  		case err, ok := <-errCh:
  2067  			if !ok {
  2068  				return nil
  2069  			}
  2070  			// error here are caused by etcd error or worker event decoding
  2071  			s.logger.Error("receive error when watching worker status change event", zap.Error(err))
  2072  			metrics.ReportWorkerEventErr(metrics.WorkerEventWatch)
  2073  			if etcdutil.IsRetryableError(err) {
  2074  				return err
  2075  			}
  2076  		}
  2077  	}
  2078  }
  2079  
  2080  // nolint:dupl
  2081  func (s *Scheduler) observeWorkerEvent(ctx context.Context, rev int64) error {
  2082  	var wg sync.WaitGroup
  2083  	for {
  2084  		workerEvCh := make(chan ha.WorkerEvent, 10)
  2085  		workerErrCh := make(chan error, 10)
  2086  		wg.Add(1)
  2087  		// use ctx1, cancel1 to make sure old watcher has been released
  2088  		ctx1, cancel1 := context.WithCancel(ctx)
  2089  		go func() {
  2090  			defer func() {
  2091  				close(workerEvCh)
  2092  				close(workerErrCh)
  2093  				wg.Done()
  2094  			}()
  2095  			ha.WatchWorkerEvent(ctx1, s.etcdCli, rev+1, workerEvCh, workerErrCh)
  2096  		}()
  2097  		err := s.handleWorkerEv(ctx1, workerEvCh, workerErrCh)
  2098  		cancel1()
  2099  		wg.Wait()
  2100  
  2101  		if etcdutil.IsRetryableError(err) {
  2102  			rev = 0
  2103  			retryNum := 1
  2104  			for rev == 0 {
  2105  				select {
  2106  				case <-ctx.Done():
  2107  					return nil
  2108  				case <-time.After(500 * time.Millisecond):
  2109  					rev, err = s.resetWorkerEv()
  2110  					if err != nil {
  2111  						log.L().Error("resetWorkerEv is failed, will retry later", zap.Error(err), zap.Int("retryNum", retryNum))
  2112  					}
  2113  				}
  2114  				retryNum++
  2115  			}
  2116  		} else {
  2117  			if err != nil {
  2118  				log.L().Error("observeWorkerEvent is failed and will quit now", zap.Error(err))
  2119  			} else {
  2120  				log.L().Info("observeWorkerEvent will quit now")
  2121  			}
  2122  			return err
  2123  		}
  2124  	}
  2125  }
  2126  
  2127  // handleWorkerOnline handles the scheduler when a DM-worker become online.
  2128  // This should try to bound an unbound source to it.
  2129  // NOTE: this func need to hold the mutex.
  2130  func (s *Scheduler) handleWorkerOnline(ev ha.WorkerEvent, toLock bool) error {
  2131  	if toLock {
  2132  		s.mu.Lock()
  2133  		defer s.mu.Unlock()
  2134  	}
  2135  
  2136  	// 1. find the worker.
  2137  	w, ok := s.workers[ev.WorkerName]
  2138  	if !ok {
  2139  		s.logger.Warn("worker for the event not exists", zap.Stringer("event", ev))
  2140  		return nil
  2141  	}
  2142  
  2143  	// 2. check whether is bound.
  2144  	if w.Stage() == WorkerBound {
  2145  		// also put identical relay config for this worker
  2146  		if source := w.RelaySourceID(); source != "" {
  2147  			_, err := ha.PutRelayConfig(s.etcdCli, source, w.BaseInfo().Name)
  2148  			if err != nil {
  2149  				return err
  2150  			}
  2151  		}
  2152  		// TODO: When dm-worker keepalive is broken, it will turn off its own running source
  2153  		// After keepalive is restored, this dm-worker should continue to run the previously bound source
  2154  		// So we PutSourceBound here to trigger dm-worker to get this event and start source again.
  2155  		// If this worker still start a source, it doesn't matter. dm-worker will omit same source and reject source with different name
  2156  		s.logger.Warn("worker already bound", zap.Stringer("bound", w.Bound()))
  2157  		_, err := ha.PutSourceBound(s.etcdCli, w.Bound())
  2158  		return err
  2159  	}
  2160  
  2161  	// 3. change the stage (from Offline) to Free or Relay.
  2162  	lastRelaySource := w.RelaySourceID()
  2163  	if lastRelaySource == "" {
  2164  		// when worker is removed (for example lost keepalive when master scheduler boots up), w.RelaySourceID() is
  2165  		// of course nothing, so we find the relay source from a better place
  2166  		for source, workerM := range s.relayWorkers {
  2167  			if _, ok2 := workerM[w.BaseInfo().Name]; ok2 {
  2168  				lastRelaySource = source
  2169  				break
  2170  			}
  2171  		}
  2172  	}
  2173  	w.ToFree()
  2174  	// TODO: rename ToFree to Online and move below logic inside it
  2175  	if lastRelaySource != "" {
  2176  		if err := w.StartRelay(lastRelaySource); err != nil {
  2177  			s.logger.DPanic("", zap.Error(err))
  2178  		}
  2179  	}
  2180  
  2181  	// 4. try to bind an unbound source.
  2182  	_, err := s.tryBoundForWorker(w)
  2183  	return err
  2184  }
  2185  
  2186  // handleWorkerOffline handles the scheduler when a DM-worker become offline.
  2187  // This should unbind any previous bound source.
  2188  // NOTE: this func need to hold the mutex.
  2189  func (s *Scheduler) handleWorkerOffline(ev ha.WorkerEvent, toLock bool) error {
  2190  	if toLock {
  2191  		s.mu.Lock()
  2192  		defer s.mu.Unlock()
  2193  	}
  2194  
  2195  	// 1. find the worker.
  2196  	w, ok := s.workers[ev.WorkerName]
  2197  	if !ok {
  2198  		s.logger.Warn("worker for the event not exists", zap.Stringer("event", ev))
  2199  		return nil
  2200  	}
  2201  
  2202  	// 2. find the bound relationship.
  2203  	bound := w.Bound()
  2204  
  2205  	// 3. check whether bound before.
  2206  	if bound.Source == "" {
  2207  		// 3.1. change the stage (from Free) to Offline.
  2208  		w.ToOffline()
  2209  		s.logger.Info("worker not bound, no need to unbound", zap.Stringer("event", ev))
  2210  		return nil
  2211  	}
  2212  
  2213  	// 4. delete the bound relationship in etcd.
  2214  	_, err := ha.DeleteSourceBound(s.etcdCli, bound.Worker)
  2215  	if err != nil {
  2216  		return err
  2217  	}
  2218  
  2219  	// 5. unbound for the source.
  2220  	s.updateStatusToUnbound(bound.Source)
  2221  
  2222  	// 6. change the stage (from Free) to Offline.
  2223  	w.ToOffline()
  2224  
  2225  	s.logger.Info("unbound the worker for source", zap.Stringer("bound", bound), zap.Stringer("event", ev))
  2226  
  2227  	// 7. try to bound the source to a Free worker again.
  2228  	_, err = s.tryBoundForSource(bound.Source)
  2229  	return err
  2230  }
  2231  
  2232  // tryBoundForWorker tries to bind a source to the given worker. The order of picking source is
  2233  // - try to bind sources on which the worker has unfinished load task
  2234  // - try to bind the last bound source
  2235  // - if enabled relay, bind to the relay source or keep unbound
  2236  // - try to bind any unbound sources
  2237  // if the source is bound to a relay enabled worker, we must check that the source is also the relay source of worker.
  2238  // pulling binlog using relay or not is determined by whether the worker has enabled relay.
  2239  func (s *Scheduler) tryBoundForWorker(w *Worker) (bound bool, err error) {
  2240  	// 1. handle this worker has unfinished load task.
  2241  	worker, sourceID := s.getNextLoadTaskTransfer(w.BaseInfo().Name, "")
  2242  	if sourceID != "" {
  2243  		s.logger.Info("found unfinished load task source when worker bound",
  2244  			zap.String("worker", w.BaseInfo().Name),
  2245  			zap.String("source", sourceID))
  2246  		// TODO: tolerate a failed transfer because of start-relay conflicts with loadTask
  2247  		err = s.transferWorkerAndSource(w.BaseInfo().Name, "", worker, sourceID)
  2248  		return err == nil, err
  2249  	}
  2250  
  2251  	// check if last bound is still available.
  2252  	// NOTE: if worker isn't in lastBound, we'll get "zero" SourceBound and it's OK, because "zero" string is not in
  2253  	// unbounds
  2254  	source := s.lastBound[w.baseInfo.Name].Source
  2255  	if _, ok := s.unbounds[source]; !ok {
  2256  		source = ""
  2257  	}
  2258  
  2259  	if source != "" {
  2260  		relaySource := w.RelaySourceID()
  2261  		if relaySource != "" && relaySource != source {
  2262  			source = ""
  2263  		} else {
  2264  			// worker not enable relay or last bound is relay source
  2265  			s.logger.Info("found history source when worker bound",
  2266  				zap.String("worker", w.BaseInfo().Name),
  2267  				zap.String("source", source))
  2268  		}
  2269  	}
  2270  
  2271  	// try to find its relay source (currently only one relay source)
  2272  	if source == "" {
  2273  		source = w.RelaySourceID()
  2274  		if source != "" {
  2275  			s.logger.Info("found relay source when worker bound",
  2276  				zap.String("worker", w.BaseInfo().Name),
  2277  				zap.String("source", source))
  2278  			// currently worker can only handle same relay source and source bound, so we don't try bound another source
  2279  			if oldWorker, ok := s.bounds[source]; ok {
  2280  				s.logger.Info("worker has started relay for a source, but that source is bound to another worker, so we let this worker free",
  2281  					zap.String("worker", w.BaseInfo().Name),
  2282  					zap.String("relay source", source),
  2283  					zap.String("bound worker for its relay source", oldWorker.BaseInfo().Name))
  2284  				return false, nil
  2285  			}
  2286  		}
  2287  	}
  2288  
  2289  	// randomly pick one from unbounds
  2290  	if source == "" {
  2291  		for source = range s.unbounds {
  2292  			s.logger.Info("found unbound source when worker bound",
  2293  				zap.String("worker", w.BaseInfo().Name),
  2294  				zap.String("source", source))
  2295  			break // got a source.
  2296  		}
  2297  	}
  2298  
  2299  	if source == "" {
  2300  		s.logger.Info("no unbound sources need to bound", zap.Stringer("worker", w.BaseInfo()))
  2301  		return false, nil
  2302  	}
  2303  
  2304  	// 2. try to bound them.
  2305  	err = s.boundSourceToWorker(source, w)
  2306  	if err != nil {
  2307  		return false, err
  2308  	}
  2309  	return true, nil
  2310  }
  2311  
  2312  // tryBoundForSource tries to bound a source to a random Free worker. The order of picking worker is
  2313  // - try to bind a worker which has unfinished load task
  2314  // - try to bind a relay worker which has be bound to this source before
  2315  // - try to bind any relay worker
  2316  // - try to bind any worker which has be bound to this source before
  2317  // - try to bind any free worker
  2318  // pulling binlog using relay or not is determined by whether the worker has enabled relay.
  2319  // caller should update the s.unbounds.
  2320  // caller should make sure this source has source config.
  2321  func (s *Scheduler) tryBoundForSource(source string) (bool, error) {
  2322  	var worker *Worker
  2323  
  2324  	// pick a worker which has subtask in load stage.
  2325  	workerName, sourceID := s.getNextLoadTaskTransfer("", source)
  2326  	if workerName != "" {
  2327  		// TODO: check relay source conflict
  2328  		err := s.transferWorkerAndSource("", source, workerName, sourceID)
  2329  		return err == nil, err
  2330  	}
  2331  
  2332  	relayWorkers := s.relayWorkers[source]
  2333  	// 1. try to find a history worker in relay workers...
  2334  	if len(relayWorkers) > 0 {
  2335  		for workerName, bound := range s.lastBound {
  2336  			if bound.Source == source {
  2337  				w, ok := s.workers[workerName]
  2338  				if !ok {
  2339  					// a not found worker
  2340  					continue
  2341  				}
  2342  				// the worker is not Offline
  2343  				if _, ok2 := relayWorkers[workerName]; ok2 && w.Stage() == WorkerRelay {
  2344  					worker = w
  2345  					s.logger.Info("found history relay worker when source bound",
  2346  						zap.String("worker", workerName),
  2347  						zap.String("source", source))
  2348  					break
  2349  				}
  2350  			}
  2351  		}
  2352  	}
  2353  	// then a relay worker for this source...
  2354  	if worker == nil {
  2355  		for workerName := range relayWorkers {
  2356  			w, ok := s.workers[workerName]
  2357  			if !ok {
  2358  				// a not found worker, should not happen
  2359  				s.logger.DPanic("worker instance not found for relay worker", zap.String("worker", workerName))
  2360  				continue
  2361  			}
  2362  			// the worker is not Offline
  2363  			if w.Stage() == WorkerRelay {
  2364  				worker = w
  2365  				s.logger.Info("found relay worker when source bound",
  2366  					zap.String("worker", workerName),
  2367  					zap.String("source", source))
  2368  				break
  2369  			}
  2370  		}
  2371  	}
  2372  	// then a history worker for this source...
  2373  	if worker == nil {
  2374  		for workerName, bound := range s.lastBound {
  2375  			if bound.Source == source {
  2376  				w, ok := s.workers[workerName]
  2377  				if !ok {
  2378  					// a not found worker
  2379  					continue
  2380  				}
  2381  				if w.Stage() == WorkerFree {
  2382  					worker = w
  2383  					s.logger.Info("found history worker when source bound",
  2384  						zap.String("worker", workerName),
  2385  						zap.String("source", source))
  2386  					break
  2387  				}
  2388  			}
  2389  		}
  2390  	}
  2391  
  2392  	// and then a random Free worker.
  2393  	if worker == nil {
  2394  		for _, w := range s.workers {
  2395  			if w.Stage() == WorkerFree {
  2396  				worker = w
  2397  				s.logger.Info("found free worker when source bound",
  2398  					zap.String("worker", w.BaseInfo().Name),
  2399  					zap.String("source", source))
  2400  				break
  2401  			}
  2402  		}
  2403  	}
  2404  
  2405  	if worker == nil {
  2406  		s.logger.Info("no free worker exists for bound", zap.String("source", source))
  2407  		return false, nil
  2408  	}
  2409  
  2410  	// 2. try to bound them.
  2411  	err := s.boundSourceToWorker(source, worker)
  2412  	if err != nil {
  2413  		return false, err
  2414  	}
  2415  	return true, nil
  2416  }
  2417  
  2418  // boundSourceToWorker bounds the source and worker together.
  2419  // we should check the bound relationship of the source and the stage of the worker in the caller.
  2420  func (s *Scheduler) boundSourceToWorker(source string, w *Worker) error {
  2421  	// 1. put the bound relationship into etcd.
  2422  	var err error
  2423  	bound := ha.NewSourceBound(source, w.BaseInfo().Name)
  2424  	sourceCfg, ok := s.sourceCfgs[source]
  2425  	if ok && sourceCfg.EnableRelay {
  2426  		stage := ha.NewRelayStage(pb.Stage_Running, source)
  2427  		_, err = ha.PutRelayStageSourceBound(s.etcdCli, stage, bound)
  2428  	} else {
  2429  		_, err = ha.PutSourceBound(s.etcdCli, bound)
  2430  	}
  2431  	if err != nil {
  2432  		return err
  2433  	}
  2434  
  2435  	// 2. update the bound relationship in the scheduler.
  2436  	err = s.updateStatusToBound(w, bound)
  2437  	if err != nil {
  2438  		return err
  2439  	}
  2440  
  2441  	s.logger.Info("bound the source to worker", zap.Stringer("bound", bound))
  2442  	return nil
  2443  }
  2444  
  2445  // recordWorker creates the worker agent (with Offline stage) and records in the scheduler.
  2446  // this func is used when adding a new worker.
  2447  // NOTE: trigger scheduler when the worker become online, not when added.
  2448  func (s *Scheduler) recordWorker(info ha.WorkerInfo) (*Worker, error) {
  2449  	w, err := NewWorker(info, s.securityCfg)
  2450  	if err != nil {
  2451  		return nil, err
  2452  	}
  2453  	s.workers[info.Name] = w
  2454  	return w, nil
  2455  }
  2456  
  2457  // deleteWorker deletes the recorded worker and bound.
  2458  // this func is used when removing the worker.
  2459  // NOTE: trigger scheduler when the worker become offline, not when deleted.
  2460  func (s *Scheduler) deleteWorker(name string) {
  2461  	for _, workers := range s.relayWorkers {
  2462  		delete(workers, name)
  2463  	}
  2464  	w, ok := s.workers[name]
  2465  	if !ok {
  2466  		return
  2467  	}
  2468  	w.Close()
  2469  	delete(s.workers, name)
  2470  	metrics.RemoveWorkerState(w.baseInfo.Name)
  2471  }
  2472  
  2473  // updateStatusToBound updates the in-memory status for bound, including:
  2474  // - update the stage of worker to `Bound`.
  2475  // - record the bound relationship and last bound relationship in the scheduler.
  2476  // - remove the unbound relationship in the scheduler.
  2477  // this func is called after the bound relationship existed in etcd.
  2478  func (s *Scheduler) updateStatusToBound(w *Worker, b ha.SourceBound) error {
  2479  	if err := w.ToBound(b); err != nil {
  2480  		return err
  2481  	}
  2482  	s.bounds[b.Source] = w
  2483  	s.lastBound[b.Worker] = b
  2484  	delete(s.unbounds, b.Source)
  2485  	return nil
  2486  }
  2487  
  2488  // updateStatusToUnbound updates the in-memory status for unbound, including:
  2489  // - update the stage of worker to `Free` or `Relay`.
  2490  // - remove the bound relationship in the scheduler.
  2491  // - record the unbound relationship in the scheduler.
  2492  // this func is called after the bound relationship removed from etcd.
  2493  func (s *Scheduler) updateStatusToUnbound(source string) {
  2494  	s.unbounds[source] = struct{}{}
  2495  	w, ok := s.bounds[source]
  2496  	if !ok {
  2497  		return
  2498  	}
  2499  	if err := w.Unbound(); err != nil {
  2500  		s.logger.DPanic("cannot updateStatusToUnbound", zap.Error(err))
  2501  	}
  2502  	delete(s.bounds, source)
  2503  }
  2504  
  2505  // reset resets the internal status.
  2506  func (s *Scheduler) reset() {
  2507  	s.subtaskLatch = newLatches()
  2508  	s.sourceCfgs = make(map[string]*config.SourceConfig)
  2509  	s.subTaskCfgs = sync.Map{}
  2510  	s.workers = make(map[string]*Worker)
  2511  	s.bounds = make(map[string]*Worker)
  2512  	s.unbounds = make(map[string]struct{})
  2513  	s.expectRelayStages = make(map[string]ha.Stage)
  2514  	s.expectSubTaskStages = sync.Map{}
  2515  	s.loadTasks = make(map[string]map[string]string)
  2516  }
  2517  
  2518  // strMapToSlice converts a `map[string]struct{}` to `[]string` in increasing order.
  2519  func strMapToSlice(m map[string]struct{}) []string {
  2520  	ret := make([]string, 0, len(m))
  2521  	for s := range m {
  2522  		ret = append(ret, s)
  2523  	}
  2524  	sort.Strings(ret)
  2525  	return ret
  2526  }
  2527  
  2528  // SetWorkerClientForTest sets mockWorkerClient for specified worker, only used for test.
  2529  func (s *Scheduler) SetWorkerClientForTest(name string, mockCli workerrpc.Client) {
  2530  	if _, ok := s.workers[name]; ok {
  2531  		s.workers[name].cli = mockCli
  2532  	}
  2533  }
  2534  
  2535  // nolint:dupl
  2536  func (s *Scheduler) observeLoadTask(ctx context.Context, rev int64) error {
  2537  	var wg sync.WaitGroup
  2538  	for {
  2539  		loadTaskCh := make(chan ha.LoadTask, 10)
  2540  		loadTaskErrCh := make(chan error, 10)
  2541  		wg.Add(1)
  2542  		// use ctx1, cancel1 to make sure old watcher has been released
  2543  		ctx1, cancel1 := context.WithCancel(ctx)
  2544  		go func() {
  2545  			defer func() {
  2546  				close(loadTaskCh)
  2547  				close(loadTaskErrCh)
  2548  				wg.Done()
  2549  			}()
  2550  			ha.WatchLoadTask(ctx1, s.etcdCli, rev+1, loadTaskCh, loadTaskErrCh)
  2551  		}()
  2552  		err := s.handleLoadTask(ctx1, loadTaskCh, loadTaskErrCh)
  2553  		cancel1()
  2554  		wg.Wait()
  2555  
  2556  		if etcdutil.IsRetryableError(err) {
  2557  			rev = 0
  2558  			retryNum := 1
  2559  			for rev == 0 {
  2560  				select {
  2561  				case <-ctx.Done():
  2562  					return nil
  2563  				case <-time.After(500 * time.Millisecond):
  2564  					rev, err = s.recoverLoadTasks(true)
  2565  					if err != nil {
  2566  						log.L().Error("resetLoadTask is failed, will retry later", zap.Error(err), zap.Int("retryNum", retryNum))
  2567  					}
  2568  				}
  2569  				retryNum++
  2570  			}
  2571  		} else {
  2572  			if err != nil {
  2573  				log.L().Error("observeLoadTask is failed and will quit now", zap.Error(err))
  2574  			} else {
  2575  				log.L().Info("observeLoadTask will quit now")
  2576  			}
  2577  			return err
  2578  		}
  2579  	}
  2580  }
  2581  
  2582  // RemoveLoadTaskAndLightningStatus removes the loadtask and lightning status by task.
  2583  func (s *Scheduler) RemoveLoadTaskAndLightningStatus(task string) error {
  2584  	s.mu.Lock()
  2585  	defer s.mu.Unlock()
  2586  
  2587  	if !s.started.Load() {
  2588  		return terror.ErrSchedulerNotStarted.Generate()
  2589  	}
  2590  	_, _, err := ha.DelLoadTaskByTask(s.etcdCli, task)
  2591  	if err != nil {
  2592  		return err
  2593  	}
  2594  	delete(s.loadTasks, task)
  2595  	_, err = ha.DeleteLightningStatusForTask(s.etcdCli, task)
  2596  	return err
  2597  }
  2598  
  2599  // getTransferWorkerAndSource tries to get transfer worker and source.
  2600  // return (worker, source) that is used by transferWorkerAndSource, to try to resolve a paused load task that the source can't be bound to the worker which has its dump files.
  2601  // worker, source	This means a subtask finish load stage, often called by handleLoadTaskDel.
  2602  // worker, ""		This means a free worker online, often called by tryBoundForWorker.
  2603  // "", source		This means a unbound source online, often called by tryBoundForSource.
  2604  func (s *Scheduler) getNextLoadTaskTransfer(worker, source string) (string, string) {
  2605  	// origin worker not free, try to get a source.
  2606  	if worker != "" {
  2607  		// try to get a unbound source
  2608  		for sourceID := range s.unbounds {
  2609  			if sourceID != source && s.hasLoadTaskByWorkerAndSource(worker, sourceID) {
  2610  				return "", sourceID
  2611  			}
  2612  		}
  2613  		// try to get a bound source
  2614  		for sourceID, w := range s.bounds {
  2615  			if sourceID != source && s.hasLoadTaskByWorkerAndSource(worker, sourceID) && !s.hasLoadTaskByWorkerAndSource(w.baseInfo.Name, sourceID) {
  2616  				return w.baseInfo.Name, sourceID
  2617  			}
  2618  		}
  2619  	}
  2620  
  2621  	// origin source is bound, try to get a worker
  2622  	if source != "" {
  2623  		// try to get a free worker
  2624  		for _, w := range s.workers {
  2625  			workerName := w.baseInfo.Name
  2626  			if workerName != worker && w.Stage() == WorkerFree && s.hasLoadTaskByWorkerAndSource(workerName, source) {
  2627  				return workerName, ""
  2628  			}
  2629  		}
  2630  
  2631  		// try to get a bound worker
  2632  		for _, w := range s.workers {
  2633  			workerName := w.baseInfo.Name
  2634  			if workerName != worker && w.Stage() == WorkerBound {
  2635  				if s.hasLoadTaskByWorkerAndSource(workerName, source) && !s.hasLoadTaskByWorkerAndSource(workerName, w.bound.Source) {
  2636  					return workerName, w.bound.Source
  2637  				}
  2638  			}
  2639  		}
  2640  	}
  2641  
  2642  	return "", ""
  2643  }
  2644  
  2645  // hasLoadTaskByWorkerAndSource check whether there is an existing load subtask for the worker and source.
  2646  func (s *Scheduler) hasLoadTaskByWorkerAndSource(worker, source string) bool {
  2647  	for taskName, sourceWorkerMap := range s.loadTasks {
  2648  		// don't consider removed subtask
  2649  		subtasksV, ok := s.subTaskCfgs.Load(taskName)
  2650  		if !ok {
  2651  			continue
  2652  		}
  2653  		subtasks := subtasksV.(map[string]config.SubTaskConfig)
  2654  		if _, ok2 := subtasks[source]; !ok2 {
  2655  			continue
  2656  		}
  2657  
  2658  		if workerName, ok2 := sourceWorkerMap[source]; ok2 && workerName == worker {
  2659  			return true
  2660  		}
  2661  	}
  2662  	return false
  2663  }
  2664  
  2665  // TryResolveLoadTask checks if there are sources whose load task has local files and not bound to the worker which is
  2666  // accessible to the local files. If so, trigger a transfer source.
  2667  func (s *Scheduler) TryResolveLoadTask(sources []string) {
  2668  	for _, source := range sources {
  2669  		s.mu.Lock()
  2670  		worker, ok := s.bounds[source]
  2671  		if !ok {
  2672  			s.mu.Unlock()
  2673  			continue
  2674  		}
  2675  		if err := s.tryResolveLoadTask(worker.baseInfo.Name, source); err != nil {
  2676  			s.logger.Error("tryResolveLoadTask failed", zap.Error(err))
  2677  		}
  2678  		s.mu.Unlock()
  2679  	}
  2680  }
  2681  
  2682  func (s *Scheduler) tryResolveLoadTask(originWorker, originSource string) error {
  2683  	if s.hasLoadTaskByWorkerAndSource(originWorker, originSource) {
  2684  		return nil
  2685  	}
  2686  
  2687  	worker, source := s.getNextLoadTaskTransfer(originWorker, originSource)
  2688  	if worker == "" && source == "" {
  2689  		return nil
  2690  	}
  2691  
  2692  	return s.transferWorkerAndSource(originWorker, originSource, worker, source)
  2693  }
  2694  
  2695  func (s *Scheduler) handleLoadTaskDel(loadTask ha.LoadTask) error {
  2696  	s.mu.Lock()
  2697  	defer s.mu.Unlock()
  2698  
  2699  	if _, ok := s.loadTasks[loadTask.Task]; !ok {
  2700  		return nil
  2701  	}
  2702  	if _, ok := s.loadTasks[loadTask.Task][loadTask.Source]; !ok {
  2703  		return nil
  2704  	}
  2705  
  2706  	originWorker := s.loadTasks[loadTask.Task][loadTask.Source]
  2707  	delete(s.loadTasks[loadTask.Task], loadTask.Source)
  2708  	if len(s.loadTasks[loadTask.Task]) == 0 {
  2709  		delete(s.loadTasks, loadTask.Task)
  2710  	}
  2711  
  2712  	return s.tryResolveLoadTask(originWorker, loadTask.Source)
  2713  }
  2714  
  2715  func (s *Scheduler) handleLoadTaskPut(loadTask ha.LoadTask) {
  2716  	s.mu.Lock()
  2717  	defer s.mu.Unlock()
  2718  
  2719  	if _, ok := s.loadTasks[loadTask.Task]; !ok {
  2720  		s.loadTasks[loadTask.Task] = make(map[string]string)
  2721  	}
  2722  	s.loadTasks[loadTask.Task][loadTask.Source] = loadTask.Worker
  2723  }
  2724  
  2725  // handleLoadTask handles the load worker status change event.
  2726  func (s *Scheduler) handleLoadTask(ctx context.Context, loadTaskCh <-chan ha.LoadTask, errCh <-chan error) error {
  2727  	for {
  2728  		select {
  2729  		case <-ctx.Done():
  2730  			return nil
  2731  		case loadTask, ok := <-loadTaskCh:
  2732  			if !ok {
  2733  				return nil
  2734  			}
  2735  			s.logger.Info("receive load task", zap.Bool("delete", loadTask.IsDelete), zap.String("task", loadTask.Task), zap.String("source", loadTask.Source), zap.String("worker", loadTask.Worker))
  2736  			var err error
  2737  			if loadTask.IsDelete {
  2738  				err = s.handleLoadTaskDel(loadTask)
  2739  			} else {
  2740  				s.handleLoadTaskPut(loadTask)
  2741  			}
  2742  			if err != nil {
  2743  				s.logger.Error("fail to handle worker status change event", zap.Error(err))
  2744  			}
  2745  		case err, ok := <-errCh:
  2746  			if !ok {
  2747  				return nil
  2748  			}
  2749  			// error here are caused by etcd error or load worker decoding
  2750  			s.logger.Error("receive error when watching load worker", zap.Error(err))
  2751  			if etcdutil.IsRetryableError(err) {
  2752  				return err
  2753  			}
  2754  		}
  2755  	}
  2756  }
  2757  
  2758  // OperateValidationTask operate validator of subtask.
  2759  //
  2760  //	tasks: tasks need to operate
  2761  //	validatorStages: stage info of subtask validators
  2762  //	changedSubtaskCfgs: changed subtask configs
  2763  //
  2764  // see server.StartValidation/StopValidation for more detail.
  2765  func (s *Scheduler) OperateValidationTask(validatorStages []ha.Stage, changedSubtaskCfgs []config.SubTaskConfig) error {
  2766  	s.mu.Lock()
  2767  	defer s.mu.Unlock()
  2768  	if !s.started.Load() {
  2769  		return terror.ErrSchedulerNotStarted.Generate()
  2770  	}
  2771  
  2772  	// 2. setting subtask stage in etcd
  2773  	if len(changedSubtaskCfgs) > 0 || len(validatorStages) > 0 {
  2774  		_, err := ha.PutSubTaskCfgStage(s.etcdCli, changedSubtaskCfgs, []ha.Stage{}, validatorStages)
  2775  		if err != nil {
  2776  			return terror.Annotate(err, "fail to set new validator stage")
  2777  		}
  2778  	}
  2779  	// 3. cache validator stage
  2780  	for _, stage := range validatorStages {
  2781  		v, _ := s.expectValidatorStages.LoadOrStore(stage.Task, map[string]ha.Stage{})
  2782  		m := v.(map[string]ha.Stage)
  2783  		m[stage.Source] = stage
  2784  	}
  2785  	for _, cfg := range changedSubtaskCfgs {
  2786  		v, _ := s.subTaskCfgs.LoadOrStore(cfg.Name, map[string]config.SubTaskConfig{})
  2787  		m := v.(map[string]config.SubTaskConfig)
  2788  		m[cfg.SourceID] = cfg
  2789  	}
  2790  	return nil
  2791  }
  2792  
  2793  // ValidatorEnabled returns true when validator of task-source pair has enabled, i.e. validation mode is not none.
  2794  // enabled validator can be in running or stopped stage.
  2795  func (s *Scheduler) ValidatorEnabled(task, source string) bool {
  2796  	return s.GetValidatorStage(task, source) != nil
  2797  }
  2798  
  2799  // GetValidatorStage get validator stage of task-source pair.
  2800  func (s *Scheduler) GetValidatorStage(task, source string) *ha.Stage {
  2801  	s.mu.RLock()
  2802  	defer s.mu.RUnlock()
  2803  	v, ok := s.expectValidatorStages.Load(task)
  2804  	if !ok {
  2805  		return nil
  2806  	}
  2807  	m := v.(map[string]ha.Stage)
  2808  	if stage, ok2 := m[source]; ok2 {
  2809  		return &stage
  2810  	}
  2811  	return nil
  2812  }