
     1  // Copyright 2021 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    14  package owner
    16  import (
    17  	"math"
    19  	""
    20  	""
    21  	""
    22  	""
    23  	cerror ""
    24  	""
    25  )
    27  type schedulerJobType string
    29  const (
    30  	schedulerJobTypeAddTable    schedulerJobType = "ADD"
    31  	schedulerJobTypeRemoveTable schedulerJobType = "REMOVE"
    32  )
    34  type schedulerJob struct {
    35  	Tp      schedulerJobType
    36  	TableID model.TableID
    37  	// if the operation is a delete operation, boundaryTs is checkpoint ts
    38  	// if the operation is a add operation, boundaryTs is start ts
    39  	BoundaryTs    uint64
    40  	TargetCapture model.CaptureID
    41  }
    43  type moveTableJob struct {
    44  	tableID model.TableID
    45  	target  model.CaptureID
    46  }
    48  type scheduler struct {
    49  	state         *model.ChangefeedReactorState
    50  	currentTables []model.TableID
    51  	captures      map[model.CaptureID]*model.CaptureInfo
    53  	moveTableTargets      map[model.TableID]model.CaptureID
    54  	moveTableJobQueue     []*moveTableJob
    55  	needRebalanceNextTick bool
    56  	lastTickCaptureCount  int
    57  }
    59  func newScheduler() *scheduler {
    60  	return &scheduler{
    61  		moveTableTargets: make(map[model.TableID]model.CaptureID),
    62  	}
    63  }
    65  // Tick is the main function of scheduler. It dispatches tables to captures and handles move-table and rebalance events.
    66  // Tick returns a bool representing whether the changefeed's state can be updated in this tick.
    67  // The state can be updated only if all the tables which should be listened to have been dispatched to captures and no operations have been sent to captures in this tick.
    68  func (s *scheduler) Tick(state *model.ChangefeedReactorState, currentTables []model.TableID, captures map[model.CaptureID]*model.CaptureInfo) (shouldUpdateState bool, err error) {
    69  	s.state = state
    70  	s.currentTables = currentTables
    71  	s.captures = captures
    73  	s.cleanUpFinishedOperations()
    74  	pendingJob, err := s.syncTablesWithCurrentTables()
    75  	if err != nil {
    76  		return false, errors.Trace(err)
    77  	}
    78  	s.dispatchToTargetCaptures(pendingJob)
    79  	if len(pendingJob) != 0 {
    80  		log.Debug("scheduler:generated pending job to be executed", zap.Any("pendingJob", pendingJob))
    81  	}
    82  	s.handleJobs(pendingJob)
    84  	// only if the pending job list is empty and no table is being rebalanced or moved,
    85  	// can the global resolved ts and checkpoint ts be updated
    86  	shouldUpdateState = len(pendingJob) == 0
    87  	shouldUpdateState = s.rebalance() && shouldUpdateState
    88  	shouldUpdateStateInMoveTable, err := s.handleMoveTableJob()
    89  	if err != nil {
    90  		return false, errors.Trace(err)
    91  	}
    92  	shouldUpdateState = shouldUpdateStateInMoveTable && shouldUpdateState
    93  	s.lastTickCaptureCount = len(captures)
    94  	return shouldUpdateState, nil
    95  }
    97  func (s *scheduler) MoveTable(tableID model.TableID, target model.CaptureID) {
    98  	s.moveTableJobQueue = append(s.moveTableJobQueue, &moveTableJob{
    99  		tableID: tableID,
   100  		target:  target,
   101  	})
   102  }
   104  // handleMoveTableJob handles the move table job add be MoveTable function
   105  func (s *scheduler) handleMoveTableJob() (shouldUpdateState bool, err error) {
   106  	shouldUpdateState = true
   107  	if len(s.moveTableJobQueue) == 0 {
   108  		return
   109  	}
   110  	table2CaptureIndex, err := s.table2CaptureIndex()
   111  	if err != nil {
   112  		return false, errors.Trace(err)
   113  	}
   114  	for _, job := range s.moveTableJobQueue {
   115  		source, exist := table2CaptureIndex[job.tableID]
   116  		if !exist {
   117  			return
   118  		}
   119  		s.moveTableTargets[job.tableID] =
   120  		job := job
   121  		shouldUpdateState = false
   122  		// for all move table job, here just remove the table from the source capture.
   123  		// and the table removed by this function will be added to target capture by syncTablesWithCurrentTables in the next tick.
   124  		s.state.PatchTaskStatus(source, func(status *model.TaskStatus) (*model.TaskStatus, bool, error) {
   125  			if status == nil {
   126  				// the capture may be down, just skip remove this table
   127  				return status, false, nil
   128  			}
   129  			if status.Operation != nil && status.Operation[job.tableID] != nil {
   130  				// skip removing this table to avoid the remove operation created by the rebalance function interfering with the operation created by another function
   131  				return status, false, nil
   132  			}
   133  			status.RemoveTable(job.tableID, s.state.Status.CheckpointTs, false)
   134  			return status, true, nil
   135  		})
   136  	}
   137  	s.moveTableJobQueue = nil
   138  	return
   139  }
   141  func (s *scheduler) Rebalance() {
   142  	s.needRebalanceNextTick = true
   143  }
   145  func (s *scheduler) table2CaptureIndex() (map[model.TableID]model.CaptureID, error) {
   146  	table2CaptureIndex := make(map[model.TableID]model.CaptureID)
   147  	for captureID, taskStatus := range s.state.TaskStatuses {
   148  		for tableID := range taskStatus.Tables {
   149  			if preCaptureID, exist := table2CaptureIndex[tableID]; exist && preCaptureID != captureID {
   150  				return nil, cerror.ErrTableListenReplicated.GenWithStackByArgs(tableID, preCaptureID, captureID)
   151  			}
   152  			table2CaptureIndex[tableID] = captureID
   153  		}
   154  		for tableID := range taskStatus.Operation {
   155  			if preCaptureID, exist := table2CaptureIndex[tableID]; exist && preCaptureID != captureID {
   156  				return nil, cerror.ErrTableListenReplicated.GenWithStackByArgs(tableID, preCaptureID, captureID)
   157  			}
   158  			table2CaptureIndex[tableID] = captureID
   159  		}
   160  	}
   161  	return table2CaptureIndex, nil
   162  }
   164  // dispatchToTargetCaptures sets the the TargetCapture of scheduler jobs
   165  // If the TargetCapture of a job is not set, it chooses a capture with the minimum workload and sets the TargetCapture to the capture.
   166  func (s *scheduler) dispatchToTargetCaptures(pendingJobs []*schedulerJob) {
   167  	workloads := make(map[model.CaptureID]uint64)
   169  	for captureID := range s.captures {
   170  		workloads[captureID] = 0
   171  		taskWorkload := s.state.Workloads[captureID]
   172  		if taskWorkload == nil {
   173  			continue
   174  		}
   175  		for _, workload := range taskWorkload {
   176  			workloads[captureID] += workload.Workload
   177  		}
   178  	}
   180  	for _, pendingJob := range pendingJobs {
   181  		if pendingJob.TargetCapture == "" {
   182  			target, exist := s.moveTableTargets[pendingJob.TableID]
   183  			if !exist {
   184  				continue
   185  			}
   186  			pendingJob.TargetCapture = target
   187  			delete(s.moveTableTargets, pendingJob.TableID)
   188  			continue
   189  		}
   190  		switch pendingJob.Tp {
   191  		case schedulerJobTypeAddTable:
   192  			workloads[pendingJob.TargetCapture] += 1
   193  		case schedulerJobTypeRemoveTable:
   194  			workloads[pendingJob.TargetCapture] -= 1
   195  		default:
   196  			log.Panic("Unreachable, please report a bug",
   197  				zap.String("changefeed", s.state.ID), zap.Any("job", pendingJob))
   198  		}
   199  	}
   201  	getMinWorkloadCapture := func() model.CaptureID {
   202  		minCapture := ""
   203  		minWorkLoad := uint64(math.MaxUint64)
   204  		for captureID, workload := range workloads {
   205  			if workload < minWorkLoad {
   206  				minCapture = captureID
   207  				minWorkLoad = workload
   208  			}
   209  		}
   211  		if minCapture == "" {
   212  			log.Panic("Unreachable, no capture is found")
   213  		}
   214  		return minCapture
   215  	}
   217  	for _, pendingJob := range pendingJobs {
   218  		if pendingJob.TargetCapture != "" {
   219  			continue
   220  		}
   221  		minCapture := getMinWorkloadCapture()
   222  		pendingJob.TargetCapture = minCapture
   223  		workloads[minCapture] += 1
   224  	}
   225  }
   227  // syncTablesWithCurrentTables iterates all current tables to check whether it should be listened or not.
   228  // this function will return schedulerJob to make sure all tables will be listened.
   229  func (s *scheduler) syncTablesWithCurrentTables() ([]*schedulerJob, error) {
   230  	var pendingJob []*schedulerJob
   231  	allTableListeningNow, err := s.table2CaptureIndex()
   232  	if err != nil {
   233  		return nil, errors.Trace(err)
   234  	}
   235  	globalCheckpointTs := s.state.Status.CheckpointTs
   236  	for _, tableID := range s.currentTables {
   237  		if _, exist := allTableListeningNow[tableID]; exist {
   238  			delete(allTableListeningNow, tableID)
   239  			continue
   240  		}
   241  		// For each table which should be listened but is not, add an adding-table job to the pending job list
   242  		pendingJob = append(pendingJob, &schedulerJob{
   243  			Tp:         schedulerJobTypeAddTable,
   244  			TableID:    tableID,
   245  			BoundaryTs: globalCheckpointTs,
   246  		})
   247  	}
   248  	// The remaining tables are the tables which should be not listened
   249  	tablesThatShouldNotBeListened := allTableListeningNow
   250  	for tableID, captureID := range tablesThatShouldNotBeListened {
   251  		opts := s.state.TaskStatuses[captureID].Operation
   252  		if opts != nil && opts[tableID] != nil && opts[tableID].Delete {
   253  			// the table is being removed, skip
   254  			continue
   255  		}
   256  		pendingJob = append(pendingJob, &schedulerJob{
   257  			Tp:            schedulerJobTypeRemoveTable,
   258  			TableID:       tableID,
   259  			BoundaryTs:    globalCheckpointTs,
   260  			TargetCapture: captureID,
   261  		})
   262  	}
   263  	return pendingJob, nil
   264  }
   266  func (s *scheduler) handleJobs(jobs []*schedulerJob) {
   267  	for _, job := range jobs {
   268  		job := job
   269  		s.state.PatchTaskStatus(job.TargetCapture, func(status *model.TaskStatus) (*model.TaskStatus, bool, error) {
   270  			switch job.Tp {
   271  			case schedulerJobTypeAddTable:
   272  				if status == nil {
   273  					// if task status is not found, we can just skip adding the adding-table operation, since this table will be added in the next tick
   274  					log.Warn("task status of the capture is not found, may be the capture is already down. specify a new capture and redo the job", zap.Any("job", job))
   275  					return status, false, nil
   276  				}
   277  				status.AddTable(job.TableID, &model.TableReplicaInfo{
   278  					StartTs:     job.BoundaryTs,
   279  					MarkTableID: 0, // mark table ID will be set in processors
   280  				}, job.BoundaryTs)
   281  			case schedulerJobTypeRemoveTable:
   282  				failpoint.Inject("OwnerRemoveTableError", func() {
   283  					// just skip removing this table
   284  					failpoint.Return(status, false, nil)
   285  				})
   286  				if status == nil {
   287  					log.Warn("Task status of the capture is not found. Maybe the capture is already down. Specify a new capture and redo the job", zap.Any("job", job))
   288  					return status, false, nil
   289  				}
   290  				status.RemoveTable(job.TableID, job.BoundaryTs, false)
   291  			default:
   292  				log.Panic("Unreachable, please report a bug", zap.Any("job", job))
   293  			}
   294  			return status, true, nil
   295  		})
   296  	}
   297  }
   299  // cleanUpFinishedOperations clean up the finished operations.
   300  func (s *scheduler) cleanUpFinishedOperations() {
   301  	for captureID := range s.state.TaskStatuses {
   302  		s.state.PatchTaskStatus(captureID, func(status *model.TaskStatus) (*model.TaskStatus, bool, error) {
   303  			changed := false
   304  			for tableID, operation := range status.Operation {
   305  				if operation.Status == model.OperFinished {
   306  					delete(status.Operation, tableID)
   307  					changed = true
   308  				}
   309  			}
   310  			return status, changed, nil
   311  		})
   312  	}
   313  }
   315  func (s *scheduler) rebalance() (shouldUpdateState bool) {
   316  	if !s.shouldRebalance() {
   317  		// if no table is rebalanced, we can update the resolved ts and checkpoint ts
   318  		return true
   319  	}
   320  	// we only support rebalance by table number for now
   321  	return s.rebalanceByTableNum()
   322  }
   324  func (s *scheduler) shouldRebalance() bool {
   325  	if s.needRebalanceNextTick {
   326  		s.needRebalanceNextTick = false
   327  		return true
   328  	}
   329  	if s.lastTickCaptureCount != len(s.captures) {
   330  		// a new capture online and no table distributed to the capture
   331  		// or some captures offline
   332  		return true
   333  	}
   334  	// TODO periodic trigger rebalance
   335  	return false
   336  }
   338  // rebalanceByTableNum removes tables from captures replicating an above-average number of tables.
   339  // the removed table will be dispatched again by syncTablesWithCurrentTables function
   340  func (s *scheduler) rebalanceByTableNum() (shouldUpdateState bool) {
   341  	totalTableNum := len(s.currentTables)
   342  	captureNum := len(s.captures)
   343  	upperLimitPerCapture := int(math.Ceil(float64(totalTableNum) / float64(captureNum)))
   344  	shouldUpdateState = true
   346  	log.Info("Start rebalancing",
   347  		zap.String("changefeed", s.state.ID),
   348  		zap.Int("table-num", totalTableNum),
   349  		zap.Int("capture-num", captureNum),
   350  		zap.Int("target-limit", upperLimitPerCapture))
   352  	for captureID, taskStatus := range s.state.TaskStatuses {
   353  		tableNum2Remove := len(taskStatus.Tables) - upperLimitPerCapture
   354  		if tableNum2Remove <= 0 {
   355  			continue
   356  		}
   358  		// here we pick `tableNum2Remove` tables to delete,
   359  		// and then the removed tables will be dispatched by `syncTablesWithCurrentTables` function in the next tick
   360  		for tableID := range taskStatus.Tables {
   361  			tableID := tableID
   362  			if tableNum2Remove <= 0 {
   363  				break
   364  			}
   365  			shouldUpdateState = false
   366  			s.state.PatchTaskStatus(captureID, func(status *model.TaskStatus) (*model.TaskStatus, bool, error) {
   367  				if status == nil {
   368  					// the capture may be down, just skip remove this table
   369  					return status, false, nil
   370  				}
   371  				if status.Operation != nil && status.Operation[tableID] != nil {
   372  					// skip remove this table to avoid the remove operation created by rebalance function to influence the operation created by other function
   373  					return status, false, nil
   374  				}
   375  				status.RemoveTable(tableID, s.state.Status.CheckpointTs, false)
   376  				log.Info("Rebalance: Move table",
   377  					zap.Int64("table-id", tableID),
   378  					zap.String("capture", captureID),
   379  					zap.String("changefeed-id", s.state.ID))
   380  				return status, true, nil
   381  			})
   382  			tableNum2Remove--
   383  		}
   384  	}
   385  	return
   386  }