github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/jobmaster/dm/task_manager.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package dm
    15  
    16  import (
    17  	"context"
    18  	"sync"
    19  	"time"
    20  
    21  	dmconfig "github.com/pingcap/tiflow/dm/config"
    22  	frameModel "github.com/pingcap/tiflow/engine/framework/model"
    23  	"github.com/pingcap/tiflow/engine/jobmaster/dm/config"
    24  	"github.com/pingcap/tiflow/engine/jobmaster/dm/metadata"
    25  	"github.com/pingcap/tiflow/engine/jobmaster/dm/runtime"
    26  	dmpkg "github.com/pingcap/tiflow/engine/pkg/dm"
    27  	"github.com/pingcap/tiflow/engine/pkg/dm/ticker"
    28  	"github.com/pingcap/tiflow/engine/pkg/promutil"
    29  	"github.com/pingcap/tiflow/pkg/errors"
    30  	"github.com/prometheus/client_golang/prometheus"
    31  	"go.uber.org/zap"
    32  )
    33  
    34  var (
    35  	taskNormalInterval = time.Second * 30
    36  	taskErrorInterval  = time.Second * 10
    37  )
    38  
    39  // TaskManager checks and operates task.
    40  type TaskManager struct {
    41  	*ticker.DefaultTicker
    42  
    43  	jobID        string
    44  	jobStore     *metadata.JobStore
    45  	messageAgent dmpkg.MessageAgent
    46  	logger       *zap.Logger
    47  	// tasks record the runtime task status
    48  	// taskID -> TaskStatus
    49  	tasks sync.Map
    50  
    51  	gaugeVec *prometheus.GaugeVec
    52  }
    53  
    54  // NewTaskManager creates a new TaskManager instance
    55  func NewTaskManager(
    56  	jobID string,
    57  	initTaskStatus []runtime.TaskStatus,
    58  	jobStore *metadata.JobStore,
    59  	messageAgent dmpkg.MessageAgent,
    60  	pLogger *zap.Logger,
    61  	metricFactory promutil.Factory,
    62  ) *TaskManager {
    63  	taskManager := &TaskManager{
    64  		jobID:         jobID,
    65  		DefaultTicker: ticker.NewDefaultTicker(taskNormalInterval, taskErrorInterval),
    66  		jobStore:      jobStore,
    67  		logger:        pLogger.With(zap.String("component", "task_manager")),
    68  		messageAgent:  messageAgent,
    69  		gaugeVec: metricFactory.NewGaugeVec(
    70  			prometheus.GaugeOpts{
    71  				Namespace: "dm",
    72  				Subsystem: "worker",
    73  				Name:      "task_state",
    74  				Help:      "task state of dm worker in this job",
    75  			}, []string{"task", "source_id"}),
    76  	}
    77  	taskManager.DefaultTicker.Ticker = taskManager
    78  
    79  	for _, taskStatus := range initTaskStatus {
    80  		taskManager.UpdateTaskStatus(taskStatus)
    81  	}
    82  	return taskManager
    83  }
    84  
    85  // OperateTask updates the task status in metadata and triggers the task manager to check and operate task.
    86  // called by user request.
    87  func (tm *TaskManager) OperateTask(ctx context.Context, op dmpkg.OperateType, jobCfg *config.JobCfg, tasks []string) (err error) {
    88  	tm.logger.Info("operate task", zap.Stringer("op", op), zap.Strings("tasks", tasks))
    89  	defer func() {
    90  		if err == nil {
    91  			tm.SetNextCheckTime(time.Now())
    92  		}
    93  	}()
    94  
    95  	var stage metadata.TaskStage
    96  	switch op {
    97  	case dmpkg.Create:
    98  		return tm.jobStore.Put(ctx, metadata.NewJob(jobCfg))
    99  	case dmpkg.Update:
   100  		return tm.jobStore.UpdateConfig(ctx, jobCfg)
   101  	// Deleting marks the job as deleting.
   102  	case dmpkg.Deleting:
   103  		return tm.jobStore.MarkDeleting(ctx)
   104  	// Delete deletes the job in metadata.
   105  	case dmpkg.Delete:
   106  		return tm.jobStore.Delete(ctx)
   107  	case dmpkg.Resume:
   108  		stage = metadata.StageRunning
   109  	case dmpkg.Pause:
   110  		stage = metadata.StagePaused
   111  	default:
   112  		return errors.New("unknown operate type")
   113  	}
   114  
   115  	return tm.jobStore.UpdateStages(ctx, tasks, stage)
   116  }
   117  
   118  // UpdateTaskStatus is called when receive task status from worker.
   119  func (tm *TaskManager) UpdateTaskStatus(taskStatus runtime.TaskStatus) {
   120  	tm.logger.Debug(
   121  		"update task status",
   122  		zap.String("task_id", taskStatus.Task),
   123  		zap.Stringer("stage", taskStatus.Stage),
   124  		zap.Stringer("unit", taskStatus.Unit),
   125  		zap.Uint64("config_modify_revison", taskStatus.CfgModRevision),
   126  	)
   127  	tm.tasks.Store(taskStatus.Task, taskStatus)
   128  	tm.gaugeVec.WithLabelValues(tm.jobID, taskStatus.Task).Set(float64(taskStatus.Stage))
   129  }
   130  
   131  // TaskStatus return the task status.
   132  func (tm *TaskManager) TaskStatus() map[string]runtime.TaskStatus {
   133  	result := make(map[string]runtime.TaskStatus)
   134  	tm.tasks.Range(func(key, value interface{}) bool {
   135  		result[key.(string)] = value.(runtime.TaskStatus)
   136  		return true
   137  	})
   138  	return result
   139  }
   140  
   141  // TickImpl removes tasks that are not in the job config.
   142  // TickImpl checks and operates task if needed.
   143  func (tm *TaskManager) TickImpl(ctx context.Context) error {
   144  	tm.logger.Info("start to check and operate tasks")
   145  	state, err := tm.jobStore.Get(ctx)
   146  	if err != nil || state.(*metadata.Job).Deleting {
   147  		tm.logger.Info("on job deleting", zap.Error(err))
   148  		tm.onJobDel()
   149  		return err
   150  	}
   151  	job := state.(*metadata.Job)
   152  
   153  	tm.removeTaskStatus(job)
   154  	return tm.checkAndOperateTasks(ctx, job)
   155  }
   156  
   157  func (tm *TaskManager) checkAndOperateTasks(ctx context.Context, job *metadata.Job) error {
   158  	var (
   159  		runningTask runtime.TaskStatus
   160  		recordError error
   161  	)
   162  
   163  	// check and operate task
   164  	for taskID, persistentTask := range job.Tasks {
   165  		task, ok := tm.tasks.Load(taskID)
   166  		if ok {
   167  			runningTask = task.(runtime.TaskStatus)
   168  		}
   169  
   170  		// task unbounded or worker offline
   171  		if !ok || runningTask.Stage == metadata.StageUnscheduled {
   172  			recordError = errors.New("get task running status failed")
   173  			tm.logger.Error("failed to schedule task", zap.String("task_id", taskID), zap.Error(recordError))
   174  			continue
   175  		}
   176  
   177  		op := genOp(runningTask.Stage, runningTask.StageUpdatedTime, persistentTask.Stage, persistentTask.StageUpdatedTime)
   178  		if op == dmpkg.None {
   179  			tm.logger.Debug(
   180  				"task status will not be changed",
   181  				zap.String("task_id", taskID),
   182  				zap.Stringer("stage", runningTask.Stage),
   183  			)
   184  			continue
   185  		}
   186  
   187  		tm.logger.Info(
   188  			"unexpected task status",
   189  			zap.String("task_id", taskID),
   190  			zap.Stringer("op", op),
   191  			zap.Stringer("expected_stage", persistentTask.Stage),
   192  			zap.Stringer("stage", runningTask.Stage),
   193  		)
   194  		// operateTaskMessage should be a asynchronous request
   195  		if err := tm.operateTaskMessage(ctx, taskID, op); err != nil {
   196  			recordError = err
   197  			tm.logger.Error("operate task failed", zap.Error(recordError))
   198  			continue
   199  		}
   200  	}
   201  	return recordError
   202  }
   203  
   204  // remove all tasks, usually happened when delete jobs.
   205  func (tm *TaskManager) onJobDel() {
   206  	tm.logger.Info("clear all task status")
   207  	tm.tasks.Range(func(key, value interface{}) bool {
   208  		tm.tasks.Delete(key)
   209  		tm.gaugeVec.DeleteLabelValues(tm.jobID, key.(string))
   210  		return true
   211  	})
   212  }
   213  
   214  // remove deleted task status, usually happened when update-job delete some tasks.
   215  func (tm *TaskManager) removeTaskStatus(job *metadata.Job) {
   216  	tm.tasks.Range(func(key, value interface{}) bool {
   217  		taskID := key.(string)
   218  		if _, ok := job.Tasks[taskID]; !ok {
   219  			tm.logger.Info("remove task status", zap.String("task_id", taskID))
   220  			tm.tasks.Delete(taskID)
   221  			tm.gaugeVec.DeleteLabelValues(tm.jobID, taskID)
   222  		}
   223  		return true
   224  	})
   225  }
   226  
   227  // GetTaskStatus gets task status by taskID
   228  func (tm *TaskManager) GetTaskStatus(taskID string) (runtime.TaskStatus, bool) {
   229  	value, ok := tm.tasks.Load(taskID)
   230  	if !ok {
   231  		return runtime.NewOfflineStatus(taskID), false
   232  	}
   233  	return value.(runtime.TaskStatus), true
   234  }
   235  
   236  func genOp(
   237  	runningStage metadata.TaskStage,
   238  	runningStageUpdatedTime time.Time,
   239  	expectedStage metadata.TaskStage,
   240  	expectedStageUpdatedTime time.Time,
   241  ) dmpkg.OperateType {
   242  	switch {
   243  	case expectedStage == metadata.StagePaused && (runningStage == metadata.StageRunning || runningStage == metadata.StageError):
   244  		return dmpkg.Pause
   245  	case expectedStage == metadata.StageRunning:
   246  		if runningStage == metadata.StagePaused {
   247  			return dmpkg.Resume
   248  		}
   249  		// only resume a error task for a manual Resume action by checking expectedStageUpdatedTime
   250  		if runningStage == metadata.StageError && expectedStageUpdatedTime.After(runningStageUpdatedTime) {
   251  			return dmpkg.Resume
   252  		}
   253  		return dmpkg.None
   254  	// TODO: support update
   255  	default:
   256  		return dmpkg.None
   257  	}
   258  }
   259  
   260  func (tm *TaskManager) operateTaskMessage(ctx context.Context, taskID string, op dmpkg.OperateType) error {
   261  	msg := &dmpkg.OperateTaskMessage{
   262  		Task: taskID,
   263  		Op:   op,
   264  	}
   265  	return tm.messageAgent.SendMessage(ctx, taskID, dmpkg.OperateTask, msg)
   266  }
   267  
   268  func (tm *TaskManager) allFinished(ctx context.Context) bool {
   269  	state, err := tm.jobStore.Get(ctx)
   270  	if err != nil {
   271  		return false
   272  	}
   273  	job := state.(*metadata.Job)
   274  
   275  	for taskID, task := range job.Tasks {
   276  		t, ok := tm.tasks.Load(taskID)
   277  		if !ok {
   278  			return false
   279  		}
   280  		runningTask := t.(runtime.TaskStatus)
   281  		if runningTask.Stage != metadata.StageFinished {
   282  			return false
   283  		}
   284  		// update if we add new task mode
   285  		switch task.Cfg.TaskMode {
   286  		case dmconfig.ModeFull:
   287  			if runningTask.Unit != frameModel.WorkerDMLoad {
   288  				return false
   289  			}
   290  		case dmconfig.ModeDump:
   291  		default:
   292  			return false
   293  		}
   294  	}
   295  	return true
   296  }