github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/engine/jobmaster/dm/task_manager.go (about) 1 // Copyright 2022 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package dm 15 16 import ( 17 "context" 18 "sync" 19 "time" 20 21 dmconfig "github.com/pingcap/tiflow/dm/config" 22 frameModel "github.com/pingcap/tiflow/engine/framework/model" 23 "github.com/pingcap/tiflow/engine/jobmaster/dm/config" 24 "github.com/pingcap/tiflow/engine/jobmaster/dm/metadata" 25 "github.com/pingcap/tiflow/engine/jobmaster/dm/runtime" 26 dmpkg "github.com/pingcap/tiflow/engine/pkg/dm" 27 "github.com/pingcap/tiflow/engine/pkg/dm/ticker" 28 "github.com/pingcap/tiflow/engine/pkg/promutil" 29 "github.com/pingcap/tiflow/pkg/errors" 30 "github.com/prometheus/client_golang/prometheus" 31 "go.uber.org/zap" 32 ) 33 34 var ( 35 taskNormalInterval = time.Second * 30 36 taskErrorInterval = time.Second * 10 37 ) 38 39 // TaskManager checks and operates task. 40 type TaskManager struct { 41 *ticker.DefaultTicker 42 43 jobID string 44 jobStore *metadata.JobStore 45 messageAgent dmpkg.MessageAgent 46 logger *zap.Logger 47 // tasks record the runtime task status 48 // taskID -> TaskStatus 49 tasks sync.Map 50 51 gaugeVec *prometheus.GaugeVec 52 } 53 54 // NewTaskManager creates a new TaskManager instance 55 func NewTaskManager( 56 jobID string, 57 initTaskStatus []runtime.TaskStatus, 58 jobStore *metadata.JobStore, 59 messageAgent dmpkg.MessageAgent, 60 pLogger *zap.Logger, 61 metricFactory promutil.Factory, 62 ) *TaskManager { 63 taskManager := &TaskManager{ 64 jobID: jobID, 65 DefaultTicker: ticker.NewDefaultTicker(taskNormalInterval, taskErrorInterval), 66 jobStore: jobStore, 67 logger: pLogger.With(zap.String("component", "task_manager")), 68 messageAgent: messageAgent, 69 gaugeVec: metricFactory.NewGaugeVec( 70 prometheus.GaugeOpts{ 71 Namespace: "dm", 72 Subsystem: "worker", 73 Name: "task_state", 74 Help: "task state of dm worker in this job", 75 }, []string{"task", "source_id"}), 76 } 77 taskManager.DefaultTicker.Ticker = taskManager 78 79 for _, taskStatus := range initTaskStatus { 80 taskManager.UpdateTaskStatus(taskStatus) 81 } 82 return taskManager 83 } 84 85 // OperateTask updates the task status in metadata and triggers the task manager to check and operate task. 86 // called by user request. 87 func (tm *TaskManager) OperateTask(ctx context.Context, op dmpkg.OperateType, jobCfg *config.JobCfg, tasks []string) (err error) { 88 tm.logger.Info("operate task", zap.Stringer("op", op), zap.Strings("tasks", tasks)) 89 defer func() { 90 if err == nil { 91 tm.SetNextCheckTime(time.Now()) 92 } 93 }() 94 95 var stage metadata.TaskStage 96 switch op { 97 case dmpkg.Create: 98 return tm.jobStore.Put(ctx, metadata.NewJob(jobCfg)) 99 case dmpkg.Update: 100 return tm.jobStore.UpdateConfig(ctx, jobCfg) 101 // Deleting marks the job as deleting. 102 case dmpkg.Deleting: 103 return tm.jobStore.MarkDeleting(ctx) 104 // Delete deletes the job in metadata. 105 case dmpkg.Delete: 106 return tm.jobStore.Delete(ctx) 107 case dmpkg.Resume: 108 stage = metadata.StageRunning 109 case dmpkg.Pause: 110 stage = metadata.StagePaused 111 default: 112 return errors.New("unknown operate type") 113 } 114 115 return tm.jobStore.UpdateStages(ctx, tasks, stage) 116 } 117 118 // UpdateTaskStatus is called when receive task status from worker. 119 func (tm *TaskManager) UpdateTaskStatus(taskStatus runtime.TaskStatus) { 120 tm.logger.Debug( 121 "update task status", 122 zap.String("task_id", taskStatus.Task), 123 zap.Stringer("stage", taskStatus.Stage), 124 zap.Stringer("unit", taskStatus.Unit), 125 zap.Uint64("config_modify_revison", taskStatus.CfgModRevision), 126 ) 127 tm.tasks.Store(taskStatus.Task, taskStatus) 128 tm.gaugeVec.WithLabelValues(tm.jobID, taskStatus.Task).Set(float64(taskStatus.Stage)) 129 } 130 131 // TaskStatus return the task status. 132 func (tm *TaskManager) TaskStatus() map[string]runtime.TaskStatus { 133 result := make(map[string]runtime.TaskStatus) 134 tm.tasks.Range(func(key, value interface{}) bool { 135 result[key.(string)] = value.(runtime.TaskStatus) 136 return true 137 }) 138 return result 139 } 140 141 // TickImpl removes tasks that are not in the job config. 142 // TickImpl checks and operates task if needed. 143 func (tm *TaskManager) TickImpl(ctx context.Context) error { 144 tm.logger.Info("start to check and operate tasks") 145 state, err := tm.jobStore.Get(ctx) 146 if err != nil || state.(*metadata.Job).Deleting { 147 tm.logger.Info("on job deleting", zap.Error(err)) 148 tm.onJobDel() 149 return err 150 } 151 job := state.(*metadata.Job) 152 153 tm.removeTaskStatus(job) 154 return tm.checkAndOperateTasks(ctx, job) 155 } 156 157 func (tm *TaskManager) checkAndOperateTasks(ctx context.Context, job *metadata.Job) error { 158 var ( 159 runningTask runtime.TaskStatus 160 recordError error 161 ) 162 163 // check and operate task 164 for taskID, persistentTask := range job.Tasks { 165 task, ok := tm.tasks.Load(taskID) 166 if ok { 167 runningTask = task.(runtime.TaskStatus) 168 } 169 170 // task unbounded or worker offline 171 if !ok || runningTask.Stage == metadata.StageUnscheduled { 172 recordError = errors.New("get task running status failed") 173 tm.logger.Error("failed to schedule task", zap.String("task_id", taskID), zap.Error(recordError)) 174 continue 175 } 176 177 op := genOp(runningTask.Stage, runningTask.StageUpdatedTime, persistentTask.Stage, persistentTask.StageUpdatedTime) 178 if op == dmpkg.None { 179 tm.logger.Debug( 180 "task status will not be changed", 181 zap.String("task_id", taskID), 182 zap.Stringer("stage", runningTask.Stage), 183 ) 184 continue 185 } 186 187 tm.logger.Info( 188 "unexpected task status", 189 zap.String("task_id", taskID), 190 zap.Stringer("op", op), 191 zap.Stringer("expected_stage", persistentTask.Stage), 192 zap.Stringer("stage", runningTask.Stage), 193 ) 194 // operateTaskMessage should be a asynchronous request 195 if err := tm.operateTaskMessage(ctx, taskID, op); err != nil { 196 recordError = err 197 tm.logger.Error("operate task failed", zap.Error(recordError)) 198 continue 199 } 200 } 201 return recordError 202 } 203 204 // remove all tasks, usually happened when delete jobs. 205 func (tm *TaskManager) onJobDel() { 206 tm.logger.Info("clear all task status") 207 tm.tasks.Range(func(key, value interface{}) bool { 208 tm.tasks.Delete(key) 209 tm.gaugeVec.DeleteLabelValues(tm.jobID, key.(string)) 210 return true 211 }) 212 } 213 214 // remove deleted task status, usually happened when update-job delete some tasks. 215 func (tm *TaskManager) removeTaskStatus(job *metadata.Job) { 216 tm.tasks.Range(func(key, value interface{}) bool { 217 taskID := key.(string) 218 if _, ok := job.Tasks[taskID]; !ok { 219 tm.logger.Info("remove task status", zap.String("task_id", taskID)) 220 tm.tasks.Delete(taskID) 221 tm.gaugeVec.DeleteLabelValues(tm.jobID, taskID) 222 } 223 return true 224 }) 225 } 226 227 // GetTaskStatus gets task status by taskID 228 func (tm *TaskManager) GetTaskStatus(taskID string) (runtime.TaskStatus, bool) { 229 value, ok := tm.tasks.Load(taskID) 230 if !ok { 231 return runtime.NewOfflineStatus(taskID), false 232 } 233 return value.(runtime.TaskStatus), true 234 } 235 236 func genOp( 237 runningStage metadata.TaskStage, 238 runningStageUpdatedTime time.Time, 239 expectedStage metadata.TaskStage, 240 expectedStageUpdatedTime time.Time, 241 ) dmpkg.OperateType { 242 switch { 243 case expectedStage == metadata.StagePaused && (runningStage == metadata.StageRunning || runningStage == metadata.StageError): 244 return dmpkg.Pause 245 case expectedStage == metadata.StageRunning: 246 if runningStage == metadata.StagePaused { 247 return dmpkg.Resume 248 } 249 // only resume a error task for a manual Resume action by checking expectedStageUpdatedTime 250 if runningStage == metadata.StageError && expectedStageUpdatedTime.After(runningStageUpdatedTime) { 251 return dmpkg.Resume 252 } 253 return dmpkg.None 254 // TODO: support update 255 default: 256 return dmpkg.None 257 } 258 } 259 260 func (tm *TaskManager) operateTaskMessage(ctx context.Context, taskID string, op dmpkg.OperateType) error { 261 msg := &dmpkg.OperateTaskMessage{ 262 Task: taskID, 263 Op: op, 264 } 265 return tm.messageAgent.SendMessage(ctx, taskID, dmpkg.OperateTask, msg) 266 } 267 268 func (tm *TaskManager) allFinished(ctx context.Context) bool { 269 state, err := tm.jobStore.Get(ctx) 270 if err != nil { 271 return false 272 } 273 job := state.(*metadata.Job) 274 275 for taskID, task := range job.Tasks { 276 t, ok := tm.tasks.Load(taskID) 277 if !ok { 278 return false 279 } 280 runningTask := t.(runtime.TaskStatus) 281 if runningTask.Stage != metadata.StageFinished { 282 return false 283 } 284 // update if we add new task mode 285 switch task.Cfg.TaskMode { 286 case dmconfig.ModeFull: 287 if runningTask.Unit != frameModel.WorkerDMLoad { 288 return false 289 } 290 case dmconfig.ModeDump: 291 default: 292 return false 293 } 294 } 295 return true 296 }