github.com/matrixorigin/matrixone@v1.2.0/pkg/taskservice/daemon_task.go (about) 1 // Copyright 2021 - 2023 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package taskservice 16 17 import ( 18 "context" 19 "strings" 20 "sync/atomic" 21 "time" 22 23 "github.com/matrixorigin/matrixone/pkg/common/moerr" 24 "github.com/matrixorigin/matrixone/pkg/pb/task" 25 "go.uber.org/zap" 26 ) 27 28 type TaskHandler interface { 29 Handle(ctx context.Context) error 30 } 31 32 type startTask struct { 33 runner *taskRunner 34 task *daemonTask 35 } 36 37 func newStartTask(r *taskRunner, t *daemonTask) *startTask { 38 return &startTask{ 39 runner: r, 40 task: t, 41 } 42 } 43 44 func (t *startTask) Handle(_ context.Context) error { 45 if err := t.runner.stopper.RunTask(func(ctx context.Context) { 46 defer t.runner.removeDaemonTask(t.task.task.ID) 47 48 ok, err := t.runner.startDaemonTask(ctx, t.task) 49 if err != nil { 50 t.runner.setDaemonTaskError(ctx, t.task, err) 51 return 52 } 53 54 // ok value is false, means that the task cannot be started by 55 // this runner, maybe it has been started by another runner. 56 if !ok { 57 return 58 } 59 60 // Start the go-routine to execute the task. It hangs here until 61 // the task encounters some error or be canceled. 62 if err := t.task.executor(ctx, &t.task.task); err != nil { 63 // set the record of this task error message. 64 t.runner.setDaemonTaskError(ctx, t.task, err) 65 } 66 }); err != nil { 67 return err 68 } 69 return nil 70 } 71 72 type resumeTask struct { 73 runner *taskRunner 74 task *daemonTask 75 } 76 77 func newResumeTask(r *taskRunner, t *daemonTask) *resumeTask { 78 return &resumeTask{ 79 runner: r, 80 task: t, 81 } 82 } 83 84 func (t *resumeTask) Handle(ctx context.Context) error { 85 ctx, cancel := context.WithTimeout(ctx, time.Second*5) 86 defer cancel() 87 tasks, err := t.runner.service.QueryDaemonTask(ctx, WithTaskIDCond(EQ, t.task.task.ID)) 88 if err != nil { 89 return err 90 } 91 if len(tasks) != 1 { 92 return moerr.NewInternalError(ctx, "count of tasks is wrong %d", len(tasks)) 93 } 94 95 tk := tasks[0] 96 // We cannot resume a task which is not on local runner. 97 if !strings.EqualFold(tk.TaskRunner, t.runner.runnerID) { 98 return moerr.NewInternalError(ctx, "the task is not on local runner, prev runner %s, "+ 99 "local runner %s", tk.TaskRunner, t.runner.runnerID) 100 } 101 102 tk.TaskStatus = task.TaskStatus_Running 103 nowTime := time.Now() 104 tk.LastRun = nowTime 105 tk.LastHeartbeat = nowTime 106 _, err = t.runner.service.UpdateDaemonTask(ctx, []task.DaemonTask{tk}) 107 if err != nil { 108 return err 109 } 110 111 ar := t.task.activeRoutine.Load() 112 if ar == nil || *ar == nil { 113 return moerr.NewInternalError(ctx, "cannot handle resume operation, "+ 114 "active routine not set for task %d", t.task.task.ID) 115 } 116 return (*ar).Resume() 117 } 118 119 type pauseTask struct { 120 runner *taskRunner 121 task *daemonTask 122 } 123 124 func newPauseTask(r *taskRunner, t *daemonTask) *pauseTask { 125 return &pauseTask{ 126 runner: r, 127 task: t, 128 } 129 } 130 131 func (t *pauseTask) Handle(ctx context.Context) error { 132 ctx, cancel := context.WithTimeout(ctx, time.Second*5) 133 defer cancel() 134 tasks, err := t.runner.service.QueryDaemonTask(ctx, WithTaskIDCond(EQ, t.task.task.ID)) 135 if err != nil { 136 return err 137 } 138 if len(tasks) != 1 { 139 return moerr.NewInternalError(ctx, "count of tasks is wrong %d", len(tasks)) 140 } 141 142 tk := tasks[0] 143 tk.TaskStatus = task.TaskStatus_Paused 144 _, err = t.runner.service.UpdateDaemonTask(ctx, []task.DaemonTask{tk}) 145 if err != nil { 146 return err 147 } 148 149 if t.runner.exists(tk.ID) { 150 ar := t.task.activeRoutine.Load() 151 if ar == nil || *ar == nil { 152 return moerr.NewInternalError(ctx, "cannot handle pause operation, "+ 153 "active routine not set for task %d", t.task.task.ID) 154 } 155 if err := (*ar).Pause(); err != nil { 156 return err 157 } 158 } 159 return nil 160 } 161 162 type cancelTask struct { 163 runner *taskRunner 164 task *daemonTask 165 } 166 167 func newCancelTask(r *taskRunner, t *daemonTask) *cancelTask { 168 return &cancelTask{ 169 runner: r, 170 task: t, 171 } 172 } 173 174 func (t *cancelTask) Handle(ctx context.Context) error { 175 ctx, cancel := context.WithTimeout(ctx, time.Second*5) 176 defer cancel() 177 tasks, err := t.runner.service.QueryDaemonTask(ctx, WithTaskIDCond(EQ, t.task.task.ID)) 178 if err != nil { 179 return err 180 } 181 if len(tasks) != 1 { 182 return moerr.NewInternalError(ctx, "count of tasks is wrong %d", len(tasks)) 183 } 184 185 tk := tasks[0] 186 tk.TaskStatus = task.TaskStatus_Canceled 187 tk.EndAt = time.Now() 188 _, err = t.runner.service.UpdateDaemonTask(ctx, []task.DaemonTask{tk}) 189 if err != nil { 190 return err 191 } 192 if t.runner.exists(tk.ID) { 193 ar := t.task.activeRoutine.Load() 194 if ar == nil || *ar == nil { 195 return moerr.NewInternalError(ctx, "cannot handle cancel operation, "+ 196 "active routine not set for task %d", t.task.task.ID) 197 } 198 return (*ar).Cancel() 199 } 200 return nil 201 } 202 203 // ActiveRoutine is an interface that the go routine of the daemon task 204 // should implement. 205 type ActiveRoutine interface { 206 // Resume resumes the go routine of the daemon task. 207 Resume() error 208 // Pause pauses the go routine of the daemon task. 209 Pause() error 210 // Cancel cancels the go routine of the daemon task. 211 Cancel() error 212 } 213 214 type daemonTask struct { 215 task task.DaemonTask 216 executor TaskExecutor 217 // activeRoutine is the go-routine runs in background to execute 218 // the daemon task. 219 activeRoutine atomic.Pointer[ActiveRoutine] 220 } 221 222 func (r *taskRunner) newDaemonTask(t task.DaemonTask) (*daemonTask, error) { 223 executor, err := r.getExecutor(t.Metadata.Executor) 224 if err != nil { 225 return nil, err 226 } 227 dt := &daemonTask{ 228 task: t, 229 executor: executor, 230 } 231 return dt, nil 232 } 233 234 func (r *taskRunner) startDaemonTaskWorker() error { 235 if err := r.stopper.RunNamedTask("poll-daemon-tasks", r.poll); err != nil { 236 return err 237 } 238 if err := r.stopper.RunNamedTask("handle-daemon-tasks", r.handleTask); err != nil { 239 return err 240 } 241 if err := r.stopper.RunNamedTask("daemon-tasks-heartbeat", r.sendHeartbeat); err != nil { 242 return err 243 } 244 return nil 245 } 246 247 func (r *taskRunner) poll(ctx context.Context) { 248 timer := time.NewTimer(r.options.fetchInterval) 249 defer timer.Stop() 250 for { 251 select { 252 case <-ctx.Done(): 253 r.logger.Info("daemon task poll worker stopped") 254 return 255 256 case <-timer.C: 257 if taskFrameworkDisabled() { 258 continue 259 } 260 r.dispatchTaskHandle(ctx) 261 timer.Reset(r.options.fetchInterval) 262 } 263 } 264 } 265 266 func (r *taskRunner) enqueue(handler TaskHandler) { 267 r.pendingTaskHandle <- handler 268 } 269 270 func (r *taskRunner) newStartTask(t task.DaemonTask) { 271 dt, err := r.newDaemonTask(t) 272 if err != nil { 273 r.logger.Error("failed to dispatch daemon task", 274 zap.Uint64("task ID", t.ID), zap.Error(err)) 275 return 276 } 277 r.enqueue(newStartTask(r, dt)) 278 } 279 280 func (r *taskRunner) dispatchTaskHandle(ctx context.Context) { 281 r.daemonTasks.Lock() 282 defer r.daemonTasks.Unlock() 283 for _, t := range r.startTasks(ctx) { 284 r.newStartTask(t) 285 } 286 for _, t := range r.resumeTasks(ctx) { 287 dt, ok := r.daemonTasks.m[t.ID] 288 if ok { 289 r.enqueue(newResumeTask(r, dt)) 290 } else { 291 r.newStartTask(t) 292 } 293 } 294 for _, t := range r.pauseTasks(ctx) { 295 dt, ok := r.daemonTasks.m[t.ID] 296 if ok { 297 r.enqueue(newPauseTask(r, dt)) 298 } else { 299 dt, err := r.newDaemonTask(t) 300 if err != nil { 301 r.logger.Error("failed to dispatch daemon task", 302 zap.Uint64("task ID", t.ID), zap.Error(err)) 303 return 304 } 305 r.enqueue(newPauseTask(r, dt)) 306 } 307 } 308 for _, t := range r.cancelTasks(ctx) { 309 dt, ok := r.daemonTasks.m[t.ID] 310 if ok { 311 r.enqueue(newCancelTask(r, dt)) 312 } else { 313 dt, err := r.newDaemonTask(t) 314 if err != nil { 315 r.logger.Error("failed to dispatch daemon task", 316 zap.Uint64("task ID", t.ID), zap.Error(err)) 317 return 318 } 319 r.enqueue(newCancelTask(r, dt)) 320 } 321 } 322 } 323 324 func (r *taskRunner) queryDaemonTasks(ctx context.Context, c ...Condition) []task.DaemonTask { 325 ctx, cancel := context.WithTimeout(ctx, r.options.fetchTimeout) 326 defer cancel() 327 t, err := r.service.QueryDaemonTask(ctx, c...) 328 if err != nil { 329 r.logger.Error("failed to get tasks", zap.Error(err)) 330 return nil 331 } 332 return t 333 } 334 335 // mergeTasks merges all the tasks in all the slices. It not only remove the duplicated tasks, 336 // but also filter out the tasks if the runner cannot run. 337 func (r *taskRunner) mergeTasks(tasksSlice ...[]task.DaemonTask) []task.DaemonTask { 338 taskIDs := make(map[uint64]struct{}) 339 var res []task.DaemonTask 340 for _, tasks := range tasksSlice { 341 for _, t := range tasks { 342 if _, ok := taskIDs[t.ID]; ok { 343 continue 344 } 345 if !r.canClaimDaemonTask(t.Account) { 346 continue 347 } 348 taskIDs[t.ID] = struct{}{} 349 res = append(res, t) 350 } 351 } 352 return res 353 } 354 355 // resumeTasks gets the tasks that need to start. 356 // - status: task.TaskStatus_Created 357 // - status: task.TaskStatus_Running AND last-heartbeat: timeout 358 func (r *taskRunner) startTasks(ctx context.Context) []task.DaemonTask { 359 return r.mergeTasks( 360 r.queryDaemonTasks(ctx, 361 WithTaskStatusCond(task.TaskStatus_Created), 362 ), 363 r.queryDaemonTasks(ctx, 364 WithTaskStatusCond(task.TaskStatus_Running, task.TaskStatus_ResumeRequested), 365 WithLastHeartbeat(LE, time.Now().UnixNano()-r.options.heartbeatTimeout.Nanoseconds()), 366 ), 367 ) 368 } 369 370 // resumeTasks gets the tasks that need to resume. 371 // - status equals to task.TaskStatus_ResumeRequested and runner equals to local 372 func (r *taskRunner) resumeTasks(ctx context.Context) []task.DaemonTask { 373 // We only resume the tasks that already running on this runner. For the tasks that 374 // run on other runners and heartbeat timeout, startTasks() will handle them. 375 return r.mergeTasks( 376 r.queryDaemonTasks(ctx, 377 WithTaskStatusCond(task.TaskStatus_ResumeRequested), 378 WithTaskRunnerCond(EQ, r.runnerID), 379 ), 380 ) 381 } 382 383 // pauseTasks gets the tasks that need to pause. 384 // - status equals to task.TaskStatus_PauseRequested and runner equals to local 385 func (r *taskRunner) pauseTasks(ctx context.Context) []task.DaemonTask { 386 // Handle the tasks which is in PauseRequested status: 387 // 1. the task is on current runner 388 // 2. the task is on other runners, but heartbeat timeout or null. In the handler, 389 // do NOT pause the active routine in this case. 390 return r.mergeTasks( 391 r.queryDaemonTasks(ctx, 392 WithTaskStatusCond(task.TaskStatus_PauseRequested), 393 WithTaskRunnerCond(EQ, r.runnerID), 394 ), 395 r.queryDaemonTasks(ctx, 396 WithTaskStatusCond(task.TaskStatus_PauseRequested), 397 WithLastHeartbeat(LE, time.Now().UnixNano()-r.options.heartbeatTimeout.Nanoseconds()), 398 ), 399 ) 400 } 401 402 // cancelTasks gets the tasks that need to cancel. 403 func (r *taskRunner) cancelTasks(ctx context.Context) []task.DaemonTask { 404 // Handle the tasks which is in CancelRequested status: 405 // 1. the task is on current runner 406 // 2. the task is on other runners, but heartbeat timeout or null. In the handler, 407 // do NOT cancel the active routine in this case. 408 return r.mergeTasks( 409 r.queryDaemonTasks(ctx, 410 WithTaskStatusCond(task.TaskStatus_CancelRequested), 411 WithTaskRunnerCond(EQ, r.runnerID), 412 ), 413 r.queryDaemonTasks(ctx, 414 WithTaskStatusCond(task.TaskStatus_CancelRequested), 415 WithLastHeartbeat(LE, time.Now().UnixNano()-r.options.heartbeatTimeout.Nanoseconds()), 416 ), 417 ) 418 } 419 420 func (r *taskRunner) handleTask(ctx context.Context) { 421 for { 422 select { 423 case <-ctx.Done(): 424 return 425 case h := <-r.pendingTaskHandle: 426 if err := h.Handle(ctx); err != nil { 427 r.logger.Error("failed to handle task", zap.Error(err)) 428 } 429 } 430 } 431 } 432 433 func (r *taskRunner) sendHeartbeat(ctx context.Context) { 434 ticker := time.NewTicker(r.options.heartbeatInterval) 435 defer ticker.Stop() 436 for { 437 select { 438 case <-ctx.Done(): 439 r.logger.Debug("heartbeat task stopped") 440 return 441 case <-ticker.C: 442 if taskFrameworkDisabled() { 443 continue 444 } 445 r.doSendHeartbeat(ctx) 446 } 447 } 448 } 449 450 func (r *taskRunner) doSendHeartbeat(ctx context.Context) { 451 r.daemonTasks.Lock() 452 tasks := make([]*daemonTask, 0, len(r.daemonTasks.m)) 453 for _, dt := range r.daemonTasks.m { 454 tasks = append(tasks, dt) 455 } 456 r.daemonTasks.Unlock() 457 458 for _, dt := range tasks { 459 if err := r.service.HeartbeatDaemonTask(ctx, dt.task); err != nil { 460 r.logger.Error("task heartbeat failed", 461 zap.Uint64("task ID", dt.task.ID), 462 zap.Error(err)) 463 } 464 } 465 } 466 467 func (r *taskRunner) startDaemonTask(ctx context.Context, dt *daemonTask) (bool, error) { 468 t := dt.task 469 t.TaskRunner = r.runnerID 470 t.TaskStatus = task.TaskStatus_Running 471 nowTime := time.Now() 472 t.UpdateAt = nowTime 473 t.LastRun = nowTime 474 475 // Update the last heartbeat if the daemon task is started successfully. 476 // The new value is used to prevent other runners to start this task at 477 // the same time. 478 t.LastHeartbeat = nowTime 479 480 // Clear the error message of the task when start it. And if it fails to 481 // start, new error message will be set again. 482 t.Details.Error = "" 483 484 // When update the daemon task, add the condition that last heartbeat of 485 // the task must be timeout or be null, which means that other runners does 486 // NOT try to start this task. 487 c, err := r.service.UpdateDaemonTask(ctx, []task.DaemonTask{t}, 488 WithLastHeartbeat(LE, nowTime.UnixNano()-r.options.heartbeatTimeout.Nanoseconds())) 489 if err != nil { 490 return false, err 491 } 492 493 // The daemon task may be updated by other runners, so do not start the task on this runner. 494 if c != 1 { 495 return false, nil 496 } 497 498 r.addDaemonTask(dt) 499 return true, nil 500 } 501 502 func (r *taskRunner) setDaemonTaskError(ctx context.Context, dt *daemonTask, errMsg error) { 503 r.logger.Info("daemon task stopped with error", zap.Uint64("task ID", dt.task.ID), 504 zap.Error(errMsg)) 505 t := dt.task 506 nowTime := time.Now() 507 t.UpdateAt = nowTime 508 t.Details.Error = errMsg.Error() 509 // TODO(volgariver6): if it is a retryable error, do not update the status, 510 // otherwise, set the status to Error. 511 _, err := r.service.UpdateDaemonTask(ctx, []task.DaemonTask{t}) 512 if err != nil { 513 r.logger.Error("failed to set error message to task", 514 zap.Uint64("task ID", t.ID), 515 zap.String("error message", errMsg.Error()), 516 zap.Error(err)) 517 } 518 } 519 520 func (r *taskRunner) addDaemonTask(dt *daemonTask) { 521 r.daemonTasks.Lock() 522 defer r.daemonTasks.Unlock() 523 if _, ok := r.daemonTasks.m[dt.task.ID]; ok { 524 return 525 } 526 r.daemonTasks.m[dt.task.ID] = dt 527 } 528 529 func (r *taskRunner) removeDaemonTask(id uint64) { 530 r.daemonTasks.Lock() 531 defer r.daemonTasks.Unlock() 532 delete(r.daemonTasks.m, id) 533 } 534 535 func (r *taskRunner) exists(id uint64) bool { 536 r.daemonTasks.Lock() 537 defer r.daemonTasks.Unlock() 538 if _, ok := r.daemonTasks.m[id]; ok { 539 return true 540 } 541 return false 542 }